Whamcloud - gitweb
- merge 0.7rc1 from b_devel to HEAD (20030612 merge point) 0.7.0
authorpschwan <pschwan>
Thu, 12 Jun 2003 07:12:50 +0000 (07:12 +0000)
committerpschwan <pschwan>
Thu, 12 Jun 2003 07:12:50 +0000 (07:12 +0000)
- remove extN/ files

726 files changed:
lnet/.cvsignore [moved from lustre/lib/.cvsignore with 60% similarity]
lnet/AUTHORS [new file with mode: 0644]
lnet/ChangeLog [new file with mode: 0644]
lnet/Kernelenv.in [new file with mode: 0644]
lnet/Kernelenv.mk [new file with mode: 0644]
lnet/Makefile.am [new file with mode: 0644]
lnet/Makefile.mk [new file with mode: 0644]
lnet/NEWS [new file with mode: 0644]
lnet/README [new file with mode: 0644]
lnet/Rules.linux [new file with mode: 0644]
lnet/archdep.m4 [new file with mode: 0644]
lnet/autogen.sh [new file with mode: 0644]
lnet/build.m4 [new file with mode: 0644]
lnet/configure.in [new file with mode: 0644]
lnet/doc/.cvsignore [new file with mode: 0644]
lnet/doc/Data-structures [new file with mode: 0644]
lnet/doc/Makefile.am [new file with mode: 0644]
lnet/doc/Message-life-cycle [new file with mode: 0644]
lnet/doc/NAL-HOWTO [new file with mode: 0644]
lnet/doc/file.fig [new file with mode: 0644]
lnet/doc/flow_new.fig [new file with mode: 0644]
lnet/doc/get.fig [new file with mode: 0644]
lnet/doc/ieee.bst [new file with mode: 0644]
lnet/doc/mpi.fig [new file with mode: 0644]
lnet/doc/portals.fig [new file with mode: 0644]
lnet/doc/portals3.bib [new file with mode: 0644]
lnet/doc/portals3.lyx [new file with mode: 0644]
lnet/doc/put.fig [new file with mode: 0644]
lnet/include/.cvsignore [new file with mode: 0644]
lnet/include/Makefile.am [new file with mode: 0644]
lnet/include/config.h.in [new file with mode: 0644]
lnet/include/linux/Makefile.am [new file with mode: 0644]
lnet/include/linux/kp30.h [new file with mode: 0644]
lnet/include/linux/portals_compat25.h [new file with mode: 0644]
lnet/include/linux/portals_lib.h [new file with mode: 0644]
lnet/include/lnet/Makefile.am [new file with mode: 0644]
lnet/include/lnet/api-support.h [new file with mode: 0644]
lnet/include/lnet/api.h [new file with mode: 0644]
lnet/include/lnet/arg-blocks.h [new file with mode: 0644]
lnet/include/lnet/defines.h [new file with mode: 0644]
lnet/include/lnet/errno.h [new file with mode: 0644]
lnet/include/lnet/internal.h [new file with mode: 0644]
lnet/include/lnet/lib-dispatch.h [new file with mode: 0644]
lnet/include/lnet/lib-lnet.h [new file with mode: 0644]
lnet/include/lnet/lib-nal.h [new file with mode: 0644]
lnet/include/lnet/lib-p30.h [new file with mode: 0644]
lnet/include/lnet/lib-types.h [new file with mode: 0644]
lnet/include/lnet/list.h [new file with mode: 0644]
lnet/include/lnet/lltrace.h [new file with mode: 0644]
lnet/include/lnet/lnet.h [new file with mode: 0644]
lnet/include/lnet/lnetctl.h [new file with mode: 0644]
lnet/include/lnet/myrnal.h [new file with mode: 0644]
lnet/include/lnet/nal.h [new file with mode: 0644]
lnet/include/lnet/nalids.h [new file with mode: 0644]
lnet/include/lnet/p30.h [new file with mode: 0644]
lnet/include/lnet/ppid.h [new file with mode: 0644]
lnet/include/lnet/ptlctl.h [new file with mode: 0644]
lnet/include/lnet/stringtab.h [new file with mode: 0644]
lnet/include/lnet/types.h [new file with mode: 0644]
lnet/klnds/.cvsignore [new file with mode: 0644]
lnet/klnds/Makefile.am [new file with mode: 0644]
lnet/klnds/Makefile.mk [new file with mode: 0644]
lnet/klnds/gmlnd/.cvsignore [new file with mode: 0644]
lnet/klnds/gmlnd/Makefile.am [new file with mode: 0644]
lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch [new file with mode: 0644]
lnet/klnds/gmlnd/gmlnd.h [new file with mode: 0644]
lnet/klnds/gmlnd/gmlnd_cb.c [new file with mode: 0644]
lnet/klnds/gmlnd/gmnal.c [new file with mode: 0644]
lnet/klnds/qswlnd/.cvsignore [new file with mode: 0644]
lnet/klnds/qswlnd/Makefile.am [new file with mode: 0644]
lnet/klnds/qswlnd/qswlnd.c [new file with mode: 0644]
lnet/klnds/qswlnd/qswlnd.h [new file with mode: 0644]
lnet/klnds/qswlnd/qswlnd_cb.c [new file with mode: 0644]
lnet/klnds/scimaclnd/.cvsignore [new file with mode: 0644]
lnet/klnds/scimaclnd/Makefile.am [new file with mode: 0644]
lnet/klnds/scimaclnd/README.scimacnal [new file with mode: 0644]
lnet/klnds/scimaclnd/scimac.conf [new file with mode: 0644]
lnet/klnds/scimaclnd/scimacnal.c [new file with mode: 0644]
lnet/klnds/scimaclnd/scimacnal.h [new file with mode: 0644]
lnet/klnds/scimaclnd/scimacnal_cb.c [new file with mode: 0644]
lnet/klnds/socklnd/.cvsignore [new file with mode: 0644]
lnet/klnds/socklnd/Makefile.am [new file with mode: 0644]
lnet/klnds/socklnd/Makefile.mk [new file with mode: 0644]
lnet/klnds/socklnd/socklnd.c [new file with mode: 0644]
lnet/klnds/socklnd/socklnd.h [new file with mode: 0644]
lnet/klnds/socklnd/socklnd_cb.c [new file with mode: 0644]
lnet/klnds/toelnd/.cvsignore [new file with mode: 0644]
lnet/klnds/toelnd/Makefile.am [new file with mode: 0644]
lnet/klnds/toelnd/toenal.c [new file with mode: 0644]
lnet/klnds/toelnd/toenal.h [new file with mode: 0644]
lnet/klnds/toelnd/toenal_cb.c [new file with mode: 0644]
lnet/libcfs/.cvsignore [new file with mode: 0644]
lnet/libcfs/Makefile.am [new file with mode: 0644]
lnet/libcfs/Makefile.mk [new file with mode: 0644]
lnet/libcfs/debug.c [new file with mode: 0644]
lnet/libcfs/module.c [new file with mode: 0644]
lnet/libcfs/proc.c [new file with mode: 0644]
lnet/lnet/.cvsignore [new file with mode: 0644]
lnet/lnet/Makefile.am [new file with mode: 0644]
lnet/lnet/Makefile.mk [new file with mode: 0644]
lnet/lnet/api-eq.c [new file with mode: 0644]
lnet/lnet/api-errno.c [new file with mode: 0644]
lnet/lnet/api-init.c [new file with mode: 0644]
lnet/lnet/api-me.c [new file with mode: 0644]
lnet/lnet/api-ni.c [new file with mode: 0644]
lnet/lnet/api-wrap.c [new file with mode: 0644]
lnet/lnet/lib-dispatch.c [new file with mode: 0644]
lnet/lnet/lib-eq.c [new file with mode: 0644]
lnet/lnet/lib-init.c [new file with mode: 0644]
lnet/lnet/lib-md.c [new file with mode: 0644]
lnet/lnet/lib-me.c [new file with mode: 0644]
lnet/lnet/lib-move.c [new file with mode: 0644]
lnet/lnet/lib-msg.c [new file with mode: 0644]
lnet/lnet/lib-ni.c [new file with mode: 0644]
lnet/lnet/lib-pid.c [new file with mode: 0644]
lnet/packaging/.cvsignore [new file with mode: 0644]
lnet/packaging/Makefile.am [new file with mode: 0644]
lnet/packaging/portals.spec.in [new file with mode: 0644]
lnet/router/.cvsignore [new file with mode: 0644]
lnet/router/Makefile.am [new file with mode: 0644]
lnet/router/Makefile.mk [new file with mode: 0644]
lnet/router/proc.c [new file with mode: 0644]
lnet/router/router.c [new file with mode: 0644]
lnet/router/router.h [new file with mode: 0644]
lnet/tests/.cvsignore [new file with mode: 0644]
lnet/tests/Makefile.am [new file with mode: 0644]
lnet/tests/ping.h [new file with mode: 0644]
lnet/tests/ping_cli.c [new file with mode: 0644]
lnet/tests/ping_srv.c [new file with mode: 0644]
lnet/tests/sping_cli.c [new file with mode: 0644]
lnet/tests/sping_srv.c [new file with mode: 0644]
lnet/tests/startclient.sh [new file with mode: 0644]
lnet/tests/startserver.sh [new file with mode: 0644]
lnet/tests/stopclient.sh [new file with mode: 0644]
lnet/tests/stopserver.sh [new file with mode: 0644]
lnet/ulnds/.cvsignore [new file with mode: 0644]
lnet/ulnds/Makefile.am [new file with mode: 0644]
lnet/ulnds/README [new file with mode: 0644]
lnet/ulnds/address.c [new file with mode: 0644]
lnet/ulnds/bridge.h [new file with mode: 0644]
lnet/ulnds/connection.c [new file with mode: 0644]
lnet/ulnds/connection.h [new file with mode: 0644]
lnet/ulnds/debug.c [new file with mode: 0644]
lnet/ulnds/dispatch.h [new file with mode: 0644]
lnet/ulnds/ipmap.h [new file with mode: 0644]
lnet/ulnds/pqtimer.c [new file with mode: 0644]
lnet/ulnds/pqtimer.h [new file with mode: 0644]
lnet/ulnds/procapi.c [new file with mode: 0644]
lnet/ulnds/procbridge.h [new file with mode: 0644]
lnet/ulnds/proclib.c [new file with mode: 0644]
lnet/ulnds/select.c [new file with mode: 0644]
lnet/ulnds/socklnd/Makefile.am [new file with mode: 0644]
lnet/ulnds/socklnd/README [new file with mode: 0644]
lnet/ulnds/socklnd/address.c [new file with mode: 0644]
lnet/ulnds/socklnd/bridge.h [new file with mode: 0644]
lnet/ulnds/socklnd/connection.c [new file with mode: 0644]
lnet/ulnds/socklnd/connection.h [new file with mode: 0644]
lnet/ulnds/socklnd/debug.c [new file with mode: 0644]
lnet/ulnds/socklnd/dispatch.h [new file with mode: 0644]
lnet/ulnds/socklnd/ipmap.h [new file with mode: 0644]
lnet/ulnds/socklnd/pqtimer.c [new file with mode: 0644]
lnet/ulnds/socklnd/pqtimer.h [new file with mode: 0644]
lnet/ulnds/socklnd/procapi.c [new file with mode: 0644]
lnet/ulnds/socklnd/procbridge.h [new file with mode: 0644]
lnet/ulnds/socklnd/proclib.c [new file with mode: 0644]
lnet/ulnds/socklnd/select.c [new file with mode: 0644]
lnet/ulnds/socklnd/table.c [new file with mode: 0644]
lnet/ulnds/socklnd/table.h [new file with mode: 0644]
lnet/ulnds/socklnd/tcplnd.c [new file with mode: 0644]
lnet/ulnds/socklnd/timer.h [new file with mode: 0644]
lnet/ulnds/socklnd/utypes.h [new file with mode: 0644]
lnet/ulnds/table.c [new file with mode: 0644]
lnet/ulnds/table.h [new file with mode: 0644]
lnet/ulnds/tcplnd.c [new file with mode: 0644]
lnet/ulnds/timer.h [new file with mode: 0644]
lnet/ulnds/utypes.h [new file with mode: 0644]
lnet/utils/.cvsignore [new file with mode: 0644]
lnet/utils/Makefile.am [new file with mode: 0644]
lnet/utils/acceptor.c [new file with mode: 0644]
lnet/utils/debug.c [new file with mode: 0644]
lnet/utils/debugctl.c [new file with mode: 0644]
lnet/utils/l_ioctl.c [new file with mode: 0644]
lnet/utils/parser.c [new file with mode: 0644]
lnet/utils/parser.h [new file with mode: 0644]
lnet/utils/portals.c [new file with mode: 0644]
lnet/utils/ptlctl.c [new file with mode: 0644]
lnet/utils/routerstat.c [new file with mode: 0644]
lnet/utils/wirecheck.c [new file with mode: 0644]
lustre/.cvsignore
lustre/ChangeLog
lustre/Makefile.am
lustre/Makefile.mk [new file with mode: 0644]
lustre/README
lustre/Rules
lustre/archdep.m4 [deleted file]
lustre/autogen.sh
lustre/cobd/cache_obd.c
lustre/cobd/lproc_cache.c
lustre/conf/lustre.dtd
lustre/conf/lustre2ldif.xsl
lustre/conf/slapd-lustre.conf
lustre/configure.in
lustre/doc/lconf.lyx
lustre/doc/lctl.lyx
lustre/doc/lmc.lyx
lustre/extN/Makefile.am [deleted file]
lustre/extN/ext3-largefile.diff [deleted file]
lustre/extN/ext3-unmount_sync.diff [deleted file]
lustre/extN/extN-2.4.18-exports.diff [deleted file]
lustre/extN/extN-2.4.18-ino_sb_fixup.diff [deleted file]
lustre/extN/extN-san.diff [deleted file]
lustre/extN/extN-wantedi.diff [deleted file]
lustre/include/.cvsignore
lustre/include/ioctl.h [new file with mode: 0644]
lustre/include/liblustre.h
lustre/include/linux/lprocfs_status.h
lustre/include/linux/lustre_compat25.h [new file with mode: 0644]
lustre/include/linux/lustre_dlm.h
lustre/include/linux/lustre_export.h
lustre/include/linux/lustre_fsfilt.h
lustre/include/linux/lustre_ha.h
lustre/include/linux/lustre_idl.h
lustre/include/linux/lustre_import.h
lustre/include/linux/lustre_lib.h
lustre/include/linux/lustre_lite.h
lustre/include/linux/lustre_mds.h
lustre/include/linux/lustre_net.h
lustre/include/linux/obd.h
lustre/include/linux/obd_class.h
lustre/include/linux/obd_echo.h
lustre/include/linux/obd_filter.h
lustre/include/linux/obd_lov.h
lustre/include/linux/obd_ost.h
lustre/include/linux/obd_ptlbd.h
lustre/include/linux/obd_support.h
lustre/kernel_patches/README
lustre/kernel_patches/kernel_configs/config-linux-2.4.18-i386 [new file with mode: 0644]
lustre/kernel_patches/kernel_configs/config-linux-2.4.18-p4smp-61chaos [new file with mode: 0644]
lustre/kernel_patches/kernel_configs/config-linux-2.4.18-uml [new file with mode: 0644]
lustre/kernel_patches/kernel_configs/config-linux-2.4.20-i386-rh [new file with mode: 0644]
lustre/kernel_patches/kernel_configs/config-linux-2.4.20-uml [new file with mode: 0644]
lustre/kernel_patches/kernel_configs/jdike-2.5.69-uml.config [new file with mode: 0644]
lustre/kernel_patches/patches/dev_read_only_2.4.20-rh.patch [new file with mode: 0644]
lustre/kernel_patches/patches/dev_read_only_2.4.20.patch [moved from lustre/kernel_patches/patches/dev_read_only_hp.patch with 62% similarity]
lustre/kernel_patches/patches/dev_read_only_hp_2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/dsp.patch [new file with mode: 0644]
lustre/kernel_patches/patches/export-truncate-2.5.63.patch [new file with mode: 0644]
lustre/kernel_patches/patches/export-truncate.patch [new file with mode: 0644]
lustre/kernel_patches/patches/exports.patch
lustre/kernel_patches/patches/exports_2.4.20-rh-hp.patch [moved from lustre/kernel_patches/patches/exports_hp.patch with 61% similarity]
lustre/kernel_patches/patches/exports_2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext-2.4-patch-1.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext-2.4-patch-2.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext-2.4-patch-3.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext-2.4-patch-4.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-2.4-ino_t.patch [moved from lustre/extN/ext3-2.4-ino_t.diff with 73% similarity]
lustre/kernel_patches/patches/ext3-2.4.18-fixes.patch [moved from lustre/extN/ext3-2.4.18-fixes.diff with 100% similarity]
lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro.patch [moved from lustre/extN/ext3-2.4.18-ino_sb_macro.diff with 99% similarity]
lustre/kernel_patches/patches/ext3-2.4.20-fixes.patch [moved from lustre/extN/patch-2.4.18-chaos22 with 60% similarity]
lustre/kernel_patches/patches/ext3-2.5-noread.patch [moved from lustre/extN/ext3-2.5-noread.diff with 99% similarity]
lustre/kernel_patches/patches/ext3-2.5.63.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-delete_thread-2.4.18.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-largefile.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-noread-2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-orphan_lock.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-san-2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-truncate_blocks-chaos.patch.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-truncate_blocks.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-unmount_sync.patch [new file with mode: 0644]
lustre/kernel_patches/patches/ext3-use-after-free.patch [moved from lustre/extN/ext3-use-after-free.diff with 56% similarity]
lustre/kernel_patches/patches/ext3-xattr-2.5.patch [deleted file]
lustre/kernel_patches/patches/ext3_orphan_lock-2.4.20-rh.patch [new file with mode: 0644]
lustre/kernel_patches/patches/extN-2.4.18-ino_sb_fixup.patch [new file with mode: 0644]
lustre/kernel_patches/patches/extN-delete_thread.patch [new file with mode: 0644]
lustre/kernel_patches/patches/extN-iget-debug.patch [moved from lustre/extN/extN-iget-debug.diff with 78% similarity]
lustre/kernel_patches/patches/extN-misc-fixup.patch [moved from lustre/extN/extN-misc-fixup.diff with 58% similarity]
lustre/kernel_patches/patches/extN-noread.patch [moved from lustre/extN/extN-noread.diff with 54% similarity]
lustre/kernel_patches/patches/extN-san.patch [new file with mode: 0644]
lustre/kernel_patches/patches/extN-wantedi.patch [new file with mode: 0644]
lustre/kernel_patches/patches/htree-ext3-2.4.18.patch [moved from lustre/extN/htree-ext3-2.4.18.diff with 99% similarity]
lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch [new file with mode: 0644]
lustre/kernel_patches/patches/invalidate_show.patch
lustre/kernel_patches/patches/iod-rmap-exports-2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/iod-rmap-exports.patch
lustre/kernel_patches/patches/iod-stock-24-exports.patch [new file with mode: 0644]
lustre/kernel_patches/patches/iod-stock-24-exports_hp.patch
lustre/kernel_patches/patches/iopen-2.4.18.patch [new file with mode: 0644]
lustre/kernel_patches/patches/iopen-2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/kmem_cache_validate_2.4.20-rh.patch [new file with mode: 0644]
lustre/kernel_patches/patches/kmem_cache_validate_2.4.20.patch [new file with mode: 0644]
lustre/kernel_patches/patches/kmem_cache_validate_hp.patch
lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26.patch [moved from lustre/extN/linux-2.4.18ea-0.8.26.diff with 93% similarity]
lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch [new file with mode: 0644]
lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch [new file with mode: 0644]
lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch [new file with mode: 0644]
lustre/kernel_patches/patches/lustre-2.5.63.patch [new file with mode: 0644]
lustre/kernel_patches/patches/lustre-2.5.patch [deleted file]
lustre/kernel_patches/patches/lustre_version.patch
lustre/kernel_patches/patches/mcore-2.4.20-8.patch [new file with mode: 0644]
lustre/kernel_patches/patches/patch-2.4.18-hp1_pnnl18.2.8qsnet.patch [deleted file]
lustre/kernel_patches/patches/tcp-zero-copy.patch [new file with mode: 0644]
lustre/kernel_patches/patches/uml-patch-2.4.20-4.patch [new file with mode: 0644]
lustre/kernel_patches/patches/uml_check_get_page.patch
lustre/kernel_patches/patches/uml_no_panic.patch
lustre/kernel_patches/patches/vanilla-2.4.18.patch [deleted file]
lustre/kernel_patches/patches/vanilla-2.4.19.patch [deleted file]
lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch
lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch [moved from lustre/kernel_patches/patches/vfs_intent.patch with 50% similarity]
lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch [moved from lustre/kernel_patches/patches/vfs_intent_hp.patch with 76% similarity]
lustre/kernel_patches/pc/dev_read_only_2.4.20-rh.pc [moved from lustre/kernel_patches/pc/dev_read_only_hp.pc with 100% similarity]
lustre/kernel_patches/pc/dev_read_only_2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/dev_read_only_hp_2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/dsp.pc [new file with mode: 0644]
lustre/kernel_patches/pc/export-truncate-2.5.63.pc [new file with mode: 0644]
lustre/kernel_patches/pc/export-truncate.pc [new file with mode: 0644]
lustre/kernel_patches/pc/exports_2.4.20-rh-hp.pc [moved from lustre/kernel_patches/pc/exports_hp.pc with 100% similarity]
lustre/kernel_patches/pc/exports_2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/exports_hp_2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext-2.4-patch-1-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext-2.4-patch-1.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext-2.4-patch-2.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext-2.4-patch-3.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext-2.4-patch-4.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-2.4-ino_t.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-2.4.18-fixes.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-2.4.20-fixes.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-2.5-noread.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-2.5.63.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-delete_thread-2.4.18.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-delete_thread-2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-largefile.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-noread-2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-orphan_lock.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-san-2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-truncate_blocks-chaos.patch.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-truncate_blocks.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-unmount_sync.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3-use-after-free.pc [new file with mode: 0644]
lustre/kernel_patches/pc/ext3_orphan_lock-2.4.20-rh.pc [new file with mode: 0644]
lustre/kernel_patches/pc/extN-2.4.18-ino_sb_fixup.pc [new file with mode: 0644]
lustre/kernel_patches/pc/extN-delete_thread.pc [new file with mode: 0644]
lustre/kernel_patches/pc/extN-iget-debug.pc [new file with mode: 0644]
lustre/kernel_patches/pc/extN-misc-fixup.pc [new file with mode: 0644]
lustre/kernel_patches/pc/extN-noread.pc [new file with mode: 0644]
lustre/kernel_patches/pc/extN-san.pc [new file with mode: 0644]
lustre/kernel_patches/pc/extN-wantedi.pc [new file with mode: 0644]
lustre/kernel_patches/pc/htree-ext3-2.4.18.pc [new file with mode: 0644]
lustre/kernel_patches/pc/invalidate_show-2.4.20-rh.pc [new file with mode: 0644]
lustre/kernel_patches/pc/invalidate_show.pc
lustre/kernel_patches/pc/iod-rmap-exports-2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/iod-rmap-exports.pc
lustre/kernel_patches/pc/iod-stock-24-exports.pc [new file with mode: 0644]
lustre/kernel_patches/pc/iod-stock-24-exports_hp.pc [new file with mode: 0644]
lustre/kernel_patches/pc/iopen-2.4.18.pc [new file with mode: 0644]
lustre/kernel_patches/pc/iopen-2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/kmem_cache_validate_2.4.20-rh.pc [new file with mode: 0644]
lustre/kernel_patches/pc/kmem_cache_validate_2.4.20.pc [new file with mode: 0644]
lustre/kernel_patches/pc/kmem_cache_validate_hp.pc
lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26.pc [new file with mode: 0644]
lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-chaos.pc [new file with mode: 0644]
lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-hp.pc [new file with mode: 0644]
lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54.pc [new file with mode: 0644]
lustre/kernel_patches/pc/lustre-2.5.63.pc [new file with mode: 0644]
lustre/kernel_patches/pc/mcore-2.4.20-8.pc [new file with mode: 0644]
lustre/kernel_patches/pc/patch-2.4.18-hp1_pnnl18.2.8qsnet.pc [deleted file]
lustre/kernel_patches/pc/tcp-zero-copy.pc [new file with mode: 0644]
lustre/kernel_patches/pc/uml-patch-2.4.20-4.pc [new file with mode: 0644]
lustre/kernel_patches/pc/uml_check_get_page.pc
lustre/kernel_patches/pc/uml_compile_fixes.pc
lustre/kernel_patches/pc/uml_no_panic.pc
lustre/kernel_patches/pc/vanilla-2.4.18.pc [deleted file]
lustre/kernel_patches/pc/vanilla-2.4.19.pc [deleted file]
lustre/kernel_patches/pc/vfs_intent-2.4.18-18.pc
lustre/kernel_patches/pc/vfs_intent-2.4.20-rh.pc [moved from lustre/kernel_patches/pc/vfs_intent.pc with 82% similarity]
lustre/kernel_patches/pc/vfs_intent-2.4.20-vanilla.pc [moved from lustre/kernel_patches/pc/vfs_intent_hp.pc with 82% similarity]
lustre/kernel_patches/prepare_tree.sh
lustre/kernel_patches/scripts/apatch
lustre/kernel_patches/scripts/cat-series [new file with mode: 0755]
lustre/kernel_patches/scripts/combine-applied
lustre/kernel_patches/scripts/forkpatch [new file with mode: 0755]
lustre/kernel_patches/scripts/join-patch [new file with mode: 0755]
lustre/kernel_patches/scripts/patchfns
lustre/kernel_patches/scripts/poppatch
lustre/kernel_patches/scripts/pushpatch
lustre/kernel_patches/scripts/refpatch
lustre/kernel_patches/scripts/rpatch
lustre/kernel_patches/scripts/sum-series [new file with mode: 0755]
lustre/kernel_patches/scripts/trypatch [new file with mode: 0755]
lustre/kernel_patches/scripts/unused-patches [new file with mode: 0755]
lustre/kernel_patches/series/chaos
lustre/kernel_patches/series/hp-pnnl [deleted file]
lustre/kernel_patches/series/hp-pnnl-2.4.20 [new file with mode: 0644]
lustre/kernel_patches/series/rh-2.4.18-18
lustre/kernel_patches/series/rh-2.4.20 [new file with mode: 0644]
lustre/kernel_patches/series/vanilla-2.4.18 [deleted file]
lustre/kernel_patches/series/vanilla-2.4.19 [deleted file]
lustre/kernel_patches/series/vanilla-2.4.20 [new file with mode: 0644]
lustre/kernel_patches/series/vanilla-2.5
lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt [new file with mode: 0644]
lustre/kernel_patches/txt/vfs_intent.txt [deleted file]
lustre/kernel_patches/which_patch
lustre/ldlm/Makefile.am
lustre/ldlm/ldlm_extent.c
lustre/ldlm/ldlm_internal.h [new file with mode: 0644]
lustre/ldlm/ldlm_lib.c [new file with mode: 0644]
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/ldlm/ldlm_resource.c
lustre/lib/Makefile.am [deleted file]
lustre/lib/client.c [deleted file]
lustre/lib/mds_updates.c [deleted file]
lustre/lib/obd_pack.c [deleted file]
lustre/lib/target.c [deleted file]
lustre/liblustre/Makefile.am
lustre/liblustre/file.c [new file with mode: 0644]
lustre/liblustre/libtest.c
lustre/liblustre/llite_lib.c [new file with mode: 0644]
lustre/liblustre/llite_lib.h [new file with mode: 0644]
lustre/liblustre/lltest.c [new file with mode: 0644]
lustre/liblustre/rw.c [new file with mode: 0644]
lustre/liblustre/super.c [new file with mode: 0644]
lustre/llite/Makefile.am
lustre/llite/commit_callback.c
lustre/llite/dcache.c
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/iod.c
lustre/llite/llite_internal.h [new file with mode: 0644]
lustre/llite/lproc_llite.c
lustre/llite/namei.c
lustre/llite/recover.c [deleted file]
lustre/llite/rw.c
lustre/llite/super.c
lustre/llite/super25.c
lustre/llite/symlink.c
lustre/lov/Makefile.am
lustre/lov/lov_obd.c
lustre/lov/lov_pack.c
lustre/mdc/Makefile.am
lustre/mdc/mdc_internal.h [new file with mode: 0644]
lustre/mdc/mdc_lib.c [new file with mode: 0644]
lustre/mdc/mdc_reint.c
lustre/mdc/mdc_request.c
lustre/mds/Makefile.am
lustre/mds/Makefile.mk [new file with mode: 0644]
lustre/mds/handler.c
lustre/mds/lproc_mds.c
lustre/mds/mds_fs.c
lustre/mds/mds_internal.h [new file with mode: 0644]
lustre/mds/mds_lib.c [new file with mode: 0644]
lustre/mds/mds_lov.c
lustre/mds/mds_open.c
lustre/mds/mds_reint.c
lustre/obdclass/Makefile.am
lustre/obdclass/class_obd.c
lustre/obdclass/debug.c
lustre/obdclass/fsfilt.c
lustre/obdclass/fsfilt_ext3.c
lustre/obdclass/fsfilt_extN.c
lustre/obdclass/fsfilt_reiserfs.c
lustre/obdclass/genops.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/lustre_handles.c
lustre/obdclass/simple.c [moved from lustre/lib/simple.c with 83% similarity]
lustre/obdclass/statfs_pack.c
lustre/obdclass/sysctl.c
lustre/obdclass/uuid.c
lustre/obdecho/Makefile.am
lustre/obdecho/echo.c
lustre/obdecho/echo_client.c
lustre/obdfilter/Makefile.am
lustre/obdfilter/filter.c
lustre/obdfilter/lproc_obdfilter.c
lustre/osc/Makefile.am
lustre/osc/osc_lib.c [new file with mode: 0644]
lustre/osc/osc_request.c
lustre/ost/Makefile.am
lustre/ost/ost_handler.c
lustre/portals/.cvsignore [new file with mode: 0644]
lustre/portals/AUTHORS [new file with mode: 0644]
lustre/portals/ChangeLog [new file with mode: 0644]
lustre/portals/Kernelenv.in [new file with mode: 0644]
lustre/portals/Kernelenv.mk [new file with mode: 0644]
lustre/portals/Makefile.am [new file with mode: 0644]
lustre/portals/Makefile.mk [new file with mode: 0644]
lustre/portals/NEWS [new file with mode: 0644]
lustre/portals/README [new file with mode: 0644]
lustre/portals/Rules.linux [new file with mode: 0644]
lustre/portals/archdep.m4 [new file with mode: 0644]
lustre/portals/autogen.sh [new file with mode: 0755]
lustre/portals/build.m4 [new file with mode: 0644]
lustre/portals/configure.in [new file with mode: 0644]
lustre/portals/doc/.cvsignore [new file with mode: 0644]
lustre/portals/doc/Data-structures [new file with mode: 0644]
lustre/portals/doc/Makefile.am [new file with mode: 0644]
lustre/portals/doc/Message-life-cycle [new file with mode: 0644]
lustre/portals/doc/NAL-HOWTO [new file with mode: 0644]
lustre/portals/doc/file.fig [new file with mode: 0644]
lustre/portals/doc/flow_new.fig [new file with mode: 0644]
lustre/portals/doc/get.fig [new file with mode: 0644]
lustre/portals/doc/ieee.bst [new file with mode: 0644]
lustre/portals/doc/mpi.fig [new file with mode: 0644]
lustre/portals/doc/portals.fig [new file with mode: 0644]
lustre/portals/doc/portals3.bib [new file with mode: 0644]
lustre/portals/doc/portals3.lyx [new file with mode: 0644]
lustre/portals/doc/put.fig [new file with mode: 0644]
lustre/portals/include/.cvsignore [new file with mode: 0644]
lustre/portals/include/Makefile.am [new file with mode: 0644]
lustre/portals/include/config.h.in [new file with mode: 0644]
lustre/portals/include/linux/Makefile.am [new file with mode: 0644]
lustre/portals/include/linux/kp30.h [new file with mode: 0644]
lustre/portals/include/linux/portals_compat25.h [new file with mode: 0644]
lustre/portals/include/linux/portals_lib.h [new file with mode: 0644]
lustre/portals/include/portals/Makefile.am [new file with mode: 0644]
lustre/portals/include/portals/api-support.h [new file with mode: 0644]
lustre/portals/include/portals/api.h [new file with mode: 0644]
lustre/portals/include/portals/arg-blocks.h [new file with mode: 0644]
lustre/portals/include/portals/defines.h [new file with mode: 0644]
lustre/portals/include/portals/errno.h [new file with mode: 0644]
lustre/portals/include/portals/internal.h [new file with mode: 0644]
lustre/portals/include/portals/lib-dispatch.h [new file with mode: 0644]
lustre/portals/include/portals/lib-nal.h [new file with mode: 0644]
lustre/portals/include/portals/lib-p30.h [new file with mode: 0644]
lustre/portals/include/portals/lib-types.h [new file with mode: 0644]
lustre/portals/include/portals/list.h [new file with mode: 0644]
lustre/portals/include/portals/lltrace.h [new file with mode: 0644]
lustre/portals/include/portals/myrnal.h [new file with mode: 0644]
lustre/portals/include/portals/nal.h [new file with mode: 0644]
lustre/portals/include/portals/nalids.h [new file with mode: 0644]
lustre/portals/include/portals/p30.h [new file with mode: 0644]
lustre/portals/include/portals/ppid.h [new file with mode: 0644]
lustre/portals/include/portals/ptlctl.h [new file with mode: 0644]
lustre/portals/include/portals/stringtab.h [new file with mode: 0644]
lustre/portals/include/portals/types.h [new file with mode: 0644]
lustre/portals/knals/.cvsignore [new file with mode: 0644]
lustre/portals/knals/Makefile.am [new file with mode: 0644]
lustre/portals/knals/Makefile.mk [new file with mode: 0644]
lustre/portals/knals/gmnal/.cvsignore [new file with mode: 0644]
lustre/portals/knals/gmnal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch [new file with mode: 0644]
lustre/portals/knals/gmnal/gmnal.c [new file with mode: 0644]
lustre/portals/knals/gmnal/gmnal.h [new file with mode: 0644]
lustre/portals/knals/gmnal/gmnal_cb.c [new file with mode: 0644]
lustre/portals/knals/qswnal/.cvsignore [new file with mode: 0644]
lustre/portals/knals/qswnal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/qswnal/qswnal.c [new file with mode: 0644]
lustre/portals/knals/qswnal/qswnal.h [new file with mode: 0644]
lustre/portals/knals/qswnal/qswnal_cb.c [new file with mode: 0644]
lustre/portals/knals/scimacnal/.cvsignore [new file with mode: 0644]
lustre/portals/knals/scimacnal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/scimacnal/README.scimacnal [new file with mode: 0644]
lustre/portals/knals/scimacnal/scimac.conf [new file with mode: 0644]
lustre/portals/knals/scimacnal/scimacnal.c [new file with mode: 0644]
lustre/portals/knals/scimacnal/scimacnal.h [new file with mode: 0644]
lustre/portals/knals/scimacnal/scimacnal_cb.c [new file with mode: 0644]
lustre/portals/knals/socknal/.cvsignore [new file with mode: 0644]
lustre/portals/knals/socknal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/socknal/Makefile.mk [new file with mode: 0644]
lustre/portals/knals/socknal/socknal.c [new file with mode: 0644]
lustre/portals/knals/socknal/socknal.h [new file with mode: 0644]
lustre/portals/knals/socknal/socknal_cb.c [new file with mode: 0644]
lustre/portals/knals/toenal/.cvsignore [new file with mode: 0644]
lustre/portals/knals/toenal/Makefile.am [new file with mode: 0644]
lustre/portals/knals/toenal/toenal.c [new file with mode: 0644]
lustre/portals/knals/toenal/toenal.h [new file with mode: 0644]
lustre/portals/knals/toenal/toenal_cb.c [new file with mode: 0644]
lustre/portals/libcfs/.cvsignore [new file with mode: 0644]
lustre/portals/libcfs/Makefile.am [new file with mode: 0644]
lustre/portals/libcfs/Makefile.mk [new file with mode: 0644]
lustre/portals/libcfs/debug.c [new file with mode: 0644]
lustre/portals/libcfs/module.c [new file with mode: 0644]
lustre/portals/libcfs/proc.c [new file with mode: 0644]
lustre/portals/packaging/.cvsignore [new file with mode: 0644]
lustre/portals/packaging/Makefile.am [new file with mode: 0644]
lustre/portals/packaging/portals.spec.in [new file with mode: 0644]
lustre/portals/portals/.cvsignore [new file with mode: 0644]
lustre/portals/portals/Makefile.am [new file with mode: 0644]
lustre/portals/portals/Makefile.mk [new file with mode: 0644]
lustre/portals/portals/api-eq.c [new file with mode: 0644]
lustre/portals/portals/api-errno.c [new file with mode: 0644]
lustre/portals/portals/api-init.c [new file with mode: 0644]
lustre/portals/portals/api-me.c [new file with mode: 0644]
lustre/portals/portals/api-ni.c [new file with mode: 0644]
lustre/portals/portals/api-wrap.c [new file with mode: 0644]
lustre/portals/portals/lib-dispatch.c [new file with mode: 0644]
lustre/portals/portals/lib-eq.c [new file with mode: 0644]
lustre/portals/portals/lib-init.c [new file with mode: 0644]
lustre/portals/portals/lib-md.c [new file with mode: 0644]
lustre/portals/portals/lib-me.c [new file with mode: 0644]
lustre/portals/portals/lib-move.c [new file with mode: 0644]
lustre/portals/portals/lib-msg.c [new file with mode: 0644]
lustre/portals/portals/lib-ni.c [new file with mode: 0644]
lustre/portals/portals/lib-pid.c [new file with mode: 0644]
lustre/portals/router/.cvsignore [new file with mode: 0644]
lustre/portals/router/Makefile.am [new file with mode: 0644]
lustre/portals/router/Makefile.mk [new file with mode: 0644]
lustre/portals/router/proc.c [new file with mode: 0644]
lustre/portals/router/router.c [new file with mode: 0644]
lustre/portals/router/router.h [new file with mode: 0644]
lustre/portals/tests/.cvsignore [new file with mode: 0644]
lustre/portals/tests/Makefile.am [new file with mode: 0644]
lustre/portals/tests/ping.h [new file with mode: 0644]
lustre/portals/tests/ping_cli.c [new file with mode: 0644]
lustre/portals/tests/ping_srv.c [new file with mode: 0644]
lustre/portals/tests/sping_cli.c [new file with mode: 0644]
lustre/portals/tests/sping_srv.c [new file with mode: 0644]
lustre/portals/tests/startclient.sh [new file with mode: 0755]
lustre/portals/tests/startserver.sh [new file with mode: 0755]
lustre/portals/tests/stopclient.sh [new file with mode: 0755]
lustre/portals/tests/stopserver.sh [new file with mode: 0644]
lustre/portals/unals/.cvsignore [new file with mode: 0644]
lustre/portals/unals/Makefile.am [new file with mode: 0644]
lustre/portals/unals/README [new file with mode: 0644]
lustre/portals/unals/address.c [new file with mode: 0644]
lustre/portals/unals/bridge.h [new file with mode: 0644]
lustre/portals/unals/connection.c [new file with mode: 0644]
lustre/portals/unals/connection.h [new file with mode: 0644]
lustre/portals/unals/debug.c [new file with mode: 0644]
lustre/portals/unals/dispatch.h [new file with mode: 0644]
lustre/portals/unals/ipmap.h [new file with mode: 0644]
lustre/portals/unals/pqtimer.c [new file with mode: 0644]
lustre/portals/unals/pqtimer.h [new file with mode: 0644]
lustre/portals/unals/procapi.c [new file with mode: 0644]
lustre/portals/unals/procbridge.h [new file with mode: 0644]
lustre/portals/unals/proclib.c [new file with mode: 0644]
lustre/portals/unals/select.c [new file with mode: 0644]
lustre/portals/unals/table.c [new file with mode: 0644]
lustre/portals/unals/table.h [new file with mode: 0644]
lustre/portals/unals/tcpnal.c [new file with mode: 0644]
lustre/portals/unals/timer.h [new file with mode: 0644]
lustre/portals/unals/utypes.h [new file with mode: 0644]
lustre/portals/utils/.cvsignore [new file with mode: 0644]
lustre/portals/utils/Makefile.am [new file with mode: 0644]
lustre/portals/utils/acceptor.c [new file with mode: 0644]
lustre/portals/utils/debug.c [new file with mode: 0644]
lustre/portals/utils/debugctl.c [new file with mode: 0644]
lustre/portals/utils/l_ioctl.c [new file with mode: 0644]
lustre/portals/utils/parser.c [new file with mode: 0644]
lustre/portals/utils/parser.h [new file with mode: 0644]
lustre/portals/utils/portals.c [new file with mode: 0644]
lustre/portals/utils/ptlctl.c [new file with mode: 0644]
lustre/portals/utils/routerstat.c [new file with mode: 0644]
lustre/portals/utils/wirecheck.c [new file with mode: 0644]
lustre/ptlbd/blk.c
lustre/ptlbd/client.c
lustre/ptlbd/rpc.c
lustre/ptlbd/server.c
lustre/ptlrpc/Makefile.am
lustre/ptlrpc/client.c
lustre/ptlrpc/connection.c
lustre/ptlrpc/events.c
lustre/ptlrpc/lproc_ptlrpc.c
lustre/ptlrpc/niobuf.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/pinger.c [new file with mode: 0644]
lustre/ptlrpc/ptlrpc_internal.h [new file with mode: 0644]
lustre/ptlrpc/ptlrpc_lib.c [new file with mode: 0644]
lustre/ptlrpc/ptlrpc_module.c [new file with mode: 0644]
lustre/ptlrpc/recovd.c [deleted file]
lustre/ptlrpc/recover.c
lustre/ptlrpc/rpc.c [deleted file]
lustre/ptlrpc/service.c
lustre/scripts/llite-group.sh [new file with mode: 0644]
lustre/scripts/lustre.spec.in
lustre/scripts/version_tag.pl
lustre/tests/.cvsignore
lustre/tests/Makefile.am
lustre/tests/acceptance-small.sh
lustre/tests/ba-echo.sh
lustre/tests/checkstat.c
lustre/tests/cobd.sh
lustre/tests/createtest.c
lustre/tests/directio.c
lustre/tests/echo.sh
lustre/tests/fchdir_test.c [new file with mode: 0644]
lustre/tests/llecho.sh
lustre/tests/llmount.sh
lustre/tests/llmountcleanup.sh
lustre/tests/llrmount.sh
lustre/tests/local.sh
lustre/tests/mcr-routed-config.sh
lustre/tests/mkdirdeep.c [new file with mode: 0644]
lustre/tests/opendevunlink.c [new file with mode: 0644]
lustre/tests/opendirunlink.c [new file with mode: 0644]
lustre/tests/openfile.c [new file with mode: 0644]
lustre/tests/recovery-cleanup.sh
lustre/tests/recovery-small-upcall.sh [new file with mode: 0755]
lustre/tests/recovery-small.sh
lustre/tests/runas.c
lustre/tests/runobdstat [new file with mode: 0644]
lustre/tests/runvmstat
lustre/tests/sanity-ldlm.sh [new file with mode: 0644]
lustre/tests/sanity.sh
lustre/tests/sanityN.sh
lustre/tests/test_brw.c
lustre/tests/uml.sh
lustre/tests/unlinkmany.c [new file with mode: 0644]
lustre/tests/writeme.c
lustre/utils/.cvsignore
lustre/utils/Lustre/.cvsignore [new file with mode: 0644]
lustre/utils/Lustre/Makefile.am [new file with mode: 0644]
lustre/utils/Lustre/__init__.py [new file with mode: 0644]
lustre/utils/Lustre/cmdline.py [new file with mode: 0644]
lustre/utils/Lustre/error.py [new file with mode: 0644]
lustre/utils/Lustre/lustredb.py [new file with mode: 0644]
lustre/utils/Makefile.am
lustre/utils/lactive [new file with mode: 0644]
lustre/utils/lconf [moved from lustre/utils/lconf.in with 60% similarity]
lustre/utils/lctl.c
lustre/utils/llparser.pm [deleted file]
lustre/utils/llstat.pl [new file with mode: 0755]
lustre/utils/lmc
lustre/utils/load_ldap.sh [new file with mode: 0755]
lustre/utils/lstripe.c
lustre/utils/obd.c
lustre/utils/obdctl.c
lustre/utils/obdctl.h
lustre/utils/obdiolib.c
lustre/utils/obdiolib.h
lustre/utils/obdstat.c
lustre/utils/parser.c
lustre/utils/wirecheck.c [new file with mode: 0644]

similarity index 60%
rename from lustre/lib/.cvsignore
rename to lnet/.cvsignore
index e530020..99ac885 100644 (file)
@@ -1,8 +1,8 @@
-.Xrefs
+Kernelenv
+Makefile
+Makefile.in
+aclocal.m4
+autom4te.cache
 config.log
 config.status
 configure
-Makefile
-Makefile.in
-.deps
-TAGS
diff --git a/lnet/AUTHORS b/lnet/AUTHORS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lnet/ChangeLog b/lnet/ChangeLog
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lnet/Kernelenv.in b/lnet/Kernelenv.in
new file mode 100644 (file)
index 0000000..29a713f
--- /dev/null
@@ -0,0 +1 @@
+EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include
diff --git a/lnet/Kernelenv.mk b/lnet/Kernelenv.mk
new file mode 100644 (file)
index 0000000..29a713f
--- /dev/null
@@ -0,0 +1 @@
+EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include
diff --git a/lnet/Makefile.am b/lnet/Makefile.am
new file mode 100644 (file)
index 0000000..1a223f2
--- /dev/null
@@ -0,0 +1,12 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = Rules.linux archdep.m4 include 
+DIST_SUBDIRS = libcfs portals knals unals utils tests doc router
+if LIBLUSTRE
+SUBDIRS = portals unals utils
+else
+SUBDIRS = libcfs portals knals unals utils tests doc router
+endif
diff --git a/lnet/Makefile.mk b/lnet/Makefile.mk
new file mode 100644 (file)
index 0000000..be0e51a
--- /dev/null
@@ -0,0 +1,6 @@
+include fs/lustre/portals/Kernelenv
+
+obj-y += portals/
+obj-y += libcfs/
+obj-y += knals/
+obj-y += router/
diff --git a/lnet/NEWS b/lnet/NEWS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lnet/README b/lnet/README
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lnet/Rules.linux b/lnet/Rules.linux
new file mode 100644 (file)
index 0000000..93943b7
--- /dev/null
@@ -0,0 +1,25 @@
+# included in Linux kernel directories
+# Rules for module building
+
+if LINUX25
+
+basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g')
+AM_CPPFLAGS= -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2  -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename)
+
+$(MODULE).o: $($(MODULE)_OBJECTS)
+       $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS)
+
+else
+
+$(MODULE).o: $($(MODULE)_OBJECTS)
+       $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS)
+
+endif
+
+tags:
+       rm -f $(top_srcdir)/TAGS
+       rm -f $(top_srcdir)/tags
+       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a
+       find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a
+       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a
+       find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a
diff --git a/lnet/archdep.m4 b/lnet/archdep.m4
new file mode 100644 (file)
index 0000000..7cb00cf
--- /dev/null
@@ -0,0 +1,317 @@
+
+# -------- in kernel compilation? (2.5 only) -------------
+AC_ARG_ENABLE(inkernel, [ --enable-inkernel set up 2.5 kernel makefiles])
+AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
+echo "Makefile for in kernel build: $INKERNEL"
+
+# -------- liblustre compilation --------------
+AC_ARG_WITH(lib, [  --with-lib compile lustre library], host_cpu="lib")
+
+# -------- set linuxdir ------------
+
+AC_ARG_WITH(linux, [  --with-linux=[path] set path to Linux source (default=/usr/src/linux)],LINUX=$with_linux,LINUX=/usr/src/linux)
+AC_SUBST(LINUX)
+
+# --------- UML?  --------------------
+AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...)
+if test $host_cpu = "lib" ; then 
+        host_cpu="lib"
+       AC_MSG_RESULT(no building Lustre library)
+else
+  if test -e $LINUX/include/asm-um ; then
+    if test  X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then
+       host_cpu="um";
+       AC_MSG_RESULT(yes)
+    else
+       AC_MSG_RESULT(no (asm doesn't point at asm-um))
+    fi
+
+  else 
+        AC_MSG_RESULT(no (asm-um missing))
+  fi
+fi
+
+# --------- Linux 25 ------------------
+
+AC_MSG_CHECKING(if you are running linux 2.5)
+if test -e $LINUX/include/linux/namei.h ; then
+        linux25="yes"
+        AC_MSG_RESULT(yes)
+else
+        linux25="no"
+        AC_MSG_RESULT(no)
+fi
+AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
+echo "Makefiles for in linux 2.5 build: $LINUX25"
+
+# -------  Makeflags ------------------
+
+AC_MSG_CHECKING(setting make flags system architecture: )
+case ${host_cpu} in
+       lib )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall '
+       KCPPFLAGS='-D__arch_lib__ '
+       libdir='${exec_prefix}/lib/lustre'
+        MOD_LINK=elf_i386
+;;
+       um )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common '
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/kernel/skas/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) '
+        ;;
+                * )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/include '
+        ;;
+        esac
+
+        MOD_LINK=elf_i386
+;;
+       i*86 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe'
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include '
+        ;;
+                * )
+                KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        ;;
+        esac
+        MOD_LINK=elf_i386
+;;
+
+       alphaev6 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       alphaev67 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       alpha* )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       ia64 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step'
+       KCPPFLAGS='-D__KERNEL__ -DMODULE'
+        MOD_LINK=elf64_ia64
+;;
+
+       sparc64 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs'
+        KCPPFLAGS='-D__KERNEL__'
+        MOD_LINK=elf64_sparc
+
+;;
+
+       powerpc )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring'
+        KCPPFLAGS='-D__KERNEL__'
+        MOD_LINK=elf32ppclinux
+;;
+
+        *)
+       AC_ERROR("Unknown Linux Platform: $host_cpu")
+;;
+esac
+
+# ----------- make dep run? ------------------
+
+if test $host_cpu != "lib" ; then 
+  AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) )
+  if test -f $LINUX/include/linux/config.h ; then
+  AC_MSG_RESULT(yes)
+ else
+  AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.)
+  fi
+fi
+
+# ------------ include paths ------------------
+
+if test $host_cpu != "lib" ; then 
+    KINCFLAGS="-I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include -I$LINUX/include"
+else
+    KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include'
+fi
+CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS"
+
+if test $host_cpu != "lib" ; then 
+# ------------ autoconf.h ------------------
+  AC_MSG_CHECKING(if autoconf.h is in kernel source)
+  if test -f $LINUX/include/linux/autoconf.h ; then
+      AC_MSG_RESULT(yes)
+  else
+      AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.)
+  fi
+
+# ------------ RELEASE and moduledir ------------------
+  AC_MSG_CHECKING(for Linux release)
+  
+  dnl We need to rid ourselves of the nasty [ ] quotes.
+  changequote(, )
+  dnl Get release from version.h
+  RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`"
+  changequote([, ])
+  
+  moduledir='$(libdir)/modules/'$RELEASE/kernel
+  AC_SUBST(moduledir)
+  
+  modulefsdir='$(moduledir)/fs/$(PACKAGE)'
+  AC_SUBST(modulefsdir)
+  
+  AC_MSG_RESULT($RELEASE)
+  AC_SUBST(RELEASE)
+
+# ---------- modversions? --------------------
+  AC_MSG_CHECKING(for MODVERSIONS)
+  if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1;
+  then
+        MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB"
+        AC_MSG_RESULT(yes)
+  else
+        MFLAGS=
+        AC_MSG_RESULT(no)
+  fi
+fi
+
+# ---------- Portals flags --------------------
+
+#AC_PREFIX_DEFAULT([])
+#if test "x$prefix" = xNONE || test "x$prefix" = x; then
+#  usrprefix=/usr
+#else
+#  usrprefix='${prefix}'
+#fi
+#AC_SUBST(usrprefix)
+
+AC_MSG_CHECKING(if kernel has CPU affinity support)
+if test "$target_cpu" != ia64 ; then
+  enable_affinity_temp="-DCPU_AFFINITY=1"
+  AC_MSG_RESULT(yes)
+else
+  enable_affinity_temp=""
+  AC_MSG_RESULT(no)
+fi
+
+AC_MSG_CHECKING(if kernel has zero-copy TCP support)
+ZCCD="`grep -c zccd $LINUX/include/linux/skbuff.h`"
+if test "$ZCCD" != 0 ; then
+  enable_zerocopy_temp="-DSOCKNAL_ZC=1"
+  AC_MSG_RESULT(yes)
+else
+  enable_zerocopy_temp=""
+  AC_MSG_RESULT(no)
+fi
+
+AC_ARG_ENABLE(zerocopy, [  --enable-zerocopy enable socknal zerocopy],enable_zerocopy=$enable_zerocopy_temp, enable_zerocopy="")
+
+AC_ARG_ENABLE(affinity, [  --enable-affinity enable process/irq affinity],enable_affinity="-DCPU_AFFINITY=1", enable_affinity=$enable_affinity_temp)
+#####################################
+
+AC_MSG_CHECKING(if quadrics kernel headers are present)
+if test -d $LINUX/drivers/net/qsnet ; then
+  AC_MSG_RESULT(yes)
+  QSWNAL="qswnal"
+  with_quadrics="-I$LINUX/drivers/net/qsnet/include"
+  :
+elif test -d $LINUX/drivers/qsnet1 ; then
+  AC_MSG_RESULT(yes)
+  QSWNAL="qswnal"
+  with_quadrics="-I$LINUX/drivers/qsnet1/include -DPROPRIETARY_ELAN"
+  :
+elif test -d $LINUX/drivers/quadrics ; then
+  AC_MSG_RESULT(yes)
+  QSWNAL="qswnal"
+  with_quadrics="-I$LINUX/drivers/quadrics/include -DPROPRIETARY_ELAN"
+  :
+#elif test -d /usr/include/elan3 ; then
+#  AC_MSG_RESULT(yes)
+#  QSWNAL="qswnal"
+#  with_quadrics=""
+#  :
+else
+  AC_MSG_RESULT(no)
+  QSWNAL=""
+  with_quadrics=""
+  :
+fi
+AC_SUBST(with_quadrics)
+AC_SUBST(QSWNAL)
+
+# R. Read 5/02
+GMNAL=""
+echo "checking with-gm=" ${with_gm}
+if test "${with_gm+set}" = set; then
+  if test "${with_gm}" = yes; then
+    with_gm="-I/usr/local/gm/include"
+  else
+    with_gm=-I"$with_gm/include"
+  fi
+  GMNAL="gmnal"
+else
+# default case - no GM
+  with_gm=""
+fi
+AC_SUBST(with_gm)
+AC_SUBST(GMNAL)
+
+
+def_scamac=/opt/scali/include
+AC_ARG_WITH(scamac, [  --with-scamac=[yes/no/path] Path to ScaMAC includes (default=/opt/scali/include)], with_scamac=$withval, with_scamac=$def_scamac)
+AC_MSG_CHECKING(if ScaMAC headers are present)
+if test "$with_scamac" = yes; then
+  with_scamac=$def_scamac
+fi
+if test "$with_scamac" != no -a -f ${with_scamac}/scamac.h; then
+  AC_MSG_RESULT(yes)
+  SCIMACNAL="scimacnal"
+  with_scamac="-I${with_scamac} -I${with_scamac}/icm"
+else
+  AC_MSG_RESULT(no)
+  SCIMACNAL=""
+  with_scamac=""
+fi
+
+AC_SUBST(with_scamac)
+AC_SUBST(SCIMACNAL)
+
+CFLAGS="$KCFLAGS"
+CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac "
+
+AC_SUBST(MOD_LINK)
+AC_SUBST(LINUX25)
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
+
+# ---------- Red Hat 2.4.20 backports some 2.5 bits --------
+# This needs to run after we've defined the KCPPFLAGS
+
+AC_MSG_CHECKING(for kernel version)
+AC_TRY_LINK([#define __KERNEL__
+             #include <linux/sched.h>],
+            [struct task_struct p;
+             p.sighand = NULL;],
+            [RH_2_4_20=1],
+            [RH_2_4_20=0])
+
+if test $RH_2_4_20 = 1; then
+       AC_MSG_RESULT(redhat-2.4.20)
+       CPPFLAGS="$CPPFLAGS -DCONFIG_RH_2_4_20"
+else
+       AC_MSG_RESULT($RELEASE)
+fi 
diff --git a/lnet/autogen.sh b/lnet/autogen.sh
new file mode 100644 (file)
index 0000000..9deed73
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+aclocal &&
+automake --add-missing &&
+${AUTOCONF:-autoconf}
diff --git a/lnet/build.m4 b/lnet/build.m4
new file mode 100644 (file)
index 0000000..025f243
--- /dev/null
@@ -0,0 +1,95 @@
+# ----------  other tests and settings ---------
+
+
+# ---------  unsigned long long sane? -------
+
+AC_CHECK_SIZEOF(unsigned long long, 0)
+echo "---> size SIZEOF $SIZEOF_unsigned_long_long"
+echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long"
+if test $ac_cv_sizeof_unsigned_long_long != 8 ; then
+        AC_MSG_ERROR([** we assume that sizeof(long long) == 8.  Tell phil@clusterfs.com])
+fi
+
+# directories for binaries
+ac_default_prefix=
+bindir='${exec_prefix}/usr/bin'
+sbindir='${exec_prefix}/usr/sbin'
+includedir='${prefix}/usr/include'
+
+# Directories for documentation and demos.
+docdir='${prefix}/usr/share/doc/$(PACKAGE)'
+AC_SUBST(docdir)
+demodir='$(docdir)/demo'
+AC_SUBST(demodir)
+pkgexampledir='${prefix}/usr/lib/$(PACKAGE)/examples'
+AC_SUBST(pkgexampledir)
+pymoddir='${prefix}/usr/lib/${PACKAGE}/python/Lustre'
+AC_SUBST(pymoddir)
+modulenetdir='$(moduledir)/net/$(PACKAGE)'
+AC_SUBST(modulenetdir)
+
+
+# ----------  BAD gcc? ------------
+AC_PROG_RANLIB
+AC_PROG_CC
+AC_MSG_CHECKING(for buggy compiler)
+CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"`
+bad_cc() {
+       echo
+       echo "   '$CC_VERSION'"
+       echo "  has been known to generate bad code, "
+       echo "  please get an updated compiler."
+       AC_MSG_ERROR(sorry)
+}
+TMP_VERSION=`echo $CC_VERSION | cut -c 1-16`
+if test "$TMP_VERSION" = "gcc version 2.95"; then
+        bad_cc
+fi
+case "$CC_VERSION" in 
+       # ost_pack_niobuf putting 64bit NTOH temporaries on the stack
+       # without "sub    $0xc,%esp" to protect the stack from being
+       # stomped on by interrupts (bug 606)
+       "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)")
+               bad_cc
+               ;;
+       # mandrake's similar sub 0xc compiler bug
+       # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2
+       "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
+               bad_cc
+               ;;
+       *)
+               AC_MSG_RESULT(no known problems)
+               ;;
+esac
+# end ------  BAD gcc? ------------
+
+# --------  Check for required packages  --------------
+
+# this doesn't seem to work on older autoconf
+# AC_CHECK_LIB(readline, readline,,)
+AC_ARG_ENABLE(readline,        [  --enable-readline  use readline library],,
+                       enable_readline="yes")
+if test "$enable_readline" = "yes" ; then
+   LIBREADLINE="-lreadline -lncurses"
+   HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1"
+else 
+   LIBREADLINE=""
+   HAVE_LIBREADLINE=""
+fi
+AC_SUBST(LIBREADLINE)
+AC_SUBST(HAVE_LIBREADLINE)
+
+AC_ARG_ENABLE(efence,  [  --enable-efence  use efence library],,
+                       enable_efence="no")
+if test "$enable_efence" = "yes" ; then
+   LIBEFENCE="-lefence"
+   HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1"
+else 
+   LIBEFENCE=""
+   HAVE_LIBEFENCE=""
+fi
+AC_SUBST(LIBEFENCE)
+AC_SUBST(HAVE_LIBEFENCE)
+
diff --git a/lnet/configure.in b/lnet/configure.in
new file mode 100644 (file)
index 0000000..31d3492
--- /dev/null
@@ -0,0 +1,34 @@
+# This version is here to make autoconf happy; the name is a file which is
+# "unique" to this directory so that configure knows where it should run.
+AC_INIT(knals/Makefile.am, 3.0)
+AC_CANONICAL_SYSTEM
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+# Automake variables.  Steal the version number from packaging/intersync.spec
+AM_INIT_AUTOMAKE(portals, builtin([esyscmd], [sed -ne '/.*define IVERSION /{ s/.*IVERSION //; p; }' libcfs/module.c]))
+# AM_MAINTAINER_MODE
+
+sinclude(build.m4)
+sinclude(archdep.m4)
+
+if test x$enable_inkernel = xyes ; then
+cp Kernelenv.mk Kernelenv.in
+cp Makefile.mk Makefile.in
+cp libcfs/Makefile.mk libcfs/Makefile.in
+cp portals/Makefile.mk portals/Makefile.in
+cp knals/Makefile.mk knals/Makefile.in
+cp knals/socknal/Makefile.mk knals/socknal/Makefile.in
+cp router/Makefile.mk router/Makefile.in
+fi
+
+AM_CONFIG_HEADER(include/config.h)
+
+AC_OUTPUT([Makefile Kernelenv libcfs/Makefile portals/Makefile \
+          unals/Makefile knals/Makefile router/Makefile \
+         knals/socknal/Makefile knals/gmnal/Makefile knals/qswnal/Makefile \
+         knals/scimacnal/Makefile knals/toenal/Makefile \
+          utils/Makefile tests/Makefile doc/Makefile ])
+
diff --git a/lnet/doc/.cvsignore b/lnet/doc/.cvsignore
new file mode 100644 (file)
index 0000000..827dca4
--- /dev/null
@@ -0,0 +1,4 @@
+Makefile
+Makefile.in
+*.eps
+*.pdf
diff --git a/lnet/doc/Data-structures b/lnet/doc/Data-structures
new file mode 100644 (file)
index 0000000..b5532b1
--- /dev/null
@@ -0,0 +1,65 @@
+In this document I will try to draw the data structures and how they
+interrelate in the Portals 3 reference implementation.  It is probably
+best shown with a drawing, so there may be an additional xfig or
+Postscript figure.
+
+
+MEMORY POOLS:
+------------
+
+First, a digression on memory allocation in the library.  As mentioned
+in the NAL Writer's Guide, the library does not link against any
+standard C libraries and as such is unable to dynamically allocate
+memory on its own.  It requires that the NAL implement a method
+for allocation that is appropriate for the protection domain in
+which the library lives.  This is only called when a network
+interface is initialized to allocate the Portals object pools.
+
+These pools are preallocate blocks of objects that the library
+can rapidly make active and manage with a minimum of overhead.
+It is also cuts down on overhead for setting up structures
+since the NAL->malloc() callback does not need to be called
+for each object.
+
+The objects are maintained on a per-object type singly linked free
+list and contain a pointer to the next free object.  This pointer
+is NULL if the object is not on the free list and is non-zero
+if it is on the list.  The special sentinal value of 0xDEADBEEF
+is used to mark the end of the free list since NULL could
+indicate that the last object in the list is not free.
+
+When one of the lib_*_alloc() functions is called, the library
+returns the head of the free list and advances the head pointer
+to the next item on the list.  The special case of 0xDEADBEEF is
+checked and a NULL pointer is returned if there are no more
+objects of this type available.   The lib_*_free() functions
+are even simpler -- check to ensure that the object is not already
+free, set its next pointer to the current head and then set
+the head to be this newly freed object.
+
+Since C does not have templates, I did the next best thing and wrote
+the memory pool allocation code as a macro that expands based on the
+type of the argument.  The mk_alloc(T) macro expands to
+write the _lib_T_alloc() and lib_T_free() functions.
+It requires that the object have a pointer of the type T named
+"next_free".  There are also functions that map _lib_T_alloc()
+to lib_T_alloc() so that the library can add some extra
+functionality to the T constructor.
+
+
+
+LINKED LISTS:
+------------
+
+Many of the active Portals objects are stored in doubly linked lists
+when they are active.  These are always implemented with the pointer
+to the next object and a pointer to the next pointer of the
+previous object.  This avoids the "dummy head" object or
+special cases for inserting at the beginning or end of the list.
+The pointer manipulations are a little hairy at times, but
+I hope that they are understandable.
+
+The actual linked list code is implemented as macros in <lib-p30.h>,
+although the object has to know about 
+
+
diff --git a/lnet/doc/Makefile.am b/lnet/doc/Makefile.am
new file mode 100644 (file)
index 0000000..7c65e6c
--- /dev/null
@@ -0,0 +1,46 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+LYX2PDF = lyx --export pdf
+LYX2TXT = lyx --export text
+LYX2HTML = lyx --export html
+SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps
+
+DOCS = portals3.pdf 
+IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps
+LYXFILES= portals3.lyx
+
+MAINTAINERCLEANFILES =  $(IMAGES) $(DOCS) $(GENERATED)
+GENERATED = 
+EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) 
+
+all: $(DOCS)
+
+# update date and version in document
+date := $(shell date +%x)
+tag := $(shell echo '$$Name:  $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/')
+addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g'
+
+# Regenerate when the $(VERSION) or $Name:  $ changes.
+.INTERMEDIATE: $(GENERATED)
+$(GENERATED) : %.lyx: %.lin Makefile
+       $(addversion) $< > $@
+
+.lyx.pdf:
+       @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n"
+
+.lyx.txt:
+       @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n"
+.lyx.html:
+       @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n"
+.fig.eps:
+       -fig2dev -L eps $< > $@
+
+portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx
+
+syncweb: portals3.pdf
+#      cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf
+#      ( cd /usr/src/www ; make lustre ; make synclustre )
+
diff --git a/lnet/doc/Message-life-cycle b/lnet/doc/Message-life-cycle
new file mode 100644 (file)
index 0000000..e8cc7e2
--- /dev/null
@@ -0,0 +1,118 @@
+This documents the life cycle of message as it arrives and is handled by
+a basic async, packetized NAL.  There are four types of messages that have
+slightly different life cycles, so they are addressed independently.
+
+
+Put request
+-----------
+
+1.  NAL notices that there is a incoming message header on the network
+and reads an ptl_hdr_t in from the wire.
+
+2.  It may store additional NAL specific data that provides context
+for this event in a void* that it will interpret in some fashion
+later.
+
+3.  The NAL calls lib_parse() with a pointer to the header and its
+private data structure.
+
+4.  The library decodes the header and may build a message state
+object that describes the event to be written and the ACK to be
+sent, if any.  It then calls nal->recv() with the private data
+that the NAL passed in, a pointer to the message state object
+and a translated user address.
+
+       The NAL will have been given a chance to pretranslate
+       all user addresses when the buffers are created.  This
+       process is described in the NAL-HOWTO.
+
+5.  The NAL should restore what ever context it required from the
+private data pointer, begin receiving the bytes and possibly store
+some extra state of its own.  It should return at this point.
+
+
+
+Get request
+-----------
+
+1.  As with a Put, the NAL notices the incoming message header and
+passes it to lib_parse().
+
+2.  The library decodes the header and calls nal->recv() with a
+zero byte length, offset and destination to instruct it to clean
+up the wire after reading the header.  The private data will
+be passed in as well, allowing the NAL to retrieve any state
+or context that it requires.
+
+3.  The library may build a message state object to possibly
+write an event log or invalidate a memory region.
+
+4.  The library will build a ptl_msg_t header that specifies the
+Portals protocol information for delivery at the remote end.
+
+5.  The library calls nal->send() with the pre-built header,
+the optional message state object, the four part address
+component, a translated user pointer + offset, and some
+other things.
+
+6.  The NAL is to put the header on the wire or copy it at
+this point (since it off the stack).  It should store some
+amount of state about its current position in the message and
+the destination address.
+
+7.  And then return to the library.
+
+
+Reply request
+-------------
+
+1.  Starting at "The library decodes the header..."
+
+2.  The library decodes the header and calls nal->recv()
+to bring in the rest of the message.  Flow continues in
+exactly the same fashion as with all other receives.
+
+
+Ack request
+-----------
+
+1.  The library decodes the header, builds the appropriate data
+structures for the event in a message state object and calls nal->recv()
+with a zero byte length, etc.
+
+
+Packet arrival
+--------------
+
+1.  The NAL should notice the arrival of a packet, retrieve whatever
+state it needs from the message ID or other NAL specific header data
+and place the data bytes directly into the user address that were
+given to nal->recv().
+
+       How this happens is outside the scope of the Portals library
+       and soley determined by the NAL...
+
+2.  If this is the last packet in a message, the NAL should retrieve
+the lib_msg_t *cookie that it was given in the call to nal->recv()
+and pass it to lib_finalize().  lib_finalize() may call nal->send()
+to send an ACK, nal->write() to record an entry in the event log,
+nal->invalidate() to unregister a region of memory or do nothing at all.
+
+3.  It should then clean up any remaining NAL specific state about
+the message and go back into the main loop.
+
+
+Outgoing packets
+----------------
+
+1.  When the NAL has pending output, it should put the packets on
+the wire wrapped with whatever implementation specified wrappers.
+
+2.  Once it has output all the packets of a message it should
+call lib_finalize() with the message state object that was
+handed to nal->send().  This will allows the library to clean
+up its state regarding the message and write any pending event
+entries.
+
+
+
diff --git a/lnet/doc/NAL-HOWTO b/lnet/doc/NAL-HOWTO
new file mode 100644 (file)
index 0000000..ea38aed
--- /dev/null
@@ -0,0 +1,293 @@
+This document is a first attempt at describing how to write a NAL
+for the Portals 3 library.  It also defines the library architecture
+and the abstraction of protection domains.
+
+
+First, an overview of the architecture:
+
+    Application
+
+----|----+--------
+         |
+   API  === NAL        (User space)
+         |   
+---------+---|-----
+         |    
+   LIB  === NAL        (Library space)
+         |
+---------+---|-----
+          
+    Physical wire      (NIC space)
+          
+
+Application
+    API
+API-side NAL
+------------
+LIB-side NAL
+    LIB
+LIB-side NAL
+   wire
+
+Communication is through the indicated paths via well defined
+interfaces.  The API and LIB portions are written to be portable
+across platforms and do not depend on the network interface.
+
+Communcation between the application and the API code is
+defined in the Portals 3 API specification.  This is the
+user-visible portion of the interface and should be the most
+stable.
+
+
+
+API-side NAL:
+------------
+
+The user space NAL needs to implement only a few functions
+that are stored in a nal_t data structure and called by the
+API-side library:
+
+       int forward( nal_t *nal,
+               int     index,
+               void    *args,
+               size_t  arg_len,
+               void    *ret,
+               size_t  ret_len
+       );
+
+Most of the data structures in the portals library are held in
+the LIB section of the code, so it is necessary to forward API
+calls across the protection domain to the library.  This is
+handled by the NAL's forward method.  Once the argument and return
+blocks are on the remote side the NAL should call lib_dispatch()
+to invoke the appropriate API function.
+
+       int validate( nal_t *nal,
+               void    *base,
+               size_t  extent,
+               void    **trans_base,
+               void    **trans_data
+       );
+
+The validate method provides a means for the NAL to prevalidate
+and possibly pretranslate user addresses into a form suitable
+for fast use by the network card or kernel module.  The trans_base
+pointer will be used by the library everytime it needs to
+refer to the block of memory.  The trans_data result is a
+cookie that will be handed to the NAL along with the trans_base.
+
+The library never performs calculations on the trans_base value;
+it only computes offsets that are then handed to the NAL.
+
+
+       int shutdown( nal_t *nal, int interface );
+
+Brings down the network interface.  The remote NAL side should
+call lib_fini() to bring down the library side of the network.
+
+       void yield( nal_t *nal );
+
+This allows the user application to gracefully give up the processor
+while busy waiting.  Performance critical applications may not
+want to take the time to call this function, so it should be an
+option to the PtlEQWait call.  Right now it is not implemented as such.
+
+Lastly, the NAL must implement a function named PTL_IFACE_*, where
+* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR.
+This initialization function is to set up communication with the
+library-side NAL, which should call lib_init() to bring up the
+network interface.
+
+
+
+LIB-side NAL:
+------------
+
+On the library-side, the NAL has much more responsibility.  It
+is responsible for calling lib_dispatch() on behalf of the user,
+it is also responsible for bringing packets off the wire and
+pushing bits out.  As on the user side, the methods are stored
+in a nal_cb_t structure that is defined on a per network
+interface basis.
+
+The calls to lib_dispatch() need to be examined.  The prototype:
+
+       void    lib_dispatch(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       int                     index,
+                       void                    *arg_block,
+                       void                    *ret_block
+       );
+
+has two complications.  The private field is a NAL-specific
+value that will be passed to any callbacks produced as a result
+of this API call.  Kernel module implementations may use this
+for task structures, or perhaps network card data.  It is ignored
+by the library.
+
+Secondly, the arg_block and ret_block must be in the same protection
+domain as the library.  The NAL's two halves must communicate the
+sizes and perform the copies.  After the call, the buffer pointed
+to by ret_block will be filled in and should be copied back to
+the user space.  How this is to be done is NAL specific.
+
+       int lib_parse(
+                       nal_cb_t                *nal,
+                       ptl_hdr_t               *hdr,
+                       void                    *private
+       );
+
+This is the only other entry point into the library from the NAL.
+When the NAL detects an incoming message on the wire it should read
+sizeof(ptl_hdr_t) bytes and pass a pointer to the header to
+lib_parse().  It may set private to be anything that it needs to
+tie the incoming message to callbacks that are made as a result
+of this event.
+
+The method calls are:
+
+       int     (*send)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       lib_msg_t               *cookie,
+                       ptl_hdr_t               *hdr,
+                       int                     nid,
+                       int                     pid,
+                       int                     gid,
+                       int                     rid,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+                       size_t                  len
+       );
+
+This is a tricky function -- it must support async output
+of messages as well as properly syncronized event log writing.
+The private field is the same that was passed into lib_dispatch()
+or lib_parse() and may be used to tie this call to the event
+that initiated the entry to the library.
+
+The cookie is a pointer to a library private value that must
+be passed to lib_finalize() once the message has been completely
+sent.  It should not be examined by the NAL for any meaning.
+
+The four ID fields are passed in, although some implementations
+may not use all of them.
+
+The single base pointer has been replaced with the translated
+address that the API NAL generated in the api_nal->validate()
+call.  The trans_data is unchanged and the offset is in bytes.
+
+
+       int     (*recv)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       lib_msg_t               *cookie,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+                       size_t                  mlen,
+                       size_t                  rlen
+       );
+
+This callback will only be called in response to lib_parse().
+The cookie, trans_addr and trans_data  are as discussed in send().
+The NAL should read mlen bytes from the wire, deposit them into
+trans_base + offset and then discard (rlen - mlen) bytes.
+Once the entire message has been received the NAL should call
+lib_finalize() with the lib_msg_t *cookie.
+
+The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0
+is used to indicate that the NAL should clean up the wire.  This could
+be implemented as a blocking call, although having it return as quickly
+as possible is desirable.
+
+       int     (*write)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       user_ptr                trans_addr,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+
+                       void                    *src_addr,
+                       size_t                  len
+       );
+
+This is essentially a cross-protection domain memcpy().  The user address
+has been pretranslated by the api_nal->translate() call.
+
+       void    *(*malloc)(
+                       nal_cb_t                *nal,
+                       size_t                  len
+       );
+
+       void    (*free)(
+                       nal_cb_t                *nal,
+                       void                    *buf
+       );
+
+Since the NAL may be in a non-standard hosted environment it can
+not call malloc().  This allows the library side NAL to implement
+the system specific malloc().  In the current reference implementation
+the libary only calls nal->malloc() when the network interface is
+initialized and then calls free when it is brought down.  The library
+maintains its own pool of objects for allocation so only one call to
+malloc is made per object type.
+
+       void    (*invalidate)(
+                       nal_cb_t                *nal,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  extent
+       );
+
+User addresses are validated/translated at the user-level API NAL
+method, which is likely to push them to this level.  Meanwhile,
+the library NAL will be notified when the library no longer
+needs the buffer.  Overlapped buffers are not detected by the
+library, so the NAL should ref count each page involved.
+
+Unfortunately we have a few bugs when the invalidate method is
+called.  It is still in progress...
+
+       void    (*printf)(
+                       nal_cb_t                *nal,
+                       const char              *fmt,
+                       ...
+       );
+
+As with malloc(), the library does not have any way to do printf
+or printk.  It is not necessary for the NAL to implement the this
+call, although it will make debugging difficult.
+
+       void    (*cli)(
+                       nal_cb_t                *nal,
+                       unsigned long           *flags
+       );
+
+       void    (*sti)(
+                       nal_cb_t                *nal,
+                       unsigned long           *flags
+       );
+
+These are used by the library to mark critical sections.
+
+       int     (*gidrid2nidpid)(
+                       nal_cb_t                *nal,
+                       ptl_id_t                gid,
+                       ptl_id_t                rid,
+                       ptl_id_t                *nid,
+                       ptl_id_t                *pid
+       );
+
+
+       int     (*nidpid2gidrid)(
+                       nal_cb_t                *nal,
+                       ptl_id_t                nid,
+                       ptl_id_t                pid,
+                       ptl_id_t                *gid,
+                       ptl_id_t                *rid
+       );
+
+Rolf added these.  I haven't looked at how they have to work yet.
diff --git a/lnet/doc/file.fig b/lnet/doc/file.fig
new file mode 100644 (file)
index 0000000..914c294
--- /dev/null
@@ -0,0 +1,111 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 1200 750 1650 1050
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 1050 1650 750 1200 750 1200 1050 1650 1050
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001
+-6
+6 1200 2325 1650 2625
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 2625 1650 2325 1200 2325 1200 2625 1650 2625
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001
+-6
+6 1200 1800 1650 2100
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 2100 1650 1800 1200 1800 1200 2100 1650 2100
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001
+-6
+6 1200 1275 1650 1575
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 1575 1650 1275 1200 1275 1200 1575 1650 1575
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001
+-6
+6 450 750 900 1200
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 825 450 1050
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1050 900 825
+-6
+6 450 2325 900 2775
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 2400 450 2625
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2625 900 2400
+-6
+6 450 1800 900 2250
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 1875 450 2100
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2100 900 1875
+-6
+6 450 1275 900 1725
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 1350 450 1575
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1575 900 1350
+-6
+6 2250 750 3450 2625
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1200 3150 1200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1500 3150 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1800 3150 1800
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 2100 3150 2100
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2550 975 3150 975 3150 2625 2550 2625 2550 975
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 2400 3150 2400
+4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2400 2550 1350
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 1875 2550 1050
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 1425 2550 1950
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 900 2550 1650
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 900 1200 900
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1425 1200 1425
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1950 1200 1950
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2475 1200 2475
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2025 2550 2250
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2550 2550 2475
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1875 2850 1875 600 225 600 225 2850 1875 2850
+4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001
diff --git a/lnet/doc/flow_new.fig b/lnet/doc/flow_new.fig
new file mode 100644 (file)
index 0000000..d828dea
--- /dev/null
@@ -0,0 +1,213 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 525 2175 1575 2925
+6 675 2287 1425 2812
+4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001
+4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001
+4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        1575 2550 1050 2175 525 2550 1050 2925 1575 2550
+-6
+6 3450 1275 4350 1725
+6 3600 1312 4200 1687
+4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001
+4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3450 1275 4350 1275 4350 1725 3450 1725 3450 1275
+-6
+6 4650 1275 5550 1725
+6 4725 1312 5475 1687
+4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001
+4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4650 1275 5550 1275 5550 1725 4650 1725 4650 1275
+-6
+6 1350 525 2250 975
+6 1350 562 2250 937
+4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001
+4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1350 525 2250 525 2250 975 1350 975 1350 525
+-6
+6 525 1125 1575 1875
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        1575 1500 1050 1125 525 1500 1050 1875 1575 1500
+4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001
+-6
+6 2340 1237 2940 1687
+6 2340 1237 2940 1687
+4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001
+4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001
+4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001
+-6
+-6
+6 525 3225 1575 3975
+6 675 3375 1425 3750
+4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001
+4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001
+-6
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        525 3600 1050 3225 1575 3600 1050 3975 525 3600
+-6
+6 3300 3375 4350 3825
+6 3300 3412 4350 3787
+4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001
+4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3300 3375 4350 3375 4350 3825 3300 3825 3300 3375
+-6
+6 1950 3225 3000 3975
+6 2250 3450 2700 3750
+4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001
+4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        3000 3600 2475 3225 1950 3600 2475 3975 3000 3600
+-6
+6 3150 4500 4200 4950
+6 3150 4537 4200 4912
+4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001
+4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3150 4500 4200 4500 4200 4950 3150 4950 3150 4500
+-6
+6 600 4500 1500 4950
+6 675 4537 1425 4912
+4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001
+4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        600 4500 1500 4500 1500 4950 600 4950 600 4500
+-6
+6 4650 4350 5700 5100
+6 4950 4537 5400 4912
+6 4950 4537 5400 4912
+4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001
+4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001
+-6
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        5700 4725 5175 4350 4650 4725 5175 5100 5700 4725
+-6
+6 6000 4500 6900 4950
+6 6225 4575 6675 4875
+4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001
+4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        6000 4500 6900 4500 6900 4950 6000 4950 6000 4500
+-6
+6 1800 4350 2850 5100
+6 2100 4575 2550 4875
+4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001
+4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        2850 4725 2325 4350 1800 4725 2325 5100 2850 4725
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 1875 1050 2175
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1575 1500 2100 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 450 1050 1125
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1350 750 1050 750
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 2925 1050 3225
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3150 1500 3450 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        4350 1500 4650 1500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        2100 1500 2625 1125 3150 1500 2625 1875 2100 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1575 3600 1950 3600
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 3975 1050 4500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3000 3600 3300 3600
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 4725 1800 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        5700 4725 6000 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2850 4725 3150 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        4200 4725 4650 4725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        6900 4725 7950 4725
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1575 2550 1650 2550 1800 2550 1800 2400 1800 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5
+       0 0 1.00 60.00 120.00
+        2250 750 2475 750 2625 750 2625 900 2625 1125
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5
+       0 0 1.00 60.00 120.00
+        7500 4725 7500 1650 7500 1500 7350 1500 5550 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        2475 3225 2475 2400 2475 2250 2325 2250 1800 2250
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        3825 3375 3825 2175 3825 2025 3675 2025 1800 2025
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8
+       0 0 1.00 60.00 120.00
+        2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125
+        4425 4275 4425 4725
+        0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8
+       0 0 1.00 60.00 120.00
+        5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125
+        7275 4275 7275 4725
+        0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000
+4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001
+4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001
+4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001
+4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001
+4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001
+4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001
diff --git a/lnet/doc/get.fig b/lnet/doc/get.fig
new file mode 100644 (file)
index 0000000..28db949
--- /dev/null
@@ -0,0 +1,33 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 2775 900 3525 1200
+4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001
+4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001
+-6
+6 1350 1725 2175 2025
+4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001
+4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001
+-6
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        900 525 2700 750
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 825 2700 1275
+2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 1350 900 1950
+2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5
+        2400 300 3600 300 3600 2250 2400 2250 2400 300
+2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5
+        0 300 1200 300 1200 2250 0 2250 0 300
+4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001
+4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001
+4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001
diff --git a/lnet/doc/ieee.bst b/lnet/doc/ieee.bst
new file mode 100644 (file)
index 0000000..4df7c50
--- /dev/null
@@ -0,0 +1,1112 @@
+% ---------------------------------------------------------------
+%
+% by Paolo.Ienne@di.epfl.ch
+%
+% ---------------------------------------------------------------
+%
+% no guarantee is given that the format corresponds perfectly to 
+% IEEE 8.5" x 11" Proceedings, but most features should be ok.
+%
+% ---------------------------------------------------------------
+%
+% `ieee' from BibTeX standard bibliography style `abbrv'
+% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09.
+% Copyright (C) 1985, all rights reserved.
+% Copying of this file is authorized only if either
+% (1) you make absolutely no changes to your copy, including name, or
+% (2) if you do make changes, you name it something other than
+% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
+% This restriction helps ensure that all standard styles are identical.
+% The file btxbst.doc has the documentation for this style.
+
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+ { add.period$ write$
+   newline$
+   "\newblock " write$
+ }
+ { output.state before.all =
+     'write$
+     { add.period$ " " * write$ }
+   if$
+ }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem{" write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+ 'skip$
+ { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+ { namesleft #1 >
+     { ", " * t * }
+     { numnames #2 >
+  { "," * }
+  'skip$
+       if$
+       t "others" =
+  { " et~al." * }
+  { " and " * t * }
+       if$
+     }
+   if$
+ }
+ 't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+ { ", editors" * }
+ { ", editor" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+ { t #1 #2 substring$ "--" = not
+     { "--" *
+       t #2 global.max$ substring$ 't :=
+     }
+     {   { t #1 #1 substring$ "-" = }
+  { "-" *
+    t #2 global.max$ substring$ 't :=
+  }
+       while$
+     }
+   if$
+ }
+ { t #1 #1 substring$ *
+   t #2 global.max$ substring$ 't :=
+ }
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year empty$
+    { month empty$
+ { "" }
+ { "there's a month but no year in " cite$ * warning$
+   month
+ }
+      if$
+    }
+    { month empty$
+ 'year
+ { month " " * year * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+ 'skip$
+ { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+ { series field.or.null }
+ { output.state mid.sentence =
+     { "number" }
+     { "Number" }
+   if$
+   number tie.or.space.connect
+   series empty$
+     { "there's a number but no series in " cite$ * warning$ }
+     { " in " * series * }
+   if$
+ }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+ { edition "l" change.case$ " edition" * }
+ { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+ { #1 'multiresult := }
+ { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+ { "pages" pages n.dashify tie.or.space.connect }
+ { "page" pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "(" number * ")" * *
+      volume empty$
+ { "there's a number but no volume in " cite$ * warning$ }
+ 'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+ { pop$ format.pages }
+ { ":" * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+ { "chapter" }
+ { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+ 'skip$
+ { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+ { "In " booktitle emphasize * }
+ { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+ { "need key or journal for " cite$ * " to crossref " * crossref *
+   warning$
+   ""
+ }
+ { "In {\em " journal * "\/}" * }
+      if$
+    }
+    { "In " key * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.crossref.editor}
+{ editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+ 'skip$
+ { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+     { " et~al." * }
+     { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+   if$
+ }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { series empty$
+     { "need editor, key, or series for " cite$ * " to crossref " *
+       crossref * warning$
+       "" *
+     }
+     { "{\em " * series * "\/}" * }
+   if$
+ }
+ { key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { booktitle empty$
+     { "need editor, key, or booktitle for " cite$ * " to crossref " *
+       crossref * warning$
+       ""
+     }
+     { "In {\em " booktitle * "\/}" * }
+   if$
+ }
+ { "In " key * }
+      if$
+    }
+    { "In " format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      format.vol.num.pages output
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+ { organization publisher new.sentence.checkb
+   organization output
+   publisher output
+   format.date "year" output.check
+ }
+ { address output.nonnull
+   format.date "year" output.check
+   new.sentence
+   organization output
+   publisher output
+ }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  author empty$
+    { organization empty$
+ 'skip$
+ { organization output.nonnull
+   address output
+ }
+      if$
+    }
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  author empty$
+    { organization empty$
+ { address new.block.checka
+   address output
+ }
+ 'skip$
+      if$
+    }
+    { organization address new.block.checkb
+      organization output
+      address output
+    }
+  if$
+  format.edition output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  editor empty$
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address empty$
+    { editor empty$
+ { publisher new.sentence.checka }
+ { organization publisher new.sentence.checkb
+   organization output
+ }
+      if$
+      publisher output
+      format.date "year" output.check
+    }
+    { address output.nonnull
+      format.date "year" output.check
+      new.sentence
+      editor empty$
+ 'skip$
+ { organization output }
+      if$
+      publisher output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+MACRO {jan} {"Jan."}
+
+MACRO {feb} {"Feb."}
+
+MACRO {mar} {"Mar."}
+
+MACRO {apr} {"Apr."}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"Aug."}
+
+MACRO {sep} {"Sept."}
+
+MACRO {oct} {"Oct."}
+
+MACRO {nov} {"Nov."}
+
+MACRO {dec} {"Dec."}
+
+MACRO {acmcs} {"ACM Comput. Surv."}
+
+MACRO {acta} {"Acta Inf."}
+
+MACRO {cacm} {"Commun. ACM"}
+
+MACRO {ibmjrd} {"IBM J. Res. Dev."}
+
+MACRO {ibmsj} {"IBM Syst.~J."}
+
+MACRO {ieeese} {"IEEE Trans. Softw. Eng."}
+
+MACRO {ieeetc} {"IEEE Trans. Comput."}
+
+MACRO {ieeetcad}
+ {"IEEE Trans. Comput.-Aided Design Integrated Circuits"}
+
+MACRO {ipl} {"Inf. Process. Lett."}
+
+MACRO {jacm} {"J.~ACM"}
+
+MACRO {jcss} {"J.~Comput. Syst. Sci."}
+
+MACRO {scp} {"Sci. Comput. Programming"}
+
+MACRO {sicomp} {"SIAM J. Comput."}
+
+MACRO {tocs} {"ACM Trans. Comput. Syst."}
+
+MACRO {tods} {"ACM Trans. Database Syst."}
+
+MACRO {tog} {"ACM Trans. Gr."}
+
+MACRO {toms} {"ACM Trans. Math. Softw."}
+
+MACRO {toois} {"ACM Trans. Office Inf. Syst."}
+
+MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."}
+
+MACRO {tcs} {"Theoretical Comput. Sci."}
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+ { "   " * }
+ 'skip$
+      if$
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr numnames = t "others" = and
+ { "et al" * }
+ { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+ { "to sort, need author or key in " cite$ * warning$
+   ""
+ }
+ { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+ { key empty$
+     { "to sort, need author, editor, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need author, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need editor, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+FUNCTION {presort}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+ 'editor.organization.sort
+ { type$ "manual" =
+     'author.organization.sort
+     'author.sort
+   if$
+ }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label }
+
+INTEGERS { number.label longest.label.width }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #1 'number.label :=
+  #0 'longest.label.width :=
+}
+
+FUNCTION {longest.label.pass}
+{ number.label int.to.str$ 'label :=
+  number.label #1 + 'number.label :=
+  label width$ longest.label.width >
+    { label 'longest.label :=
+      label width$ 'longest.label.width :=
+    }
+    'skip$
+  if$
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {longest.label.pass}
+
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{"  longest.label  * 
+  "}\setlength{\itemsep}{-1ex}\small" * write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
+
+% end of file ieee.bst
+% ---------------------------------------------------------------
diff --git a/lnet/doc/mpi.fig b/lnet/doc/mpi.fig
new file mode 100644 (file)
index 0000000..e1a91b5
--- /dev/null
@@ -0,0 +1,117 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 150 1650 900 2025
+4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001
+4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001
+-6
+6 150 150 900 525
+4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001
+4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001
+-6
+6 2550 4125 3150 4725
+4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001
+4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001
+4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001
+-6
+6 1050 1575 1950 1875
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 1575 1950 1575 1950 1875 1050 1875 1050 1575
+4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001
+-6
+6 5400 1575 6300 2175
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        5400 1575 6300 1575 6300 2175 5400 2175 5400 1575
+4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001
+-6
+6 5400 2400 6300 3000
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        5400 2400 6300 2400 6300 3000 5400 3000 5400 2400
+4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001
+-6
+6 1050 2400 1950 2700
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 2400 1950 2400 1950 2700 1050 2700 1050 2400
+4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001
+-6
+6 1050 825 1950 1125
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 825 1950 825 1950 1125 1050 1125 1050 825
+4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 1125 1500 1575
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2025 4050 3375
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        150 675 6600 675
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        150 1350 6600 1350
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 4125 3300 4125 3300 4725 2400 4725 2400 4125
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 4500 4050 3675
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 1725 5400 1725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2550 5400 2550
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2850 4050 3450
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 1800 1500 2400
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 825 3300 825 3300 1275 2400 1275 2400 825
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 2625 1500 4125
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 4125 1950 4125 1950 4425 1050 4425 1050 4125
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 300 1500 825
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 975 2400 975
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 1725 2400 1725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 2550 2400 2550
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 4275 2400 4275
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 1575 3300 1575 3300 2175 2400 2175 2400 1575
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 2400 3300 2400 3300 3000 2400 3000 2400 2400
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4050 3300 5250 3300 5250 3750 4050 3750 4050 3300
+4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001
+4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001
+4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001
+4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001
+4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001
+4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001
+4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001
+4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001
+4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001
+4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001
+4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001
+4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001
+4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001
+4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001
+4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001
diff --git a/lnet/doc/portals.fig b/lnet/doc/portals.fig
new file mode 100644 (file)
index 0000000..9b1271b
--- /dev/null
@@ -0,0 +1,68 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1350 900 1650 900 1650 1200 1350 1200 1350 900
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1800 1350 2100 1350 2100 1650 1800 1650 1800 1350
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2250 1800 2550 1800 2550 2100 2250 2100 2250 1800
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        4200 375 4200 2100
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        525 600 1125 600 1125 2100 525 2100 525 600
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4425 1275 4875 1275 4875 1950 4425 1950 4425 1275
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2550 1200 3150 1200 3150 1500 2550 1500 2550 1200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3000 1425 4425 1425
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3600 825 3750 825 3750 1125 3600 1125 3600 825
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2025 1425 2550 1425
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        4425 750 4875 750 4875 1125 4425 1125 4425 750
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3675 975 4425 975
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2
+       0 0 1.00 60.00 120.00
+        825 1050 1350 1050
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1500 1125 1500 1350 1500 1500 1650 1500 1800 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1950 1575 1950 1800 1950 1950 2100 1950 2250 1950
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2
+        525 975 1125 975
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2
+        525 1125 1125 1125
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7
+       0 0 1.00 60.00 120.00
+        3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975
+        3600 975
+        0.000 1.000 1.000 1.000 1.000 1.000 0.000
+4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001
+4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001
+4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001
+4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001
+4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001
+4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001
+4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001
+4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001
+4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001
diff --git a/lnet/doc/portals3.bib b/lnet/doc/portals3.bib
new file mode 100644 (file)
index 0000000..323b99f
--- /dev/null
@@ -0,0 +1,124 @@
+@Article{           Cplant,
+    title       = { {M}assively {P}arallel {C}omputing with
+                    {C}ommodity {C}omponents },
+    author      = { Ron Brightwell and David S. Greenberg and Arthur
+                    B. Maccabe and Rolf Riesen },
+    journal     = { Parallel Computing },
+    volume      = { 26 },
+    month       = { February },
+    pages       = { 243-266 },
+    year        = { 2000 }
+}
+
+@Manual{     Portals,
+    organization = { Sandia National Laboratories },
+    title        = { {P}uma {P}ortals },
+    note         = { http://www.cs.sandia.gov/puma/portals },
+    year         = { 1997 }
+}
+
+@Techreport{      VIA,
+  title         = { {V}irtual {I}nterface {A}rchitecture
+                    {S}pecification {V}ersion 1.0 }, 
+  author        = { {Compaq, Microsoft, and Intel} },
+  institution   = { Compaq, Microsoft, and Intel },
+  month         = { December },
+  year          = { 1997 }
+}
+
+@Techreport{      ST,
+  title         = { {I}nformation {T}echnology - {S}cheduled
+                  {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 },
+  author        = { {Task Group of Technical Committee T11} },
+  institution   = { Accredited Standards Committee NCITS },
+  month         = { July },
+  year          = { 1998 }
+}
+
+@Manual{     TFLOPS,
+    organization = { Sandia National Laboratories },
+    title        = { ASCI Red },
+    note         = { http://www.sandia.gov/ASCI/TFLOP },
+    year         = { 1996 }
+}
+
+@Techreport{      GM,
+  title         = { The {GM} {M}essage {P}assing {S}ystem },
+  author         = { {Myricom, Inc.} },
+  institution    = { {Myricom, Inc.} },
+  year          = { 1997 },
+}
+
+@Article{           MPIstandard,
+    title        = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard },
+    author       = { {Message Passing Interface Forum} },
+    journal      = { The International Journal of Supercomputer Applications
+                     and High Performance Computing },
+    volume       = { 8 },
+    year         = { 1994 }
+}
+
+@Inproceedings{    PumaOS,
+    author       = "Lance Shuler and Chu Jong and Rolf Riesen and
+                    David van Dresser and Arthur B. Maccabe and
+                    Lee Ann Fisk and T. Mack Stallcup",
+    booktitle    = "Proceeding of the 1995 Intel Supercomputer
+                    User's Group Conference",
+    title        = "The {P}uma Operating System for Massively Parallel Computers",
+    organization = "Intel Supercomputer User's Group",
+    year         = 1995
+}
+
+@InProceedings{   SUNMOS,
+author          = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and
+                   Stephen R. Wheat",
+title           = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide",
+booktitle       = "Proceedings of the {Intel} Supercomputer Users' Group. 1994
+                   Annual North America Users' Conference.",
+year            = 1994,
+pages           = "245--251",
+month           = "June",
+location        = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps"
+}
+
+@InProceedings {   PumaMPI,
+    title        = { Design and Implementation of {MPI} on {P}uma Portals },
+    author       = { Ron Brightwell and Lance Shuler },
+    booktitle    = { Proceedings of the Second MPI Developer's Conference },
+    pages        = { 18-25 },
+    month        = { July },
+    year         = { 1996 }
+}
+
+@Inproceedings{     FM2,
+    author       = { Mario Lauria and Scott Pakin and Andrew Chien },
+    title        = { {E}fficient {L}ayering for {H}igh {S}peed
+                     {C}ommunication: {F}ast {M}essages 2.x },
+    Booktitle    = { Proceedings of the IEEE International Symposium
+                     on High Performance Distributed Computing },
+    year         = { 1998 }
+}
+
+@Manual {          CraySHMEM,
+    title        = "SHMEM Technical Note for C, SG-2516 2.3",
+    organization = "Cray Research, Inc.",
+    month        = "October",
+    year         = 1994
+}
+
+@Manual {          MPI2,
+    title        = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface",
+    organization = "Message Passing Interface Forum",
+    note         = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html",
+    month        = "July",
+    year         = 1997
+}
+
+@InProceedings {   PMMPI,
+    title        = { {The Design and Implementation of Zero Copy MPI Using
+                       Commodity Hardware with a High Performance Network} },
+    author       = { Francis O'Carroll and  Hiroshi Tezuka and Atsushi Hori
+                     and Yutaka Ishikawa  },
+    booktitle    = { Proceedings of the ICS },
+    year         = { 1998 }
+}
diff --git a/lnet/doc/portals3.lyx b/lnet/doc/portals3.lyx
new file mode 100644 (file)
index 0000000..8429280
--- /dev/null
@@ -0,0 +1,15944 @@
+#LyX 1.2 created this file. For more info see http://www.lyx.org/
+\lyxformat 220
+\textclass report
+\begin_preamble
+\usepackage{fullpage}
+\renewenvironment{comment}%
+{\begin{quote}\textbf{Discussion}: \slshape}%
+{\end{quote}}
+\pagestyle{myheadings}
+\end_preamble
+\language american
+\inputencoding auto
+\fontscheme pslatex
+\graphics default
+\paperfontsize 10
+\spacing single 
+\papersize letterpaper
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 2
+\tocdepth 2
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 2
+\paperpagestyle headings
+
+\layout Title
+
+The Portals 3.2 Message Passing Interface 
+\newline 
+ Revision 1.1
+\layout Author
+
+Ron Brightwell
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+R.
+ Brightwell and R.
+ Riesen are with the Scalable Computing Systems Department, Sandia National
+ Laboratories, P.O.
+ Box 5800, Albuquerque, NM\SpecialChar ~
+\SpecialChar ~
+87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov.
+\end_inset 
+
+, Arthur B.
+ Maccabe
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+A.
+ B.
+ Maccabe is with the Computer Science Department, University of New Mexico,
+ Albuquerque, NM\SpecialChar ~
+\SpecialChar ~
+87131-1386, maccabe@cs.unm.edu.
+\end_inset 
+
+, Rolf Riesen and Trammell Hudson
+\layout Abstract
+
+This report presents a specification for the Portals 3.2 message passing
+ interface.
+ Portals 3.2 is intended to allow scalable, high-performance network communicatio
+n between nodes of a parallel computing system.
+ Specifically, it is designed to support a parallel computing platform composed
+ of clusters of commodity workstations connected by a commodity system area
+ network fabric.
+ In addition, Portals 3.2 is well suited to massively parallel processing
+ and embedded systems.
+ Portals 3.2 represents an adaption of the data movement layer developed
+ for massively parallel processing platforms, such as the 4500-node Intel
+ TeraFLOPS machine.
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+clearpage
+\backslash 
+pagenumbering{roman}
+\backslash 
+setcounter{page}{3}
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset LatexCommand \tableofcontents{}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset FloatList figure
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset FloatList table
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Chapter*
+
+Summary of Changes for Revision 1.1
+\layout Enumerate
+
+Updated version number to 3.2 throughout the document
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sub:PtlGetId}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_SEGV
+\family default 
+ to error list for 
+\shape italic 
+PtlGetId
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_ML_TOOLONG
+\family default 
+ to error list for 
+\shape italic 
+PtlMEAttach
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meunlink}
+
+\end_inset 
+
+: removed text referring to a list of associated memory descriptors.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: added text to describe unlinking a free-floating memory descriptor.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+: added entry for 
+\family typewriter 
+ptl_seq_t
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+:
+\begin_deeper 
+\layout Enumerate
+
+added definition of 
+\family typewriter 
+max_offset
+\family default 
+.
+\layout Enumerate
+
+added text to clarify 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+.
+\end_deeper 
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: modified text for 
+\family typewriter 
+unlink_op
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+: added text to clarify multiple calls to 
+\shape italic 
+PtlNIInit
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: added text to clarify 
+\family typewriter 
+unlink_nofit
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:receiving}
+
+\end_inset 
+
+: removed text indicating that an MD will reject a message if the associated
+ EQ is full.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_MD_INUSE
+\family default 
+ error code and text to indicate that only MDs with no pending operations
+ can be unlinked.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_MD_INUSE
+\family default 
+ return code.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+: added user id field, MD handle field, and NI specific failure field to
+ the 
+\family typewriter 
+ptl_event_t
+\family default 
+ structure.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+: added 
+\family typewriter 
+ptl_ni_fail_t
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_EVENT_UNLINK
+\family default 
+ event type.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+: removed 
+\shape slanted 
+PtlTransId
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, Section 
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+, Section 
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+: listed allowable constants with relevant fields.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+: added 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+ function.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_PT_FULL
+\family default 
+ return code for 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+: updated to reflect new event types.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+ptl_nid_t
+\family default 
+, 
+\family typewriter 
+ptl_pid_t
+\family default 
+, and 
+\family typewriter 
+ptl_uid_t
+\family default 
+.
+\layout Chapter*
+
+Summary of Changes for Version 3.1
+\layout Section*
+
+Thread Issues
+\layout Standard
+
+The most significant change to the interface from version 3.0 to 3.1 involves
+ the clarification of how the interface interacts with multi-threaded applicatio
+ns.
+ We adopted a generic thread model in which processes define an address
+ space and threads share the address space.
+ Consideration of the API in the light of threads lead to several clarifications
+ throughout the document: 
+\layout Enumerate
+
+Glossary: 
+\begin_deeper 
+\layout Enumerate
+
+added a definition for 
+\emph on 
+thread
+\emph default 
+, 
+\layout Enumerate
+
+reworded the definition for 
+\emph on 
+process
+\emph default 
+.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:apiover}
+
+\end_inset 
+
+: added section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:threads}
+
+\end_inset 
+
+ to describe the multi-threading model used by the Portals API.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ptlinit}
+
+\end_inset 
+
+: 
+\emph on 
+PtlInit
+\emph default 
+ must be called at least once and may be called any number of times.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ptlfini}
+
+\end_inset 
+
+: 
+\emph on 
+PtlFini
+\emph default 
+ should be called once as the process is terminating and not as each thread
+ terminates.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+: Portals does not define thread ids.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+: network interfaces are associated with processes, not threads.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+: 
+\emph on 
+PtlNIInit
+\emph default 
+ must be called at least once and may be called any number of times.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:eqget}
+
+\end_inset 
+
+: 
+\emph on 
+PtlEQGet
+\emph default 
+ returns 
+\family typewriter 
+PTL_EQ_EMPTY
+\family default 
+ if a thread is blocked on 
+\emph on 
+PtlEQWait
+\emph default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:eqwait}
+
+\end_inset 
+
+: waiting threads are awakened in FIFO order.
+\layout Standard
+
+Two functions, 
+\emph on 
+PtlNIBarrier
+\emph default 
+ and 
+\emph on 
+PtlEQCount
+\emph default 
+ were removed from the API.
+\emph on 
+PtlNIBarrier
+\emph default 
+ was defined to block the calling process until all of the processes in
+ the application group had invoked 
+\emph on 
+PtlNIBarrier
+\emph default 
+.
+ We now consider this functionality, along with the concept of groups (see
+ the discussion under 
+\begin_inset Quotes eld
+\end_inset 
+
+other changes
+\begin_inset Quotes erd
+\end_inset 
+
+), to be part of the runtime system, not part of the Portals API.
+\emph on 
+PtlEQCount
+\emph default 
+ was defined to return the number of events in an event queue.
+ Because external operations may lead to new events being added and other
+ threads may remove events, the value returned by 
+\emph on 
+PtlEQCount
+\emph default 
+ would have to be a hint about the number of events in the event queue.
+\layout Section*
+
+Handling small, unexpected messages
+\layout Standard
+
+Another set of changes relates to handling small unexpected messages in
+ MPI.
+ In designing version 3.0, we assumed that each unexpected message would
+ be placed in a unique memory descriptor.
+ To avoid the need to process a long list of memory descriptors, we moved
+ the memory descriptors out of the match list and hung them off of a single
+ match list entry.
+ In this way, large unexpected messages would only encounter a single 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ match list entry before encountering the 
+\begin_inset Quotes eld
+\end_inset 
+
+long message
+\begin_inset Quotes erd
+\end_inset 
+
+ match list entry.
+ Experience with this strategy identified resource management problems with
+ this approach.
+ In particular, a long sequence of very short (or zero length) messages
+ could quickly exhaust the memory descriptors constructed for handling unexpecte
+d messages.
+ Our new strategy involves the use of several very large memory descriptors
+ for small unexpected messages.
+ Consecutive unexpected messages will be written into the first of these
+ memory descriptors until the memory descriptor fills up.
+ When the first of the 
+\begin_inset Quotes eld
+\end_inset 
+
+small memory
+\begin_inset Quotes erd
+\end_inset 
+
+ descriptors fills up, it will be unlinked and subsequent short messages
+ will be written into the next 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor.
+ In this case, a 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor will be declared full when it does not have sufficient
+ space for the largest small unexpected message.
+\layout Standard
+
+This lead to two significant changes.
+ First, each match list entry now has a single memory descriptor rather
+ than a list of memory descriptors.
+ Second, in addition to exceeding the operation threshold, a memory descriptor
+ can be unlinked when the local offset exceeds a specified value.
+ These changes have lead to several changes in this document: 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{subsec:paddress}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+removed references to the memory descriptor list, 
+\layout Enumerate
+
+changed the portals address translation description to indicate that unlinking
+ a memory descriptor implies unlinking the associated match list entry--match
+ list entries can no longer be unlinked independently from the memory descriptor.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+removed unlink from argument list, 
+\layout Enumerate
+
+removed description of 
+\family typewriter 
+ptl_unlink
+\family default 
+ type, 
+\layout Enumerate
+
+changed wording of the error condition when the Portal table index already
+ has an associated match list.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+: removed unlink from argument list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+max_offset
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+added description of 
+\family typewriter 
+ptl_unlink
+\family default 
+ type, 
+\layout Enumerate
+
+removed reference to memory descriptor lists, 
+\layout Enumerate
+
+changed wording of the error condition when match list entry already has
+ an associated memory descriptor, 
+\layout Enumerate
+
+changed the description of the 
+\family typewriter 
+unlink
+\family default 
+ argument.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+: removed 
+\family typewriter 
+PtlMDInsert
+\family default 
+ operation.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+: removed references to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: removed reference to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:summary}
+
+\end_inset 
+
+: removed references to PtlMDInsert.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:semantics}
+
+\end_inset 
+
+: removed reference to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:exmpi}
+
+\end_inset 
+
+: revised the MPI example to reflect the changes to the interface.
+\layout Standard
+
+Several changes have been made to improve the general documentation of the
+ interface.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+: documented the special value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+: documented the special value 
+\family typewriter 
+PTL_ID_ANY
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+: documented the return value 
+\family typewriter 
+PTL_INV_EQ
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+: clarified the description of the 
+\emph on 
+PtlMDUpdate
+\emph default 
+ function.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:implvals}
+
+\end_inset 
+
+: introduced a new section to document the implementation defined values.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:summary}
+
+\end_inset 
+
+: modified Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+ to indicate where each constant is introduced and where it is used.
+\layout Section*
+
+Other changes
+\layout Subsection*
+
+Implementation defined limits (Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+)
+\layout Standard
+
+The earlier version provided implementation defined limits for the maximum
+ number of match entries, the maximum number of memory descriptors, etc.
+ Rather than spanning the entire implementation, these limits are now associated
+ with individual network interfaces.
+\layout Subsection*
+
+Added User Ids (Section 
+\begin_inset LatexCommand \ref{sec:uid}
+
+\end_inset 
+
+)
+\layout Standard
+
+Group Ids had been used to simplify access control entries.
+ In particular, a process could allow access for all of the processes in
+ a group.
+ User Ids have been introduced to regain this functionality.
+ We use user ids to fill this role.
+\layout Subsection*
+
+Removed Group Ids and Rank Ids (Section 
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+)
+\layout Standard
+
+The earlier version of Portals had two forms for addressing processes: <node
+ id, process id> and <group id, rank id>.
+ A process group was defined as the collection processes created during
+ application launch.
+ Each process in the group was given a unique rank id in the range 0 to
+\begin_inset Formula $n-1$
+\end_inset 
+
+ where 
+\begin_inset Formula $n$
+\end_inset 
+
+ was the number of processes in the group.
+ We removed groups because they are better handled in the runtime system.
+\layout Subsection*
+
+Match lists (Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+)
+\layout Standard
+
+It is no longer illegal to have an existing match entry when calling PtlMEAttach.
+ A position argument was added to the list of arguments supplied to 
+\emph on 
+PtlMEAttach
+\emph default 
+ to specify whether the new match entry is prepended or appended to the
+ existing list.
+ If there is no existing match list, the position argument is ignored.
+\layout Subsection*
+
+Unlinking Memory Descriptors (Section 
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+)
+\layout Standard
+
+Previously, a memory descriptor could be unlinked if the offset exceeded
+ a threshold upon the completion of an operation.
+ In this version, the unlinking is delayed until there is a matching operation
+ which requires more memory than is currently available in the descriptor.
+ In addition to changes in section, this lead to a revision of Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:flow}
+
+\end_inset 
+
+.
+\layout Subsection*
+
+Split Phase Operations and Events (Section 
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+)
+\layout Standard
+
+Previously, there were five types of events: 
+\family typewriter 
+PTL_EVENT_PUT
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_GET
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_REPLY
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_SENT
+\family default 
+, and 
+\family typewriter 
+PTL_EVENT_ACK.
+\family default 
+The first four of these reflected the completion of potentially long operations.
+ We have introduced new event types to reflect the fact that long operations
+ have a distinct starting point and a distinct completion point.
+ Moreover, the completion may be successful or unsuccessful.
+\layout Standard
+
+In addition to providing a mechanism for reporting failure to higher levels
+ of software, this split provides an opportunity for for improved ordering
+ semantics.
+ Previously, if one process intiated two operations (e.g., two put operations)
+ on a remote process, these operations were guaranteed to complete in the
+ same order that they were initiated.
+ Now, we only guarantee that the initiation events are delivered in the
+ same order.
+ In particular, the operations do not need to complete in the order that
+ they were intiated.
+\layout Subsection*
+
+Well known proces ids (Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+)
+\layout Standard
+
+To support the notion of 
+\begin_inset Quotes eld
+\end_inset 
+
+well known process ids,
+\begin_inset Quotes erd
+\end_inset 
+
+ we added a process id argument to the arguments for PtlNIInit.
+\layout Chapter*
+
+Glossary
+\layout Description
+
+API Application Programming Interface.
+ A definition of the functions and semantics provided by library of functions.
+\layout Description
+
+Initiator A 
+\emph on 
+process
+\emph default 
+ that initiates a message operation.
+\layout Description
+
+Message An application-defined unit of data that is exchanged between 
+\emph on 
+processes
+\emph default 
+.
+\layout Description
+
+Message\SpecialChar ~
+Operation Either a put operation, which writes data, or a get operation,
+ which reads data.
+\layout Description
+
+Network A network provides point-to-point communication between 
+\emph on 
+nodes
+\emph default 
+.
+ Internally, a network may provide multiple routes between endpoints (to
+ improve fault tolerance or to improve performance characteristics); however,
+ multiple paths will not be exposed outside of the network.
+\layout Description
+
+Node A node is an endpoint in a 
+\emph on 
+network
+\emph default 
+.
+ Nodes provide processing capabilities and memory.
+ A node may provide multiple processors (an SMP node) or it may act as a
+\emph on 
+gateway
+\emph default 
+ between networks.
+\layout Description
+
+Process A context of execution.
+ A process defines a virtual memory (VM) context.
+ This context is not shared with other processes.
+ Several threads may share the VM context defined by a process.
+\layout Description
+
+Target A 
+\emph on 
+process
+\emph default 
+ that is acted upon by a message operation.
+\layout Description
+
+Thread A context of execution that shares a VM context with other threads.
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\layout Standard
+
+\backslash 
+setcounter{page}{1}
+\backslash 
+pagenumbering{arabic}
+\end_inset 
+
+
+\layout Chapter
+
+Introduction
+\begin_inset LatexCommand \label{sec:intro}
+
+\end_inset 
+
+
+\layout Section
+
+Overview
+\layout Standard
+
+This document describes an application programming interface for message
+ passing between nodes in a system area network.
+ The goal of this interface is to improve the scalability and performance
+ of network communication by defining the functions and semantics of message
+ passing required for scaling a parallel computing system to ten thousand
+ nodes.
+ This goal is achieved by providing an interface that will allow a quality
+ implementation to take advantage of the inherently scalable design of Portals.
+\layout Standard
+
+This document is divided into several sections: 
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:intro}
+
+\end_inset 
+
+---Introduction This section describes the purpose and scope of the Portals
+ API.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:apiover}
+
+\end_inset 
+
+---An\SpecialChar ~
+Overview\SpecialChar ~
+of\SpecialChar ~
+the\SpecialChar ~
+Portals\SpecialChar ~
+3.1\SpecialChar ~
+API This section gives a brief overview of the
+ Portals API.
+ The goal is to introduce the key concepts and terminology used in the descripti
+on of the API.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:api}
+
+\end_inset 
+
+---The\SpecialChar ~
+Portals\SpecialChar ~
+3.2\SpecialChar ~
+API This section describes the functions and semantics of
+ the Portals application programming interface.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:semantics}
+
+\end_inset 
+
+--The\SpecialChar ~
+Semantics\SpecialChar ~
+of\SpecialChar ~
+Message\SpecialChar ~
+Transmission This section describes the semantics
+ of message transmission.
+ In particular, the information transmitted in each type of message and
+ the processing of incoming messages.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:examples}
+
+\end_inset 
+
+---Examples This section presents several examples intended to illustrates
+ the use of the Portals API.
+\layout Section
+
+Purpose
+\layout Standard
+
+Existing message passing technologies available for commodity cluster networking
+ hardware do not meet the scalability goals required by the Cplant\SpecialChar ~
+
+\begin_inset LatexCommand \cite{Cplant}
+
+\end_inset 
+
+ project at Sandia National Laboratories.
+ The goal of the Cplant project is to construct a commodity cluster that
+ can scale to the order of ten thousand nodes.
+ This number greatly exceeds the capacity for which existing message passing
+ technologies have been designed and implemented.
+\layout Standard
+
+In addition to the scalability requirements of the network, these technologies
+ must also be able to support a scalable implementation of the Message Passing
+ Interface (MPI)\SpecialChar ~
+
+\begin_inset LatexCommand \cite{MPIstandard}
+
+\end_inset 
+
+ standard, which has become the 
+\shape italic 
+de facto
+\shape default 
+ standard for parallel scientific computing.
+ While MPI does not impose any scalability limitations, existing message
+ passing technologies do not provide the functionality needed to allow implement
+ations of MPI to meet the scalability requirements of Cplant.
+\layout Standard
+
+The following are properties of a network architecture that do not impose
+ any inherent scalability limitations: 
+\layout Itemize
+
+Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~
+
+\begin_inset LatexCommand \cite{VIA}
+
+\end_inset 
+
+ and TCP/IP sockets, have limitations on the number of peer connections
+ that can be established.
+\layout Itemize
+
+Network independence - Many communication systems depend on the host processor
+ to perform operations in order for messages in the network to be consumed.
+ Message consumption from the network should not be dependent on host processor
+ activity, such as the operating system scheduler or user-level thread scheduler.
+\layout Itemize
+
+User-level flow control - Many communication systems manage flow control
+ internally to avoid depleting resources, which can significantly impact
+ performance as the number of communicating processes increases.
+\layout Itemize
+
+OS Bypass - High performance network communication should not involve memory
+ copies into or out of a kernel-managed protocol stack.
+\layout Standard
+
+The following are properties of a network architecture that do not impose
+ scalability limitations for an implementation of MPI:
+\layout Itemize
+
+Receiver-managed - Sender-managed message passing implementations require
+ a persistent block of memory to be available for every process, requiring
+ memory resources to increase with job size and requiring user-level flow
+ control mechanisms to manage these resources.
+\layout Itemize
+
+User-level Bypass - While OS Bypass is necessary for high-performance, it
+ alone is not sufficient to support the Progress Rule of MPI asynchronous
+ operations.
+\layout Itemize
+
+Unexpected messages - Few communication systems have support for receiving
+ messages for which there is no prior notification.
+ Support for these types of messages is necessary to avoid flow control
+ and protocol overhead.
+\layout Section
+
+Background
+\layout Standard
+
+Portals was originally designed for and implemented on the nCube machine
+ as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~
+
+\begin_inset LatexCommand \cite{SUNMOS}
+
+\end_inset 
+
+ and Puma\SpecialChar ~
+
+\begin_inset LatexCommand \cite{PumaOS}
+
+\end_inset 
+
+ lightweight kernel development projects.
+ Portals went through two design phases, the latter of which is used on
+ the 4500-node Intel TeraFLOPS machine\SpecialChar ~
+
+\begin_inset LatexCommand \cite{TFLOPS}
+
+\end_inset 
+
+.
+ Portals have been very successful in meeting the needs of such a large
+ machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~
+
+\begin_inset LatexCommand \cite{PumaMPI}
+
+\end_inset 
+
+, but also for implementing the scalable run-time environment and parallel
+ I/O capabilities of the machine.
+\layout Standard
+
+The second generation Portals implementation was designed to take full advantage
+ of the hardware architecture of large MPP machines.
+ However, efforts to implement this same design on commodity cluster technology
+ identified several limitations, due to the differences in network hardware
+ as well as to shortcomings in the design of Portals.
+\layout Section
+
+Scalability
+\layout Standard
+
+The primary goal in the design of Portals is scalability.
+ Portals are designed specifically for an implementation capable of supporting
+ a parallel job running on tens of thousands of nodes.
+ Performance is critical only in terms of scalability.
+ That is, the level of message passing performance is characterized by how
+ far it allows an application to scale and not by how it performs in micro-bench
+marks (e.g., a two node bandwidth or latency test).
+\layout Standard
+
+The Portals API is designed to allow for scalability, not to guarantee it.
+ Portals cannot overcome the shortcomings of a poorly designed application
+ program.
+ Applications that have inherent scalability limitations, either through
+ design or implementation, will not be transformed by Portals into scalable
+ applications.
+ Scalability must be addressed at all levels.
+ Portals do not inhibit scalability, but do not guarantee it either.
+\layout Standard
+
+To support scalability, the Portals interface maintains a minimal amount
+ of state.
+ Portals provide reliable, ordered delivery of messages between pairs of
+ processes.
+ They are connectionless: a process is not required to explicitly establish
+ a point-to-point connection with another process in order to communicate.
+ Moreover, all buffers used in the transmission of messages are maintained
+ in user space.
+ The target process determines how to respond to incoming messages, and
+ messages for which there are no buffers are discarded.
+\layout Section
+
+Communication Model
+\layout Standard
+
+Portals combine the characteristics of both one-side and two-sided communication.
+ They define a 
+\begin_inset Quotes eld
+\end_inset 
+
+matching put
+\begin_inset Quotes erd
+\end_inset 
+
+ operation and a 
+\begin_inset Quotes eld
+\end_inset 
+
+matching get
+\begin_inset Quotes erd
+\end_inset 
+
+ operation.
+ The destination of a put (or send) is not an explicit address; instead,
+ each message contains a set of match bits that allow the receiver to determine
+ where incoming messages should be placed.
+ This flexibility allows Portals to support both traditional one-sided operation
+s and two-sided send/receive operations.
+\layout Standard
+
+Portals allows the target to determine whether incoming messages are acceptable.
+ A target process can choose to accept message operations from any specific
+ process or can choose to ignore message operations from any specific process.
+\layout Section
+
+Zero Copy, OS Bypass and Application Bypass
+\layout Standard
+
+In traditional system architectures, network packets arrive at the network
+ interface card (NIC), are passed through one or more protocol layers in
+ the operating system, and eventually copied into the address space of the
+ application.
+ As network bandwidth began to approach memory copy rates, reduction of
+ memory copies became a critical concern.
+ This concern lead to the development of zero-copy message passing protocols
+ in which message copies are eliminated or pipelined to avoid the loss of
+ bandwidth.
+\layout Standard
+
+A typical zero-copy protocol has the NIC generate an interrupt for the CPU
+ when a message arrives from the network.
+ The interrupt handler then controls the transfer of the incoming message
+ into the address space of the appropriate application.
+ The interrupt latency, the time from the initiation of an interrupt until
+ the interrupt handler is running, is fairly significant.
+ To avoid this cost, some modern NICs have processors that can be programmed
+ to implement part of a message passing protocol.
+ Given a properly designed protocol, it is possible to program the NIC to
+ control the transfer of incoming messages, without needing to interrupt
+ the CPU.
+ Because this strategy does not need to involve the OS on every message
+ transfer, it is frequently called 
+\begin_inset Quotes eld
+\end_inset 
+
+OS Bypass.
+\begin_inset Quotes erd
+\end_inset 
+
+ ST\SpecialChar ~
+
+\begin_inset LatexCommand \cite{ST}
+
+\end_inset 
+
+, VIA\SpecialChar ~
+
+\begin_inset LatexCommand \cite{VIA}
+
+\end_inset 
+
+, FM\SpecialChar ~
+
+\begin_inset LatexCommand \cite{FM2}
+
+\end_inset 
+
+, GM\SpecialChar ~
+
+\begin_inset LatexCommand \cite{GM}
+
+\end_inset 
+
+, and Portals are examples of OS Bypass protocols.
+\layout Standard
+
+Many protocols that support OS Bypass still require that the application
+ actively participate in the protocol to ensure progress.
+ As an example, the long message protocol of PM requires that the application
+ receive and reply to a request to put or get a long message.
+ This complicates the runtime environment, requiring a thread to process
+ incoming requests, and significantly increases the latency required to
+ initiate a long message protocol.
+ The Portals message passing protocol does not require activity on the part
+ of the application to ensure progress.
+ We use the term 
+\begin_inset Quotes eld
+\end_inset 
+
+Application Bypass
+\begin_inset Quotes erd
+\end_inset 
+
+ to refer to this aspect of the Portals protocol.
+\layout Section
+
+Faults 
+\layout Standard
+
+Given the number of components that we are dealing with and the fact that
+ we are interested in supporting applications that run for very long times,
+ failures are inevitable.
+ The Portals API recognizes that the underlying transport may not be able
+ to successfully complete an operation once it has been initiated.
+ This is reflected in the fact that the Portals API reports three types
+ of events: events indicating the initiation of an operation, events indicating
+ the successful completion of an operation, and events indicating the unsuccessf
+ul completion of an operation.
+ Every initiation event is eventually followed by a successful completion
+ event or an unsuccessful completion event.
+\layout Standard
+
+Between the time an operation is started and the time that the operation
+ completes (successfully or unsuccessfully), any memory associated with
+ the operation should be considered volatile.
+ That is, the memory may be changed in unpredictable ways while the operation
+ is progressing.
+ Once the operation completes, the memory associated with the operation
+ will not be subject to further modification (from this operation).
+ Notice that unsuccessful operations may alter memory in an essentially
+ unpredictable fashion.
+\layout Chapter
+
+An Overview of the Portals API
+\begin_inset LatexCommand \label{sec:apiover}
+
+\end_inset 
+
+
+\layout Standard
+
+In this section, we give a conceptual overview of the Portals API.
+ The goal is to provide a context for understanding the detailed description
+ of the API presented in the next section.
+\layout Section
+
+Data Movement
+\begin_inset LatexCommand \label{sec:dmsemantics}
+
+\end_inset 
+
+
+\layout Standard
+
+A Portal represents an opening in the address space of a process.
+ Other processes can use a Portal to read (get) or write (put) the memory
+ associated with the portal.
+ Every data movement operation involves two processes, the 
+\series bold 
+initiator
+\series default 
+ and the 
+\series bold 
+target
+\series default 
+.
+ The initiator is the process that initiates the data movement operation.
+ The target is the process that responds to the operation by either accepting
+ the data for a put operation, or replying with the data for a get operation.
+\layout Standard
+
+In this discussion, activities attributed to a process may refer to activities
+ that are actually performed by the process or 
+\emph on 
+on behalf of the process
+\emph default 
+.
+ The inclusiveness of our terminology is important in the context of 
+\emph on 
+application bypass
+\emph default 
+.
+ In particular, when we note that the target sends a reply in the case of
+ a get operation, it is possible that reply will be generated by another
+ component in the system, bypassing the application.
+\layout Standard
+
+Figures\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:put}
+
+\end_inset 
+
+ and 
+\begin_inset LatexCommand \ref{fig:get}
+
+\end_inset 
+
+ present graphical interpretations of the Portal data movement operations:
+ put and get.
+ In the case of a put operation, the initiator sends a put request message
+ containing the data to the target.
+ The target translates the Portal addressing information in the request
+ using its local Portal structures.
+ When the request has been processed, the target optionally sends an acknowledge
+ment message.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename put.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 218pt
+       lyxheight 119pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Put (Send)
+\begin_inset LatexCommand \label{fig:put}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+In the case of a get operation, the initiator sends a get request to the
+ target.
+ As with the put operation, the target translates the Portal addressing
+ information in the request using its local Portal structures.
+ Once it has translated the Portal addressing information, the target sends
+ a reply that includes the requested data.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename get.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 218pt
+       lyxheight 119pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Get
+\begin_inset LatexCommand \label{fig:get}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+We should note that Portal address translations are only performed on nodes
+ that respond to operations initiated by other nodes.
+ Acknowledgements and replies to get operations bypass the portals address
+ translation structures.
+\layout Section
+
+Portal Addressing
+\begin_inset LatexCommand \label{subsec:paddress}
+
+\end_inset 
+
+
+\layout Standard
+
+One-sided data movement models (e.g., shmem\SpecialChar ~
+
+\begin_inset LatexCommand \cite{CraySHMEM}
+
+\end_inset 
+
+, ST\SpecialChar ~
+
+\begin_inset LatexCommand \cite{ST}
+
+\end_inset 
+
+, MPI-2\SpecialChar ~
+
+\begin_inset LatexCommand \cite{MPI2}
+
+\end_inset 
+
+) typically use a triple to address memory on a remote node.
+ This triple consists of a process id, memory buffer id, and offset.
+ The process id identifies the target process, the memory buffer id specifies
+ the region of memory to be used for the operation, and the offset specifies
+ an offset within the memory buffer.
+\layout Standard
+
+In addition to the standard address components (process id, memory buffer
+ id, and offset), a Portal address includes a set of match bits.
+ This addressing model is appropriate for supporting one-sided operations
+ as well as traditional two-sided message passing operations.
+ Specifically, the Portals API provides the flexibility needed for an efficient
+ implementation of MPI-1, which defines two-sided operations with one-sided
+ completion semantics.
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:portals}
+
+\end_inset 
+
+ presents a graphical representation of the structures used by a target
+ in the interpretation of a Portal address.
+ The process id is used to route the message to the appropriate node and
+ is not reflected in this diagram.
+ The memory buffer id, called the 
+\series bold 
+portal id
+\series default 
+, is used as an index into the Portal table.
+ Each element of the Portal table identifies a match list.
+ Each element of the match list specifies two bit patterns: a set of 
+\begin_inset Quotes eld
+\end_inset 
+
+don't care
+\begin_inset Quotes erd
+\end_inset 
+
+ bits, and a set of 
+\begin_inset Quotes eld
+\end_inset 
+
+must match
+\begin_inset Quotes erd
+\end_inset 
+
+ bits.
+ In addition to the two sets of match bits, each match list element has
+ at most one memory descriptor.
+ Each memory descriptor identifies a memory region and an optional event
+ queue.
+ The memory region specifies the memory to be used in the operation and
+ the event queue is used to record information about these operations.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename portals.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 305pt
+       lyxheight 106pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Addressing Structures
+\begin_inset LatexCommand \label{fig:portals}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:flow}
+
+\end_inset 
+
+ illustrates the steps involved in translating a Portal address, starting
+ from the first element in a match list.
+ If the match criteria specified in the match list entry are met and the
+ memory descriptor list accepts the operation
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+Memory descriptors can reject operations because a threshold has been exceeded
+ or because the memory region does not have sufficient space, see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+
+, the operation (put or get) is performed using the memory region specified
+ in the memory descriptor.
+ If the memory descriptor specifies that it is to be unlinked when a threshold
+ has been exceeded, the match list entry is removed from the match list
+ and the resources associated with the memory descriptor and match list
+ entry are reclaimed.
+ Finally, if there is an event queue specified in the memory descriptor,
+ the operation is logged in the event queue.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename flow_new.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 447pt
+       lyxheight 282pt
+\end_inset 
+
+
+\layout Caption
+
+Portals Address Translation
+\begin_inset LatexCommand \label{fig:flow}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+If the match criteria specified in the match list entry are not met, or
+ there is no memory descriptor associated with the match list entry, or
+ the memory descriptor associated with the match list entry rejects the
+ operation, the address translation continues with the next match list entry.
+ If the end of the match list has been reached, the address translation
+ is aborted and the incoming requested is discarded.
+\layout Section
+
+Access Control
+\layout Standard
+
+A process can control access to its portals using an access control list.
+ Each entry in the access control list specifies a process id and a Portal
+ table index.
+ The access control list is actually an array of entries.
+ Each incoming request includes an index into the access control list (i.e.,
+ a 
+\begin_inset Quotes eld
+\end_inset 
+
+cookie
+\begin_inset Quotes erd
+\end_inset 
+
+ or hint).
+ If the id of the process issuing the request doesn't match the id specified
+ in the access control list entry or the Portal table index specified in
+ the request doesn't match the Portal table index specified in the access
+ control list entry, the request is rejected.
+ Process identifiers and Portal table indexes may include wild card values
+ to increase the flexibility of this mechanism.
+\layout Standard
+
+Two aspects of this design merit further discussion.
+ First, the model assumes that the information in a message header, the
+ sender's id in particular, is trustworthy.
+ In most contexts, we assume that the entity that constructs the header
+ is trustworthy; however, using cryptographic techniques, we could easily
+ devise a protocol that would ensure the authenticity of the sender.
+\layout Standard
+
+Second, because the access check is performed by the receiver, it is possible
+ that a malicious process will generate thousands of messages that will
+ be denied by the receiver.
+ This could saturate the network and/or the receiver, resulting in a 
+\emph on 
+denial of service
+\emph default 
+ attack.
+ Moving the check to the sender using capabilities, would remove the potential
+ for this form of attack.
+ However, the solution introduces the complexities of capability management
+ (exchange of capabilities, revocation, protections, etc).
+\layout Section
+
+Multi-threaded Applications
+\begin_inset LatexCommand \label{sec:threads}
+
+\end_inset 
+
+\layout Standard
+
+The Portals API supports a generic view of multi-threaded applications.
+ From the perspective of the Portals API, an application program is defined
+ by a set of processes.
+ Each process defines a unique address space.
+ The Portals API defines access to this address space from other processes
+ (using portals addressing and the data movement operations).
+ A process may have one or more 
+\emph on 
+threads
+\emph default 
+ executing in its address space.
+\layout Standard
+
+With the exception of 
+\emph on 
+PtlEQWait
+\emph default 
+ every function in the Portals API is non-blocking and atomic with respect
+ to both other threads and external operations that result from data movement
+ operations.
+ While individual operations are atomic, sequences of these operations may
+ be interleaved between different threads and with external operations.
+ The Portals API does not provide any mechanisms to control this interleaving.
+ It is expected that these mechanisms will be provided by the API used to
+ create threads.
+\layout Chapter
+
+The Portals API
+\begin_inset LatexCommand \label{sec:api}
+
+\end_inset 
+
+
+\layout Section
+
+Naming Conventions
+\begin_inset LatexCommand \label{sec:conv}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API defines two types of entities: functions and types.
+ Function always start with 
+\emph on 
+Ptl
+\emph default 
+ and use mixed upper and lower case.
+ When used in the body of this report, function names appear in italic face,
+ e.g., 
+\emph on 
+PtlInit
+\emph default 
+.
+ The functions associated with an object type will have names that start
+ with 
+\emph on 
+Ptl
+\emph default 
+, followed by the two letter object type code shown in Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:objcodes}
+
+\end_inset 
+
+.
+ As an example, the function 
+\emph on 
+PtlEQAlloc
+\emph default 
+ allocates resources for an event queue.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Object Type Codes
+\begin_inset LatexCommand \label{tab:objcodes}
+
+\end_inset 
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+\backslash 
+medskip
+\newline 
+  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\emph on 
+xx
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Name 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Section 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+EQ 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Event Queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ MD 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Memory Descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ ME 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Match list Entry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ NI 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Network Interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Type names use lower case with underscores to separate words.
+ Each type name starts with 
+\family typewriter 
+ptl
+\family default 
+_ and ends with 
+\family typewriter 
+_t
+\family default 
+.
+ When used in the body of this report, type names appear in a fixed font,
+ e.g., 
+\family typewriter 
+ptl_match_bits_t
+\family default 
+.
+\layout Standard
+
+Names for constants use upper case with underscores to separate words.
+ Each constant name starts with 
+\family typewriter 
+PTL_
+\family default 
+.
+ When used in the body of this report, type names appear in a fixed font,
+ e.g., 
+\family typewriter 
+PTL_OK
+\family default 
+.
+\layout Section
+
+Base Types
+\layout Standard
+
+The Portals API defines a variety of base types.
+ These types represent a simple renaming of the base types provided by the
+ C programming language.
+ In most cases these new type names have been introduced to improve type
+ safety and to avoid issues arising from differences in representation sizes
+ (e.g., 16-bit or 32-bit integers).
+\layout Subsection
+
+Sizes
+\begin_inset LatexCommand \label{sec:size-t}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_size_t
+\family default 
+ is an unsigned 64-bit integral type used for representing sizes.
+\layout Subsection
+
+Handles
+\begin_inset LatexCommand \label{sec:handle-type}
+
+\end_inset 
+
+\layout Standard
+
+Objects maintained by the API are accessed through handles.
+ Handle types have names of the form 
+\family typewriter 
+ptl_handle_
+\emph on 
+xx
+\emph default 
+_t
+\family default 
+, where 
+\emph on 
+xx
+\emph default 
+ is one of the two letter object type codes shown in Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:objcodes}
+
+\end_inset 
+
+.
+ For example, the type 
+\family typewriter 
+ptl_handle_ni_t
+\family default 
+ is used for network interface handles.
+\layout Standard
+
+Each type of object is given a unique handle type to enhance type checking.
+ The type, 
+\family typewriter 
+ptl_handle_any_t
+\family default 
+, can be used when a generic handle is needed.
+ Every handle value can be converted into a value of type 
+\family typewriter 
+ptl_handle_any_t
+\family default 
+ without loss of information.
+\layout Standard
+
+Handles are not simple values.
+ Every portals object is associated with a specific network interface and
+ an identifier for this interface (along with an object identifier) is part
+ of the handle for the object.
+\layout Standard
+
+The special value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, of type 
+\family typewriter 
+ptl_handle_eq_t
+\family default 
+, is used to indicate the absence of an event queue.
+ See sections 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+ and\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+ for uses of this value.
+\layout Subsection
+
+Indexes
+\begin_inset LatexCommand \label{sec:index-type}
+
+\end_inset 
+
+\layout Standard
+
+The types 
+\family typewriter 
+ptl_pt_index_t
+\family default 
+ and 
+\family typewriter 
+ptl_ac_index_t
+\family default 
+ are integral types used for representing Portal table indexes and access
+ control tables indexes, respectively.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+ for limits on values of these types.
+\layout Subsection
+
+Match Bits
+\begin_inset LatexCommand \label{sec:mb-type}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_match_bits_t
+\family default 
+ is capable of holding unsigned 64-bit integer values.
+\layout Subsection
+
+Network Interfaces
+\begin_inset LatexCommand \label{sec:ni-type}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_interface_t
+\family default 
+ is an integral type used for identifying different network interfaces.
+ Users will need to consult the local documentation to determine appropriate
+ values for the interfaces available.
+ The special value 
+\family typewriter 
+PTL_IFACE_DEFAULT
+\family default 
+ identifies the default interface.
+\layout Subsection
+
+Identifiers
+\begin_inset LatexCommand \label{sec:id-type}
+
+\end_inset 
+
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_nid_t
+\family default 
+ is an integral type used for representing node ids
+\family typewriter 
+, ptl_pid_t
+\family default 
+ is an integral type for representing process ids, and 
+\family typewriter 
+ptl_uid_t 
+\family default 
+is an integral type for representing user ids.
+\layout Standard
+
+The special values 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ matches any process identifier, PTL_NID_ANY matches any node identifier,
+ and 
+\family typewriter 
+PTL_UID_ANY
+\family default 
+ matches any user identifier.
+ See sections 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+ and\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+ for uses of these values.
+\layout Subsection
+
+Status Registers
+\begin_inset LatexCommand \label{sec:stat-type}
+
+\end_inset 
+
+
+\layout Standard
+
+Each network interface maintains an array of status registers that can be
+ accessed using the 
+\family typewriter 
+PtlNIStatus
+\family default 
+ function (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+).
+ The type 
+\family typewriter 
+ptl_sr_index_t
+\family default 
+ defines the types of indexes that can be used to access the status registers.
+ The only index defined for all implementations is 
+\family typewriter 
+PTL_SR_DROP_COUNT
+\family default 
+ which identifies the status register that counts the dropped requests for
+ the interface.
+ Other indexes (and registers) may be defined by the implementation.
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_sr_value_t
+\family default 
+ defines the types of values held in status registers.
+ This is a signed integer type.
+ The size is implementation dependent, but must be at least 32 bits.
+\layout Section
+
+Initialization and Cleanup
+\begin_inset LatexCommand \label{sec:init}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API includes a function, 
+\emph on 
+PtlInit
+\emph default 
+, to initialize the library and a function, 
+\emph on 
+PtlFini
+\emph default 
+, to cleanup after the application is done using the library.
+\layout Subsection
+
+PtlInit
+\begin_inset LatexCommand \label{sec:ptlinit}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlInit( int *max_interfaces );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlInit
+\emph default 
+ function initializes the Portals library.
+ PtlInit must be called at least once by a process before any thread makes
+ a Portals function call, but may be safely called more than once.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_FAIL Indicates an error during initialization.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+max_interfaces
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+max_interfaces
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the maximum number of interfaces
+ that can be initialized.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlFini
+\begin_inset LatexCommand \label{sec:ptlfini}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+void PtlFini( void );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlFini
+\emph default 
+ function cleans up after the Portals library is no longer needed by a process.
+ After this function is called, calls to any of the functions defined by
+ the Portal API or use of the structures set up by the Portals API will
+ result in undefined behavior.
+ This function should be called once and only once during termination by
+ a process.
+ Typically, this function will be called in the exit sequence of a process.
+ Individual threads should not call PtlFini when they terminate.
+\layout Section
+
+Network Interfaces
+\begin_inset LatexCommand \label{sec:ni}
+
+\end_inset 
+
+\layout Standard
+
+The Portals API supports the use of multiple network interfaces.
+ However, each interface is treated as an independent entity.
+ Combining interfaces (e.g., 
+\begin_inset Quotes eld
+\end_inset 
+
+bonding
+\begin_inset Quotes erd
+\end_inset 
+
+ to create a higher bandwidth connection) must be implemented by the application
+ or embedded in the underlying network.
+ Interfaces are treated as independent entities to make it easier to cache
+ information on individual network interface cards.
+\layout Standard
+
+Once initialized, each interface provides a Portal table, an access control
+ table, and a collection of status registers.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+ for a discussion of updating Portal table entries using the 
+\emph on 
+PtlMEAttach
+\emph default 
+ function.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ac}
+
+\end_inset 
+
+ for a discussion of the initialization and updating of entries in the access
+ control table.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+ for a discussion of the 
+\emph on 
+PtlNIStatus
+\emph default 
+ function which can be used to determine the value of a status register.
+\layout Standard
+
+Every other type of Portal object (e.g., memory descriptor, event queue, or
+ match list entry) is associated with a specific network interface.
+ The association to a network interface is established when the object is
+ created and is encoded in the handle for the object.
+\layout Standard
+
+Each network interface is initialized and shutdown independently.
+ The initialization routine, 
+\emph on 
+PtlNIInit
+\emph default 
+, returns a handle for an interface object which is used in all subsequent
+ Portal operations.
+ The 
+\emph on 
+PtlNIFini
+\emph default 
+ function is used to shutdown an interface and release any resources that
+ are associated with the interface.
+ Network interface handles are associated with processes, not threads.
+ All threads in a process share all of the network interface handles.
+\layout Standard
+
+The Portals API also defines the 
+\emph on 
+PtlNIStatus
+\emph default 
+ function to query the status registers for a network interface, the 
+\emph on 
+PtlNIDist
+\emph default 
+ function to determine the 
+\begin_inset Quotes eld
+\end_inset 
+
+distance
+\begin_inset Quotes erd
+\end_inset 
+
+ to another process, and the 
+\emph on 
+PtlNIHandle
+\emph default 
+ function to determine the network interface that an object is associated
+ with.
+\layout Subsection
+
+PtlNIInit
+\begin_inset LatexCommand \label{sec:niinit}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    int            max_match_entries;
+\newline 
+    int            max_mem_descriptors;
+\newline 
+    int            max_event_queues;
+\newline 
+    ptl_ac_index_t max_atable_index; 
+\newline 
+    ptl_pt_index_t max_ptable_index;
+\newline 
+} ptl_ni_limits_t;
+\newline 
+
+\newline 
+int PtlNIInit( ptl_interface_t  interface
+\newline 
+               ptl_pid_t        pid,
+\newline 
+               ptl_ni_limits_t* desired,
+\newline 
+               ptl_ni_limits_t* actual,
+\newline 
+               ptl_handle_ni_t* handle );
+\layout Standard
+
+Values of type 
+\family typewriter 
+ptl_ni_limits_t
+\family default 
+ include the following members:
+\layout Description
+
+max_match_entries Maximum number of match entries that can be allocated
+ at any one time.
+\layout Description
+
+max_mem_descriptors Maximum number of memory descriptors that can be allocated
+ at any one time.
+\layout Description
+
+max_event_queues Maximum number of event queues that can be allocated at
+ any one time.
+\layout Description
+
+max_atable_index Largest access control table index for this interface,
+ valid indexes range from zero to 
+\family typewriter 
+max_atable_index
+\family default 
+, inclusive.
+\layout Description
+
+max_ptable_index Largest Portal table index for this interface, valid indexes
+ range from zero to 
+\family typewriter 
+max_ptable_index
+\family default 
+, inclusive.
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIInit
+\emph default 
+ function is used to initialized the Portals API for a network interface.
+ This function must be called at least once by each process before any other
+ operations that apply to the interface by any process or thread.
+ For subsequent calls to 
+\shape italic 
+PtlNIInit
+\shape default 
+ from within the same process (either by different threads or the same thread),
+ the desired limits will be ignored and the call will return the existing
+ NI handle.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INIT_DUP Indicates a duplicate initialization of 
+\family typewriter 
+interface
+\family default 
+.
+\layout Description
+
+PTL_INIT_INV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to initialize the
+ interface.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+pid
+\family default 
+ is not a valid process id.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+actual 
+\family default 
+or
+\family typewriter 
+ handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the network interface to be initialized.
+  (See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+ for a discussion of  values used to identify network interfaces.)
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+pid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the desired process id (for well known process ids).
+ The value 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ may be used to have the process id assigned by the underlying library.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+desired
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If non-NULL, points to a structure that holds the desired limits.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+actual
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, the location pointed to by actual will hold the actual
+ limits.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the interface.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+The use of desired is implementation dependent.
+ In particular, an implementation may choose to ignore this argument.
+\layout Subsection
+
+PtlNIFini
+\begin_inset LatexCommand \label{sec:nifini}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlNIFini( ptl_handle_ni_t interface );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIFini
+\emph default 
+ function is used to release the resources allocated for a network interface.
+ Once the 
+\emph on 
+PtlNIFini
+\emph default 
+ operation has been started, the results of pending API operations (e.g.,
+ operations initiated by another thread) for this interface are undefined.
+ Similarly, the effects of incoming operations (puts and gets) or return
+ values (acknowledgements and replies) for this interface are undefined.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the interface to shutdown.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlNIStatus
+\begin_inset LatexCommand \label{sec:nistatus}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlNIStatus( ptl_handle_ni_t interface,
+\newline 
+                 ptl_sr_index_t  status_register,
+\newline 
+                 ptl_sr_value_t* status );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIStatus
+\emph default 
+ function returns the value of a status register for the specified interface.
+ (See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+ for more information on status register indexes and status register values.)
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_INV_SR_INDX Indicates that 
+\family typewriter 
+status_register
+\family default 
+ is not a valid status register.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+status
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+status_register
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+An index for the status register to read.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+status
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the current value of the status
+ register.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+The only status register that must be defined is a drop count register (
+\family typewriter 
+PTL_SR_DROP_COUNT
+\family default 
+).
+ Implementations may define additional status registers.
+ Identifiers for the indexes associated with these registers should start
+ with the prefix 
+\family typewriter 
+PTL_SR_
+\family default 
+.
+\layout Subsection
+
+PtlNIDist
+\layout LyX-Code
+
+int PtlNIDist( ptl_handle_ni_t  interface,
+\newline 
+               ptl_process_id_t process,
+\newline 
+               unsigned long*   distance );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIDist
+\emph default 
+ function returns the distance to another process using the specified interface.
+ Distances are only defined relative to an interface.
+ Distance comparisons between different interfaces on the same process may
+ be meaningless.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+process
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+distance
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+process
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+An identifier for the process whose distance is being  requested.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+distance
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the  distance to the remote
+ process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+This function should return a static measure of distance.
+ Examples include minimum latency, the inverse of available bandwidth, or
+ the number of switches between the two endpoints.
+\layout Subsection
+
+PtlNIHandle
+\layout LyX-Code
+
+int PtlNIHandle( ptl_handle_any_t handle,
+\newline 
+                 ptl_handle_ni_t* interface );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIHandle
+\emph default 
+ function returns a handle for the network interface with which the object
+ identified by 
+\family typewriter 
+handle
+\family default 
+ is associated.
+ If the object identified by 
+\family typewriter 
+handle
+\family default 
+ is a network interface, this function returns the same value it is passed.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_HANDLE Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a valid handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the object.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the network interface
+ associated with 
+\family typewriter 
+handle
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Every handle should encode the network interface and the object id relative
+ to this handle.
+ Both are presumably encoded using integer values.
+\layout Section
+
+User Identification
+\begin_inset LatexCommand \label{sec:uid}
+
+\end_inset 
+
+
+\layout Standard
+
+Every process runs on behalf of a user.
+\layout Subsection
+
+PtlGetUid
+\layout LyX-Code
+
+int PtlGetUid( ptl_handle_ni_t   ni_handle,
+\newline 
+               ptl_uid_t*        uid );
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+ni_handle
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A network interface handle.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the user id for the calling
+ process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Note that user identifiers are dependent on the network interface(s).
+ In particular, if a node has multiple interfaces, a process may have multiple
+ user identifiers.
+\layout Section
+
+Process Identification
+\begin_inset LatexCommand \label{sec:pid}
+
+\end_inset 
+
+
+\layout Standard
+
+Processes that use the Portals API, can be identified using a node id and
+ process id.
+ Every node accessible through a network interface has a unique node identifier
+ and every process running on a node has a unique process identifier.
+ As such, any process in the computing system can be identified by its node
+ id and process id.
+\layout Standard
+
+The Portals API defines a type, 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ for representing process ids and a function, 
+\emph on 
+PtlGetId
+\emph default 
+, which can be used to obtain the id of the current process.
+\layout Comment
+
+The portals API does not include thread identifiers.
+  Messages are delivered to processes (address spaces) not threads (contexts
+ of  execution).
+\layout Subsection
+
+The Process Id Type
+\begin_inset LatexCommand \label{sec:pid-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    ptl_nid_t       nid; /* node id */
+\newline 
+    ptl_pid_t       pid; /* process id */
+\newline 
+} ptl_process_id_t;
+\layout Standard
+\noindent 
+The 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ type uses two identifiers to represent a process id: a node id and a process
+ id.
+\layout Subsection
+
+PtlGetId
+\begin_inset LatexCommand \label{sub:PtlGetId}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlGetId( ptl_handle_ni_t   ni_handle,
+\newline 
+              ptl_process_id_t* id );
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+ni_handle
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+id
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A network interface handle.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the id for the calling process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Note that process identifiers are dependent on the network interface(s).
+ In particular, if a node has multiple interfaces, it may have multiple
+ node identifiers.
+\layout Section
+
+Match List Entries and Match Lists
+\begin_inset LatexCommand \label{sec:me}
+
+\end_inset 
+
+
+\layout Standard
+
+A match list is a chain of match list entries.
+ Each match list entry includes a memory descriptor and a set of match criteria.
+ The match criteria can be used to reject incoming requests based on process
+ id or the match bits provided in the request.
+ A match list is created using the 
+\emph on 
+PtlMEAttach
+\emph default 
+ or 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+ functions, which create a match list consisting of a single match list
+ entry, attaches the match list to the specified Portal index, and returns
+ a handle for the match list entry.
+ Match entries can be dynamically inserted and removed from a match list
+ using the 
+\emph on 
+PtlMEInsert
+\emph default 
+ and 
+\emph on 
+PtlMEUnlink
+\emph default 
+ functions.
+\layout Subsection
+
+PtlMEAttach
+\begin_inset LatexCommand \label{sec:meattach}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t;
+\newline 
+
+\layout LyX-Code
+
+typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t;
+\newline 
+
+\layout LyX-Code
+
+int PtlMEAttach( ptl_handle_ni_t  interface,
+\newline 
+                 ptl_pt_index_t   index,
+\newline 
+                 ptl_process_id_t matchid,
+\newline 
+                 ptl_match_bits_t match_bits,
+\newline 
+                 ptl_match_bits_t ignorebits,
+\newline 
+                 ptl_unlink_t     unlink,
+\newline 
+                 ptl_ins_pos_t    position,
+\newline 
+                 ptl_handle_me_t* handle );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+ are used to control where a new item is inserted.
+ The value 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+ is used to insert the new item before the current item or before the head
+ of the list.
+ The value 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+ is used to insert the new item after the current item or after the last
+ item in the list.
+\layout Standard
+
+The 
+\emph on 
+PtlMEAttach
+\emph default 
+ function creates a match list consisting of a single entry and attaches
+ this list to the Portal table for 
+\family typewriter 
+interface
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PTINDEX Indicates that 
+\family typewriter 
+index
+\family default 
+ is not a valid Portal table index.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match list entry.
+\layout Description
+
+PTL_ML_TOOLONG Indicates that the resulting match list is too long.
+ The maximum length for a match list is defined by the interface.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="7" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.75in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The Portal table index where the match list  should be attached.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Specifies the match criteria for the process id of the requestor.
+  The constants 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ and 
+\family typewriter 
+PTL_NID_ANY
+\family default 
+ can be used to  wildcard either of the ids in the 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ structure.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+match_bits, ignorebits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Specify the match criteria to apply  to the match bits in the incoming request.
+  The 
+\family typewriter 
+ignorebits
+\family default 
+ are used to mask out insignificant bits in the incoming match bits.
+  The resulting bits are then compared to the match list entry's match 
+ bits to determine if the incoming request meets the match criteria.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+unlink
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates the match list entry should be unlinked when the last memory descripto
+r associated with this match list  entry is unlinked.
+  (Note, the check for unlinking a match entry  only occurs when a memory
+ descriptor is unlinked.) 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+position
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates whether the new match entry should be prepended or appended to
+ the existing match list.
+ If there is no existing list, this argument is ignored and the new match
+ entry becomes the only entry in the list.
+ Allowed constants: 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+, 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ match list entry.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEAttachAny
+\begin_inset LatexCommand \label{sec:attachany}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEAttachAny( ptl_handle_ni_t  interface,
+\newline 
+                    ptl_pt_index_t   *index,
+\newline 
+                    ptl_process_id_t matchid,
+\newline 
+                    ptl_match_bits_t match_bits,
+\newline 
+                    ptl_match_bits_t ignorebits,
+\newline 
+                    ptl_unlink_t     unlink,
+\newline 
+                    ptl_handle_me_t* handle );
+\layout Standard
+
+The 
+\emph on 
+PtlMEAttachAny
+\emph default 
+ function creates a match list consisting of a single entry and attaches
+ this list to an unused Portal table entry for 
+\family typewriter 
+interface
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match list entry.
+\layout Description
+
+PTL_PT_FULL Indicates that there are no free entries in the Portal table.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.75in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On succesfful return, this location will hold the Portal index where the
+ match list  has been attached.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid, match_bits, ignorebits, unlink
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion for 
+\shape italic 
+PtlMEAttach
+\shape default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ match list entry.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEInsert
+\begin_inset LatexCommand \label{sec:meinsert}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEInsert( ptl_handle_me_t  current,
+\newline 
+                 ptl_process_id_t matchid,
+\newline 
+                 ptl_match_bits_t match_bits,
+\newline 
+                 ptl_match_bits_t ignorebits,
+\newline 
+                 ptl_ins_pos_t    position,
+\newline 
+                 ptl_handle_me_t* handle );
+\layout Standard
+
+The 
+\emph on 
+PtlMEInsert
+\emph default 
+ function creates a new match list entry and inserts this entry into the
+ match list containing 
+\family typewriter 
+current
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+current
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ML_TOOLONG Indicates that the resulting match list is too long.
+ The maximum length for a match list is defined by the interface.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match entry.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+current
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for a match entry.
+  The new match entry will be inserted immediately before or immediately
+ after this match entry.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid
+\family default 
+, 
+\family typewriter 
+match_bits
+\family default 
+, 
+\family typewriter 
+ignorebits
+\family default 
+,  
+\family typewriter 
+unlink
+\family default 
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion  for 
+\emph on 
+PtlMEAttach
+\emph default 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+position
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates whether the new match entry should be inserted before or after
+ the 
+\family typewriter 
+current
+\family default 
+ entry.
+ Allowed constants: 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+, 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion for 
+\emph on 
+PtlMEAttach
+\emph default 
+.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEUnlink
+\begin_inset LatexCommand \label{sec:meunlink}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEUnlink( ptl_handle_me_t entry );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMEUnlink
+\emph default 
+ function can be used to unlink a match entry from a match list.
+ This operation also releases any resources associated with the match entry
+ (including the associated memory descriptor).
+ It is an error to use the match entry handle after calling 
+\emph on 
+PtlMEUnlink
+\emph default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+entry
+\family default 
+ is not a valid match entry handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+entry
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the match entry to be unlinked.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Memory Descriptors
+\begin_inset LatexCommand \label{sec:md}
+
+\end_inset 
+
+
+\layout Standard
+
+A memory descriptor contains information about a region of an application
+ process' memory and an event queue where information about the operations
+ performed on the memory descriptor are recorded.
+ The Portals API provides two operations to create memory descriptors: 
+\emph on 
+PtlMDAttach
+\emph default 
+, and 
+\emph on 
+PtlMDBind
+\emph default 
+; an operation to update a memory descriptor, 
+\emph on 
+PtlMDUpdate
+\emph default 
+; and an operation to unlink and release the resources associated with a
+ memory descriptor, 
+\emph on 
+PtlMDUnlink
+\emph default 
+.
+\layout Subsection
+
+The Memory Descriptor Type
+\begin_inset LatexCommand \label{sec:md-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    void*           start;
+\newline 
+    ptl_size_t      length;
+\newline 
+    int             threshold;
+\newline 
+    unsigned int    max_offset;
+\newline 
+    unsigned int    options;
+\newline 
+    void*           user_ptr;
+\newline 
+    ptl_handle_eq_t eventq;
+\newline 
+} ptl_md_t;
+\layout Standard
+\noindent 
+The 
+\family typewriter 
+ptl_md_t
+\family default 
+ type defines the application view of a memory descriptor.
+ Values of this type are used to initialize and update the memory descriptors.
+\layout Subsubsection
+
+Members
+\layout Description
+
+start,\SpecialChar ~
+length Specify the memory region associated with the memory descriptor.
+ The 
+\family typewriter 
+start
+\family default 
+ member specifies the starting address for the memory region and the 
+\family typewriter 
+length
+\family default 
+ member specifies the length of the region.
+ The 
+\family typewriter 
+start member
+\family default 
+ can be NULL provided that the 
+\family typewriter 
+length
+\family default 
+ member is zero.
+ (Zero length buffers are useful to record events.) There are no alignment
+ restrictions on the starting address or the length of the region; although,
+ unaligned messages may be slower (i.e., lower bandwidth and/or longer latency)
+ on some implementations.
+\layout Description
+
+threshold Specifies the maximum number of operations that can be performed
+ on the memory descriptor.
+ An operation is any action that could possibly generate an event (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+ for the different types of events).
+ In the usual case, the threshold value is decremented for each operation
+ on the memory descriptor.
+ When the threshold value is zero, the memory descriptor is 
+\emph on 
+inactive
+\emph default 
+, and does not respond to operations.
+ A memory descriptor can have an initial threshold value of zero to allow
+ for manipulation of an inactive memory descriptor by the local process.
+ A threshold value of 
+\family typewriter 
+PTL_MD_THRESH_INF
+\family default 
+ indicates that there is no bound on the number of operations that may be
+ applied to a memory descriptor.
+ Note that local operations (e.g., 
+\emph on 
+PtlMDUpdate
+\emph default 
+) are not applied to the threshold count.
+\layout Description
+
+max_offset Specifies the maximum local offset of a memory descriptor.
+ When the local offset of a memory descriptor exceeds this maximum, the
+ memory descriptor becomes 
+\shape italic 
+inactive
+\shape default 
+ and does not respond to further operations.
+\layout Description
+
+options Specifies the behavior of the memory descriptor.
+ There are five options that can be selected: enable put operations (yes
+ or no), enable get operations (yes or no), offset management (local or
+ remote), message truncation (yes or no), and acknowledgement (yes or no).
+ Values for this argument can be constructed using a bitwise or of the following
+ values: 
+\begin_deeper 
+\begin_deeper 
+\layout Description
+
+PTL_MD_OP_PUT Specifies that the memory descriptor will respond to 
+\emph on 
+put
+\emph default 
+ operations.
+ By default, memory descriptors reject 
+\emph on 
+put
+\emph default 
+ operations.
+\layout Description
+
+PTL_MD_OP_GET Specifies that the memory descriptor will respond to 
+\emph on 
+get
+\emph default 
+ operations.
+ By default, memory descriptors reject 
+\emph on 
+get
+\emph default 
+ operations.
+\layout Description
+
+PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory
+ region is provided by the incoming request.
+ By default, the offset is maintained locally.
+ When the offset is maintained locally, the offset is incremented by the
+ length of the request so that the next operation (put and/or get) will
+ access the next part of the memory region.
+\layout Description
+
+PTL_MD_TRUNCATE Specifies that the length provided in the incoming request
+ can be reduced to match the memory available in the region.
+ (The memory available in a memory region is determined by subtracting the
+ offset from the length of the memory region.) By default, if the length
+ in the incoming operation is greater than the amount of memory available,
+ the operation is rejected.
+\layout Description
+
+PTL_MD_ACK_DISABLE Specifies that an acknowledgement should 
+\emph on 
+not
+\emph default 
+ be sent for incoming 
+\emph on 
+put
+\emph default 
+ operations, even if requested.
+ By default, acknowledgements are sent for 
+\emph on 
+put
+\emph default 
+ operations that request an acknowledgement.
+ Acknowledgements are never sent for 
+\emph on 
+get
+\emph default 
+ operations.
+ The value sent in the reply serves as an implicit acknowledgement.
+\end_deeper 
+\layout Standard
+
+
+\series bold 
+Note
+\series default 
+: It is not considered an error to have a memory descriptor that does not
+ respond to either 
+\emph on 
+put
+\emph default 
+ or 
+\emph on 
+get
+\emph default 
+ operations: Every memory descriptor responds to 
+\emph on 
+reply
+\emph default 
+ operations.
+ Nor is it considered an error to have a memory descriptor that responds
+ to both 
+\emph on 
+put
+\emph default 
+ and 
+\emph on 
+get
+\emph default 
+ operations.
+\end_deeper 
+\layout Description
+
+user_ptr A user-specified value that is associated with the memory descriptor.
+ The value does not need to be a pointer, but must fit in the space used
+ by a pointer.
+ This value (along with other values) is recorded in events associated with
+ operations on this memory descriptor.
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+Tying the memory descriptor to a user-defined value can be useful when multiple
+ memory descriptor share the same event queue or when the memory descriptor
+ needs to be associated with a data structure maintained by the application.
+ For example, an MPI implementation can set the 
+\family typewriter 
+user_ptr
+\family default 
+ argument to the value of an MPI Request.
+ This direct association allows for processing of memory descriptor's by
+ the MPI implementation without a table lookup or a search for the appropriate
+ MPI Request.
+\end_inset 
+
+
+\layout Description
+
+eventq A handle for the event queue used to log the operations performed
+ on the memory region.
+ If this argument is 
+\family typewriter 
+PTl_EQ_NONE
+\family default 
+, operations performed on this memory descriptor are not logged.
+\layout Subsection
+
+PtlMDAttach
+\begin_inset LatexCommand \label{sec:mdattach}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDAttach( ptl_handle_me_t  match,
+\newline 
+                 ptl_md_t         mem_desc,
+\newline 
+                 ptl_unlink_t     unlink_op,
+\newline 
+                 ptl_unlink_t     unlink_nofit,
+\newline 
+                 ptl_handle_md_t* handle );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_unlink_t
+\family default 
+ are used to control whether an item is unlinked from a list.
+ The value 
+\family typewriter 
+PTL_UNLINK
+\family default 
+ enables unlinking.
+ The value 
+\family typewriter 
+PTL_RETAIN
+\family default 
+ disables unlinking.
+\layout Standard
+
+The 
+\emph on 
+PtlMDAttach
+\emph default 
+ operation is used to create a memory descriptor and attach it to a match
+ list entry.
+ An error code is returned if this match list entry already has an associated
+ memory descriptor.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INUSE Indicates that 
+\family typewriter 
+match
+\family default 
+ already has a memory descriptor attached.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+match
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ILL_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a legal memory descriptor.
+ This may happen because the memory region defined in 
+\family typewriter 
+mem_desc
+\family default 
+ is invalid or because the network interface associated with the 
+\family typewriter 
+eventq
+\family default 
+ in 
+\family typewriter 
+mem_desc
+\family default 
+ is not the same as the network interface associated with 
+\family typewriter 
+match
+\family default 
+.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ memory descriptor.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the match entry that the memory descriptor will be associated
+ with.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Provides initial values for the application visible parts of a memory descriptor.
+  Other than its use for initialization, there is no linkage between this
+ structure and the memory descriptor maintained by the API.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+unlink_op
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A flag to indicate whether the memory descriptor is  unlinked when it becomes
+ inactive, either because the operation threshold drops to zero or because
+ the maximum offset has been exceeded.
+  (Note, the check for unlinking a memory descriptor only occurs after a
+ the completion of a successful operation.
+  If the threshold is set to zero during initialization or  using 
+\emph on 
+PtlMDUpdate
+\emph default 
+, the memory descriptor is 
+\series bold 
+not
+\series default 
+  unlinked.) 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+unlink_nofit
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A flag to indicate whether the memory descriptor is  unlinked when the space
+ remaining in the memory descriptor is not sufficient for a matching operation.
+ If an incoming message arrives arrives at a memory descriptor that does
+ not have sufficient space and the 
+\series bold 
+PTL_MD_TRUNCATE
+\series default 
+ operation is not specified, the memory descriptor will be unlinked.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the newly created
+ memory descriptor.
+  The 
+\family typewriter 
+handle
+\family default 
+ argument can be NULL, in which case the handle will not be returned.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDBind
+\begin_inset LatexCommand \label{sec:mdbind}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDBind( ptl_handle_ni_t  interface,
+\newline 
+               ptl_md_t         mem_desc,
+\newline 
+               ptl_handle_md_t* handle );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDBind
+\emph default 
+ operation is used to create a 
+\begin_inset Quotes eld
+\end_inset 
+
+free floating
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor, i.e., a memory descriptor that is not associated with
+ a match list entry.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ILL_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a legal memory descriptor.
+ This may happen because the memory region defined in 
+\family typewriter 
+mem_desc
+\family default 
+ is invalid or because the network interface associated with the 
+\family typewriter 
+eventq
+\family default 
+ in 
+\family typewriter 
+mem_desc
+\family default 
+ is not the same as the network interface, 
+\family typewriter 
+interface
+\family default 
+.
+\layout Description
+
+PTL_INV_EQ Indicates that the event queue associated with 
+\family typewriter 
+mem_desc
+\family default 
+ is not valid.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ memory descriptor.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the network interface with which the memory descriptor will
+ be associated.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Provides initial values for the application visible parts of a memory descriptor.
+  Other than its use for initialization, there is no linkage between this
+ structure and the  memory descriptor maintained by the API.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ memory descriptor.
+  The 
+\family typewriter 
+handle
+\family default 
+ argument must be a valid address and cannot be NULL.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDUnlink
+\begin_inset LatexCommand \label{sec:mdfree}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDUnlink( ptl_handle_md_t mem_desc );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDUnlink
+\emph default 
+ function unlinks the memory descriptor from any match list entry it may
+ be linked to and releases the resources associated with a memory descriptor.
+ (This function does not free the memory region associated with the memory
+ descriptor.) This function also releases the resources associated with a
+ floating memory descriptor.
+ Only memory descriptors with no pending operations may be unlinked.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor handle.
+\layout Description
+
+PTL_MD_INUSE Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ has pending operations and cannot be unlinked.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor to be released.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDUpdate
+\begin_inset LatexCommand \label{sec:mdupdate}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDUpdate( ptl_handle_md_t mem_desc,
+\newline 
+                 ptl_md_t*       old_md,
+\newline 
+                 ptl_md_t*       new_md,
+\newline 
+                 ptl_handle_eq_t testq );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDUpdate
+\emph default 
+ function provides a conditional, atomic update operation for memory descriptors.
+ The memory descriptor identified by 
+\family typewriter 
+mem_desc
+\family default 
+ is only updated if the event queue identified by 
+\family typewriter 
+testq
+\family default 
+ is empty.
+ The intent is to only enable updates to the memory descriptor when no new
+ messages have arrived since the last time the queue was checked.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:exmpi}
+
+\end_inset 
+
+ for an example of how this function can be used.
+\layout Standard
+
+If 
+\family typewriter 
+new
+\family default 
+ is not NULL the memory descriptor identified by handle will be updated
+ to reflect the values in the structure pointed to by 
+\family typewriter 
+new
+\family default 
+ if 
+\family typewriter 
+testq
+\family default 
+ has the value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+ or if the event queue identified by 
+\family typewriter 
+testq
+\family default 
+ is empty.
+ If 
+\family typewriter 
+old
+\family default 
+ is not NULL, the current value of the memory descriptor identified by 
+\family typewriter 
+mem_desc
+\family default 
+ is recorded in the location identified by 
+\family typewriter 
+old
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_NOUPDATE Indicates that the update was not performed because 
+\family typewriter 
+testq
+\family default 
+ was not empty.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor handle.
+\layout Description
+
+PTL_ILL_MD Indicates that the value pointed to by 
+\family typewriter 
+new
+\family default 
+ is not a legal memory descriptor (e.g., the memory region specified by the
+ memory descriptor may be invalid).
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+testq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+new
+\family default 
+ or 
+\family typewriter 
+old
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor to update.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+old_md
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If 
+\family typewriter 
+old_md
+\family default 
+ is not the value 
+\family typewriter 
+NULL
+\family default 
+, the current value of the memory descriptor will be stored in the location
+ identified by 
+\family typewriter 
+old
+\family default 
+_md.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+new_md
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If 
+\family typewriter 
+new_md
+\family default 
+ is not the value 
+\family typewriter 
+NULL
+\family default 
+, this argument provides the new values for the memory descriptor, if the
+ update is performed.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+testq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for an event queue used to predicate the update.
+ If 
+\family typewriter 
+testq
+\family default 
+ is equal to 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, the update is performed unconditionally.
+  Otherwise, the update is performed if and only if 
+\family typewriter 
+testq
+\family default 
+ is empty.
+  If the update is  not performed, the function returns the value 
+\family typewriter 
+PTL_NOUPDATE
+\family default 
+.
+  (Note, the 
+\family typewriter 
+testq
+\family default 
+ argument does not need to be the same as  the event queue associated with
+ the memory descriptor.)
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Standard
+
+The conditional update can be used to ensure that the memory descriptor
+ has not changed between the time it was examined and the time it is updated.
+ In particular, it is needed to support an MPI implementation where the
+ activity of searching an unexpected message queue and posting a receive
+ must be atomic.
+\layout Section
+
+Events and Event Queues
+\begin_inset LatexCommand \label{sec:eq}
+
+\end_inset 
+
+
+\layout Standard
+
+Event queues are used to log operations performed on memory descriptors.
+ They can also be used to hold acknowledgements for completed 
+\emph on 
+put
+\emph default 
+ operations and to note when the data specified in a 
+\emph on 
+put
+\emph default 
+ operation has been sent (i.e., when it is safe to reuse the buffer that holds
+ this data).
+ Multiple memory descriptors can share a single event queue.
+\layout Standard
+
+In addition to the 
+\family typewriter 
+ptl_handle_eq_t
+\family default 
+ type, the Portals API defines two types associated with events: The 
+\family typewriter 
+
+\newline 
+ptl_event_kind_t
+\family default 
+ type defines the kinds of events that can be stored in an event queue.
+ The 
+\family typewriter 
+ptl_event_t
+\family default 
+ type defines a structure that holds the information associated with an
+ event.
+\layout Standard
+
+The Portals API also provides four functions for dealing with event queues:
+ The 
+\emph on 
+PtlEQAlloc
+\emph default 
+ function is used to allocate the API resources needed for an event queue,
+ the 
+\emph on 
+PtlEQFree
+\emph default 
+ function is used to release these resources, the 
+\emph on 
+PtlEQGet
+\emph default 
+ function can be used to get the next event from an event queue, and the
+\emph on 
+PtlEQWait
+\emph default 
+ function can be used to block a process (or thread) until an event queue
+ has at least one event.
+\layout Subsection
+
+Kinds of Events
+\begin_inset LatexCommand \label{sec:ek-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { 
+\newline 
+    PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL,
+\newline 
+    PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL,
+\newline 
+    PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL,
+\newline 
+    PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL,
+\newline 
+    PTL_EVENT_ACK,
+\newline 
+    PTL_EVENT_UNLINK
+\newline 
+} ptl_event_kind_t;
+\layout Standard
+\noindent 
+The Portals API defines fourteen types of events that can be logged in an
+ event queue: 
+\layout Description
+
+PTL_EVENT_GET_START A remote 
+\emph on 
+get
+\emph default 
+ operation has been started on the memory descriptor.
+ The memory region associated with this descriptor should not be altered
+ until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_GET_END A previously initiated 
+\emph on 
+get
+\emph default 
+ operation completed successfully.
+ This event is logged after the reply has been sent by the local node.
+ As such, the process could free the memory descriptor once it sees this
+ event.
+\layout Description
+
+PTL_EVENT_GET_FAIL A previously initiated 
+\emph on 
+get
+\emph default 
+ operation completed unsuccessfully.
+ This event is logged after the reply has been sent by the local node.
+ As such, the process could free the memory descriptor once it sees this
+ event.
+\layout Description
+
+PTL_EVENT_PUT_START A remote 
+\emph on 
+put
+\emph default 
+ operation has been started on the memory descriptor.
+ The memory region associated with this descriptor should should be considered
+ volatile until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_PUT_END A previously initiated 
+\emph on 
+put
+\emph default 
+ operation completed successfully.
+ The underlying layers will not alter the memory (on behalf of this operation)
+ once this event has been logged.
+\layout Description
+
+PTL_EVENT_PUT_FAIL A previously initiated 
+\emph on 
+put
+\emph default 
+ operation completed unsuccessfully.
+ The underlying layers will not alter the memory (on behalf of this operation)
+ once this event has been logged.
+\layout Description
+
+PTL_EVENT_REPLY_START A 
+\emph on 
+reply
+\emph default 
+ operation has been started on the memory descriptor.
+\layout Description
+
+PTL_EVENT_REPLY_END A previously initiated 
+\emph on 
+reply
+\emph default 
+ operation has completed successfully .
+ This event is logged after the data (if any) from the reply has been written
+ into the memory descriptor.
+\layout Description
+
+PTL_EVENT_REPLY_FAIL A previously initiated 
+\emph on 
+reply
+\emph default 
+ operation has completed unsuccessfully.
+ This event is logged after the data (if any) from the reply has been written
+ into the memory descriptor.
+\layout Description
+
+PTL_EVENT_ACK An 
+\emph on 
+acknowledgement
+\emph default 
+ was received.
+ This event is logged when the acknowledgement is received 
+\layout Description
+
+PTL_EVENT_SEND_START An outgoing 
+\emph on 
+send
+\emph default 
+ operation has been started.
+ The memory region associated with this descriptor should not be altered
+ until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_SEND_END A previously initiated 
+\emph on 
+send
+\emph default 
+ operation has completed successfully.
+ This event is logged after the entire buffer has been sent and it is safe
+ for the application to reuse the buffer.
+\layout Description
+
+PTL_EVENT_SEND_FAIL A previously initiated 
+\emph on 
+send
+\emph default 
+ operation has completed unsuccessfully.
+ The process can safely manipulate the memory or free the memory descriptor
+ once it sees this event.
+\layout Description
+
+PTL_EVENT_UNLINK A memory descriptor associated with this event queue has
+ been automatically unlinked.
+ This event is not generated when a memory descriptor is explicitly unlinked
+ by calling 
+\shape italic 
+PtlMDUnlink
+\shape default 
+.
+ This event does not decrement the threshold count.
+\layout Subsection
+
+Event Ordering
+\layout Standard
+
+The Portals API guarantees that a when a process initiates two operations
+ on a remote process, the operations will be initiated on the remote process
+ in the same order that they were initiated on the original process.
+ As an example, if process A intitates two 
+\emph on 
+put
+\emph default 
+ operations, 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+, on process B, the Portals API guarantees that process A will receive the
+\family typewriter 
+PTL_EVENT_SEND_START
+\family default 
+ events for 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+ in the same order that process B receives the 
+\family typewriter 
+PTL_EVENT_PUT_START
+\family default 
+ events for 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+.
+ Notice that the API does not guarantee that the start events will be delivered
+ in the same order that process A initiated the 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+ operations.
+ If process A needs to ensure the ordering of these operations, it should
+ include code to wait for the initiation of 
+\emph on 
+x
+\emph default 
+ before it initiates 
+\emph on 
+y
+\emph default 
+.
+\layout Subsection
+
+Failure Notification
+\layout Standard
+
+Operations may fail to complete successfully; however, unless the node itself
+ fails, every operation that is started will eventually complete.
+ While an operation is in progress, the memory associated with the operation
+ should not be viewed (in the case of a put or a reply) or altered (in the
+ case of a send or get).
+ Operation completion, whether successful or unsuccessful, is final.
+ That is, when an operation completes, the memory associated with the operation
+ will no longer be read or altered by the operation.
+ A network interface can use the 
+\family typewriter 
+ptl_ni_fail_t
+\family default 
+ to define more specific information regarding the failure of the operation
+ and record this information in the 
+\family typewriter 
+ni_fail_type
+\family default 
+ field of the event.
+\layout Subsection
+
+The Event Type
+\begin_inset LatexCommand \label{sec:event-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    ptl_event_kind_t      type;
+\newline 
+    ptl_process_id_t      initiator;
+\newline 
+    ptl_uid_t             uid;
+\layout LyX-Code
+
+    ptl_pt_index_t        portal;
+\newline 
+    ptl_match_bits_t      match_bits;
+\newline 
+    ptl_size_t            rlength;
+\newline 
+    ptl_size_t            mlength;
+\newline 
+    ptl_size_t            offset; 
+\newline 
+    ptl_handle_md_t       md_handle;
+\newline 
+    ptl_md_t              mem_desc;
+\newline 
+    ptl_hdr_data_t        hdr_data;
+\newline 
+    ptl_seq_t             link;
+\newline 
+    ptl_ni_fail_t         ni_fail_type;
+\newline 
+    volatile ptl_seq_t    sequence;
+\newline 
+} ptl_event_t;
+\layout Standard
+\noindent 
+An event structure includes the following members: 
+\layout Description
+
+type Indicates the type of the event.
+\layout Description
+
+initiator The id of the initiator.
+\layout Description
+
+portal The Portal table index specified in the request.
+\layout Description
+
+match_bits A copy of the match bits specified in the request.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+ for more information on match bits.
+\layout Description
+
+rlength The length (in bytes) specified in the request.
+\layout Description
+
+mlength The length (in bytes) of the data that was manipulated by the operation.
+ For truncated operations, the manipulated length will be the number of
+ bytes specified by the memory descriptor (possibly with an offset) operation.
+ For all other operations, the manipulated length will be the length of
+ the requested operation.
+\layout Description
+
+offset Is the displacement (in bytes) into the memory region that the operation
+ used.
+ The offset can be determined by the operation (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+) for a remote managed memory descriptor, or by the local memory descriptor
+ (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+).
+\layout Description
+
+md_handle Is the handle to the memory descriptor associated with the event.
+\layout Description
+
+mem_desc Is the state of the memory descriptor immediately after the event
+ has been processed.
+\layout Description
+
+hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+).
+\layout Description
+
+link The 
+\emph on 
+link
+\emph default 
+ member is used to link 
+\family typewriter 
+START
+\family default 
+ events with the 
+\family typewriter 
+END
+\family default 
+ or 
+\family typewriter 
+FAIL
+\family default 
+ event that signifies completion of the operation.
+ The 
+\emph on 
+link
+\emph default 
+ member will be the same for the two events associated with an operation.
+ The link member is also used to link an 
+\family typewriter 
+UNLINK
+\family default 
+ event with the event that caused the memory descriptor to be unlinked.
+\layout Description
+
+sequence The sequence number for this event.
+ Sequence numbers are unique to each event.
+\layout Comment
+
+The 
+\emph on 
+sequence
+\emph default 
+ member is the last member and is volatile to support SMP implementations.
+ When an event structure is filled in, the 
+\emph on 
+sequence
+\emph default 
+ member should be written after all other members have been updated.
+ Moreover, a memory barrier should be inserted between the updating of other
+ members and the updating of the 
+\emph on 
+sequence
+\emph default 
+ member.
+\layout Subsection
+
+PtlEQAlloc
+\begin_inset LatexCommand \label{sec:eqalloc}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQAlloc( ptl_handle_ni_t  interface,
+\newline 
+                ptl_size_t       count,
+\newline 
+                ptl_handle_eq_t* handle );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlEQAlloc
+\emph default 
+ function is used to build an event queue.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ event queue.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface with which the event queue  will be associated.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+count
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The number of events that can be stored in the event queue.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the newly created
+ event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQFree
+\begin_inset LatexCommand \label{sec:eqfree}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQFree( ptl_handle_eq_t eventq );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlEQFree
+\emph default 
+ function releases the resources associated with an event queue.
+ It is up to the user to insure that no memory descriptors are associated
+ with the event queue once it is freed.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the event queue to be released.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQGet
+\begin_inset LatexCommand \label{sec:eqget}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQGet( ptl_handle_eq_t eventq,
+\newline 
+              ptl_event_t*    event );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PTLEQGet
+\emph default 
+ function is a nonblocking function that can be used to get the next event
+ in an event queue.
+ The event is removed from the queue.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at
+ least one event between this event and the last event obtained (using 
+\emph on 
+PtlEQGet
+\emph default 
+ or 
+\emph on 
+PtlEQWait
+\emph default 
+) from this event queue has been dropped due to limited space in the event
+ queue.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_EQ_EMPTY Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is empty or another thread is waiting on 
+\emph on 
+PtlEQWait
+\emph default 
+.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+event
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the event queue.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+event
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the  values associated with
+ the next event in the event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQWait
+\begin_inset LatexCommand \label{sec:eqwait}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQWait( ptl_handle_eq_t eventq,
+\newline 
+               ptl_event_t*    event );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PTLEQWait
+\emph default 
+ function can be used to block the calling process (thread) until there
+ is an event in an event queue.
+ This function also returns the next event in the event queue and removes
+ this event from the queue.
+ This is the only blocking operation in the Portals 3.2 API.
+ In the event that multiple threads are waiting on the same event queue,
+ PtlEQWait is guaranteed to wake exactly one thread, but the order in which
+ they are awakened is not specified.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at
+ least one event between this event and the last event obtained (using 
+\emph on 
+PtlEQGet
+\emph default 
+ or 
+\emph on 
+PtlEQWait
+\emph default 
+) from this event queue has been dropped due to limited space in the event
+ queue.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+event
+\family default 
+ is not a legal address.
+ queue handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+\noindent 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the event queue to wait on.
+  The calling process (thread) will be blocked until 
+\family typewriter 
+eventq
+\family default 
+ is not empty.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+event
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the values associated with
+ the next event in the event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+The Access Control Table
+\begin_inset LatexCommand \label{sec:ac}
+
+\end_inset 
+
+
+\layout Standard
+
+Processes can use the access control table to control which processes are
+ allowed to perform operations on Portal table entries.
+ Each communication interface has a Portal table and an access control table.
+ The access control table for the default interface contains an entry at
+ index zero that allows all processes with the same user id to communicate.
+ Entries in the access control table can be manipulated using the 
+\emph on 
+PtlACEntry
+\emph default 
+ function.
+\layout Subsection
+
+PtlACEntry
+\begin_inset LatexCommand \label{sec:acentry}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlACEntry( ptl_handle_ni_t  interface,
+\newline 
+                ptl_ac_index_t   index,
+\newline 
+                ptl_process_id_t matchid,
+\newline 
+                ptl_uid_t        user_id,
+\newline 
+                ptl_pt_index_t   portal );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlACEntry
+\emph default 
+ function can be used to update an entry in the access control table for
+ an interface.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_AC_INV_INDEX Indicates that 
+\family typewriter 
+index
+\family default 
+ is not a valid access control table index.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_PT_INV_INDEX Indicates that 
+\family typewriter 
+portal
+\family default 
+ is not a valid Portal table index.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index of the entry in the access control table to update.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+matchid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the process(es) that are allowed to  perform operations.
+ The constants 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ and 
+\family typewriter 
+PTL_NID_ANY
+\family default 
+ can be used to wildcard either of the ids in the 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ structure.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+user_id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the user that is allowed to  perform operations.
+ The value 
+\family typewriter 
+PTL_UID_ANY
+\family default 
+ can be used to wildcard the user.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the Portal index(es) that can be used.
+  The value 
+\family typewriter 
+PTL_PT_INDEX_ANY
+\family default 
+ can be used to wildcard the  Portal index.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Data Movement Operations
+\begin_inset LatexCommand \label{sec:datamovement}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API provides two data movement operations: 
+\emph on 
+PtlPut
+\emph default 
+ and 
+\emph on 
+PtlGet
+\emph default 
+.
+\layout Subsection
+
+PtlPut
+\begin_inset LatexCommand \label{sec:put}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t;
+\newline 
+
+\newline 
+int PtlPut( ptl_handle_md_t  mem_desc,
+\newline 
+            ptl_ack_req_t    ack_req,
+\newline 
+            ptl_process_id_t target,
+\newline 
+            ptl_pt_index_t   portal,
+\newline 
+            ptl_ac_index_t   cookie,
+\newline 
+            ptl_match_bits_t match_bits,
+\newline 
+            ptl_size_t       offset,
+\newline 
+            ptl_hdr_data_t   hdr_data );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_ack_req_t
+\family default 
+ are used to control whether an acknowledgement should be sent when the
+ operation completes (i.e., when the data has been written to a memory descriptor
+ of the 
+\family typewriter 
+target
+\family default 
+ process).
+ The value 
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+ requests an acknowledgement, the value 
+\family typewriter 
+PTL_NOACK_REQ
+\family default 
+ requests that no acknowledgement should be generated.
+\layout Standard
+
+The 
+\emph on 
+PtlPut
+\emph default 
+ function initiates an asynchronous put operation.
+ There are several events associated with a put operation: initiation of
+ the send on the local node (
+\family typewriter 
+PTL_EVENT_SEND_START
+\family default 
+), completion of the send on the local node (
+\family typewriter 
+PTL_EVENT_SEND_END
+\family default 
+ or 
+\family typewriter 
+PTL_EVENT_SEND_FAIL
+\family default 
+), and, when the send completes successfully, the receipt of an acknowledgement
+ (
+\family typewriter 
+PTL_EVENT_ACK
+\family default 
+) indicating that the operation was accepted by the target.
+ These events will be logged in the event queue associated with the memory
+ descriptor (
+\family typewriter 
+mem_desc
+\family default 
+) used in the put operation.
+ Using a memory descriptor that does not have an associated event queue
+ results in these events being discarded.
+ In this case, the application must have another mechanism (e.g., a higher
+ level protocol) for determining when it is safe to modify the memory region
+ associated with the memory descriptor.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+target
+\family default 
+ is not a valid process id.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="8" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor that describes the memory to be sent.
+  If the memory descriptor has an event queue  associated with it, it will
+ be used to record events when the  message has been sent (PTL_EVENT_SEND_START,
+ PTL_EVENT_SEND_END).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ack_req
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Controls whether an acknowledgement event is requested.
+  Acknowledgements are only sent when they are requested by the initiating
+ process 
+\series bold 
+and
+\series default 
+ the memory descriptor has an event queue 
+\series bold 
+and
+\series default 
+ the target memory descriptor enables them.
+ Allowed constants: 
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+, 
+\family typewriter 
+PTL_NOACK_REQ
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A process id for the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index in the remote Portal table.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index into the access control table of the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The match bits to use for message selection at the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The offset into the target memory descriptor (only used when the target
+ memory descriptor has the 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+ option set).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+hdr_data
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+64 bits of user data that can be included in message header.
+  This data is written to an event queue entry at the target if an event
+ queue is present on the matching memory descriptor.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlGet
+\begin_inset LatexCommand \label{sec:get}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlGet( ptl_handle_md_t  mem_desc,
+\newline 
+            ptl_process_id_t target,
+\newline 
+            ptl_pt_index_t   portal,
+\newline 
+            ptl_ac_index_t   cookie,
+\newline 
+            ptl_match_bits_t match_bits,
+\newline 
+            ptl_size_t       offset );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlGet
+\emph default 
+ function initiates a remote read operation.
+ There are two event pairs associated with a get operation , when the data
+ is sent from the remote node, a 
+\family typewriter 
+PTL_EVENT_GET{START|END}
+\family default 
+ event pair is registered on the remote node; and when the data is returned
+ from the remote node a 
+\family typewriter 
+PTL_EVENT_REPLY{START|END}
+\family default 
+ event pair is registered on the local node.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+target
+\family default 
+ is not a valid process id.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="6" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor that describes the memory into which
+ the requested data will be received.
+  The memory descriptor can have an event queue associated with it to record
+ events, such as when the message receive has started (
+\family typewriter 
+PTL_EVENT_REPLY
+\family default 
+_
+\family typewriter 
+START
+\family default 
+).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A process id for the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index in the remote Portal table.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index into the access control table of the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The match bits to use for message selection at the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The offset into the target memory descriptor (only used when the target
+ memory descriptor has the 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+ option set).
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Summary
+\layout Standard
+
+
+\begin_inset LatexCommand \label{sec:summary}
+
+\end_inset 
+
+ We conclude this section by summarizing the names introduced by the Portals
+ 3.2 API.
+ We start by summarizing the names of the types introduced by the API.
+ This is followed by a summary of the functions introduced by the API.
+ Which is followed by a summary of the function return codes.
+ Finally, we conclude with a summary of the other constant values introduced
+ by the API.
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+ presents a summary of the types defined by the Portals API.
+ The first column in this table gives the type name, the second column gives
+ a brief description of the type, the third column identifies the section
+ where the type is defined, and the fourth column lists the functions that
+ have arguments of this type.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Types Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:types}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\noindent 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="25" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2in">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2.2in">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Sect
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Functions 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for an access control table 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:index-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlACEntry, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+acknowledgement request types 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlPut
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+kinds of events
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+information about events 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+plt_seq_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+event sequence number
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlEQGet, PtlEQWait
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_any_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for any object 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIHandle 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_eq_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for event queues 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for memory descriptors 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert,
+ PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_me_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for match entries 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_ni_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for network interfaces 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut,
+ PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_nid_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+node identifiers
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlGetId,PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+process identifier
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlGetId, PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user indentifier
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlGetUid, PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+insertion position (before or after) 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_interface_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+identifiers for network interfaces 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+match (and ignore) bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mb-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+memory descriptors 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ni_fail_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+network interface-specific failures
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlEQGet, PtlEQWait
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+process identifiers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:pid-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for Portal tables 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:index-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlACEntry 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+sizes 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:size-t}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQAlloc, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for status registers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_value_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+values in status registers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+unlink options 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+ presents a summary of the functions defined by the Portals API.
+ The first column in this table gives the name for the function, the second
+ column gives a brief description of the operation implemented by the function,
+ and the third column identifies the section where the function is defined.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Functions Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:func}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="24" columns="3">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+Name 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Section 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlACEntry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ update an entry in an access control table 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ac}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQAlloc 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQGet 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the next event from an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQFree 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ release the resources for an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQWait 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ wait for a new event in an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlFini 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ shutdown the Portals API 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:init}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlGet 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ perform a get operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlGetId 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the id for the current process 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlInit 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initialize the Portals API 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:init}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDAttach 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a memory descriptor and attach it to a match entry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDBind 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a free-floating memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDUnlink 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ remove a memory descriptor from a list and release its resources 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDUpdate 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ update a memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEAttach 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+create a match entry and attach it to a Portal table 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMEAttachAny
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+create a match entry and attach it to a free Portal table entry
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:attachany}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEInsert 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a match entry and insert it in a list 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEUnlink 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ remove a match entry from a list and release its resources 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIDist 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the distance to another process 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIFini 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ shutdown a network interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIHandle 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the network interface handle for an object 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIInit 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initialize a network interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIStatus 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ read a network interface status register 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlPut 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ perform a put operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+ summarizes the return codes used by functions defined by the Portals API.
+ All of these constants are integer values.
+ The first column of this table gives the symbolic name for the constant,
+ the second column gives a brief description of the value, and the third
+ column identifies the functions that can return this value.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Function Return Codes for the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:retcodes}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="27" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2.6in">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Functions
+\series default 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_AC_INV_INDEX
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid access control table index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlACEntry 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_DROPPED
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+at least one event has been dropped 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlEQGet, PtlWait 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_EMPTY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+no events available in an event queue 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlEQGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_FAIL 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+error during initialization or cleanup 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlInit, PtlFini 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ILL_MD
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+illegal memory descriptor values 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach, PtlMDBind, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INIT_DUP 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+duplicate initialization of an interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INIT_INV
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initialization of an invalid interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INUSE
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+the ME already has an MD
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_ASIZE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid access control table size 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_EQ
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid event queue handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDUpdate, PtlEQFree, PtlEQGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_HANDLE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIHandle 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_MD 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid memory descriptor handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDUnlink, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_ME
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid match entry handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_NI 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid network interface handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_PROC 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid process identifier 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_PTINDEX
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid Portal table index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMEAttach 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_REG 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid status register 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_SR_INDX 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid status register index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ML_TOOLONG 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match list too long 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMEAttach, PtlMEInsert 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_INUSE
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+MD has pending operations
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMDUnlink
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOINIT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+uninitialized API 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+\emph on 
+all
+\emph default 
+, except PtlInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOSPACE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insufficient memory 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+PTL_NOUPDATE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ no update was performed 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PT_FULL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+Portal table is full
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMEAttachAny
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_OK 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ success 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+\emph on 
+all
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_SEGV 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+addressing violation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate,
+ PtlEQAlloc, PtlEQGet, PtlEQWait 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+ summarizes the remaining constant values introduced by the Portals API.
+ The first column in this table presents the symbolic name for the constant,
+ the second column gives a brief description of the value, the third column
+ identifies the type for the value, and the fourth column identifies the
+ sections in which the value is mentioned.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Other Constants Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:oconsts}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="36" columns="5">
+<features>
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Base type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Intr.
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Ref.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+request an acknowledgement 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_NONE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a NULL event queue handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_eq_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_UNLINK
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+unlink event
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PID_ANY 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for process id fields 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pid_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NID_ANY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for node id fields
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_nid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_UID_ANY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for user id
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_IFACE_DEFAULT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+default interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_interface_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INS_AFTER 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insert after 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INS_BEFORE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insert before 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_ACK_DISABLE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to disable acknowledgements 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_MANAGE_REMOTE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable the use of remote offsets 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_OP_GET 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable get operations 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_OP_PUT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable put operations 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_THRESH_INF 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+infinite threshold for a memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_TRUNCATE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable truncation of a request 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOACK_REQ 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+request no acknowledgement 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PT_INDEX_ANY 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for Portal indexes 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_RETAIN 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+disable unlinking 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_SR_DROP_COUNT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+index for the dropped count register 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_UNLINK 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+enable unlinking 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Chapter
+
+The Semantics of Message Transmission
+\begin_inset LatexCommand \label{sec:semantics}
+
+\end_inset 
+
+
+\layout Standard
+
+The portals API uses four types of messages: put requests, acknowledgements,
+ get requests, and replies.
+ In this section, we describe the information passed on the wire for each
+ type of message.
+ We also describe how this information is used to process incoming messages.
+\layout Section
+
+Sending Messages
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:put-wire}
+
+\end_inset 
+
+ summarizes the information that is transmitted for a put request.
+ The first column provides a descriptive name for the information, the second
+ column provides the type for this information, the third column identifies
+ the source of the information, and the fourth column provides additional
+ notes.
+ Most information that is transmitted is obtained directly from the 
+\emph on 
+PtlPut
+\emph default 
+ operation.
+ Notice that the handle for the memory descriptor used in the 
+\emph on 
+PtlPut
+\emph default 
+ operation is transmitted even though this value cannot be interpreted by
+ the target.
+ A value of anything other than 
+\family typewriter 
+PTL_MD_NONE
+\family default 
+, is interpreted as a request for an acknowledgement.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Put Request
+\begin_inset LatexCommand \label{tab:put-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="12" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+\emph on 
+PtlPut
+\emph default 
+ arg
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates a put request 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+no ack if 
+\family typewriter 
+PTL_MD_NONE
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+length
+\family default 
+ member 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+data 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family roman 
+\emph on 
+bytes
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+start
+\family default 
+ and 
+\family typewriter 
+length
+\family default 
+ members 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:ack-wire}
+
+\end_inset 
+
+ summarizes the information transmitted in an acknowledgement.
+ Most of the information is simply echoed from the put request.
+ Notice that the initiator and target are obtained directly from the put
+ request, but are swapped in generating the acknowledgement.
+ The only new piece of information in the acknowledgement is the manipulated
+ length which is determined as the put request is satisfied.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in an Acknowledgement
+\begin_inset LatexCommand \label{tab:ack-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="10" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Put Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ indicates an acknowledgement 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ requested length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ manipulated length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ obtained from the operation 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:get-wire}
+
+\end_inset 
+
+ summarizes the information that is transmitted for a get request.
+ Like the information transmitted in a put request, most of the information
+ transmitted in a get request is obtained directly from the 
+\emph on 
+PtlGet
+\emph default 
+ operation.
+ Unlike put requests, get requests do not include the event queue handle.
+ In this case, the reply is generated whenever the operation succeeds and
+ the memory descriptor must not be unlinked until the reply is received.
+ As such, there is no advantage to explicitly sending the event queue handle.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Get Request
+\begin_inset LatexCommand \label{tab:get-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="11" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+\emph on 
+PtlGet
+\emph default 
+ argument
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates a get operation 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+length
+\family default 
+ member 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:reply-wire}
+
+\end_inset 
+
+ summarizes the information transmitted in a reply.
+ Like an acknowledgement, most of the information is simply echoed from
+ the get request.
+ The initiator and target are obtained directly from the get request, but
+ are swapped in generating the acknowledgement.
+ The only new information in the acknowledgement are the manipulated length
+ and the data, which are determined as the get request is satisfied.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Reply
+\begin_inset LatexCommand \label{tab:reply-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="11" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Put Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates an acknowledgement 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+requested length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+manipulated length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+obtained from the operation 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+data 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\emph on 
+bytes
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+obtained from the operation 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Section
+
+Receiving Messages
+\begin_inset LatexCommand \label{sec:receiving}
+
+\end_inset 
+
+
+\layout Standard
+
+When an incoming message arrives on a network interface, the communication
+ system first checks that the target process identified in the request is
+ a valid process that has initialized the network interface (i.e., that the
+ target process has a valid Portal table).
+ If this test fails, the communication system discards the message and increment
+s the dropped message count for the interface.
+ The remainder of the processing depends on the type of the incoming message.
+ Put and get messages are subject to access control checks and translation
+ (searching a match list), while acknowledgement and reply messages bypass
+ the access control checks and the translation step.
+\layout Standard
+
+Acknowledgement messages include a handle for the memory descriptor used
+ in the original 
+\emph on 
+PtlPut
+\emph default 
+ operation.
+ This memory descriptor will identify the event queue where the event should
+ be recorded.
+ Upon receipt of an acknowledgement, the runtime system only needs to confirm
+ that the memory descriptor and event queue still exist and that there is
+ space for another event.
+ Should the any of these conditions fail,  the message is simply discarded
+ and the dropped message count for the interface is incremented.
+ Otherwise, the system builds an acknowledgement event from the information
+ in the acknowledgement message and adds it to the event queue.
+\layout Standard
+
+Reception of reply messages is also relatively straightforward.
+ Each reply message includes a handle for a memory descriptor.
+ If this descriptor exists, it is used to receive the message.
+ A reply message will be dropped if the memory descriptor identified in
+ the request doesn't exist.
+ In either of this case, the dropped message count for the interface is
+ incremented.
+ These are the only reasons for dropping reply messages.
+ Every memory descriptor accepts and truncates incoming reply messages,
+ eliminating the other potential reasons for rejecting a reply message.
+\layout Standard
+
+The critical step in processing an incoming put or get request involves
+ mapping the request to a memory descriptor.
+ This step starts by using the Portal index in the incoming request to identify
+ a list of match entries.
+ This list of match entries is searched in order until a match entry is
+ found whose match criteria matches the match bits in the incoming request
+ and whose memory descriptor accepts the request.
+\layout Standard
+
+Because acknowledge and reply messages are generated in response to requests
+ made by the process receiving these messages, the checks performed by the
+ runtime system for acknowledgements and replies are minimal.
+ In contrast, put and get messages are generated by remote processes and
+ the checks performed for these messages are more extensive.
+ Incoming put or get messages may be rejected because: 
+\layout Itemize
+
+the Portal index supplied in the request is not valid; 
+\layout Itemize
+
+the cookie supplied in the request is not a valid access control entry;
+\layout Itemize
+
+the access control entry identified by the cookie does not match the identifier
+ of the requesting process; 
+\layout Itemize
+
+the access control entry identified by the access control entry does not
+ match the Portal index supplied in the request; or 
+\layout Itemize
+
+the match bits supplied in the request do not match any of the match entries
+ with a memory descriptor that accepts the request.
+\layout Standard
+
+In all cases, if the message is rejected, the incoming message is discarded
+ and the dropped message count for the interface is incremented.
+\layout Standard
+
+A memory descriptor may reject an incoming request for any of the following
+ reasons: 
+\layout Itemize
+
+the 
+\family typewriter 
+PTL_MD_PUT
+\family default 
+ or 
+\family typewriter 
+PTL_MD_GET
+\family default 
+ option has not been enabled and the operation is put or get, respectively;
+\layout Itemize
+
+the length specified in the request is too long for the memory descriptor
+ and the 
+\family typewriter 
+PTL_MD_TRUNCATE
+\family default 
+ option has not been enabled.
+\layout Chapter
+
+Examples
+\begin_inset LatexCommand \label{sec:examples}
+
+\end_inset 
+
+
+\layout Comment
+
+The examples presented in this chapter have not been updated to reflect
+ the current API.
+\layout Standard
+
+In this section we present several example to illustrate expected usage
+ patterns for the Portals 3.2 API.
+ The first example describes how to implement parallel servers using the
+ features of the Portals 3.2 API.
+ This example covers the access control list and the use of remote managed
+ offsets.
+ The second example presents an approach to dealing with dropped requests.
+ This example covers aspects of match lists and memory descriptors.
+ The final example covers message reception in MPI.
+ This example illustrates more sophisticated uses of matching and a procedure
+ to update a memory descriptor.
+\layout Section
+
+Parallel File Servers
+\begin_inset LatexCommand \label{sec:expfs}
+
+\end_inset 
+
+
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:file}
+
+\end_inset 
+
+ illustrates the logical structure of a parallel file server.
+ In this case, the parallel server consists of four servers that stripe
+ application data across four disks.
+ We would like to present applications with the illusion that the file server
+ is a single entity.
+ We will assume that all of the processes that constitute the parallel server
+ have the same user id.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename file.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 196pt
+       lyxheight 147pt
+\end_inset 
+
+
+\layout Caption
+
+Parallel File Server
+\begin_inset LatexCommand \label{fig:file}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+When an application establishes a connection to the parallel file server,
+ it will allocate a Portal and access control list entry for communicating
+ with the server.
+ The access control list entry will include the Portal and match any process
+ in the parallel file server's, so all of the file server processes will
+ have access to the portal.
+ The Portal information and access control entry will be sent to the file
+ server at this time.
+ If the application and server need to have multiple, concurrent I/O operations,
+ they can use additional portals or match entries to keep the operations
+ from interfering with one another.
+\layout Standard
+
+When an application initiates an I/O operation, it first builds a memory
+ descriptor that describes the memory region involved in the operation.
+ This memory descriptor will enable the appropriate operation (put for read
+ operations and get for write operations) and enable the use of remote offsets
+ (this lets the servers decide where their data should be placed in the
+ memory region).
+ After creating the memory descriptor and linking it into the appropriate
+ Portal entry, the application sends a read or write request (using 
+\emph on 
+PtlPut
+\emph default 
+) to one of the file server processes.
+ The file server processes can then use put or get operations with the appropria
+te offsets to fill or retrieve the contents of the application's buffer.
+ To know when the operation has completed, the application can add an event
+ queue to the memory descriptor and add up the lengths of the remote operations
+ until the sum is the size of the requested I/O operation.
+\layout Section
+
+Dealing with Dropped Requests
+\begin_inset LatexCommand \label{sec:exdrop}
+
+\end_inset 
+
+
+\layout Standard
+
+If a process does not anticipate unexpected requests, they will be discarded.
+ Applications using the Portals API can query the dropped count for the
+ interface to determine the number of requests that have been dropped (see
+ Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+).
+ While this approach minimizes resource consumption, it does not provide
+ information that might be critical in debugging the implementation of a
+ higher level protocol.
+\layout Standard
+
+To keep track of more information about dropped requests, we use a memory
+ descriptor that truncates each incoming request to zero bytes and logs
+ the 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ operations in an event queue.
+ Note that the operations are not dropped in the Portals sense, because
+ the operation succeeds.
+\layout Standard
+
+The following code fragment illustrates an implementation of this approach.
+ In this case, we assume that a thread is launched to execute the function
+\family typewriter 
+watch_drop
+\family default 
+.
+ This code starts by building an event queue to log truncated operations
+ and a memory descriptor to truncate the incoming requests.
+ This example only captures 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ requests for a single portal.
+ In a more realistic situation, the memory descriptor would be appended
+ to the match list for every portal.
+ We also assume that the thread is capable of keeping up with the 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ requests.
+ If this is not the case, we could use a finite threshold on the memory
+ descriptor to capture the first few dropped requests.
+\layout LyX-Code
+
+
+\size small 
+#include <stdio.h>
+\newline 
+#include <stdlib.h>
+\newline 
+#include <portals.h>
+\newline 
+
+\newline 
+#define DROP_SIZE 32       /* number of dropped requests to track */
+\newline 
+
+\newline 
+int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) {
+\newline 
+    ptl_handle_eq_t drop_events;
+\newline 
+    ptl_event_t event;
+\newline 
+    ptl_handle_md_t drop_em;
+\newline 
+    ptl_md_t drop_desc;
+\newline 
+    ptl_process_id_t any_proc;
+\newline 
+    ptl_handle_me_t match_any;
+\newline 
+
+\newline 
+    /* create the event queue */
+\newline 
+    if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) {
+\newline 
+        fprintf( stderr, "Couldn't create the event queue
+\backslash 
+n" );
+\newline 
+        exit( 1 );
+\newline 
+    }
+\newline 
+
+\newline 
+    /* build a match entry */
+\newline 
+    any_proc.nid = PTL_ID_ANY;
+\newline 
+    any_proc.pid = PTL_ID_ANY;
+\newline 
+    PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN,
+\newline 
+                        &match_any );
+\newline 
+
+\newline 
+    /* create the memory descriptor */
+\newline 
+    drop_desc.start = NULL;
+\newline 
+    drop_desc.length = 0;
+\newline 
+    drop_desc.threshold = PTL_MD_THRESH_INF;
+\newline 
+    drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE;
+\newline 
+    drop_desc.user_ptr = NULL;
+\newline 
+    drop_desc.eventq = drop_events;
+\newline 
+    if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) {
+\newline 
+        fprintf( stderr, "Couldn't create the memory descriptor
+\backslash 
+n" );
+\newline 
+        exit( 1 );
+\newline 
+    }
+\newline 
+
+\newline 
+    /* watch for "dropped" requests */
+\newline 
+    while( 1 ) {
+\newline 
+        if( PtlEQWait( drop_events, &event ) != PTL_OK ) break;
+\newline 
+        fprintf( stderr, "Dropped request from gid = event.initiator.gid,
+ event.initiator.rid );
+\newline 
+    }
+\newline 
+}
+\layout Section
+
+Message Transmission in MPI
+\begin_inset LatexCommand \label{sec:exmpi}
+
+\end_inset 
+
+
+\layout Standard
+
+We conclude this section with a fairly extensive example that describes
+ an approach to implementing message transmission for MPI.
+ Like many MPI implementations, we distinguish two message transmission
+ protocols: a short message protocol and a long message protocol.
+ We use the constant 
+\family typewriter 
+MPI_LONG_LENGTH
+\family default 
+ to determine the size of a long message.
+\layout Standard
+
+For small messages, the sender simply sends the message and presumes that
+ the message will be received (i.e., the receiver has allocated a memory region
+ to receive the message body).
+ For large messages, the sender also sends the message, but does not presume
+ that the message body will be saved.
+ Instead, the sender builds a memory descriptor for the message and enables
+ get operations on this descriptor.
+ If the target does not save the body of the message, it will record an
+ event for the put operation.
+ When the process later issues a matching MPI receive, it will perform a
+ get operation to retrieve the body of the message.
+\layout Standard
+
+To facilitate receive side matching based on the protocol, we use the most
+ significant bit in the match bits to indicate the protocol: 1 for long
+ messages and 0 for short messages.
+\layout Standard
+
+The following code presents a function that implements the send side of
+ the protocol.
+ The global variable 
+\family typewriter 
+EndGet
+\family default 
+ is the last match entry attached to the Portal index used for posting long
+ messages.
+ This entry does not match any incoming requests (i.e., the memory descriptor
+ rejects all get operations) and is built during initialization of the MPI
+ library.
+ The other global variable, 
+\family typewriter 
+MPI_NI
+\family default 
+, is a handle for the network interface used by the MPI implementation.
+\layout LyX-Code
+
+
+\size small 
+extern ptl_handle_me_t EndGet;
+\newline 
+extern ptl_handle_ni_t MPI_NI;
+\newline 
+
+\newline 
+void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq,
+\newline 
+                    ptl_process_id target, ptl_match_bits_t match ) 
+\newline 
+{
+\newline 
+    ptl_handle_md_t send_handle;
+\newline 
+    ptl_md_t mem_desc;
+\newline 
+    ptl_ack_req_t want_ack;
+\newline 
+
+\newline 
+    mem_desc.start = buf;
+\newline 
+    mem_desc.length = len;
+\newline 
+    mem_desc.threshold = 1;
+\newline 
+    mem_desc.options = PTL_MD_GET_OP;
+\newline 
+    mem_desc.user_ptr = data;
+\newline 
+    mem_desc.eventq = eventq;
+\newline 
+
+\newline 
+    if( len >= MPI_LONG_LENGTH ) {
+\newline 
+        ptl_handle_me_t me_handle;
+\newline 
+
+\newline 
+        /* add a match entry to the end of the get list */
+\newline 
+        PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet,
+ &me_handle );
+\newline 
+        PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL );
+\newline 
+
+\newline 
+        /* we want an ack for long messages */
+\newline 
+        want_ack = PTL_ACK_REQ;
+\newline 
+
+\newline 
+        /* set the protocol bit to indicate that this is a long message
+ */
+\newline 
+        match |= 1<<63;
+\newline 
+    } else {
+\newline 
+        /* we don't want an ack for short messages */
+\newline 
+        want_ack = PTL_ACK_REQ;
+\newline 
+
+\newline 
+        /* set the protocol bit to indicate that this is a short message
+ */
+\newline 
+        match &= ~(1<<63);
+\newline 
+    }
+\newline 
+
+\newline 
+   /* create a memory descriptor and send it */
+\newline 
+   PtlMDBind( MPI_NI, mem_desc, &send_handle );
+\newline 
+   PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match,
+ 0 );
+\newline 
+}
+\layout Standard
+
+The 
+\emph on 
+MPISend
+\emph default 
+ function returns as soon as the message has been scheduled for transmission.
+ The event queue argument, 
+\family typewriter 
+eventq
+\family default 
+, can be used to determine the disposition of the message.
+ Assuming that 
+\family typewriter 
+eventq
+\family default 
+ is not 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, a 
+\family typewriter 
+PTL_EVENT_SENT
+\family default 
+ event will be recorded for each message as the message is transmitted.
+ For small messages, this is the only event that will be recorded in 
+\family typewriter 
+eventq
+\family default 
+.
+ In contrast, long messages include an explicit request for an acknowledgement.
+ If the 
+\family typewriter 
+target
+\family default 
+ process has posted a matching receive, the acknowledgement will be sent
+ as the message is received.
+ If a matching receive has not been posted, the message will be discarded
+ and no acknowledgement will be sent.
+ When the 
+\family typewriter 
+target
+\family default 
+ process later issues a matching receive, the receive will be translated
+ into a get operation and a 
+\family typewriter 
+PTL_EVENT_GET
+\family default 
+ event will be recorded in 
+\family typewriter 
+eventq
+\family default 
+.
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:mpi}
+
+\end_inset 
+
+ illustrates the organization of the match list used for receiving MPI messages.
+ The initial entries (not shown in this figure) would be used to match the
+ MPI receives that have been preposted by the application.
+ The preposted receives are followed by a match entry, 
+\emph on 
+RcvMark
+\emph default 
+, that marks the boundary between preposted receives and the memory descriptors
+ used for 
+\begin_inset Quotes eld
+\end_inset 
+
+unexpected
+\begin_inset Quotes erd
+\end_inset 
+
+ messages.
+ The 
+\emph on 
+RcvMark
+\emph default 
+ entry is followed by a small collection of match entries that match unexpected
+\begin_inset Quotes eld
+\end_inset 
+
+short
+\begin_inset Quotes erd
+\end_inset 
+
+ messages, i.e., messages that have a 0 in the most significant bit of their
+ match bits.
+ The memory descriptors associated with these match entries will append
+ the incoming message to the associated memory descriptor and record an
+ event in an event queue for unexpected messages.
+ The unexpected short message matching entries are followed by a match entry
+ that will match messages that were not matched by the preceding match entries,
+ i.e., the unexpected long messages.
+ The memory descriptor associated with this match entry truncates the message
+ body and records an event in the event queue for unexpected messages.
+ Note that of the memory descriptors used for unexpected messages share
+ a common event queue.
+ This makes it possible to process the unexpected messages in the order
+ in which they arrived, regardless of.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename mpi.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 389pt
+       lyxheight 284pt
+\end_inset 
+
+
+\layout Caption
+
+Message Reception in MPI
+\begin_inset LatexCommand \label{fig:mpi}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+When the local MPI process posts an MPI receive, we must first search the
+ events unexpected message queue to see if a matching message has already
+ arrived.
+ If no matching message is found, a match entry for the receive is inserted
+ before the 
+\emph on 
+RcvMark
+\emph default 
+ entry--after the match entries for all of the previously posted receives
+ and before the match entries for the unexpected messages.
+ This ensures that preposted receives are matched in the order that they
+ were posted (a requirement of MPI).
+\layout Standard
+
+While this strategy respects the temporal semantics of MPI, it introduces
+ a race condition: a matching message might arrive after the events in the
+ unexpected message queue have been searched, but before the match entry
+ for the receive has been inserted in the match list.
+\layout Standard
+
+To avoid this race condition we start by setting the 
+\family typewriter 
+threshold
+\family default 
+ of the memory descriptor to 0, making the descriptor inactive.
+ We then insert the match entry into the match list and proceed to search
+ the events in the unexpected message queue.
+ A matching message that arrives as we are searching the unexpected message
+ queue will not be accepted by the memory descriptor and, if not matched
+ by an earlier match list element, will add an event to the unexpected message
+ queue.
+ After searching the events in the unexpected message queue, we update the
+ memory descriptor, setting the threshold to 1 to activate the memory descriptor.
+ This update is predicated by the condition that the unexpected message
+ queue is empty.
+ We repeat the process of searching the unexpected message queue until the
+ update succeeds.
+\layout Standard
+
+The following code fragment illustrates this approach.
+ Because events must be removed from the unexpected message queue to be
+ examined, this code fragment assumes the existence of a user managed event
+ list, 
+\family typewriter 
+Rcvd
+\family default 
+, for the events that have already been removed from the unexpected message
+ queue.
+ In an effort to keep the example focused on the basic protocol, we have
+ omitted the code that would be needed to manage the memory descriptors
+ used for unexpected short messages.
+ In particular, we simply leave messages in these descriptors until they
+ are received by the application.
+ In a robust implementation, we would introduce code to ensure that short
+ unexpected messages are removed from these memory descriptors so that they
+ can be re-used.
+\layout LyX-Code
+
+
+\size small 
+extern ptl_handle_eq_t UnexpQueue;
+\newline 
+extern ptl_handle_me_t RcvMark;
+\newline 
+extern ptl_handle_me_t ShortMatch;
+\newline 
+
+\newline 
+typedef struct event_list_tag {
+\newline 
+    ptl_event_t            event;
+\newline 
+    struct event_list_tag* next;
+\newline 
+} event_list;
+\newline 
+
+\newline 
+extern event_list Rcvd;
+\newline 
+
+\newline 
+void AppendRcvd( ptl_event_t event )
+\newline 
+{
+\newline 
+    /* append an event onto the Rcvd list */
+\newline 
+}
+\newline 
+
+\newline 
+int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi
+ts_t match,
+\newline 
+                       ptl_match_bits_t ignore, ptl_event_t *event )
+\newline 
+{
+\newline 
+    /* Search the Rcvd event queue, looking for a message that matches the
+ requested message.
+\newline 
+     * If one is found, remove the event from the Rcvd list and return it.
+ */
+\newline 
+}
+\newline 
+
+\newline 
+typedef enum { RECEIVED, POSTED } receive_state;
+\newline 
+
+\newline 
+receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event,
+ ptl_md_t md_buf )
+\newline 
+{
+\newline 
+    ptl_md_t md_buf;
+\newline 
+    ptl_handle_me_t me_handle;
+\newline 
+
+\newline 
+    if( event.rlength >= MPI_LONG_LENGTH ) {
+\newline 
+        PtlMDBind( MPI_NI, md_buf, &md_handle );
+\newline 
+        PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX,
+ md_handle );
+\newline 
+        return POSTED;
+\newline 
+    } else {
+\newline 
+        /* copy the message */
+\newline 
+        if( event.mlength < *length ) *length = event.mlength;
+\newline 
+        memcpy( buf, (char*)event.md_desc.start+event.offset, *length );
+\newline 
+        return RECEIVED;
+\newline 
+    }
+\newline 
+}
+\newline 
+
+\newline 
+receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle
+_eq_t eventq, 
+\newline 
+                           ptl_process_id_t sender, ptl_match_bits_t match,
+ ptl_match_bits_t ignore )
+\newline 
+{
+\newline 
+    ptl_md_t md_buf;
+\newline 
+    ptl_handle_md_t md_handle;
+\newline 
+    ptl_handle_me_t me_handle;
+\newline 
+    ptl_event_t event;
+\newline 
+
+\newline 
+    /* build a memory descriptor for the receive */
+\newline 
+    md_buf.start = buf;
+\newline 
+    md_buf.length = *len;
+\newline 
+    md_buf.threshold = 0;     /* temporarily disabled */
+\newline 
+    md_buf.options = PTL_MD_PUT_OP;
+\newline 
+    md_buf.user_ptr = MPI_data;
+\newline 
+    md_buf.eventq = eventq;
+\newline 
+
+\newline 
+    /* see if we have already received the message */
+\newline 
+    if( SearchRcvd(buf, len, sender, match, ignore, &event) )
+\newline 
+         return CopyMsg( buf, len, event, md_buf );
+\newline 
+
+\newline 
+    /* create the match entry and attach the  memory descriptor */
+\newline 
+    PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark,
+ &me_handle);
+\newline 
+    PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle );
+\newline 
+
+\newline 
+    md_buf.threshold = 1;
+\newline 
+    do
+\newline 
+        if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) {
+\newline 
+            if( MPIMatch(event, match, ignore, sender) ) {
+\newline 
+                return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset,
+ md_buf );
+\newline 
+            } else {
+\newline 
+                AppendRcvd( event );
+\newline 
+            }
+\newline 
+        }
+\newline 
+    while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE
+ );
+\newline 
+    return POSTED;
+\newline 
+}
+\layout Chapter*
+
+Acknowledgments
+\layout Standard
+
+Several people have contributed to the philosophy, design, and implementation
+ of the Portals message passing architecture as it has evolved.
+ We acknowledge the following people for their contributions: Al Audette,
+ Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike
+ Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke,
+ Dave van Dresser, Lee Ward, and Stephen Wheat.
+\layout Standard
+
+
+\begin_inset LatexCommand \BibTeX[ieee]{portals3}
+
+\end_inset 
+
+
+\the_end
diff --git a/lnet/doc/put.fig b/lnet/doc/put.fig
new file mode 100644 (file)
index 0000000..5235b6d
--- /dev/null
@@ -0,0 +1,32 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 1350 900 2175 1200
+4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001
+4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001
+-6
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 1275 2700 1725
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        900 525 2700 1200
+2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5
+        0 300 1200 300 1200 2250 0 2250 0 300
+2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5
+        2400 300 3600 300 3600 2250 2400 2250 2400 300
+2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2
+       0 0 1.00 60.00 120.00
+        2699 1788 899 1938
+4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001
+4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001
+4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001
+4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001
+4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001
+4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001
diff --git a/lnet/include/.cvsignore b/lnet/include/.cvsignore
new file mode 100644 (file)
index 0000000..d45f796
--- /dev/null
@@ -0,0 +1,4 @@
+config.h
+stamp-h
+stamp-h1
+stamp-h.in
diff --git a/lnet/include/Makefile.am b/lnet/include/Makefile.am
new file mode 100644 (file)
index 0000000..2cf7f99
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = portals linux
+EXTRA_DIST = config.h.in
+include $(top_srcdir)/Rules
diff --git a/lnet/include/config.h.in b/lnet/include/config.h.in
new file mode 100644 (file)
index 0000000..b05d0c4
--- /dev/null
@@ -0,0 +1,11 @@
+/* ../include/config.h.in.  Generated automatically from configure.in by autoheader.  */
+
+/* Define if you have the readline library (-lreadline).  */
+#undef HAVE_LIBREADLINE
+
+/* Name of package */
+#undef PACKAGE
+
+/* Version number of package */
+#undef VERSION
+
diff --git a/lnet/include/linux/Makefile.am b/lnet/include/linux/Makefile.am
new file mode 100644 (file)
index 0000000..6a65cb5
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(top_srcdir)/Rules
+
+linuxincludedir = $(includedir)/linux
+
+linuxinclude_HEADERS=kp30.h portals_lib.h
diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h
new file mode 100644 (file)
index 0000000..6d7f3f3
--- /dev/null
@@ -0,0 +1,943 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _KP30_INCLUDED
+#define _KP30_INCLUDED
+
+
+#define PORTAL_DEBUG
+
+#ifndef offsetof
+# define offsetof(typ,memb)    ((int)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define LOWEST_BIT_SET(x)      ((x) & ~((x) - 1))
+
+#ifndef CONFIG_SMP
+# define smp_processor_id() 0
+#endif
+
+/*
+ *  Debugging
+ */
+extern unsigned int portal_subsystem_debug;
+extern unsigned int portal_stack;
+extern unsigned int portal_debug;
+extern unsigned int portal_printk;
+/* Debugging subsystems  (8 bit ID)
+ *
+ * If you add debug subsystem #32, you need to send email to phil, because
+ * you're going to break kernel subsystem debug filtering. */
+#define S_UNDEFINED    (0 << 24)
+#define S_MDC          (1 << 24)
+#define S_MDS          (2 << 24)
+#define S_OSC          (3 << 24)
+#define S_OST          (4 << 24)
+#define S_CLASS        (5 << 24)
+#define S_OBDFS        (6 << 24) /* obsolete */
+#define S_LLITE        (7 << 24)
+#define S_RPC          (8 << 24)
+#define S_EXT2OBD      (9 << 24) /* obsolete */
+#define S_PORTALS     (10 << 24)
+#define S_SOCKNAL     (11 << 24)
+#define S_QSWNAL      (12 << 24)
+#define S_PINGER      (13 << 24)
+#define S_FILTER      (14 << 24)
+#define S_TRACE       (15 << 24) /* obsolete */
+#define S_ECHO        (16 << 24)
+#define S_LDLM        (17 << 24)
+#define S_LOV         (18 << 24)
+#define S_GMNAL       (19 << 24)
+#define S_PTLROUTER   (20 << 24)
+#define S_COBD        (21 << 24)
+#define S_PTLBD       (22 << 24)
+#define S_LOG         (23 << 24)
+
+/* If you change these values, please keep portals/linux/utils/debug.c
+ * up to date! */
+
+/* Debugging masks (24 bits, non-overlapping) */
+#define D_TRACE     (1 << 0) /* ENTRY/EXIT markers */
+#define D_INODE     (1 << 1)
+#define D_SUPER     (1 << 2)
+#define D_EXT2      (1 << 3) /* anything from ext2_debug */
+#define D_MALLOC    (1 << 4) /* print malloc, free information */
+#define D_CACHE     (1 << 5) /* cache-related items */
+#define D_INFO      (1 << 6) /* general information */
+#define D_IOCTL     (1 << 7) /* ioctl related information */
+#define D_BLOCKS    (1 << 8) /* ext2 block allocation */
+#define D_NET       (1 << 9) /* network communications */
+#define D_WARNING   (1 << 10)
+#define D_BUFFS     (1 << 11)
+#define D_OTHER     (1 << 12)
+#define D_DENTRY    (1 << 13)
+#define D_PORTALS   (1 << 14) /* ENTRY/EXIT markers */
+#define D_PAGE      (1 << 15) /* bulk page handling */
+#define D_DLMTRACE  (1 << 16)
+#define D_ERROR     (1 << 17) /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG     (1 << 18) /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA        (1 << 19) /* recovery and failover */
+#define D_RPCTRACE  (1 << 20) /* for distributed debugging */
+#define D_VFSTRACE  (1 << 21)
+
+#ifndef __KERNEL__
+#define THREAD_SIZE 8192
+#endif
+#ifdef  __ia64__
+#define CDEBUG_STACK() (THREAD_SIZE -                                      \
+                        ((unsigned long)__builtin_dwarf_cfa() &            \
+                         (THREAD_SIZE - 1)))
+#else
+#define CDEBUG_STACK() (THREAD_SIZE -                                      \
+                        ((unsigned long)__builtin_frame_address(0) &       \
+                         (THREAD_SIZE - 1)))
+#endif
+
+#ifdef __KERNEL__
+#define CHECK_STACK(stack)                                                    \
+        do {                                                                  \
+                if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) {    \
+                        portals_debug_msg(DEBUG_SUBSYSTEM, D_ERROR,           \
+                                          __FILE__, __FUNCTION__, __LINE__,   \
+                                          (stack),                            \
+                                          "maximum lustre stack %u\n",        \
+                                          portal_stack = (stack));            \
+                      /*panic("LBUG");*/                                      \
+                }                                                             \
+        } while (0)
+#else
+#define CHECK_STACK(stack) do { } while(0)
+#endif
+
+#if 1
+#define CDEBUG(mask, format, a...)                                            \
+do {                                                                          \
+        CHECK_STACK(CDEBUG_STACK());                                          \
+        if (!(mask) || ((mask) & (D_ERROR | D_EMERG)) ||                      \
+            (portal_debug & (mask) &&                                         \
+             portal_subsystem_debug & (1 << (DEBUG_SUBSYSTEM >> 24))))        \
+                portals_debug_msg(DEBUG_SUBSYSTEM, mask,                      \
+                                  __FILE__, __FUNCTION__, __LINE__,           \
+                                  CDEBUG_STACK(), format , ## a);             \
+} while (0)
+
+#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
+
+#define GOTO(label, rc)                                                 \
+do {                                                                    \
+        long GOTO__ret = (long)(rc);                                    \
+        CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \
+               #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\
+               (signed long)GOTO__ret);                                 \
+        goto label;                                                     \
+} while (0)
+
+#define RETURN(rc)                                                      \
+do {                                                                    \
+        typeof(rc) RETURN__ret = (rc);                                  \
+        CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n",       \
+               (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret);\
+        return RETURN__ret;                                             \
+} while (0)
+
+#define ENTRY                                                           \
+do {                                                                    \
+        CDEBUG(D_TRACE, "Process entered\n");                           \
+} while (0)
+
+#define EXIT                                                            \
+do {                                                                    \
+        CDEBUG(D_TRACE, "Process leaving\n");                           \
+} while(0)
+#else
+#define CDEBUG(mask, format, a...)      do { } while (0)
+#define CWARN(format, a...)             do { } while (0)
+#define CERROR(format, a...)            printk("<3>" format, ## a)
+#define CEMERG(format, a...)            printk("<0>" format, ## a)
+#define GOTO(label, rc)                 do { (void)(rc); goto label; } while (0)
+#define RETURN(rc)                      return (rc)
+#define ENTRY                           do { } while (0)
+#define EXIT                            do { } while (0)
+#endif
+
+
+#ifdef __KERNEL__
+# include <linux/vmalloc.h>
+# include <linux/time.h>
+# include <linux/slab.h>
+# include <linux/interrupt.h>
+# include <linux/highmem.h>
+# include <linux/module.h>
+# include <linux/version.h>
+# include <portals/lib-nal.h>
+# include <linux/smp_lock.h>
+# include <asm/atomic.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define schedule_work schedule_task
+#define prepare_work(wq,cb,cbdata)                                            \
+do {                                                                          \
+        INIT_TQUEUE((wq), 0, 0);                                              \
+        PREPARE_TQUEUE((wq), (cb), (cbdata));                                 \
+} while (0)
+
+#define ll_invalidate_inode_pages invalidate_inode_pages
+#define PageUptodate Page_Uptodate
+#define our_recalc_sigpending(current) recalc_sigpending(current)
+#define num_online_cpus() smp_num_cpus
+static inline void our_cond_resched(void)
+{
+        if (current->need_resched)
+               schedule ();
+}
+
+#else
+
+#define prepare_work(wq,cb,cbdata)                                            \
+do {                                                                          \
+        INIT_WORK((wq), (void *)(cb), (void *)(cbdata));                      \
+} while (0)
+#define ll_invalidate_inode_pages(inode) invalidate_inode_pages((inode)->i_mapping)
+#define wait_on_page wait_on_page_locked
+#define our_recalc_sigpending(current) recalc_sigpending()
+#define strtok(a,b) strpbrk(a, b)
+static inline void our_cond_resched(void)
+{
+        cond_resched();
+}
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */
+
+#ifdef PORTAL_DEBUG
+extern void kportal_assertion_failed(char *expr,char *file,char *func,int line);
+#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__,  \
+                                                        __FUNCTION__, __LINE__))
+#else
+#define LASSERT(e)
+#endif
+
+#ifdef __arch_um__
+#define LBUG_WITH_LOC(file, func, line)                                 \
+do {                                                                    \
+        CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n");       \
+        portals_debug_dumplog();                                        \
+        portals_run_lbug_upcall(file, func, line);                      \
+        panic("LBUG");                                                  \
+} while (0)
+#else
+#define LBUG_WITH_LOC(file, func, line)                                 \
+do {                                                                    \
+        CEMERG("LBUG\n");                                               \
+        portals_debug_dumplog();                                        \
+        portals_run_lbug_upcall(file, func, line);                      \
+        set_task_state(current, TASK_UNINTERRUPTIBLE);                  \
+        schedule();                                                     \
+} while (0)
+#endif /* __arch_um__ */
+
+#define LBUG() LBUG_WITH_LOC(__FILE__, __FUNCTION__, __LINE__)
+
+/*
+ * Memory
+ */
+#ifdef PORTAL_DEBUG
+extern atomic_t portal_kmemory;
+
+# define portal_kmem_inc(ptr, size)                                           \
+do {                                                                          \
+        atomic_add(size, &portal_kmemory);                                    \
+} while (0)
+
+# define portal_kmem_dec(ptr, size) do {                                      \
+        atomic_sub(size, &portal_kmemory);                                    \
+} while (0)
+
+#else
+# define portal_kmem_inc(ptr, size) do {} while (0)
+# define portal_kmem_dec(ptr, size) do {} while (0)
+#endif /* PORTAL_DEBUG */
+
+#define PORTAL_VMALLOC_SIZE        16384
+
+#define PORTAL_ALLOC(ptr, size)                                           \
+do {                                                                      \
+        long s = size;                                                    \
+        LASSERT (!in_interrupt());                                        \
+        if (s > PORTAL_VMALLOC_SIZE)                                      \
+                (ptr) = vmalloc(s);                                       \
+        else                                                              \
+                (ptr) = kmalloc(s, GFP_NOFS);                             \
+        if ((ptr) == NULL)                                                \
+                CERROR("PORTALS: out of memory at %s:%d (tried to alloc"  \
+                       " '" #ptr "' = %ld)\n", __FILE__, __LINE__, s);    \
+        else {                                                            \
+                portal_kmem_inc((ptr), s);                                \
+                memset((ptr), 0, s);                                      \
+        }                                                                 \
+        CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n",   \
+               s, (ptr), atomic_read (&portal_kmemory));                  \
+} while (0)
+
+#define PORTAL_FREE(ptr, size)                                          \
+do {                                                                    \
+        long s = (size);                                                \
+        if ((ptr) == NULL) {                                            \
+                CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at "  \
+                       "%s:%d\n", s, __FILE__, __LINE__);               \
+                break;                                                  \
+        }                                                               \
+        if (s > PORTAL_VMALLOC_SIZE)                                    \
+                vfree(ptr);                                             \
+        else                                                            \
+                kfree(ptr);                                             \
+        portal_kmem_dec((ptr), s);                                      \
+        CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n",    \
+               s, (ptr), atomic_read (&portal_kmemory));                \
+} while (0)
+
+#define PORTAL_SLAB_ALLOC(ptr, slab, size)                                \
+do {                                                                      \
+        long s = (size);                                                  \
+        LASSERT (!in_interrupt());                                        \
+        (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL);                    \
+        if ((ptr) == NULL) {                                              \
+                CERROR("PORTALS: out of memory at %s:%d (tried to alloc"  \
+                       " '" #ptr "' from slab '" #slab "')\n", __FILE__,  \
+                       __LINE__);                                         \
+        } else {                                                          \
+                portal_kmem_inc((ptr), s);                                \
+                memset((ptr), 0, s);                                      \
+        }                                                                 \
+        CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n",   \
+               s, (ptr), atomic_read (&portal_kmemory));                  \
+} while (0)
+
+#define PORTAL_SLAB_FREE(ptr, slab, size)                               \
+do {                                                                    \
+        long s = (size);                                                \
+        if ((ptr) == NULL) {                                            \
+                CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at "  \
+                       "%s:%d\n", s, __FILE__, __LINE__);               \
+                break;                                                  \
+        }                                                               \
+        memset((ptr), 0x5a, s);                                         \
+        kmem_cache_free((slab), ptr);                                   \
+        portal_kmem_dec((ptr), s);                                      \
+        CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n",    \
+               s, (ptr), atomic_read (&portal_kmemory));                \
+} while (0)
+
+/* ------------------------------------------------------------------- */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+
+#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x)
+#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x)
+
+#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x))
+#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x)
+
+#define PORTAL_MODULE_USE       MOD_INC_USE_COUNT
+#define PORTAL_MODULE_UNUSE     MOD_DEC_USE_COUNT
+#else
+
+#define PORTAL_SYMBOL_REGISTER(x)
+#define PORTAL_SYMBOL_UNREGISTER(x)
+
+#define PORTAL_SYMBOL_GET(x) symbol_get(x)
+#define PORTAL_SYMBOL_PUT(x) symbol_put(x)
+
+#define PORTAL_MODULE_USE       try_module_get(THIS_MODULE)
+#define PORTAL_MODULE_UNUSE     module_put(THIS_MODULE)
+
+#endif
+
+/******************************************************************************/
+/* Kernel Portals Router interface */
+
+typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback
+
+/* space for routing targets to stash "stuff" in a forwarded packet */
+typedef union {
+        long long        _alignment;
+        void            *_space[16];            /* scale with CPU arch */
+} kprfd_scratch_t;
+
+/* Kernel Portals Routing Forwarded message Descriptor */
+typedef struct {
+        struct list_head     kprfd_list;        /* stash in queues (routing target can use) */
+        ptl_nid_t            kprfd_target_nid;  /* final destination NID */
+        ptl_nid_t            kprfd_gateway_nid; /* gateway NID */
+        int                  kprfd_nob;         /* # message bytes (including header) */
+        int                  kprfd_niov;        /* # message frags (including header) */
+        struct iovec        *kprfd_iov;         /* message fragments */
+        void                *kprfd_router_arg;  // originating NAL's router arg
+        kpr_fwd_callback_t   kprfd_callback;    /* completion callback */
+        void                *kprfd_callback_arg; /* completion callback arg */
+        kprfd_scratch_t      kprfd_scratch;    // scratchpad for routing targets
+} kpr_fwd_desc_t;
+
+typedef void  (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd);
+
+/* NAL's routing interface (Kernel Portals Routing Nal Interface) */
+typedef const struct {
+        int             kprni_nalid;    /* NAL's id */
+        void           *kprni_arg;      /* Arg to pass when calling into NAL */
+        kpr_fwd_t       kprni_fwd;      /* NAL's forwarding entrypoint */
+} kpr_nal_interface_t;
+
+/* Router's routing interface (Kernel Portals Routing Router Interface) */
+typedef const struct {
+        /* register the calling NAL with the router and get back the handle for
+         * subsequent calls */
+        int     (*kprri_register) (kpr_nal_interface_t *nal_interface,
+                                   void **router_arg);
+
+        /* ask the router to find a gateway that forwards to 'nid' and is a peer
+         * of the calling NAL */
+        int     (*kprri_lookup) (void *router_arg, ptl_nid_t nid,
+                                 ptl_nid_t *gateway_nid);
+
+        /* hand a packet over to the router for forwarding */
+        kpr_fwd_t kprri_fwd_start;
+
+        /* hand a packet back to the router for completion */
+        void    (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd,
+                                   int error);
+
+        /* the calling NAL is shutting down */
+        void    (*kprri_shutdown) (void *router_arg);
+
+        /* deregister the calling NAL with the router */
+        void    (*kprri_deregister) (void *router_arg);
+
+} kpr_router_interface_t;
+
+/* Convenient struct for NAL to stash router interface/args */
+typedef struct {
+        kpr_router_interface_t  *kpr_interface;
+        void                    *kpr_arg;
+} kpr_router_t;
+
+/* Router's control interface (Kernel Portals Routing Control Interface) */
+typedef const struct {
+        int     (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid,
+                                   ptl_nid_t lo_nid, ptl_nid_t hi_nid);
+        int     (*kprci_del_route)(ptl_nid_t nid);
+        int     (*kprci_get_route)(int index, int *gateway_nal,
+                                   ptl_nid_t *gateway, ptl_nid_t *lo_nid,
+                                   ptl_nid_t *hi_nid);
+} kpr_control_interface_t;
+
+extern kpr_control_interface_t  kpr_control_interface;
+extern kpr_router_interface_t   kpr_router_interface;
+
+static inline int
+kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif)
+{
+        int    rc;
+
+        router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface);
+        if (router->kpr_interface == NULL)
+                return (-ENOENT);
+
+        rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg);
+        if (rc != 0)
+                router->kpr_interface = NULL;
+
+        PORTAL_SYMBOL_PUT (kpr_router_interface);
+        return (rc);
+}
+
+static inline int
+kpr_routing (kpr_router_t *router)
+{
+        return (router->kpr_interface != NULL);
+}
+
+static inline int
+kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid)
+{
+        if (!kpr_routing (router))
+                return (-EHOSTUNREACH);
+
+        return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid,
+                                                    gateway_nid));
+}
+
+static inline void
+kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid,
+              int nob, int niov, struct iovec *iov,
+              kpr_fwd_callback_t callback, void *callback_arg)
+{
+        fwd->kprfd_target_nid   = nid;
+        fwd->kprfd_gateway_nid  = nid;
+        fwd->kprfd_nob          = nob;
+        fwd->kprfd_niov         = niov;
+        fwd->kprfd_iov          = iov;
+        fwd->kprfd_callback     = callback;
+        fwd->kprfd_callback_arg = callback_arg;
+}
+
+static inline void
+kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd)
+{
+        if (!kpr_routing (router))
+                fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH);
+        else
+                router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd);
+}
+
+static inline void
+kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error)
+{
+        LASSERT (kpr_routing (router));
+        router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error);
+}
+
+static inline void
+kpr_shutdown (kpr_router_t *router)
+{
+        if (kpr_routing (router))
+                router->kpr_interface->kprri_shutdown (router->kpr_arg);
+}
+
+static inline void
+kpr_deregister (kpr_router_t *router)
+{
+        if (!kpr_routing (router))
+                return;
+        router->kpr_interface->kprri_deregister (router->kpr_arg);
+        router->kpr_interface = NULL;
+}
+
+/******************************************************************************/
+
+#ifdef PORTALS_PROFILING
+#define prof_enum(FOO) PROF__##FOO
+enum {
+        prof_enum(our_recvmsg),
+        prof_enum(our_sendmsg),
+        prof_enum(socknal_recv),
+        prof_enum(lib_parse),
+        prof_enum(conn_list_walk),
+        prof_enum(memcpy),
+        prof_enum(lib_finalize),
+        prof_enum(pingcli_time),
+        prof_enum(gmnal_send),
+        prof_enum(gmnal_recv),
+        MAX_PROFS
+};
+
+struct prof_ent {
+        char *str;
+        /* hrmph.  wrap-tastic. */
+        u32       starts;
+        u32       finishes;
+        cycles_t  total_cycles;
+        cycles_t  start;
+        cycles_t  end;
+};
+
+extern struct prof_ent prof_ents[MAX_PROFS];
+
+#define PROF_START(FOO)                                         \
+        do {                                                    \
+                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
+                pe->starts++;                                   \
+                pe->start = get_cycles();                       \
+        } while (0)
+
+#define PROF_FINISH(FOO)                                        \
+        do {                                                    \
+                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
+                pe->finishes++;                                 \
+                pe->end = get_cycles();                         \
+                pe->total_cycles += (pe->end - pe->start);      \
+        } while (0)
+#else /* !PORTALS_PROFILING */
+#define PROF_START(FOO) do {} while(0)
+#define PROF_FINISH(FOO) do {} while(0)
+#endif /* PORTALS_PROFILING */
+
+/* debug.c */
+void portals_run_lbug_upcall(char * file, char *fn, int line);
+void portals_debug_dumplog(void);
+int portals_debug_init(unsigned long bufsize);
+int portals_debug_cleanup(void);
+int portals_debug_clear_buffer(void);
+int portals_debug_mark_buffer(char *text);
+int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
+                char *file, unsigned int size);
+__s32 portals_debug_copy_to_user(char *buf, unsigned long len);
+#if (__GNUC__)
+/* Use the special GNU C __attribute__ hack to have the compiler check the
+ * printf style argument string against the actual argument count and
+ * types.
+ */
+#ifdef printf
+# warning printf has been defined as a macro...
+# undef printf
+#endif
+void portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                        unsigned long stack, const char *format, ...)
+        __attribute__ ((format (printf, 7, 8)));
+#else
+void portals_debug_msg (int subsys, int mask, char *file, char *fn,
+                        int line, unsigned long stack,
+                        const char *format, ...);
+#endif /* __GNUC__ */
+void portals_debug_set_level(unsigned int debug_level);
+
+# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b)
+# define printf(format, b...) CDEBUG(D_OTHER, format , ## b)
+# define time(a) CURRENT_TIME
+
+extern void kportal_daemonize (char *name);
+extern void kportal_blockallsigs (void);
+
+#else  /* !__KERNEL__ */
+# include <stdio.h>
+# include <stdlib.h>
+#ifndef __CYGWIN__
+# include <stdint.h>
+#endif
+# include <unistd.h>
+# include <time.h>
+# include <asm/types.h>
+# ifndef DEBUG_SUBSYSTEM
+#  define DEBUG_SUBSYSTEM S_UNDEFINED
+# endif
+# ifdef PORTAL_DEBUG
+#  undef NDEBUG
+#  include <assert.h>
+#  define LASSERT(e)     assert(e)
+# else
+#  define LASSERT(e)
+# endif
+# define printk(format, args...) printf (format, ## args)
+# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0);
+# define PORTAL_FREE(a, b) do { free(a); } while (0);
+# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \
+    printf ("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format,                    \
+            (subsys) >> 24, (mask), (long)time(0), file, fn, line,            \
+            getpid() , stack, ## a);
+#endif
+
+#ifndef CURRENT_TIME
+# define CURRENT_TIME time(0)
+#endif
+
+#include <linux/portals_lib.h>
+
+/*
+ * USER LEVEL STUFF BELOW
+ */
+
+#define PORTAL_IOCTL_VERSION 0x00010007
+#define PING_SYNC       0
+#define PING_ASYNC      1
+
+struct portal_ioctl_data {
+        __u32 ioc_len;
+        __u32 ioc_version;
+        __u64 ioc_nid;
+        __u64 ioc_nid2;
+        __u64 ioc_nid3;
+        __u32 ioc_count;
+        __u32 ioc_nal;
+        __u32 ioc_nal_cmd;
+        __u32 ioc_fd;
+        __u32 ioc_id;
+
+        __u32 ioc_flags;
+        __u32 ioc_size;
+
+        __u32 ioc_wait;
+        __u32 ioc_timeout;
+        __u32 ioc_misc;
+
+        __u32 ioc_inllen1;
+        char *ioc_inlbuf1;
+        __u32 ioc_inllen2;
+        char *ioc_inlbuf2;
+
+        __u32 ioc_plen1; /* buffers in userspace */
+        char *ioc_pbuf1;
+        __u32 ioc_plen2; /* buffers in userspace */
+        char *ioc_pbuf2;
+
+        char ioc_bulk[0];
+};
+
+struct portal_ioctl_hdr {
+        __u32 ioc_len;
+        __u32 ioc_version;
+};
+
+struct portals_debug_ioctl_data
+{
+        struct portal_ioctl_hdr hdr;
+        unsigned int subs;
+        unsigned int debug;
+};
+
+#define PORTAL_IOC_INIT(data)                           \
+do {                                                    \
+        memset(&data, 0, sizeof(data));                 \
+        data.ioc_version = PORTAL_IOCTL_VERSION;        \
+        data.ioc_len = sizeof(data);                    \
+} while (0)
+
+/* FIXME check conflict with lustre_lib.h */
+#define PTL_IOC_DEBUG_MASK             _IOWR('f', 250, long)
+
+static inline int portal_ioctl_packlen(struct portal_ioctl_data *data)
+{
+        int len = sizeof(*data);
+        len += size_round(data->ioc_inllen1);
+        len += size_round(data->ioc_inllen2);
+        return len;
+}
+
+static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data)
+{
+        if (data->ioc_len > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inllen1 > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inllen2 > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+                CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+                CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_pbuf1 && !data->ioc_plen1) {
+                CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_pbuf2 && !data->ioc_plen2) {
+                CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_plen1 && !data->ioc_pbuf1) {
+                CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+                return 1;
+        }
+        if (data->ioc_plen2 && !data->ioc_pbuf2) {
+                CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+                return 1;
+        }
+        if (portal_ioctl_packlen(data) != data->ioc_len ) {
+                CERROR ("PORTALS ioctl: packlen != ioc_len\n");
+                return 1;
+        }
+        if (data->ioc_inllen1 &&
+            data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+                CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n");
+                return 1;
+        }
+        if (data->ioc_inllen2 &&
+            data->ioc_bulk[size_round(data->ioc_inllen1) +
+                           data->ioc_inllen2 - 1] != '\0') {
+                CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n");
+                return 1;
+        }
+        return 0;
+}
+
+#ifndef __KERNEL__
+static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf,
+                                    int max)
+{
+        char *ptr;
+        struct portal_ioctl_data *overlay;
+        data->ioc_len = portal_ioctl_packlen(data);
+        data->ioc_version = PORTAL_IOCTL_VERSION;
+
+        if (*pbuf && portal_ioctl_packlen(data) > max)
+                return 1;
+        if (*pbuf == NULL) {
+                *pbuf = malloc(data->ioc_len);
+        }
+        if (!*pbuf)
+                return 1;
+        overlay = (struct portal_ioctl_data *)*pbuf;
+        memcpy(*pbuf, data, sizeof(*data));
+
+        ptr = overlay->ioc_bulk;
+        if (data->ioc_inlbuf1)
+                LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr);
+        if (data->ioc_inlbuf2)
+                LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr);
+        if (portal_ioctl_is_invalid(overlay))
+                return 1;
+
+        return 0;
+}
+#else
+#include <asm/uaccess.h>
+
+/* buffer MUST be at least the size of portal_ioctl_hdr */
+static inline int portal_ioctl_getdata(char *buf, char *end, void *arg)
+{
+        struct portal_ioctl_hdr *hdr;
+        struct portal_ioctl_data *data;
+        int err;
+        ENTRY;
+
+        hdr = (struct portal_ioctl_hdr *)buf;
+        data = (struct portal_ioctl_data *)buf;
+
+        err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+        if ( err ) {
+                EXIT;
+                return err;
+        }
+
+        if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
+                CERROR ("PORTALS: version mismatch kernel vs application\n");
+                return -EINVAL;
+        }
+
+        if (hdr->ioc_len + buf >= end) {
+                CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
+                return -EINVAL;
+        }
+
+
+        if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
+                CERROR ("PORTALS: user buffer too small for ioctl\n");
+                return -EINVAL;
+        }
+
+        err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+        if ( err ) {
+                EXIT;
+                return err;
+        }
+
+        if (portal_ioctl_is_invalid(data)) {
+                CERROR ("PORTALS: ioctl not correctly formatted\n");
+                return -EINVAL;
+        }
+
+        if (data->ioc_inllen1) {
+                data->ioc_inlbuf1 = &data->ioc_bulk[0];
+        }
+
+        if (data->ioc_inllen2) {
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                        size_round(data->ioc_inllen1);
+        }
+
+        EXIT;
+        return 0;
+}
+#endif
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_PORTAL_TYPE                   'e'
+#define IOC_PORTAL_MIN_NR                 30
+
+#define IOC_PORTAL_PING                    _IOWR('e', 30, long)
+#define IOC_PORTAL_GET_DEBUG               _IOWR('e', 31, long)
+#define IOC_PORTAL_CLEAR_DEBUG             _IOWR('e', 32, long)
+#define IOC_PORTAL_MARK_DEBUG              _IOWR('e', 33, long)
+#define IOC_PORTAL_PANIC                   _IOWR('e', 34, long)
+#define IOC_PORTAL_ADD_ROUTE               _IOWR('e', 35, long)
+#define IOC_PORTAL_DEL_ROUTE               _IOWR('e', 36, long)
+#define IOC_PORTAL_GET_ROUTE               _IOWR('e', 37, long)
+#define IOC_PORTAL_NAL_CMD                _IOWR('e', 38, long)
+#define IOC_PORTAL_GET_NID                 _IOWR('e', 39, long)
+#define IOC_PORTAL_FAIL_NID                _IOWR('e', 40, long)
+#define IOC_PORTAL_SET_DAEMON              _IOWR('e', 41, long)
+
+#define IOC_PORTAL_MAX_NR               41
+
+enum {
+        QSWNAL  =  1,
+        SOCKNAL,
+        GMNAL,
+        TOENAL,
+        TCPNAL,
+        SCIMACNAL,
+        NAL_ENUM_END_MARKER
+};
+
+#ifdef __KERNEL__
+extern ptl_handle_ni_t  kqswnal_ni;
+extern ptl_handle_ni_t  ksocknal_ni;
+extern ptl_handle_ni_t  ktoenal_ni;
+extern ptl_handle_ni_t  kgmnal_ni;
+extern ptl_handle_ni_t  kscimacnal_ni;
+#endif
+
+#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
+
+#define NAL_CMD_REGISTER_PEER_FD     100
+#define NAL_CMD_CLOSE_CONNECTION     101
+#define NAL_CMD_REGISTER_MYNID       102
+#define NAL_CMD_PUSH_CONNECTION      103
+
+enum {
+        DEBUG_DAEMON_START       =  1,
+        DEBUG_DAEMON_STOP        =  2,
+        DEBUG_DAEMON_PAUSE       =  3,
+        DEBUG_DAEMON_CONTINUE    =  4,
+};
+
+/* XXX remove to lustre ASAP */
+struct lustre_peer {
+        ptl_nid_t       peer_nid;
+        ptl_handle_ni_t peer_ni;
+};
+
+/* module.c */
+typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private);
+int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private);
+int kportal_nal_unregister(int nal);
+
+ptl_handle_ni_t *kportal_get_ni (int nal);
+void kportal_put_ni (int nal);
+
+#ifdef __CYGWIN__
+#ifndef BITS_PER_LONG
+#if (~0UL) == 0xffffffffUL
+#define BITS_PER_LONG 32
+#else
+#define BITS_PER_LONG 64
+#endif
+#endif
+#endif
+
+#if (BITS_PER_LONG == 32 || __WORDSIZE == 32)
+# define LPU64 "%Lu"
+# define LPD64 "%Ld"
+# define LPX64 "%#Lx"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#endif
+#if (BITS_PER_LONG == 64 || __WORDSIZE == 64)
+# define LPU64 "%lu"
+# define LPD64 "%ld"
+# define LPX64 "%#lx"
+# define LPSZ  "%lu"
+# define LPSSZ "%ld"
+#endif
+#ifndef LPU64
+# error "No word size defined"
+#endif
+
+#endif
diff --git a/lnet/include/linux/portals_compat25.h b/lnet/include/linux/portals_compat25.h
new file mode 100644 (file)
index 0000000..e28fbac
--- /dev/null
@@ -0,0 +1,13 @@
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) || defined(CONFIG_RH_2_4_20)
+# define SIGNAL_MASK_LOCK(task, flags)                              \
+  spin_lock_irqsave(&task->sighand->siglock, flags)
+# define SIGNAL_MASK_UNLOCK(task, flags)                            \
+  spin_unlock_irqrestore(&task->sighand->siglock, flags)
+# define RECALC_SIGPENDING         recalc_sigpending()
+#else
+# define SIGNAL_MASK_LOCK(task, flags)                              \
+  spin_lock_irqsave(&task->sigmask_lock, flags)
+# define SIGNAL_MASK_UNLOCK(task, flags)                            \
+  spin_unlock_irqrestore(&task->sigmask_lock, flags)
+# define RECALC_SIGPENDING         recalc_sigpending(current)
+#endif
diff --git a/lnet/include/linux/portals_lib.h b/lnet/include/linux/portals_lib.h
new file mode 100644 (file)
index 0000000..a528a80
--- /dev/null
@@ -0,0 +1,188 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef _PORTALS_LIB_H
+#define _PORTALS_LIB_H
+
+#ifndef __KERNEL__
+# include <string.h>
+#else 
+# include <asm/types.h>
+#endif
+
+#undef MIN
+#define MIN(a,b) (((a)<(b)) ? (a): (b))
+#undef MAX
+#define MAX(a,b) (((a)>(b)) ? (a): (b))
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int size_round (int val)
+{
+        return (val + 7) & (~0x7);
+}
+
+static inline int size_round0(int val)
+{
+        if (!val)
+                return 0;
+        return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t round_strlen(char *fset)
+{
+        return size_round(strlen(fset) + 1);
+}
+
+#ifdef __KERNEL__
+static inline char *strdup(const char *str)
+{
+        int len = strlen(str) + 1;
+        char *tmp = kmalloc(len, GFP_KERNEL);
+        if (tmp)
+                memcpy(tmp, str, len);
+
+        return tmp;
+}
+#endif
+
+#ifdef __KERNEL__
+# define NTOH__u32(var) le32_to_cpu(var)
+# define NTOH__u64(var) le64_to_cpu(var)
+# define HTON__u32(var) cpu_to_le32(var)
+# define HTON__u64(var) cpu_to_le64(var)
+#else
+# define expansion_u64(var) \
+    ({  __u64 ret; \
+       switch (sizeof(var)) {   \
+       case 8: (ret) = (var); break; \
+       case 4: (ret) = (__u32)(var); break; \
+       case 2: (ret) = (__u16)(var); break; \
+       case 1: (ret) = (__u8)(var); break; \
+       };       \
+       (ret);     \
+    })
+# define NTOH__u32(var) (var)
+# define NTOH__u64(var) (expansion_u64(var))
+# define HTON__u32(var) (var)
+# define HTON__u64(var) (expansion_u64(var))
+#endif
+
+/* 
+ * copy sizeof(type) bytes from pointer to var and move ptr forward.
+ * return EFAULT if pointer goes beyond end
+ */
+#define UNLOGV(var,type,ptr,end)                \
+do {                                            \
+        var = *(type *)ptr;                     \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+/* the following two macros convert to little endian */
+/* type MUST be __u32 or __u64 */
+#define LUNLOGV(var,type,ptr,end)               \
+do {                                            \
+        var = NTOH##type(*(type *)ptr);         \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+/* now log values */
+#define LOGV(var,type,ptr)                      \
+do {                                            \
+        *((type *)ptr) = var;                   \
+        ptr += sizeof(type);                    \
+} while (0)
+
+/* and in network order */
+#define LLOGV(var,type,ptr)                     \
+do {                                            \
+        *((type *)ptr) = HTON##type(var);       \
+        ptr += sizeof(type);                    \
+} while (0)
+
+
+/* 
+ * set var to point at (type *)ptr, move ptr forward with sizeof(type)
+ * return from function with EFAULT if ptr goes beyond end
+ */
+#define UNLOGP(var,type,ptr,end)                \
+do {                                            \
+        var = (type *)ptr;                      \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+#define LOGP(var,type,ptr)                      \
+do {                                            \
+        memcpy(ptr, var, sizeof(type));         \
+        ptr += sizeof(type);                    \
+} while (0)
+
+/* 
+ * set var to point at (char *)ptr, move ptr forward by size_round(len);
+ * return from function with EFAULT if ptr goes beyond end
+ */
+#define UNLOGL(var,type,len,ptr,end)            \
+do {                                            \
+        var = (type *)ptr;                      \
+        ptr += size_round(len * sizeof(type));  \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+#define UNLOGL0(var,type,len,ptr,end)                                   \
+do {                                                                    \
+        UNLOGL(var,type,len,ptr,end);                                   \
+        if ( *((char *)ptr - size_round(len) + len - 1) != '\0')        \
+                return -EFAULT;                                         \
+} while (0)
+
+#define LOGL(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)ptr, (const char *)var, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGU(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)var, (const char *)ptr, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGL0(var,len,ptr)                              \
+do {                                                    \
+        if (!len)                                       \
+                break;                                  \
+        memcpy((char *)ptr, (const char *)var, len);    \
+        *((char *)(ptr) + len) = 0;                     \
+        ptr += size_round(len + 1);                     \
+} while (0)
+
+#endif /* _PORTALS_LIB_H */
diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am
new file mode 100644 (file)
index 0000000..c61b084
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = base
+include $(top_srcdir)/Rules
+
+pkginclude_HEADERS=api-support.h api.h arg-blocks.h defines.h errno.h internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h myrnal.h nal.h p30.h ppid.h ptlctl.h stringtab.h types.h nalids.h list.h bridge.h ipmap.h procbridge.h lltrace.h
+
diff --git a/lnet/include/lnet/api-support.h b/lnet/include/lnet/api-support.h
new file mode 100644 (file)
index 0000000..af4a2dc
--- /dev/null
@@ -0,0 +1,27 @@
+# define DEBUG_SUBSYSTEM S_PORTALS
+# define PORTAL_DEBUG
+
+#ifndef __KERNEL__
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <time.h>
+
+/* Lots of POSIX dependencies to support PtlEQWait_timeout */
+# include <signal.h>
+# include <setjmp.h>
+# include <time.h>
+#endif
+
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+
+#include <portals/internal.h>
+#include <portals/nal.h>
+#include <portals/arg-blocks.h>
+
+/* Hack for 2.4.18 macro name collision */
+#ifdef yield
+#undef yield
+#endif
diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h
new file mode 100644 (file)
index 0000000..a83749b
--- /dev/null
@@ -0,0 +1,159 @@
+#ifndef P30_API_H
+#define P30_API_H
+
+#include <portals/types.h>
+
+#ifndef PTL_NO_WRAP
+int PtlInit(void);
+int PtlInitialized(void);
+void PtlFini(void);
+
+int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in,
+              ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid,
+              ptl_handle_ni_t * interface_out);
+
+int PtlNIInitialized(ptl_interface_t);
+
+int PtlNIFini(ptl_handle_ni_t interface_in);
+
+#endif
+
+int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id);
+
+
+/*
+ * Network interfaces
+ */
+
+#ifndef PTL_NO_WRAP
+int PtlNIBarrier(ptl_handle_ni_t interface_in);
+#endif
+
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t * status_out);
+
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out);
+
+#ifndef PTL_NO_WRAP
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out);
+#endif
+
+
+/*
+ * PtlNIDebug: 
+ *
+ * This is not an official Portals 3 API call.  It is provided
+ * by the reference implementation to allow the maintainers an
+ * easy way to turn on and off debugging information in the
+ * library.  Do not use it in code that is not intended for use
+ * with any version other than the portable reference library.
+ */
+unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in);
+
+/* 
+ * PtlNIFailNid
+ *
+ * Not an official Portals 3 API call.  It provides a way of simulating
+ * communications failures to all (nid == PTL_NID_ANY), or specific peers
+ * (via multiple calls), either until further notice (threshold == -1), or
+ * for a specific number of messages.  Passing a threshold of zero, "heals"
+ * the given peer.
+ */
+int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold);
+
+
+/*
+ * Match entries
+ */
+
+int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
+                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
+                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
+                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out);
+
+int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
+                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
+                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
+                ptl_handle_me_t * handle_out);
+
+int PtlMEUnlink(ptl_handle_me_t current_in);
+
+int PtlMEUnlinkList(ptl_handle_me_t current_in);
+
+int PtlTblDump(ptl_handle_ni_t ni, int index_in);
+int PtlMEDump(ptl_handle_me_t current_in);
+
+
+
+/*
+ * Memory descriptors
+ */
+
+#ifndef PTL_NO_WRAP
+int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out);
+
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+              ptl_handle_md_t * handle_out);
+
+int PtlMDUnlink(ptl_handle_md_t md_in);
+
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout,
+                ptl_md_t * new_inout, ptl_handle_eq_t testq_in);
+
+#endif
+
+/* These should not be called by users */
+int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout,
+                         ptl_md_t * new_inout, ptl_handle_eq_t testq_in,
+                         ptl_seq_t sequence_in);
+
+
+
+
+/*
+ * Event queues
+ */
+#ifndef PTL_NO_WRAP
+
+/* These should be called by users */
+int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in,
+               int (*callback) (ptl_event_t * event),
+               ptl_handle_eq_t * handle_out);
+int PtlEQFree(ptl_handle_eq_t eventq_in);
+
+int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out);
+
+int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+
+
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout);
+#endif
+
+/*
+ * Access Control Table
+ */
+int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
+               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in);
+
+
+/*
+ * Data movement
+ */
+
+int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
+           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
+           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in);
+
+int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_match_bits_t match_bits_in, ptl_size_t offset_in);
+
+
+
+#endif
diff --git a/lnet/include/lnet/arg-blocks.h b/lnet/include/lnet/arg-blocks.h
new file mode 100644 (file)
index 0000000..3c3b154
--- /dev/null
@@ -0,0 +1,265 @@
+#ifndef PTL_BLOCKS_H
+#define PTL_BLOCKS_H
+
+/*
+ * blocks.h
+ *
+ * Argument block types for the Portals 3.0 library
+ * Generated by idl
+ *
+ */
+
+#include <portals/types.h>
+
+/* put LIB_MAX_DISPATCH last here  -- these must match the
+   assignements to the dispatch table in lib-p30/dispatch.c */
+#define PTL_GETID     1
+#define PTL_NISTATUS  2
+#define PTL_NIDIST    3
+#define PTL_NIDEBUG   4
+#define PTL_MEATTACH  5
+#define PTL_MEINSERT  6
+// #define PTL_MEPREPEND 7
+#define PTL_MEUNLINK  8
+#define PTL_TBLDUMP   9 
+#define PTL_MEDUMP   10
+#define PTL_MDATTACH 11
+// #define PTL_MDINSERT 12
+#define PTL_MDBIND   13
+#define PTL_MDUPDATE 14
+#define PTL_MDUNLINK 15
+#define PTL_EQALLOC  16
+#define PTL_EQFREE   17
+#define PTL_ACENTRY  18
+#define PTL_PUT      19 
+#define PTL_GET      20
+#define PTL_FAILNID  21
+#define LIB_MAX_DISPATCH 21
+
+typedef struct PtlFailNid_in {
+       ptl_handle_ni_t interface;
+       ptl_nid_t       nid;
+       unsigned int    threshold;
+} PtlFailNid_in;
+
+typedef struct PtlFailNid_out {
+       int             rc;
+} PtlFailNid_out;
+
+typedef struct PtlGetId_in {
+        ptl_handle_ni_t handle_in;
+} PtlGetId_in;
+
+typedef struct PtlGetId_out {
+        int rc;
+        ptl_process_id_t id_out;
+} PtlGetId_out;
+
+typedef struct PtlNIStatus_in {
+        ptl_handle_ni_t interface_in;
+        ptl_sr_index_t register_in;
+} PtlNIStatus_in;
+
+typedef struct PtlNIStatus_out {
+        int rc;
+        ptl_sr_value_t status_out;
+} PtlNIStatus_out;
+
+
+typedef struct PtlNIDist_in {
+        ptl_handle_ni_t interface_in;
+        ptl_process_id_t process_in;
+} PtlNIDist_in;
+
+typedef struct PtlNIDist_out {
+        int rc;
+        unsigned long distance_out;
+} PtlNIDist_out;
+
+
+typedef struct PtlNIDebug_in {
+        unsigned int mask_in;
+} PtlNIDebug_in;
+
+typedef struct PtlNIDebug_out {
+        unsigned int rc;
+} PtlNIDebug_out;
+
+
+typedef struct PtlMEAttach_in {
+        ptl_handle_ni_t interface_in;
+        ptl_pt_index_t index_in;
+        ptl_ins_pos_t position_in;
+        ptl_process_id_t match_id_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_match_bits_t ignore_bits_in;
+        ptl_unlink_t unlink_in;
+} PtlMEAttach_in;
+
+typedef struct PtlMEAttach_out {
+        int rc;
+        ptl_handle_me_t handle_out;
+} PtlMEAttach_out;
+
+
+typedef struct PtlMEInsert_in {
+        ptl_handle_me_t current_in;
+        ptl_process_id_t match_id_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_match_bits_t ignore_bits_in;
+        ptl_unlink_t unlink_in;
+        ptl_ins_pos_t position_in;
+} PtlMEInsert_in;
+
+typedef struct PtlMEInsert_out {
+        int rc;
+        ptl_handle_me_t handle_out;
+} PtlMEInsert_out;
+
+typedef struct PtlMEUnlink_in {
+        ptl_handle_me_t current_in;
+        ptl_unlink_t unlink_in;
+} PtlMEUnlink_in;
+
+typedef struct PtlMEUnlink_out {
+        int rc;
+} PtlMEUnlink_out;
+
+
+typedef struct PtlTblDump_in {
+        int index_in;
+} PtlTblDump_in;
+
+typedef struct PtlTblDump_out {
+        int rc;
+} PtlTblDump_out;
+
+
+typedef struct PtlMEDump_in {
+        ptl_handle_me_t current_in;
+} PtlMEDump_in;
+
+typedef struct PtlMEDump_out {
+        int rc;
+} PtlMEDump_out;
+
+
+typedef struct PtlMDAttach_in {
+        ptl_handle_me_t me_in;
+        ptl_handle_eq_t eq_in;
+        ptl_md_t md_in;
+        ptl_unlink_t unlink_in;
+} PtlMDAttach_in;
+
+typedef struct PtlMDAttach_out {
+        int rc;
+        ptl_handle_md_t handle_out;
+} PtlMDAttach_out;
+
+
+typedef struct PtlMDBind_in {
+        ptl_handle_ni_t ni_in;
+        ptl_handle_eq_t eq_in;
+        ptl_md_t md_in;
+} PtlMDBind_in;
+
+typedef struct PtlMDBind_out {
+        int rc;
+        ptl_handle_md_t handle_out;
+} PtlMDBind_out;
+
+
+typedef struct PtlMDUpdate_internal_in {
+        ptl_handle_md_t md_in;
+        ptl_handle_eq_t testq_in;
+        ptl_seq_t sequence_in;
+
+        ptl_md_t old_inout;
+        int old_inout_valid;
+        ptl_md_t new_inout;
+        int new_inout_valid;
+} PtlMDUpdate_internal_in;
+
+typedef struct PtlMDUpdate_internal_out {
+        int rc;
+        ptl_md_t old_inout;
+        ptl_md_t new_inout;
+} PtlMDUpdate_internal_out;
+
+
+typedef struct PtlMDUnlink_in {
+        ptl_handle_md_t md_in;
+} PtlMDUnlink_in;
+
+typedef struct PtlMDUnlink_out {
+        int rc;
+        ptl_md_t status_out;
+} PtlMDUnlink_out;
+
+
+typedef struct PtlEQAlloc_in {
+        ptl_handle_ni_t ni_in;
+        ptl_size_t count_in;
+        void *base_in;
+        int len_in;
+        int (*callback_in) (ptl_event_t * event);
+} PtlEQAlloc_in;
+
+typedef struct PtlEQAlloc_out {
+        int rc;
+        ptl_handle_eq_t handle_out;
+} PtlEQAlloc_out;
+
+
+typedef struct PtlEQFree_in {
+        ptl_handle_eq_t eventq_in;
+} PtlEQFree_in;
+
+typedef struct PtlEQFree_out {
+        int rc;
+} PtlEQFree_out;
+
+
+typedef struct PtlACEntry_in {
+        ptl_handle_ni_t ni_in;
+        ptl_ac_index_t index_in;
+        ptl_process_id_t match_id_in;
+        ptl_pt_index_t portal_in;
+} PtlACEntry_in;
+
+typedef struct PtlACEntry_out {
+        int rc;
+} PtlACEntry_out;
+
+
+typedef struct PtlPut_in {
+        ptl_handle_md_t md_in;
+        ptl_ack_req_t ack_req_in;
+        ptl_process_id_t target_in;
+        ptl_pt_index_t portal_in;
+        ptl_ac_index_t cookie_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_size_t offset_in;
+        ptl_hdr_data_t hdr_data_in;
+} PtlPut_in;
+
+typedef struct PtlPut_out {
+        int rc;
+} PtlPut_out;
+
+
+typedef struct PtlGet_in {
+        ptl_handle_md_t md_in;
+        ptl_process_id_t target_in;
+        ptl_pt_index_t portal_in;
+        ptl_ac_index_t cookie_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_size_t offset_in;
+} PtlGet_in;
+
+typedef struct PtlGet_out {
+        int rc;
+} PtlGet_out;
+
+
+#endif
diff --git a/lnet/include/lnet/defines.h b/lnet/include/lnet/defines.h
new file mode 100644 (file)
index 0000000..785ce73
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+**
+** This files contains definitions that are used throughout the cplant code.
+*/
+
+#ifndef CPLANT_H
+#define CPLANT_H
+
+#define TITLE(fname,zmig)
+
+
+/*
+** TRUE and FALSE
+*/
+#undef TRUE
+#define TRUE           (1)
+#undef FALSE
+#define FALSE          (0)
+
+
+/*
+** Return codes from functions
+*/
+#undef OK
+#define OK             (0)
+#undef ERROR
+#define ERROR          (-1)
+
+
+
+/*
+** The GCC macro for a safe max() that works on all types arithmetic types.
+*/
+#ifndef MAX
+#define MAX(a, b)      (a) > (b) ? (a) : (b)
+#endif /* MAX */
+
+#ifndef MIN
+#define MIN(a, b)      (a) < (b) ? (a) : (b)
+#endif /* MIN */
+
+/*
+** The rest is from the old qkdefs.h
+*/
+
+#ifndef __linux__
+#define __inline__
+#endif
+
+#ifndef NULL
+#define NULL ((void *)0)
+#endif
+
+#ifndef __osf__
+#define PRIVATE static
+#define PUBLIC
+#endif
+
+#ifndef __osf__
+typedef unsigned char           uchar;
+#endif
+
+typedef char                    CHAR;
+typedef unsigned char           UCHAR;
+typedef char                    INT8;
+typedef unsigned char           UINT8;
+typedef short int               INT16;
+typedef unsigned short int      UINT16;
+typedef int                     INT32;
+typedef unsigned int            UINT32;
+typedef long                    LONG32;
+typedef unsigned long           ULONG32;
+
+/* long may be 32 or 64, so we can't really append the size to the definition */
+typedef long                    LONG;
+typedef unsigned long           ULONG;
+
+#ifdef __alpha__
+typedef long int_t;
+#ifndef __osf__
+typedef unsigned long uint_t;
+#endif
+#endif
+
+#ifdef __i386__
+typedef int int_t;
+typedef unsigned int uint_t;
+#endif
+
+typedef float                   FLOAT32;
+typedef double                  FLOAT64;
+typedef void                    VOID;
+typedef INT32                   BOOLEAN;
+typedef void (*FCN_PTR)(void);
+
+#ifndef off64_t
+
+#if defined (__alpha__) || defined (__ia64__)
+typedef long                     off64_t;
+#else
+typedef long long                off64_t;
+#endif
+
+#endif
+
+/*
+** Process related typedefs
+*/
+typedef UINT16 PID_TYPE;  /* Type of Local process ID */
+typedef UINT16 NID_TYPE;  /* Type of Physical node ID */
+typedef UINT16 GID_TYPE;  /* Type of Group ID */
+typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */
+
+
+
+#endif /* CPLANT_H */
diff --git a/lnet/include/lnet/errno.h b/lnet/include/lnet/errno.h
new file mode 100644 (file)
index 0000000..817936a
--- /dev/null
@@ -0,0 +1,61 @@
+#ifndef _P30_ERRNO_H_
+#define _P30_ERRNO_H_
+
+/*
+ * include/portals/errno.h
+ *
+ * Shared error number lists
+ */
+
+/* If you change these, you must update the string table in api-errno.c */
+typedef enum {
+        PTL_OK              = 0,
+        PTL_SEGV            = 1,
+
+        PTL_NOSPACE         = 2,
+        PTL_INUSE           = 3,
+        PTL_VAL_FAILED      = 4,
+
+        PTL_NAL_FAILED      = 5,
+        PTL_NOINIT          = 6,
+        PTL_INIT_DUP        = 7,
+        PTL_INIT_INV        = 8,
+        PTL_AC_INV_INDEX    = 9,
+
+        PTL_INV_ASIZE       = 10,
+        PTL_INV_HANDLE      = 11,
+        PTL_INV_MD          = 12,
+        PTL_INV_ME          = 13,
+        PTL_INV_NI          = 14,
+/* If you change these, you must update the string table in api-errno.c */
+        PTL_ILL_MD          = 15,
+        PTL_INV_PROC        = 16,
+        PTL_INV_PSIZE       = 17,
+        PTL_INV_PTINDEX     = 18,
+        PTL_INV_REG         = 19,
+
+        PTL_INV_SR_INDX     = 20,
+        PTL_ML_TOOLONG      = 21,
+        PTL_ADDR_UNKNOWN    = 22,
+        PTL_INV_EQ          = 23,
+        PTL_EQ_DROPPED      = 24,
+
+        PTL_EQ_EMPTY        = 25,
+        PTL_NOUPDATE        = 26,
+        PTL_FAIL            = 27,
+        PTL_NOT_IMPLEMENTED = 28,
+        PTL_NO_ACK          = 29,
+
+        PTL_IOV_TOO_MANY    = 30,
+        PTL_IOV_TOO_SMALL   = 31,
+
+       PTL_EQ_INUSE        = 32,
+       PTL_MD_INUSE        = 33,
+
+        PTL_MAX_ERRNO       = 33
+} ptl_err_t;
+/* If you change these, you must update the string table in api-errno.c */
+
+extern const char *ptl_err_str[];
+
+#endif
diff --git a/lnet/include/lnet/internal.h b/lnet/include/lnet/internal.h
new file mode 100644 (file)
index 0000000..d78cad4
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+*/
+#ifndef _P30_INTERNAL_H_
+#define _P30_INTERNAL_H_
+
+/*
+ * p30/internal.h
+ *
+ * Internals for the API level library that are not needed
+ * by the user application
+ */
+
+#include <portals/p30.h>
+
+extern int ptl_init;           /* Has the library be initialized */
+
+extern int ptl_ni_init(void);
+extern int ptl_me_init(void);
+extern int ptl_md_init(void);
+extern int ptl_eq_init(void);
+
+extern int ptl_me_ni_init(nal_t * nal);
+extern int ptl_md_ni_init(nal_t * nal);
+extern int ptl_eq_ni_init(nal_t * nal);
+
+extern void ptl_ni_fini(void);
+extern void ptl_me_fini(void);
+extern void ptl_md_fini(void);
+extern void ptl_eq_fini(void);
+
+extern void ptl_me_ni_fini(nal_t * nal);
+extern void ptl_md_ni_fini(nal_t * nal);
+extern void ptl_eq_ni_fini(nal_t * nal);
+
+static inline ptl_eq_t *
+ptl_handle2usereq (ptl_handle_eq_t *handle)
+{
+        /* EQ handles are a little wierd.  On the "user" side, the cookie
+         * is just a pointer to a queue of events in shared memory.  It's
+         * cb_eq_handle is the "real" handle which we pass when we
+         * call do_forward(). */
+        return (ptl_eq_t *)((unsigned long)handle->cookie);
+}
+
+#endif
diff --git a/lnet/include/lnet/lib-dispatch.h b/lnet/include/lnet/lib-dispatch.h
new file mode 100644 (file)
index 0000000..f87ff83
--- /dev/null
@@ -0,0 +1,45 @@
+#ifndef PTL_DISPATCH_H
+#define PTL_DISPATCH_H
+
+/*
+ * include/dispatch.h
+ *
+ * Dispatch table header and externs for remote side
+ * operations
+ *
+ * Generated by idl
+ *
+ */
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args,
+                           void *ret);
+extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args,
+                                 void *ret);
+extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args,
+                                  void *ret);
+extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args,
+                                 void *ret);
+extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret);
+
+extern char *dispatch_name(int index);
+#endif
diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h
new file mode 100644 (file)
index 0000000..b623b93
--- /dev/null
@@ -0,0 +1,385 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib-p30.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef _LIB_P30_H_
+#define _LIB_P30_H_
+
+#ifdef __KERNEL__
+# include <asm/page.h>
+# include <linux/string.h>
+#else
+# include <portals/list.h>
+# include <string.h>
+#endif
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/errno.h>
+#include <portals/lib-types.h>
+#include <portals/lib-nal.h>
+#include <portals/lib-dispatch.h>
+
+static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
+{
+        return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie &&
+                wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
+}
+
+#ifdef __KERNEL__
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        nal->cb_cli(nal, flagsp);                       \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        nal->cb_sti(nal, flagsp);                       \
+}
+#else
+/* not needed in user space until we thread there */
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+}
+#endif /* __KERNEL__ */
+
+#ifndef PTL_USE_SLAB_CACHE
+
+#define MAX_MES         2048
+#define MAX_MDS         2048
+#define MAX_MSGS        2048    /* Outstanding messages */
+#define MAX_EQS         512
+
+extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize);
+extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl);
+
+static inline void *
+lib_freelist_alloc (lib_freelist_t *fl)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o;
+
+        if (list_empty (&fl->fl_list))
+                return (NULL);
+        
+        o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list);
+        list_del (&o->fo_list);
+        return ((void *)&o->fo_contents);
+}
+
+static inline void
+lib_freelist_free (lib_freelist_t *fl, void *obj)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
+        
+        list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_eq_t      *eq;
+        
+        state_lock (nal, &flags);
+        eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs);
+        state_unlock (nal, &flags);
+
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_eqs, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_md_t      *md;
+        
+        state_lock (nal, &flags);
+        md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds);
+        state_unlock (nal, &flags);
+
+        return (md);
+}
+
+static inline void
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mds, md);
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_me_t      *me;
+        
+        state_lock (nal, &flags);
+        me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes);
+        state_unlock (nal, &flags);
+        
+        return (me);
+}
+
+static inline void
+lib_me_free (nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mes, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc (nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+}
+
+static inline void
+lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_msgs, msg);
+}
+
+#else
+
+extern kmem_cache_t *ptl_md_slab;
+extern kmem_cache_t *ptl_msg_slab;
+extern kmem_cache_t *ptl_me_slab;
+extern kmem_cache_t *ptl_eq_slab;
+extern atomic_t      md_in_use_count;
+extern atomic_t      msg_in_use_count;
+extern atomic_t      me_in_use_count;
+extern atomic_t      eq_in_use_count;
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_NOFS);
+
+        if (eq == NULL)
+                return (NULL);
+
+        atomic_inc (&eq_in_use_count);
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&eq_in_use_count);
+        kmem_cache_free(ptl_eq_slab, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_NOFS);
+
+        if (md == NULL)
+                return (NULL);
+
+        atomic_inc (&md_in_use_count);
+        return (md);
+}
+
+static inline void 
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&md_in_use_count);
+        kmem_cache_free(ptl_md_slab, md); 
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_NOFS);
+
+        if (me == NULL)
+                return (NULL);
+
+        atomic_inc (&me_in_use_count);
+        return (me);
+}
+
+static inline void 
+lib_me_free(nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&me_in_use_count);
+        kmem_cache_free(ptl_me_slab, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc(nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC);
+
+        if (msg == NULL)
+                return (NULL);
+        
+        atomic_inc (&msg_in_use_count);
+        return (msg);
+}
+
+static inline void 
+lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&msg_in_use_count);
+        kmem_cache_free(ptl_msg_slab, msg); 
+}
+#endif
+
+extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type);
+extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type);
+extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh);
+
+static inline void
+ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq)
+{
+        handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lib_eq_t *
+ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, 
+                                              PTL_COOKIE_TYPE_EQ);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_eq_t, eq_lh));
+}
+
+static inline void
+ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md)
+{
+        handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lib_md_t *
+ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
+                                              PTL_COOKIE_TYPE_MD);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline lib_md_t *
+ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh;
+        
+        if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie)
+                return (NULL);
+        
+        lh = lib_lookup_cookie (nal, wh->wh_object_cookie,
+                                PTL_COOKIE_TYPE_MD);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline void
+ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me)
+{
+        handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lib_me_t *
+ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
+                                              PTL_COOKIE_TYPE_ME);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_me_t, me_lh));
+}
+
+extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+                    ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size);
+extern int lib_fini(nal_cb_t * cb);
+extern void lib_dispatch(nal_cb_t * cb, void *private, int index,
+                         void *arg_block, void *ret_block);
+extern char *dispatch_name(int index);
+
+/*
+ * When the NAL detects an incoming message, it should call
+ * lib_parse() decode it.  The NAL callbacks will be handed
+ * the private cookie as a way for the NAL to maintain state
+ * about which transaction is being processed.  An extra parameter,
+ * lib_cookie will contain the necessary information for
+ * finalizing the message.
+ *
+ * After it has finished the handling the message, it should
+ * call lib_finalize() with the lib_cookie parameter.
+ * Call backs will be made to write events, send acks or
+ * replies and so on.
+ */
+extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
+extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+
+extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+
+extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_assert_wire_constants (void);
+
+extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+
+extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
+                               ptl_md_t * md_out);
+extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in);
+extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in);
+#endif
diff --git a/lnet/include/lnet/lib-nal.h b/lnet/include/lnet/lib-nal.h
new file mode 100644 (file)
index 0000000..4052c0c
--- /dev/null
@@ -0,0 +1,102 @@
+#ifndef _LIB_NAL_H_
+#define _LIB_NAL_H_
+
+/*
+ * nal.h
+ *
+ * Library side headers that define the abstraction layer's
+ * responsibilities and interfaces
+ */
+
+#include <portals/lib-types.h>
+
+struct nal_cb_t {
+       /*
+        * Per interface portal table, access control table
+        * and NAL private data field;
+        */
+       lib_ni_t ni;
+       void *nal_data;
+       /*
+        * send:  Sends a preformatted header and user data to a
+        * specified remote process.
+        * Can overwrite iov.
+        */
+       int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                       ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                       unsigned int niov, struct iovec *iov, size_t mlen);
+
+       /* as send, but with a set of page fragments (NULL if not supported) */
+       int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                             unsigned int niov, ptl_kiov_t *iov, size_t mlen);
+       /*
+        * recv: Receives an incoming message from a remote process
+        * Type of iov depends on options.  Can overwrite iov.
+        */
+       int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                       unsigned int niov, struct iovec *iov, size_t mlen, 
+                       size_t rlen);
+
+       /* as recv, but with a set of page fragments (NULL if not supported) */
+       int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                             unsigned int niov, ptl_kiov_t *iov, size_t mlen, 
+                             size_t rlen);
+       /*
+        * read: Reads a block of data from a specified user address
+        */
+       int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
+                       user_ptr src_addr, size_t len);
+
+       /*
+        * write: Writes a block of data into a specified user address
+        */
+       int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
+                        void *src_addr, size_t len);
+
+       /*
+        * callback: Calls an event callback
+        */
+       int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                        ptl_event_t *ev);
+
+       /*
+        *  malloc: Acquire a block of memory in a system independent
+        * fashion.
+        */
+       void *(*cb_malloc) (nal_cb_t * nal, size_t len);
+
+       void (*cb_free) (nal_cb_t * nal, void *buf, size_t len);
+
+       /*
+        * (un)map: Tell the NAL about some memory it will access.
+        * *addrkey passed to cb_unmap() is what cb_map() set it to.
+        * type of *iov depends on options.
+        * Set to NULL if not required.
+        */
+       int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                      void **addrkey);
+       void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                         void **addrkey);
+
+       /* as (un)map, but with a set of page fragments */
+       int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                            void **addrkey);
+       void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                         void **addrkey);
+
+       void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...);
+
+       /* Turn interrupts off (begin of protected area) */
+       void (*cb_cli) (nal_cb_t * nal, unsigned long *flags);
+
+       /* Turn interrupts on (end of protected area) */
+       void (*cb_sti) (nal_cb_t * nal, unsigned long *flags);
+
+       /*
+        * Calculate a network "distance" to given node
+        */
+       int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist);
+};
+
+#endif
diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h
new file mode 100644 (file)
index 0000000..b623b93
--- /dev/null
@@ -0,0 +1,385 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib-p30.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef _LIB_P30_H_
+#define _LIB_P30_H_
+
+#ifdef __KERNEL__
+# include <asm/page.h>
+# include <linux/string.h>
+#else
+# include <portals/list.h>
+# include <string.h>
+#endif
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/errno.h>
+#include <portals/lib-types.h>
+#include <portals/lib-nal.h>
+#include <portals/lib-dispatch.h>
+
+static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
+{
+        return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie &&
+                wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
+}
+
+#ifdef __KERNEL__
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        nal->cb_cli(nal, flagsp);                       \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        nal->cb_sti(nal, flagsp);                       \
+}
+#else
+/* not needed in user space until we thread there */
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+}
+#endif /* __KERNEL__ */
+
+#ifndef PTL_USE_SLAB_CACHE
+
+#define MAX_MES         2048
+#define MAX_MDS         2048
+#define MAX_MSGS        2048    /* Outstanding messages */
+#define MAX_EQS         512
+
+extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize);
+extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl);
+
+static inline void *
+lib_freelist_alloc (lib_freelist_t *fl)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o;
+
+        if (list_empty (&fl->fl_list))
+                return (NULL);
+        
+        o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list);
+        list_del (&o->fo_list);
+        return ((void *)&o->fo_contents);
+}
+
+static inline void
+lib_freelist_free (lib_freelist_t *fl, void *obj)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
+        
+        list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_eq_t      *eq;
+        
+        state_lock (nal, &flags);
+        eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs);
+        state_unlock (nal, &flags);
+
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_eqs, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_md_t      *md;
+        
+        state_lock (nal, &flags);
+        md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds);
+        state_unlock (nal, &flags);
+
+        return (md);
+}
+
+static inline void
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mds, md);
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_me_t      *me;
+        
+        state_lock (nal, &flags);
+        me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes);
+        state_unlock (nal, &flags);
+        
+        return (me);
+}
+
+static inline void
+lib_me_free (nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mes, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc (nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+}
+
+static inline void
+lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_msgs, msg);
+}
+
+#else
+
+extern kmem_cache_t *ptl_md_slab;
+extern kmem_cache_t *ptl_msg_slab;
+extern kmem_cache_t *ptl_me_slab;
+extern kmem_cache_t *ptl_eq_slab;
+extern atomic_t      md_in_use_count;
+extern atomic_t      msg_in_use_count;
+extern atomic_t      me_in_use_count;
+extern atomic_t      eq_in_use_count;
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_NOFS);
+
+        if (eq == NULL)
+                return (NULL);
+
+        atomic_inc (&eq_in_use_count);
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&eq_in_use_count);
+        kmem_cache_free(ptl_eq_slab, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_NOFS);
+
+        if (md == NULL)
+                return (NULL);
+
+        atomic_inc (&md_in_use_count);
+        return (md);
+}
+
+static inline void 
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&md_in_use_count);
+        kmem_cache_free(ptl_md_slab, md); 
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_NOFS);
+
+        if (me == NULL)
+                return (NULL);
+
+        atomic_inc (&me_in_use_count);
+        return (me);
+}
+
+static inline void 
+lib_me_free(nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&me_in_use_count);
+        kmem_cache_free(ptl_me_slab, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc(nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC);
+
+        if (msg == NULL)
+                return (NULL);
+        
+        atomic_inc (&msg_in_use_count);
+        return (msg);
+}
+
+static inline void 
+lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&msg_in_use_count);
+        kmem_cache_free(ptl_msg_slab, msg); 
+}
+#endif
+
+extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type);
+extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type);
+extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh);
+
+static inline void
+ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq)
+{
+        handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lib_eq_t *
+ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, 
+                                              PTL_COOKIE_TYPE_EQ);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_eq_t, eq_lh));
+}
+
+static inline void
+ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md)
+{
+        handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lib_md_t *
+ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
+                                              PTL_COOKIE_TYPE_MD);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline lib_md_t *
+ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh;
+        
+        if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie)
+                return (NULL);
+        
+        lh = lib_lookup_cookie (nal, wh->wh_object_cookie,
+                                PTL_COOKIE_TYPE_MD);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline void
+ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me)
+{
+        handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lib_me_t *
+ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
+                                              PTL_COOKIE_TYPE_ME);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_me_t, me_lh));
+}
+
+extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+                    ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size);
+extern int lib_fini(nal_cb_t * cb);
+extern void lib_dispatch(nal_cb_t * cb, void *private, int index,
+                         void *arg_block, void *ret_block);
+extern char *dispatch_name(int index);
+
+/*
+ * When the NAL detects an incoming message, it should call
+ * lib_parse() decode it.  The NAL callbacks will be handed
+ * the private cookie as a way for the NAL to maintain state
+ * about which transaction is being processed.  An extra parameter,
+ * lib_cookie will contain the necessary information for
+ * finalizing the message.
+ *
+ * After it has finished the handling the message, it should
+ * call lib_finalize() with the lib_cookie parameter.
+ * Call backs will be made to write events, send acks or
+ * replies and so on.
+ */
+extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
+extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+
+extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+
+extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_assert_wire_constants (void);
+
+extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+
+extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
+                               ptl_md_t * md_out);
+extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in);
+extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in);
+#endif
diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h
new file mode 100644 (file)
index 0000000..47c0dd2
--- /dev/null
@@ -0,0 +1,282 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * p30/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef _LIB_TYPES_H_
+#define _LIB_TYPES_H_
+
+#include <portals/types.h>
+#ifdef __KERNEL__
+# define PTL_USE_SLAB_CACHE
+# include <linux/uio.h>
+# include <linux/smp_lock.h>
+# include <linux/types.h>
+#else
+# include <sys/types.h>
+#endif
+
+/* struct nal_cb_t is defined in lib-nal.h */
+typedef struct nal_cb_t nal_cb_t;
+
+typedef char *user_ptr;
+typedef struct lib_msg_t lib_msg_t;
+typedef struct lib_ptl_t lib_ptl_t;
+typedef struct lib_ac_t lib_ac_t;
+typedef struct lib_me_t lib_me_t;
+typedef struct lib_md_t lib_md_t;
+typedef struct lib_eq_t lib_eq_t;
+
+#define WIRE_ATTR      __attribute__((packed))
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+        __u64 wh_interface_cookie;
+        __u64 wh_object_cookie;
+} WIRE_ATTR ptl_handle_wire_t;
+
+/* byte-flip insensitive! */
+#define PTL_WIRE_HANDLE_NONE \
+((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1})
+
+typedef enum {
+        PTL_MSG_ACK = 0,
+        PTL_MSG_PUT,
+        PTL_MSG_GET,
+        PTL_MSG_REPLY,
+        PTL_MSG_HELLO,
+} ptl_msg_type_t;
+
+/* Each of these structs should start with an odd number of
+ * __u32, or the compiler could add its own padding and confuse
+ * everyone.
+ *
+ * Also, "length" needs to be at offset 28 of each struct.
+ */
+typedef struct ptl_ack {
+        ptl_size_t mlength;
+        ptl_handle_wire_t dst_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length (0 for acks) moving out RSN */
+} WIRE_ATTR ptl_ack_t;
+
+typedef struct ptl_put {
+        ptl_pt_index_t ptl_index;
+        ptl_handle_wire_t ack_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length moving out RSN */
+        ptl_size_t offset;
+        ptl_hdr_data_t hdr_data;
+} WIRE_ATTR ptl_put_t;
+
+typedef struct ptl_get {
+        ptl_pt_index_t ptl_index;
+        ptl_handle_wire_t return_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length (0 for gets) moving out RSN */
+        ptl_size_t src_offset;
+        ptl_size_t return_offset;               /* unused: going RSN */
+        ptl_size_t sink_length;
+} WIRE_ATTR ptl_get_t;
+
+typedef struct ptl_reply {
+        __u32 unused1;                          /* unused fields going RSN */
+        ptl_handle_wire_t dst_wmd;
+        ptl_size_t dst_offset;                  /* unused: going RSN */
+        __u32 unused2;
+        ptl_size_t length;                      /* common length moving out RSN */
+} WIRE_ATTR ptl_reply_t;
+
+typedef struct {
+        ptl_nid_t dest_nid;
+        ptl_nid_t src_nid;
+        ptl_pid_t dest_pid;
+        ptl_pid_t src_pid;
+        __u32 type; /* ptl_msg_type_t */
+        union {
+                ptl_ack_t ack;
+                ptl_put_t put;
+                ptl_get_t get;
+                ptl_reply_t reply;
+        } msg;
+} WIRE_ATTR ptl_hdr_t;
+
+/* All length fields in individual unions at same offset */
+/* LASSERT for same in lib-move.c */
+#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length)
+
+/* A HELLO message contains the portals magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * PTL_MSG_HELLO in the type field.  All other fields are zero (including
+ * PTL_HDR_LENGTH; i.e. no payload).
+ * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID, so that hosts with
+ * multiple IP interfaces can have a single NID. These NALs should exchange
+ * HELLO messages when a connection is first established. */
+typedef struct {
+        __u32  magic;                          /* PORTALS_PROTO_MAGIC */
+        __u16   version_major;                  /* increment on incompatible change */
+        __u16   version_minor;                  /* increment on compatible change */
+} WIRE_ATTR ptl_magicversion_t;
+
+#define PORTALS_PROTO_MAGIC                0xeebc0ded
+
+#define PORTALS_PROTO_VERSION_MAJOR        0
+#define PORTALS_PROTO_VERSION_MINOR        1
+
+typedef struct {
+        long recv_count, recv_length, send_count, send_length, drop_count,
+            drop_length, msgs_alloc, msgs_max;
+} lib_counters_t;
+
+/* temporary expedient: limit number of entries in discontiguous MDs */
+#if PTL_LARGE_MTU
+# define PTL_MD_MAX_IOV        64
+#else
+# define PTL_MD_MAX_IOV 16
+#endif
+
+struct lib_msg_t {
+        struct list_head  msg_list;
+        int               send_ack;
+        lib_md_t         *md;
+        ptl_nid_t         nid;
+        ptl_pid_t         pid;
+        ptl_event_t       ev;
+        ptl_handle_wire_t ack_wmd;
+        union {
+                struct iovec  iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+        } msg_iov;
+};
+
+struct lib_ptl_t {
+        ptl_pt_index_t size;
+        struct list_head *tbl;
+};
+
+struct lib_ac_t {
+        int next_free;
+};
+
+typedef struct {
+        struct list_head  lh_hash_chain;
+        __u64             lh_cookie;
+} lib_handle_t;
+
+#define lh_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+struct lib_eq_t {
+        struct list_head  eq_list;
+        lib_handle_t      eq_lh;
+        ptl_seq_t         sequence;
+        ptl_size_t        size;
+        ptl_event_t      *base;
+        int               eq_refcount;
+        int (*event_callback) (ptl_event_t * event);
+        void             *eq_addrkey;
+};
+
+struct lib_me_t {
+        struct list_head  me_list;
+        lib_handle_t      me_lh;
+        ptl_process_id_t  match_id;
+        ptl_match_bits_t  match_bits, ignore_bits;
+        ptl_unlink_t      unlink;
+        lib_md_t         *md;
+};
+
+struct lib_md_t {
+        struct list_head  md_list;
+        lib_handle_t      md_lh;
+        lib_me_t         *me;
+        user_ptr          start;
+        ptl_size_t        offset;
+        ptl_size_t        length;
+        ptl_size_t        max_size;
+        int               threshold;
+        int               pending;
+        ptl_unlink_t      unlink;
+        unsigned int      options;
+        unsigned int      md_flags;
+        void             *user_ptr;
+        lib_eq_t         *eq;
+        void             *md_addrkey;
+        unsigned int      md_niov;                /* # frags */
+        union {
+                struct iovec  iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+        } md_iov;
+};
+
+#define PTL_MD_FLAG_UNLINK            (1 << 0)
+#define PTL_MD_FLAG_AUTO_UNLINKED     (1 << 1)
+
+#ifndef PTL_USE_SLAB_CACHE
+typedef struct
+{
+        void             *fl_objs;             /* single contiguous array of objects */
+        int                fl_nobjs;            /* the number of them */
+        int                fl_objsize;          /* the size (including overhead) of each of them */
+        struct list_head   fl_list;             /* where they are enqueued */
+} lib_freelist_t;
+
+typedef struct
+{
+        struct list_head   fo_list;             /* enqueue on fl_list */
+        void              *fo_contents;         /* aligned contents */
+} lib_freeobj_t;
+#endif
+
+typedef struct {
+        /* info about peers we are trying to fail */
+        struct list_head  tp_list;             /* stash in ni.ni_test_peers */
+        ptl_nid_t         tp_nid;              /* matching nid */
+        unsigned int      tp_threshold;        /* # failures to simulate */
+} lib_test_peer_t;
+
+#define PTL_COOKIE_TYPE_MD    1
+#define PTL_COOKIE_TYPE_ME    2
+#define PTL_COOKIE_TYPE_EQ    3
+#define PTL_COOKIE_TYPES      4
+/* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be
+ * extracted by masking with (PTL_COOKIE_TYPES - 1) */
+
+typedef struct {
+        int up;
+        int refcnt;
+        ptl_nid_t nid;
+        ptl_pid_t pid;
+        int num_nodes;
+        unsigned int debug;
+        lib_ptl_t tbl;
+        lib_ac_t ac;
+        lib_counters_t counters;
+
+        int               ni_lh_hash_size;      /* size of lib handle hash table */
+        struct list_head *ni_lh_hash_table;     /* all extant lib handles, this interface */
+        __u64             ni_next_object_cookie; /* cookie generator */
+        __u64             ni_interface_cookie;  /* uniquely identifies this ni in this epoch */
+        
+        struct list_head ni_test_peers;
+        
+#ifndef PTL_USE_SLAB_CACHE
+        lib_freelist_t   ni_free_mes;
+        lib_freelist_t   ni_free_msgs;
+        lib_freelist_t   ni_free_mds;
+        lib_freelist_t   ni_free_eqs;
+#endif
+        struct list_head ni_active_msgs;
+        struct list_head ni_active_mds;
+        struct list_head ni_active_eqs;
+} lib_ni_t;
+
+#endif
diff --git a/lnet/include/lnet/list.h b/lnet/include/lnet/list.h
new file mode 100644 (file)
index 0000000..2b63312
--- /dev/null
@@ -0,0 +1,245 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+#define prefetch(a) ((void)a)
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+       struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+       (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head * new,
+                             struct list_head * prev,
+                             struct list_head * next)
+{
+       next->prev = new;
+       new->next = next;
+       new->prev = prev;
+       prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+       next->prev = prev;
+       prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+       __list_del(list->prev, list->next);
+       list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+                                 struct list_head *head)
+{
+       __list_del(list->prev, list->next);
+       list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+       return head->next == head;
+}
+
+static inline void __list_splice(struct list_head *list,
+                                struct list_head *head)
+{
+       struct list_head *first = list->next;
+       struct list_head *last = list->prev;
+       struct list_head *at = head->next;
+
+       first->prev = head;
+       head->next = first;
+
+       last->next = at;
+       at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+       if (!list_empty(list))
+               __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+                                   struct list_head *head)
+{
+       if (!list_empty(list)) {
+               __list_splice(list, head);
+               INIT_LIST_HEAD(list);
+       }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:       the &struct list_head pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+/**
+ * list_for_each       -       iterate over a list
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each(pos, head) \
+       for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+               pos = pos->next, prefetch(pos->next))
+
+/**
+ * list_for_each_prev  -       iterate over a list in reverse order
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+       for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
+               pos = pos->prev, prefetch(pos->prev))
+
+/**
+ * list_for_each_safe  -       iterate over a list safe against removal of list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+       for (pos = (head)->next, n = pos->next; pos != (head); \
+               pos = n, n = pos->next)
+
+#endif
+
+#ifndef list_for_each_entry
+/**
+ * list_for_each_entry  -       iterate over list of given type
+ * @pos:        the type * to use as a loop counter.
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)                         \
+        for (pos = list_entry((head)->next, typeof(*pos), member),     \
+                    prefetch(pos->member.next);                        \
+            &pos->member != (head);                                    \
+            pos = list_entry(pos->member.next, typeof(*pos), member),  \
+            prefetch(pos->member.next))
+#endif
+
+#ifndef list_for_each_entry_safe
+/**
+ * list_for_each_entry_safe  -       iterate over list of given type safe against removal of list entry
+ * @pos:        the type * to use as a loop counter.
+ * @n:          another type * to use as temporary storage
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)                 \
+        for (pos = list_entry((head)->next, typeof(*pos), member),     \
+               n = list_entry(pos->member.next, typeof(*pos), member); \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+#endif
diff --git a/lnet/include/lnet/lltrace.h b/lnet/include/lnet/lltrace.h
new file mode 100644 (file)
index 0000000..7d1b304
--- /dev/null
@@ -0,0 +1,175 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Compile with:
+ * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl 
+ */
+#ifndef __LTRACE_H_
+#define __LTRACE_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <portals/types.h>
+#include <portals/ptlctl.h>
+#include <linux/kp30.h>
+#include <linux/limits.h>
+#include <asm/page.h>
+#include <linux/version.h>
+
+static inline int ltrace_write_file(char* fname)
+{
+        char* argv[3];
+
+        argv[0] = "debug_kernel";
+        argv[1] = fname;
+        argv[2] = "1";
+        
+        fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]);
+        
+        return jt_dbg_debug_kernel(3, argv);
+}
+
+static inline int ltrace_clear()
+{
+        char* argv[1];
+        
+        argv[0] = "clear";
+        
+        fprintf(stderr, "[ptlctl] %s\n", argv[0]);
+        
+        return jt_dbg_clear_debug_buf(1, argv);
+}
+
+static inline int ltrace_mark(int indent_level, char* text)
+{
+        char* argv[2];
+        char mark_buf[PATH_MAX];
+        
+        snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text);
+        
+        argv[0] = "mark";
+        argv[1] = mark_buf;
+        return jt_dbg_mark_debug_buf(2, argv);
+}
+
+static inline int ltrace_applymasks()
+{
+        char* argv[2];
+        argv[0] = "list";
+        argv[1] = "applymasks";
+        
+        fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]);
+        
+        return jt_dbg_list(2, argv);
+}
+
+
+static inline int ltrace_filter(char* subsys_or_mask)
+{
+        char* argv[2];
+        argv[0] = "filter";
+        argv[1] = subsys_or_mask;
+        return jt_dbg_filter(2, argv);
+}
+
+static inline int ltrace_show(char* subsys_or_mask)
+{
+        char* argv[2];
+        argv[0] = "show";
+        argv[1] = subsys_or_mask;
+        return jt_dbg_show(2, argv);
+}
+
+static inline int ltrace_start()
+{
+        int rc = 0;
+        dbg_initialize(0, NULL);
+#ifdef PORTALS_DEV_ID
+        rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+#endif
+        ltrace_filter("class"); 
+        ltrace_filter("socknal");
+        ltrace_filter("qswnal"); 
+        ltrace_filter("gmnal");  
+        ltrace_filter("portals");  
+        
+        ltrace_show("all_types");  
+        ltrace_filter("trace");  
+        ltrace_filter("malloc"); 
+        ltrace_filter("net"); 
+        ltrace_filter("page"); 
+        ltrace_filter("other"); 
+        ltrace_filter("info"); 
+        ltrace_applymasks();
+
+        return rc;
+}
+
+
+static inline void ltrace_stop()
+{
+#ifdef PORTALS_DEV_ID
+        unregister_ioc_dev(PORTALS_DEV_ID);
+#endif
+}
+
+static inline int not_uml()
+{
+  /* Return Values:
+   *   0 when run under UML
+   *   1 when run on host
+   *  <0 when lookup failed
+   */
+       struct stat buf;
+       int rc = stat("/dev/ubd", &buf);
+       rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc;
+       if (rc<0) {
+         fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno));
+         rc = 1; /* Assume host */
+       }
+       return rc;
+}
+
+#define LTRACE_MAX_NOB   256
+static inline void ltrace_add_processnames(char* fname)
+{
+        char cmdbuf[LTRACE_MAX_NOB];
+        struct timeval tv;
+        struct timezone tz;
+        int nob;
+        int underuml = !not_uml();
+        
+        gettimeofday(&tv, &tz);
+
+        nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \"");
+
+        /* Careful - these format strings need to match the CDEBUG
+         * formats in portals/linux/debug.c EXACTLY
+         */
+        nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ",
+                        S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec);
+
+        if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) {
+                nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB,
+                                 "(%s:%d:%s() %d | %d+%lu): ",
+                                 "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L);
+        }
+        else {
+                nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB,
+                                 "(%s:%d:%s() %d+%lu): ",
+                                 "lltrace.h", __LINE__, __FUNCTION__, 0, 0L);
+        }
+         
+        nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname);
+        system(cmdbuf);
+}
+
+#endif
diff --git a/lnet/include/lnet/lnet.h b/lnet/include/lnet/lnet.h
new file mode 100644 (file)
index 0000000..a4ea39b
--- /dev/null
@@ -0,0 +1,72 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _P30_H_
+#define _P30_H_
+
+/*
+ * p30.h
+ *
+ * User application interface file
+ */
+
+#if defined (__KERNEL__)
+#include <linux/uio.h>
+#include <linux/types.h>
+#else
+#include <sys/types.h>
+#include <sys/uio.h>
+#endif
+
+#include <portals/types.h>
+#include <portals/nal.h>
+#include <portals/api.h>
+#include <portals/errno.h>
+#include <portals/nalids.h>
+
+extern int __p30_initialized;  /* for libraries & test codes  */
+extern int __p30_myr_initialized;      /*   that don't know if p30    */
+extern int __p30_ip_initialized;       /*   had been initialized yet  */
+extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle;
+
+extern int __p30_myr_timeout;  /* in seconds, for PtlNIBarrier,     */
+extern int __p30_ip_timeout;   /* PtlReduce_all, & PtlBroadcast_all */
+
+/*
+ * Debugging flags reserved for the Portals reference library.
+ * These are not part of the API as described in the SAND report
+ * but are for the use of the maintainers of the reference implementation.
+ *
+ * It is not expected that the real implementations will export
+ * this functionality.
+ */
+#define PTL_DEBUG_NONE          0ul
+#define PTL_DEBUG_ALL           (0x0FFFul)     /* Only the Portals flags */
+
+#define __bit(x)                ((unsigned long) 1<<(x))
+#define PTL_DEBUG_PUT           __bit(0)
+#define PTL_DEBUG_GET           __bit(1)
+#define PTL_DEBUG_REPLY         __bit(2)
+#define PTL_DEBUG_ACK           __bit(3)
+#define PTL_DEBUG_DROP          __bit(4)
+#define PTL_DEBUG_REQUEST       __bit(5)
+#define PTL_DEBUG_DELIVERY      __bit(6)
+#define PTL_DEBUG_UNLINK        __bit(7)
+#define PTL_DEBUG_THRESHOLD     __bit(8)
+#define PTL_DEBUG_API           __bit(9)
+
+/*
+ * These eight are reserved for the NAL to define
+ * It should probably give them better names...
+ */
+#define PTL_DEBUG_NI_ALL        (0xF000ul)     /* Only the NAL flags */
+#define PTL_DEBUG_NI0           __bit(24)
+#define PTL_DEBUG_NI1           __bit(25)
+#define PTL_DEBUG_NI2           __bit(26)
+#define PTL_DEBUG_NI3           __bit(27)
+#define PTL_DEBUG_NI4           __bit(28)
+#define PTL_DEBUG_NI5           __bit(29)
+#define PTL_DEBUG_NI6           __bit(30)
+#define PTL_DEBUG_NI7           __bit(31)
+
+#endif
diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h
new file mode 100644 (file)
index 0000000..dc02780
--- /dev/null
@@ -0,0 +1,75 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#define PORTALS_DEV_ID 0
+#define PORTALS_DEV_PATH "/dev/portals"
+#define OBD_DEV_ID 1
+#define OBD_DEV_PATH "/dev/obd"
+
+int ptl_name2nal(char *str);
+int ptl_parse_nid (ptl_nid_t *nidp, char *str);
+char * ptl_nid2str (char *buffer, ptl_nid_t nid);
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_connect(int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_shownid(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_rxmem (int argc, char **argv);
+int jt_ptl_txmem (int argc, char **argv);
+int jt_ptl_nagle (int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+/* l_ioctl.c */
+int register_ioc_dev(int dev_id, const char * dev_name);
+void unregister_ioc_dev(int dev_id);
+int set_ioctl_dump(char * file);
+int l_ioctl(int dev_id, int opc, void *buf);
+int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *));
+int jt_ioc_dump(int argc, char **argv);
+
+#endif
diff --git a/lnet/include/lnet/myrnal.h b/lnet/include/lnet/myrnal.h
new file mode 100644 (file)
index 0000000..12b1925
--- /dev/null
@@ -0,0 +1,26 @@
+/*
+*/
+
+#ifndef MYRNAL_H
+#define MYRNAL_H
+
+#define MAX_ARGS_LEN            (256)
+#define MAX_RET_LEN             (128)
+#define MYRNAL_MAX_ACL_SIZE     (64)
+#define MYRNAL_MAX_PTL_SIZE     (64)
+
+#define P3CMD                   (100)
+#define P3SYSCALL               (200)
+#define P3REGISTER              (300)
+
+enum { PTL_MLOCKALL };
+
+typedef struct {
+       void *args;
+       size_t args_len;
+       void *ret;
+       size_t ret_len;
+       int p3cmd;
+} myrnal_forward_t;
+
+#endif                         /* MYRNAL_H */
diff --git a/lnet/include/lnet/nal.h b/lnet/include/lnet/nal.h
new file mode 100644 (file)
index 0000000..88be63c
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+*/
+#ifndef _NAL_H_
+#define _NAL_H_
+
+/*
+ * p30/nal.h
+ *
+ * The API side NAL declarations
+ */
+
+#include <portals/types.h>
+
+#ifdef yield
+#undef yield
+#endif
+
+typedef struct nal_t nal_t;
+
+struct nal_t {
+       ptl_ni_t ni;
+       int refct;
+       void *nal_data;
+       int *timeout;           /* for libp30api users */
+       int (*forward) (nal_t * nal, int index, /* Function ID */
+                       void *args, size_t arg_len, void *ret, size_t ret_len);
+
+       int (*shutdown) (nal_t * nal, int interface);
+
+       int (*validate) (nal_t * nal, void *base, size_t extent);
+
+       void (*yield) (nal_t * nal);
+
+       void (*lock) (nal_t * nal, unsigned long *flags);
+
+       void (*unlock) (nal_t * nal, unsigned long *flags);
+};
+
+typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+
+extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any);
+
+#ifndef PTL_IFACE_DEFAULT
+#define PTL_IFACE_DEFAULT (PTL_IFACE_IP)
+#endif
+
+#endif
diff --git a/lnet/include/lnet/nalids.h b/lnet/include/lnet/nalids.h
new file mode 100644 (file)
index 0000000..1b837b4
--- /dev/null
@@ -0,0 +1,4 @@
+#define PTL_IFACE_TCP 1
+#define PTL_IFACE_ER 2
+#define PTL_IFACE_SS 3
+#define PTL_IFACE_MAX 4
diff --git a/lnet/include/lnet/p30.h b/lnet/include/lnet/p30.h
new file mode 100644 (file)
index 0000000..a4ea39b
--- /dev/null
@@ -0,0 +1,72 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _P30_H_
+#define _P30_H_
+
+/*
+ * p30.h
+ *
+ * User application interface file
+ */
+
+#if defined (__KERNEL__)
+#include <linux/uio.h>
+#include <linux/types.h>
+#else
+#include <sys/types.h>
+#include <sys/uio.h>
+#endif
+
+#include <portals/types.h>
+#include <portals/nal.h>
+#include <portals/api.h>
+#include <portals/errno.h>
+#include <portals/nalids.h>
+
+extern int __p30_initialized;  /* for libraries & test codes  */
+extern int __p30_myr_initialized;      /*   that don't know if p30    */
+extern int __p30_ip_initialized;       /*   had been initialized yet  */
+extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle;
+
+extern int __p30_myr_timeout;  /* in seconds, for PtlNIBarrier,     */
+extern int __p30_ip_timeout;   /* PtlReduce_all, & PtlBroadcast_all */
+
+/*
+ * Debugging flags reserved for the Portals reference library.
+ * These are not part of the API as described in the SAND report
+ * but are for the use of the maintainers of the reference implementation.
+ *
+ * It is not expected that the real implementations will export
+ * this functionality.
+ */
+#define PTL_DEBUG_NONE          0ul
+#define PTL_DEBUG_ALL           (0x0FFFul)     /* Only the Portals flags */
+
+#define __bit(x)                ((unsigned long) 1<<(x))
+#define PTL_DEBUG_PUT           __bit(0)
+#define PTL_DEBUG_GET           __bit(1)
+#define PTL_DEBUG_REPLY         __bit(2)
+#define PTL_DEBUG_ACK           __bit(3)
+#define PTL_DEBUG_DROP          __bit(4)
+#define PTL_DEBUG_REQUEST       __bit(5)
+#define PTL_DEBUG_DELIVERY      __bit(6)
+#define PTL_DEBUG_UNLINK        __bit(7)
+#define PTL_DEBUG_THRESHOLD     __bit(8)
+#define PTL_DEBUG_API           __bit(9)
+
+/*
+ * These eight are reserved for the NAL to define
+ * It should probably give them better names...
+ */
+#define PTL_DEBUG_NI_ALL        (0xF000ul)     /* Only the NAL flags */
+#define PTL_DEBUG_NI0           __bit(24)
+#define PTL_DEBUG_NI1           __bit(25)
+#define PTL_DEBUG_NI2           __bit(26)
+#define PTL_DEBUG_NI3           __bit(27)
+#define PTL_DEBUG_NI4           __bit(28)
+#define PTL_DEBUG_NI5           __bit(29)
+#define PTL_DEBUG_NI6           __bit(30)
+#define PTL_DEBUG_NI7           __bit(31)
+
+#endif
diff --git a/lnet/include/lnet/ppid.h b/lnet/include/lnet/ppid.h
new file mode 100644 (file)
index 0000000..4727599
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ */
+
+#ifndef _INCppidh_
+#define _INCppidh_
+
+#include "defines.h"
+// #include "idtypes.h"
+
+
+#define MAX_PPID         1000    /* this needs to fit into 16 bits so the 
+                                    maximum value is 65535. having it "large"
+                                    can help w/ debugging process accounting
+                                    but there are reasons for making it 
+                                    somewhat smaller than the maximum --
+                                    requiring storage for arrays that index 
+                                    on the ppid, eg...  */
+                                 
+#define MAX_GID          1000    /* this needs to fit into 16 bits... */
+
+#define MAX_FIXED_PPID   100
+#define MAX_FIXED_GID    100
+#define PPID_FLOATING    MAX_FIXED_PPID+1   /* Floating area starts here */
+#define GID_FLOATING     MAX_FIXED_GID+1    /* Floating area starts here */
+#define NUM_PTL_TASKS    MAX_FIXED_PPID+80  /* Maximum no. portals tasks */
+
+#define PPID_AUTO        0
+
+/* Minimum PPID is 1 */
+#define PPID_BEBOPD      1            /* bebopd */
+#define  GID_BEBOPD      1            /* bebopd */
+
+#define PPID_PCT         2            /* pct */
+#define  GID_PCT         2            /* pct */
+
+#define PPID_FYOD        3            /* fyod */
+#define  GID_FYOD        3            /* fyod */
+
+#define PPID_GDBWRAP     11           /* portals proxy for gdb */
+#define  GID_GDBWRAP     11           /* portals proxy for gdb */
+
+#define PPID_TEST        15           /* for portals tests */
+#define  GID_TEST        15
+
+#define  GID_YOD         5            /* yod */
+#define  GID_PINGD       6            /* pingd */
+#define  GID_BT          7            /* bt */
+#define  GID_PTLTEST     8            /* ptltest */
+#define  GID_CGDB        9            /* cgdb */
+#define  GID_TVDSVR     10            /* start-tvdsvr */
+
+#endif /* _INCppidh_ */
diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h
new file mode 100644 (file)
index 0000000..dc02780
--- /dev/null
@@ -0,0 +1,75 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#define PORTALS_DEV_ID 0
+#define PORTALS_DEV_PATH "/dev/portals"
+#define OBD_DEV_ID 1
+#define OBD_DEV_PATH "/dev/obd"
+
+int ptl_name2nal(char *str);
+int ptl_parse_nid (ptl_nid_t *nidp, char *str);
+char * ptl_nid2str (char *buffer, ptl_nid_t nid);
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_connect(int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_shownid(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_rxmem (int argc, char **argv);
+int jt_ptl_txmem (int argc, char **argv);
+int jt_ptl_nagle (int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+/* l_ioctl.c */
+int register_ioc_dev(int dev_id, const char * dev_name);
+void unregister_ioc_dev(int dev_id);
+int set_ioctl_dump(char * file);
+int l_ioctl(int dev_id, int opc, void *buf);
+int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *));
+int jt_ioc_dump(int argc, char **argv);
+
+#endif
diff --git a/lnet/include/lnet/stringtab.h b/lnet/include/lnet/stringtab.h
new file mode 100644 (file)
index 0000000..c9683f7
--- /dev/null
@@ -0,0 +1,5 @@
+/*
+*/
+/*
+ * stringtab.h
+ */
diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h
new file mode 100644 (file)
index 0000000..d4038b6
--- /dev/null
@@ -0,0 +1,157 @@
+#ifndef _P30_TYPES_H_
+#define _P30_TYPES_H_
+
+#ifdef __linux__
+#include <asm/types.h>
+#include <asm/timex.h>
+#else
+#include <sys/types.h>
+typedef u_int32_t __u32;
+typedef u_int64_t __u64;
+typedef unsigned long long cycles_t;
+static inline cycles_t get_cycles(void) { return 0; }
+#endif
+
+typedef __u64 ptl_nid_t;
+typedef __u32 ptl_pid_t;
+typedef __u32 ptl_pt_index_t;
+typedef __u32 ptl_ac_index_t;
+typedef __u64 ptl_match_bits_t;
+typedef __u64 ptl_hdr_data_t;
+typedef __u32 ptl_size_t;
+
+typedef struct {
+        unsigned long nal_idx;                 /* which network interface */
+        __u64         cookie;                  /* which thing on that interface */
+} ptl_handle_any_t;
+
+typedef ptl_handle_any_t ptl_handle_ni_t;
+typedef ptl_handle_any_t ptl_handle_eq_t;
+typedef ptl_handle_any_t ptl_handle_md_t;
+typedef ptl_handle_any_t ptl_handle_me_t;
+
+#define PTL_HANDLE_NONE \
+((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1})
+#define PTL_EQ_NONE PTL_HANDLE_NONE
+
+static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2)
+{
+       return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie);
+}
+
+#define PTL_NID_ANY      ((ptl_nid_t) -1)
+#define PTL_PID_ANY      ((ptl_pid_t) -1)
+
+typedef struct {
+        ptl_nid_t nid;
+        ptl_pid_t pid;   /* node id / process id */
+} ptl_process_id_t;
+
+typedef enum {
+        PTL_RETAIN = 0,
+        PTL_UNLINK
+} ptl_unlink_t;
+
+typedef enum {
+        PTL_INS_BEFORE,
+        PTL_INS_AFTER
+} ptl_ins_pos_t;
+
+typedef struct {
+       struct page     *kiov_page;
+       unsigned int     kiov_len;
+       unsigned int     kiov_offset;
+} ptl_kiov_t;
+
+typedef struct {
+        void            *start;
+        ptl_size_t       length;
+        int              threshold;
+        int              max_size;
+        unsigned int     options;
+        void            *user_ptr;
+        ptl_handle_eq_t  eventq;
+       unsigned int     niov;
+} ptl_md_t;
+
+/* Options for the MD structure */
+#define PTL_MD_OP_PUT           (1 << 0)
+#define PTL_MD_OP_GET           (1 << 1)
+#define PTL_MD_MANAGE_REMOTE    (1 << 2)
+#define PTL_MD_AUTO_UNLINK      (1 << 3)
+#define PTL_MD_TRUNCATE         (1 << 4)
+#define PTL_MD_ACK_DISABLE      (1 << 5)
+#define PTL_MD_IOV             (1 << 6)
+#define PTL_MD_MAX_SIZE                (1 << 7)
+#define PTL_MD_KIOV             (1 << 8)
+
+#define PTL_MD_THRESH_INF       (-1)
+
+typedef enum {
+        PTL_EVENT_GET,
+        PTL_EVENT_PUT,
+        PTL_EVENT_REPLY,
+        PTL_EVENT_ACK,
+        PTL_EVENT_SENT
+} ptl_event_kind_t;
+
+#define PTL_SEQ_BASETYPE       long
+typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
+#define PTL_SEQ_GT(a,b)        (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0)
+
+typedef struct {
+        ptl_event_kind_t type;
+        ptl_process_id_t initiator;
+        ptl_pt_index_t portal;
+        ptl_match_bits_t match_bits;
+        ptl_size_t rlength, mlength, offset;
+        ptl_handle_me_t unlinked_me;
+        ptl_md_t mem_desc;
+        ptl_hdr_data_t hdr_data;
+        cycles_t  arrival_time;
+        volatile ptl_seq_t sequence;
+} ptl_event_t;
+
+
+typedef enum {
+        PTL_ACK_REQ,
+        PTL_NOACK_REQ
+} ptl_ack_req_t;
+
+
+typedef struct {
+        volatile ptl_seq_t sequence;
+        ptl_size_t size;
+        ptl_event_t *base;
+        ptl_handle_any_t cb_eq_handle;
+} ptl_eq_t;
+
+typedef struct {
+        ptl_eq_t *eq;
+} ptl_ni_t;
+
+
+typedef struct {
+        int max_match_entries;    /* max number of match entries */
+        int max_mem_descriptors;  /* max number of memory descriptors */
+        int max_event_queues;     /* max number of event queues */
+        int max_atable_index;     /* maximum access control list table index */
+        int max_ptable_index;     /* maximum portals table index */
+} ptl_ni_limits_t;
+
+/*
+ * Status registers
+ */
+typedef enum {
+        PTL_SR_DROP_COUNT,
+        PTL_SR_DROP_LENGTH,
+        PTL_SR_RECV_COUNT,
+        PTL_SR_RECV_LENGTH,
+        PTL_SR_SEND_COUNT,
+        PTL_SR_SEND_LENGTH,
+        PTL_SR_MSGS_MAX,
+} ptl_sr_index_t;
+
+typedef int ptl_sr_value_t;
+
+#endif
diff --git a/lnet/klnds/.cvsignore b/lnet/klnds/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/lnet/klnds/Makefile.am b/lnet/klnds/Makefile.am
new file mode 100644 (file)
index 0000000..fed2785
--- /dev/null
@@ -0,0 +1,7 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+DIST_SUBDIRS= socknal toenal qswnal gmnal scimacnal 
+SUBDIRS= socknal toenal        @QSWNAL@ @GMNAL@ @SCIMACNAL@
diff --git a/lnet/klnds/Makefile.mk b/lnet/klnds/Makefile.mk
new file mode 100644 (file)
index 0000000..ce40a60
--- /dev/null
@@ -0,0 +1,4 @@
+include ../Kernelenv
+
+obj-y = socknal/
+# more coming...
\ No newline at end of file
diff --git a/lnet/klnds/gmlnd/.cvsignore b/lnet/klnds/gmlnd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/klnds/gmlnd/Makefile.am b/lnet/klnds/gmlnd/Makefile.am
new file mode 100644 (file)
index 0000000..1dc6f4e
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kgmnal
+modulenet_DATA = kgmnal.o
+EXTRA_PROGRAMS = kgmnal
+
+DEFS =
+kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h
diff --git a/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch b/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch
new file mode 100644 (file)
index 0000000..23c80d9
--- /dev/null
@@ -0,0 +1,43 @@
+diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c
+--- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c        Mon Jul  1 10:35:09 2002
++++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c    Thu Sep 19 14:19:38 2002
+@@ -30,6 +30,8 @@
+  *
+  ************************************************************************/
++#define EXPORT_SYMTAB
++
+ #include <linux/config.h>
+ #include <linux/module.h>
+@@ -4075,6 +4077,28 @@
+   return 0;
+ }
++EXPORT_SYMBOL(gm_blocking_receive_no_spin);
++EXPORT_SYMBOL(gm_close);
++EXPORT_SYMBOL(gm_dma_free);
++EXPORT_SYMBOL(gm_dma_malloc);
++EXPORT_SYMBOL(gm_drop_sends);
++EXPORT_SYMBOL(gm_finalize);
++EXPORT_SYMBOL(gm_get_node_id);
++EXPORT_SYMBOL(gm_init);
++EXPORT_SYMBOL(gm_initialize_alarm);
++EXPORT_SYMBOL(gm_max_node_id_in_use);
++EXPORT_SYMBOL(gm_min_size_for_length);
++EXPORT_SYMBOL(gm_num_receive_tokens);
++EXPORT_SYMBOL(gm_num_send_tokens);
++EXPORT_SYMBOL(gm_open);
++EXPORT_SYMBOL(gm_provide_receive_buffer);
++EXPORT_SYMBOL(gm_resume_sending);
++EXPORT_SYMBOL(gm_send_with_callback);
++EXPORT_SYMBOL(gm_set_acceptable_sizes);
++EXPORT_SYMBOL(gm_set_alarm);
++EXPORT_SYMBOL(gm_unknown);
++
++
+ /*
+   This file uses GM standard indentation.
+Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~
+Only in gm-1.5.2.1_Linux-cfs/: trace
diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h
new file mode 100644 (file)
index 0000000..47e8c3c
--- /dev/null
@@ -0,0 +1,101 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _GMNAL_H
+#define _GMNAL_H
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_GMNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#include <gm.h>
+
+
+/*
+ *  Myrinet GM NAL
+ */
+#define NPAGES_LARGE            16
+#define NPAGES_SMALL            1
+#define MSG_LEN_LARGE            NPAGES_LARGE*PAGE_SIZE
+#define MSG_LEN_SMALL            NPAGES_SMALL*PAGE_SIZE
+#define MSG_SIZE_LARGE           (gm_min_size_for_length(MSG_LEN_LARGE))
+#define MSG_SIZE_SMALL           (gm_min_size_for_length(MSG_LEN_SMALL))
+
+#define TXMSGS                  64 /* Number of Transmit Messages */
+#define ENVELOPES               8  /* Number of outstanding receive msgs */
+
+#define KGM_PORT_NUM 3
+#define KGM_HOSTNAME "kgmnal"
+
+
+typedef struct {
+        char *krx_buffer;
+        unsigned long   krx_len;
+        unsigned int   krx_size;
+        unsigned int   krx_priority;
+        struct list_head krx_item;
+}  kgmnal_rx_t;
+
+
+typedef struct {
+        nal_cb_t  *ktx_nal;
+        void      *ktx_private;
+        lib_msg_t *ktx_cookie;
+        char      *ktx_buffer;
+        size_t     ktx_len;
+        unsigned long ktx_size;
+        int        ktx_ndx;
+        unsigned int ktx_priority;
+        unsigned int ktx_tgt_node;
+        unsigned int ktx_tgt_port_id;
+}  kgmnal_tx_t;
+
+
+typedef struct {
+        char              kgm_init;
+        char              kgm_shuttingdown;
+        struct gm_port   *kgm_port;
+        struct list_head  kgm_list;
+        ptl_nid_t         kgm_nid;
+        nal_cb_t         *kgm_cb;
+        struct kgm_trans *kgm_trans;
+        struct tq_struct  kgm_ready_tq;
+        spinlock_t        kgm_dispatch_lock;
+        spinlock_t        kgm_update_lock;
+        spinlock_t        kgm_send_lock;
+}  kgmnal_data_t;
+
+int kgm_init(kgmnal_data_t *kgm_data);
+int kgmnal_recv_thread(void *);
+int gm_return_mynid(void);
+void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+
+extern kgmnal_data_t      kgmnal_data;
+extern nal_t              kgmnal_api;
+extern nal_cb_t           kgmnal_lib;
+
+#endif  /* _GMNAL_H */
+
diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c
new file mode 100644 (file)
index 0000000..3d4c86d
--- /dev/null
@@ -0,0 +1,517 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Based on ksocknal and qswnal
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *  Author: Robert Read  <rread@datarithm.net>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* TODO
+ * preallocate send buffers, store on list
+ * put receive buffers on queue, handle with receive threads
+ * use routing
+ */
+
+#include "gmnal.h"
+
+extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int);
+
+static kgmnal_tx_t *
+get_trans(void)
+{
+        kgmnal_tx_t *t;
+        PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t)));
+        return t;
+}
+
+static void
+put_trans(kgmnal_tx_t *t)
+{
+        PORTAL_FREE(t, sizeof(kgmnal_tx_t));
+}
+
+int
+kgmnal_ispeer (ptl_nid_t nid)
+{
+   unsigned int gmnid = (unsigned int)nid;
+   unsigned int nnids;
+
+   gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
+
+   return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */
+           gmnid < nnids); /* it's in this machine */
+}
+
+/*
+ *  LIB functions follow
+ *
+ */
+static int
+kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
+             size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+static int
+kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
+             size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+static void *
+kgmnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return buf;
+}
+
+static void
+kgmnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+static void
+kgmnal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list                ap;
+        char msg[256];
+
+        if (portal_debug & D_NET) {
+                va_start( ap, fmt );
+                vsnprintf( msg, sizeof(msg), fmt, ap );
+                va_end( ap );
+
+                printk("CPUId: %d %s",smp_processor_id(), msg);
+        }
+}
+
+
+static void
+kgmnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->kgm_dispatch_lock,*flags);
+}
+
+
+static void
+kgmnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *data= nal->nal_data;
+
+        spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags);
+}
+
+
+static int
+kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* network distance doesn't mean much for this nal */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+/* FIXME rmr: add rounting code here */
+static void
+kgmnal_tx_done(kgmnal_tx_t  *trans, int error)
+{
+        lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie);
+
+        gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer);
+
+        trans->ktx_buffer = NULL;
+        trans->ktx_len = 0;
+
+        put_trans(trans);
+}
+static char * gm_error_strings[GM_NUM_STATUS_CODES] = {
+        [GM_SUCCESS] = "GM_SUCCESS",
+        [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT",
+        [GM_SEND_REJECTED] = "GM_SEND_REJECTED",
+        [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED",
+        [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE",
+        [GM_SEND_DROPPED] = "GM_SEND_DROPPED",
+        [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED",
+};
+
+inline char * get_error(int status)
+{
+        if (gm_error_strings[status] != NULL)
+                return gm_error_strings[status];
+        else
+                return "Unknown error";
+}
+
+static void
+kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status)
+{
+        CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status);
+}
+
+static void
+kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status)
+{
+        kgmnal_tx_t *ktx = (kgmnal_tx_t *)context;
+        int err = 0;
+
+        LASSERT (p != NULL);
+        LASSERT (ktx != NULL);
+
+        CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status,
+                ktx->ktx_tgt_node, ktx->ktx_tgt_port_id);
+
+        switch((int)status) {
+        case GM_SUCCESS:        /* normal */
+                break;
+        case GM_SEND_TIMED_OUT: /* application error */
+        case GM_SEND_REJECTED:  /* size of msg unacceptable */
+        case GM_SEND_TARGET_PORT_CLOSED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority,
+                                  ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
+                                  kgmnal_errhandler, NULL);
+                err = -EIO;
+                break;
+        case GM_SEND_TARGET_NODE_UNREACHABLE:
+        case GM_SEND_PORT_CLOSED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority,
+                              ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
+                              kgmnal_errhandler, NULL);
+                err = -EIO;
+                break;
+        case GM_SEND_DROPPED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                err = -EIO;
+                break;
+        default:
+                CERROR("Unknown status: %d\n", status);
+                err = -EIO;
+                break;
+        }
+
+        kgmnal_tx_done(ktx, err);
+}
+
+/*
+ */
+
+static int
+kgmnal_send(nal_cb_t        *nal,
+           void            *private,
+           lib_msg_t       *cookie,
+           ptl_hdr_t       *hdr,
+           int              type,
+           ptl_nid_t        nid,
+           ptl_pid_t        pid,
+           int              options,
+           unsigned int     niov,
+           lib_md_iov_t    *iov,
+           size_t           len)
+{
+        /*
+         * ipnal assumes that this is the private as passed to lib_dispatch..
+         * so do we :/
+         */
+        kgmnal_tx_t *ktx=NULL;
+        int rc=0;
+        void * buf;
+        int buf_len = sizeof(ptl_hdr_t) + len;
+        int buf_size = 0;
+
+        LASSERT ((options & PTL_MD_KIOV) == 0);
+        
+        PROF_START(gmnal_send);
+
+
+        CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n",
+               len, iov, nid, KGM_PORT_NUM);
+
+        /* ensure there is an available tx handle */
+
+        /* save transaction info to trans for later finalize and cleanup */
+        ktx = get_trans();
+        if (ktx == NULL) {
+                rc = -ENOMEM;
+                goto send_exit;
+        }
+
+        /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce
+           header and data.
+           Also, memory must be dma'able or registered with GM. */
+
+        if (buf_len <= MSG_LEN_SMALL) {
+                buf_size = MSG_SIZE_SMALL;
+        } else if (buf_len <= MSG_LEN_LARGE) {
+                buf_size = MSG_SIZE_LARGE;
+        } else {
+                printk("kgmnal:request exceeds TX MTU size (%d).\n",
+                       MSG_SIZE_LARGE);
+                rc = -1;
+                goto send_exit;
+        }
+
+               buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len);
+        if (buf == NULL) {
+                rc = -ENOMEM;
+                goto send_exit;
+        }
+        memcpy(buf, hdr, sizeof(ptl_hdr_t));
+
+        if (len != 0)
+                lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), 
+                                 options, niov, iov, len);
+
+        ktx->ktx_nal = nal;
+        ktx->ktx_private = private;
+        ktx->ktx_cookie = cookie;
+        ktx->ktx_len = buf_len;
+        ktx->ktx_size = buf_size;
+        ktx->ktx_buffer = buf;
+        ktx->ktx_priority = GM_LOW_PRIORITY;
+        ktx->ktx_tgt_node = nid;
+        ktx->ktx_tgt_port_id = KGM_PORT_NUM;
+
+        CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx "
+               "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM,
+               GM_LOW_PRIORITY);
+
+        gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size,
+                              buf_len, GM_LOW_PRIORITY,
+                              nid, KGM_PORT_NUM,
+                              kgmnal_txhandler, ktx);
+
+        PROF_FINISH(gmnal_send);
+ send_exit:
+        return rc;
+}
+void
+kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+void
+kqswnal_fwd_callback (void *arg, int error)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+
+static inline void
+kgmnal_requeue_rx(kgmnal_rx_t *krx)
+{
+        gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer,
+                                  krx->krx_size, krx->krx_priority);
+}
+
+/* Process a received portals packet */
+
+/* Receive Interrupt Handler */
+static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size,
+                      void * buf, unsigned int pri)
+{
+        ptl_hdr_t  *hdr = buf;
+        kgmnal_rx_t krx;
+
+        CDEBUG(D_NET,"buf %p, len %ld\n", buf, len);
+
+        if ( len < sizeof( ptl_hdr_t ) ) {
+                /* XXX what's this for? */
+                if (kgm->kgm_shuttingdown)
+                        return;
+                CERROR("kgmnal: did not receive complete portal header, "
+                       "len= %ld", len);
+                gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri);
+                return;
+        }
+
+       /* might want to use seperate threads to handle receive */
+        krx.krx_buffer = buf;
+        krx.krx_len = len;
+        krx.krx_size = size;
+        krx.krx_priority = pri;
+
+        if ( hdr->dest_nid == kgmnal_lib.ni.nid ) {
+                PROF_START(lib_parse);
+                lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx);
+                PROF_FINISH(lib_parse);
+        } else if (kgmnal_ispeer(hdr->dest_nid)) {
+                /* should have gone direct to peer */
+                CERROR("dropping packet from 0x%llx to 0x%llx: target is "
+                       "a peer", hdr->src_nid, hdr->dest_nid);
+                kgmnal_requeue_rx(&krx);
+        } else {
+                /* forward to gateway */
+                CERROR("forwarding not implemented yet");
+                kgmnal_requeue_rx(&krx);
+        }
+
+        return;
+}
+
+
+static int kgmnal_recv(nal_cb_t     *nal,
+                      void         *private,
+                      lib_msg_t    *cookie,
+                      int           options,
+                      unsigned int  niov,
+                      lib_md_iov_t *iov,
+                      size_t        mlen,
+                      size_t        rlen)
+{
+        kgmnal_rx_t *krx = private;
+
+        LASSERT ((options & PTL_MD_KIOV) == 0);
+
+        CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen);
+
+        /* What was actually received must be >= what sender claims to
+         * have sent.  This is an LASSERT, since lib-move doesn't
+         * check cb return code yet. */
+        LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
+        LASSERT (mlen <= rlen);
+
+        PROF_START(gmnal_recv);
+
+        if(mlen != 0) {
+                PROF_START(memcpy);
+                lib_copy_buf2iov (options, niov, iov, 
+                                  krx->krx_buffer + sizeof (ptl_hdr_t), mlen);
+                PROF_FINISH(memcpy);
+        }
+
+        PROF_START(lib_finalize);
+        lib_finalize(nal, private, cookie);
+        PROF_FINISH(lib_finalize);
+
+        kgmnal_requeue_rx(krx);
+
+        PROF_FINISH(gmnal_recv);
+
+        return rlen;
+}
+
+
+static void kgmnal_shutdown(void * none)
+{
+        CERROR("called\n");
+        return;
+}
+
+/*
+ * Set terminate and use alarm to wake up the recv thread.
+ */
+static void  recv_shutdown(kgmnal_data_t *kgm)
+{
+        gm_alarm_t alarm;
+
+        kgm->kgm_shuttingdown = 1;
+        gm_initialize_alarm(&alarm);
+        gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL);
+}
+
+int kgmnal_end(kgmnal_data_t *kgm)
+{
+
+        /* wait for sends to finish ? */
+        /* remove receive buffers */
+        /* shutdown receive thread */
+
+        recv_shutdown(kgm);
+
+        return 0;
+}
+
+/* Used only for the spinner */
+int kgmnal_recv_thread(void *arg)
+{
+        kgmnal_data_t *kgm = arg;
+
+        LASSERT(kgm != NULL);
+
+        kportal_daemonize("kgmnal_rx");
+        
+        while(1) {
+                gm_recv_event_t *e;
+                int priority = GM_LOW_PRIORITY;
+                if (kgm->kgm_shuttingdown)
+                        break;
+
+                e = gm_blocking_receive_no_spin(kgm->kgm_port);
+                if (e == NULL) {
+                        CERROR("gm_blocking_receive returned NULL\n");
+                        break;
+                }
+
+                switch(gm_ntohc(e->recv.type)) {
+                case GM_HIGH_RECV_EVENT:
+                        priority = GM_HIGH_PRIORITY;
+                        /* fall through */
+                case GM_RECV_EVENT:
+                        kgmnal_rx(kgm, gm_ntohl(e->recv.length),
+                                  gm_ntohc(e->recv.size),
+                                  gm_ntohp(e->recv.buffer), priority);
+                        break;
+                case GM_ALARM_EVENT:
+                        CERROR("received alarm");
+                        gm_unknown(kgm->kgm_port, e);
+                        break;
+                case GM_BAD_SEND_DETECTED_EVENT: /* ?? */
+                        CERROR("received bad send!\n");
+                        break;
+                default:
+                        gm_unknown(kgm->kgm_port, e);
+                }
+        }
+
+        CERROR("shuttting down.\n");
+        return 0;
+}
+
+nal_cb_t kgmnal_lib = {
+        nal_data: &kgmnal_data,                /* NAL private data */
+        cb_send: kgmnal_send,
+        cb_recv: kgmnal_recv,
+        cb_read: kgmnal_read,
+        cb_write: kgmnal_write,
+        cb_malloc: kgmnal_malloc,
+        cb_free: kgmnal_free,
+        cb_printf: kgmnal_printf,
+        cb_cli: kgmnal_cli,
+        cb_sti: kgmnal_sti,
+        cb_dist: kgmnal_dist
+};
diff --git a/lnet/klnds/gmlnd/gmnal.c b/lnet/klnds/gmlnd/gmnal.c
new file mode 100644 (file)
index 0000000..ceeea2a
--- /dev/null
@@ -0,0 +1,284 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Based on ksocknal and qswnal
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Robert Read  <rread@datarithm.net>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "gmnal.h"
+
+ptl_handle_ni_t kgmnal_ni;
+nal_t  kgmnal_api;
+
+kgmnal_data_t kgmnal_data;
+int gmnal_debug = 0;
+
+kpr_nal_interface_t kqswnal_router_interface = {
+        kprni_nalid:        GMNAL,
+        kprni_arg:        NULL,
+        kprni_fwd:          kgmnal_fwd_packet,
+};
+
+static int kgmnal_forward(nal_t   *nal,
+                          int     id,
+                          void    *args,  size_t args_len,
+                          void    *ret,   size_t ret_len)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
+        return PTL_OK;
+}
+
+static void kgmnal_lock(nal_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+static void kgmnal_unlock(nal_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+static int kgmnal_shutdown(nal_t *nal, int ni)
+{
+        LASSERT (nal == &kgmnal_api);
+        return 0;
+}
+
+static void kgmnal_yield( nal_t *nal )
+{
+        LASSERT (nal == &kgmnal_api);
+
+        if (current->need_resched)
+                schedule();
+        return;
+}
+
+kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx)
+{
+        kgmnal_rx_t *conn;
+
+        PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t));
+        /* Check for out of mem here */
+        if (conn==NULL) {
+                        printk("kgm_add_recv: memory alloc failed\n");
+                        return NULL;
+        }
+
+        list_add(&conn->krx_item,(struct list_head *)&data->kgm_list);
+        //        conn->ndx=ndx;
+        //        conn->len=conn->ptlhdr_copied=0;
+        //        conn->loopback=0;
+        return conn;
+}
+
+static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size,
+                          ptl_ac_index_t  ac_size, ptl_pid_t requested_pid)
+{
+        unsigned int nnids;
+
+        gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
+
+        CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n",
+               kgmnal_data.kgm_nid, nnids);
+        lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size);
+        return &kgmnal_api;
+}
+
+static void __exit
+kgmnal_finalize(void)
+{
+        struct list_head *tmp;
+
+        PORTAL_SYMBOL_UNREGISTER (kgmnal_ni);
+        PtlNIFini(kgmnal_ni);
+        lib_fini(&kgmnal_api);
+
+        if (kgmnal_data.kgm_port) {
+                gm_close(kgmnal_data.kgm_port);
+        }
+
+        /* FIXME: free dma buffers */
+        /* FIXME: kill receiver thread */
+
+        PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS);
+
+        list_for_each(tmp, &kgmnal_data.kgm_list) {
+                kgmnal_rx_t *conn;
+                conn = list_entry(tmp, kgmnal_rx_t, krx_item);
+                CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
+                tmp = tmp->next;
+                list_del(&conn->krx_item);
+                PORTAL_FREE(conn, sizeof(*conn));
+        }
+
+        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
+
+        return;
+}
+
+static int __init
+kgmnal_initialize(void)
+{
+        int rc;
+        int ntok;
+        unsigned long sizemask;
+        unsigned int nid;
+
+        CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
+
+        kgmnal_api.forward = kgmnal_forward;
+        kgmnal_api.shutdown = kgmnal_shutdown;
+        kgmnal_api.yield = kgmnal_yield;
+        kgmnal_api.validate = NULL;         /* our api validate is a NOOP */
+        kgmnal_api.lock= kgmnal_lock;
+        kgmnal_api.unlock= kgmnal_unlock;
+        kgmnal_api.nal_data = &kgmnal_data;
+
+        kgmnal_lib.nal_data = &kgmnal_data;
+
+        memset(&kgmnal_data, 0, sizeof(kgmnal_data));
+
+        INIT_LIST_HEAD(&kgmnal_data.kgm_list);
+        kgmnal_data.kgm_cb = &kgmnal_lib;
+
+        /* Allocate transmit descriptors */
+        PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS);
+        if (kgmnal_data.kgm_trans==NULL) {
+                printk("kgmnal: init: failed to allocate transmit "
+                       "descriptors\n");
+                return -1;
+        }
+        memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS));
+
+        spin_lock_init(&kgmnal_data.kgm_dispatch_lock);
+        spin_lock_init(&kgmnal_data.kgm_update_lock);
+        spin_lock_init(&kgmnal_data.kgm_send_lock);
+
+        /* Do the receiver and xmtr allocation */
+
+        rc = gm_init();
+        if (rc != GM_SUCCESS) {
+                CERROR("gm_init failed: %d\n", rc);
+                return -1;
+        }
+
+        rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME,
+                     GM_API_VERSION_1_1);
+        if (rc != GM_SUCCESS) {
+                gm_finalize();
+                kgmnal_data.kgm_port = NULL;
+                CERROR("gm_open failed: %d\n", rc);
+                return -1;
+        }
+        gm_get_node_id(kgmnal_data.kgm_port, &nid);
+        kgmnal_data.kgm_nid = nid;
+        /* Allocate 2 different sizes of buffers. For new, use half
+           the tokens for each. */
+        ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
+        CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n",
+               ntok, MSG_LEN_LARGE);
+        while (ntok-- > 0) {
+                void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
+                                              MSG_LEN_LARGE);
+                if (buffer == NULL) {
+                        CERROR("gm_init failed: %d\n", rc);
+                        return (-ENOMEM);
+                }
+                CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
+                       "pri %d\n ", kgmnal_data.kgm_port, buffer,
+                       MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY);
+
+                gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
+                                          MSG_SIZE_LARGE, GM_LOW_PRIORITY);
+        }
+
+        ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
+        CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n",
+               ntok, MSG_LEN_SMALL);
+        while (ntok-- > 0) {
+                void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
+                                              MSG_LEN_SMALL);
+                if (buffer == NULL) {
+                        CERROR("gm_init failed: %d\n", rc);
+                        return (-ENOMEM);
+                }
+                CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
+                       "pri %d\n ", kgmnal_data.kgm_port, buffer,
+                       MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY);
+
+                gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
+                                          MSG_SIZE_SMALL, GM_LOW_PRIORITY);
+
+        }
+        sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL);
+        CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n",
+                        kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask);
+        gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY,
+                                sizemask);
+        gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0);
+
+        /* Initialize Network Interface */
+        rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni);
+        if (rc) {
+                CERROR("PtlNIInit failed %d\n", rc);
+                return (-ENOMEM);
+        }
+
+        /* Start receiver thread */
+        kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0);
+
+        PORTAL_SYMBOL_REGISTER(kgmnal_ni);
+
+        kgmnal_data.kgm_init = 1;
+
+        return 0;
+}
+
+MODULE_AUTHOR("Robert Read <rread@datarithm.net>");
+MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1");
+MODULE_LICENSE("GPL");
+
+module_init (kgmnal_initialize);
+module_exit (kgmnal_finalize);
+
+EXPORT_SYMBOL (kgmnal_ni);
diff --git a/lnet/klnds/qswlnd/.cvsignore b/lnet/klnds/qswlnd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/klnds/qswlnd/Makefile.am b/lnet/klnds/qswlnd/Makefile.am
new file mode 100644 (file)
index 0000000..3eb4dd5
--- /dev/null
@@ -0,0 +1,17 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kqswnal
+modulenet_DATA = kqswnal.o
+EXTRA_PROGRAMS = kqswnal
+
+
+#CFLAGS:= @KCFLAGS@ 
+#CPPFLAGS:=@KCPPFLAGS@
+DEFS =
+CPPFLAGS=@CPPFLAGS@ @with_quadrics@
+kqswnal_SOURCES = qswnal.c qswnal_cb.c qswnal.h
diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c
new file mode 100644 (file)
index 0000000..1a8fb74
--- /dev/null
@@ -0,0 +1,608 @@
+/*
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * W. Marcus Miller - Based on ksocknal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswnal.h"
+
+ptl_handle_ni_t                kqswnal_ni;
+nal_t                  kqswnal_api;
+kqswnal_data_t         kqswnal_data;
+
+kpr_nal_interface_t kqswnal_router_interface = {
+       kprni_nalid:    QSWNAL,
+       kprni_arg:      NULL,
+       kprni_fwd:      kqswnal_fwd_packet,
+};
+
+
+static int
+kqswnal_forward(nal_t   *nal,
+               int     id,
+               void    *args,  size_t args_len,
+               void    *ret,   size_t ret_len)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
+       return (PTL_OK);
+}
+
+static void
+kqswnal_lock (nal_t *nal, unsigned long *flags)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       nal_cb->cb_cli(nal_cb,flags);
+}
+
+static void
+kqswnal_unlock(nal_t *nal, unsigned long *flags)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       nal_cb->cb_sti(nal_cb,flags);
+}
+
+static int
+kqswnal_shutdown(nal_t *nal, int ni)
+{
+       CDEBUG (D_NET, "shutdown\n");
+
+       LASSERT (nal == &kqswnal_api);
+       return (0);
+}
+
+static void
+kqswnal_yield( nal_t *nal )
+{
+       CDEBUG (D_NET, "yield\n");
+
+       if (current->need_resched)
+               schedule();
+       return;
+}
+
+static nal_t *
+kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
+            ptl_pid_t requested_pid)
+{
+       ptl_nid_t mynid = kqswnal_elanid2nid (kqswnal_data.kqn_elanid);
+       int       nnids = kqswnal_data.kqn_nnodes;
+
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid, nnids);
+
+       lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size);
+
+       return (&kqswnal_api);
+}
+
+int
+kqswnal_cmd (struct portal_ioctl_data *data, void *private)
+{
+       LASSERT (data != NULL);
+       
+       switch (data->ioc_nal_cmd) {
+       case NAL_CMD_REGISTER_MYNID:
+               CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n",
+                       data->ioc_nid - kqswnal_data.kqn_elanid,
+                       kqswnal_data.kqn_nid_offset);
+               kqswnal_data.kqn_nid_offset =
+                       data->ioc_nid - kqswnal_data.kqn_elanid;
+               kqswnal_lib.ni.nid = data->ioc_nid;
+               return (0);
+               
+       default:
+               return (-EINVAL);
+       }
+}
+
+void __exit
+kqswnal_finalise (void)
+{
+       switch (kqswnal_data.kqn_init)
+       {
+       default:
+               LASSERT (0);
+
+       case KQN_INIT_ALL:
+               PORTAL_SYMBOL_UNREGISTER (kqswnal_ni);
+               /* fall through */
+
+       case KQN_INIT_PTL:
+               PtlNIFini (kqswnal_ni);
+               lib_fini (&kqswnal_lib);
+               /* fall through */
+
+       case KQN_INIT_DATA:
+               break;
+
+       case KQN_INIT_NOTHING:
+               return;
+       }
+
+       /**********************************************************************/
+       /* Make router stop her calling me and fail any more call-ins */
+       kpr_shutdown (&kqswnal_data.kqn_router);
+
+       /**********************************************************************/
+       /* flag threads to terminate, wake them and wait for them to die */
+
+       kqswnal_data.kqn_shuttingdown = 1;
+       wake_up_all (&kqswnal_data.kqn_sched_waitq);
+
+       while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
+               CDEBUG(D_NET, "waiting for %d threads to terminate\n",
+                      atomic_read (&kqswnal_data.kqn_nthreads));
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+
+       /**********************************************************************/
+       /* close elan comms */
+
+       if (kqswnal_data.kqn_eprx_small != NULL)
+               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small);
+
+       if (kqswnal_data.kqn_eprx_large != NULL)
+               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large);
+
+       if (kqswnal_data.kqn_eptx != NULL)
+               ep_free_large_xmtr (kqswnal_data.kqn_eptx);
+
+       /**********************************************************************/
+       /* No more threads.  No more portals, router or comms callbacks!
+        * I control the horizontals and the verticals...
+        */
+
+       /**********************************************************************/
+       /* Complete any blocked forwarding packets with error
+        */
+
+       while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq))
+       {
+               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
+                                                 kpr_fwd_desc_t, kprfd_list);
+               list_del (&fwd->kprfd_list);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+       }
+
+       while (!list_empty (&kqswnal_data.kqn_delayedfwds))
+       {
+               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next,
+                                                 kpr_fwd_desc_t, kprfd_list);
+               list_del (&fwd->kprfd_list);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+       }
+
+       /**********************************************************************/
+       /* Wait for router to complete any packets I sent her
+        */
+
+       kpr_deregister (&kqswnal_data.kqn_router);
+
+
+       /**********************************************************************/
+       /* Unmap message buffers and free all descriptors and buffers
+        */
+
+       if (kqswnal_data.kqn_eprxdmahandle != NULL)
+       {
+               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eprxdmahandle, 0,
+                                 KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                                 KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE);
+
+               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eprxdmahandle);
+       }
+
+       if (kqswnal_data.kqn_eptxdmahandle != NULL)
+       {
+               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eptxdmahandle, 0,
+                                 KQSW_NTXMSGPAGES * (KQSW_NTXMSGS +
+                                                     KQSW_NNBLK_TXMSGS));
+
+               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eptxdmahandle);
+       }
+
+       if (kqswnal_data.kqn_txds != NULL)
+       {
+               int   i;
+
+               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
+               {
+                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+
+                       if (ktx->ktx_buffer != NULL)
+                               PORTAL_FREE(ktx->ktx_buffer,
+                                           KQSW_TX_BUFFER_SIZE);
+               }
+
+               PORTAL_FREE(kqswnal_data.kqn_txds,
+                           sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
+                                                    KQSW_NNBLK_TXMSGS));
+       }
+
+       if (kqswnal_data.kqn_rxds != NULL)
+       {
+               int   i;
+               int   j;
+
+               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+               {
+                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+                       for (j = 0; j < krx->krx_npages; j++)
+                               if (krx->krx_pages[j] != NULL)
+                                       __free_page (krx->krx_pages[j]);
+               }
+
+               PORTAL_FREE(kqswnal_data.kqn_rxds,
+                           sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
+                                                   KQSW_NRXMSGS_LARGE));
+       }
+
+       /* resets flags, pointers to NULL etc */
+       memset(&kqswnal_data, 0, sizeof (kqswnal_data));
+
+       CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
+
+       printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n",
+                atomic_read(&portal_kmemory));
+}
+
+static int __init
+kqswnal_initialise (void)
+{
+       ELAN3_DMA_REQUEST dmareq;
+       int               rc;
+       int               i;
+       int               elan_page_idx;
+       int               pkmem = atomic_read(&portal_kmemory);
+
+       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
+
+       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
+
+       kqswnal_api.forward  = kqswnal_forward;
+       kqswnal_api.shutdown = kqswnal_shutdown;
+       kqswnal_api.yield    = kqswnal_yield;
+       kqswnal_api.validate = NULL;            /* our api validate is a NOOP */
+       kqswnal_api.lock     = kqswnal_lock;
+       kqswnal_api.unlock   = kqswnal_unlock;
+       kqswnal_api.nal_data = &kqswnal_data;
+
+       kqswnal_lib.nal_data = &kqswnal_data;
+
+       /* ensure all pointers NULL etc */
+       memset (&kqswnal_data, 0, sizeof (kqswnal_data));
+
+       kqswnal_data.kqn_cb = &kqswnal_lib;
+
+       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds);
+       spin_lock_init (&kqswnal_data.kqn_idletxd_lock);
+       init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq);
+
+       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
+
+       spin_lock_init (&kqswnal_data.kqn_sched_lock);
+       init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
+
+       spin_lock_init (&kqswnal_data.kqn_statelock);
+
+       /* pointers/lists/locks initialised */
+       kqswnal_data.kqn_init = KQN_INIT_DATA;
+
+       /**********************************************************************/
+       /* Find the first Elan device */
+
+       kqswnal_data.kqn_epdev = ep_device (0);
+       if (kqswnal_data.kqn_epdev == NULL)
+       {
+               CERROR ("Can't get elan device 0\n");
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_nid_offset = 0;
+       kqswnal_data.kqn_nnodes     = ep_numnodes (kqswnal_data.kqn_epdev);
+       kqswnal_data.kqn_elanid     = ep_nodeid (kqswnal_data.kqn_epdev);
+       
+       /**********************************************************************/
+       /* Get the transmitter */
+
+       kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev);
+       if (kqswnal_data.kqn_eptx == NULL)
+       {
+               CERROR ("Can't allocate transmitter\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Get the receivers */
+
+       kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
+                                                            EP_SVC_LARGE_PORTALS_SMALL,
+                                                            KQSW_EP_ENVELOPES_SMALL);
+       if (kqswnal_data.kqn_eprx_small == NULL)
+       {
+               CERROR ("Can't install small msg receiver\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
+                                                            EP_SVC_LARGE_PORTALS_LARGE,
+                                                            KQSW_EP_ENVELOPES_LARGE);
+       if (kqswnal_data.kqn_eprx_large == NULL)
+       {
+               CERROR ("Can't install large msg receiver\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Reserve Elan address space for transmit buffers */
+
+        dmareq.Waitfn   = DDI_DMA_SLEEP;
+        dmareq.ElanAddr = (E3_Addr) 0;
+        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
+        dmareq.Perm     = ELAN_PERM_REMOTEREAD;
+
+       rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState,
+                             KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS),
+                             &dmareq, &kqswnal_data.kqn_eptxdmahandle);
+       if (rc != DDI_SUCCESS)
+       {
+               CERROR ("Can't reserve rx dma space\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Reserve Elan address space for receive buffers */
+
+        dmareq.Waitfn   = DDI_DMA_SLEEP;
+        dmareq.ElanAddr = (E3_Addr) 0;
+        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
+        dmareq.Perm     = ELAN_PERM_REMOTEWRITE;
+
+       rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState,
+                               KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                               KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE,
+                               &dmareq, &kqswnal_data.kqn_eprxdmahandle);
+       if (rc != DDI_SUCCESS)
+       {
+               CERROR ("Can't reserve rx dma space\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Allocate/Initialise transmit descriptors */
+
+       PORTAL_ALLOC(kqswnal_data.kqn_txds,
+                    sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       if (kqswnal_data.kqn_txds == NULL)
+       {
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /* clear flags, null pointers etc */
+       memset(kqswnal_data.kqn_txds, 0,
+              sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
+       {
+               int           premapped_pages;
+               kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+               int           basepage = i * KQSW_NTXMSGPAGES;
+
+               PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
+               if (ktx->ktx_buffer == NULL)
+               {
+                       kqswnal_finalise ();
+                       return (-ENOMEM);
+               }
+
+               /* Map pre-allocated buffer NOW, to save latency on transmit */
+               premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
+                                                       KQSW_TX_BUFFER_SIZE);
+
+               elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                      kqswnal_data.kqn_eptxdmahandle,
+                                      ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
+                                      basepage, &ktx->ktx_ebuffer);
+
+               ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
+               ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
+
+               if (i < KQSW_NTXMSGS)
+                       ktx->ktx_idle = &kqswnal_data.kqn_idletxds;
+               else
+                       ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds;
+
+               list_add_tail (&ktx->ktx_list, ktx->ktx_idle);
+       }
+
+       /**********************************************************************/
+       /* Allocate/Initialise receive descriptors */
+
+       PORTAL_ALLOC (kqswnal_data.kqn_rxds,
+                     sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
+       if (kqswnal_data.kqn_rxds == NULL)
+       {
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
+              sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
+
+       elan_page_idx = 0;
+       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       {
+               E3_Addr       elanaddr;
+               int           j;
+               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               if (i < KQSW_NRXMSGS_SMALL)
+               {
+                       krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
+                       krx->krx_eprx   = kqswnal_data.kqn_eprx_small;
+               }
+               else
+               {
+                       krx->krx_npages = KQSW_NRXMSGPAGES_LARGE;
+                       krx->krx_eprx   = kqswnal_data.kqn_eprx_large;
+               }
+
+               LASSERT (krx->krx_npages > 0);
+               for (j = 0; j < krx->krx_npages; j++)
+               {
+                       krx->krx_pages[j] = alloc_page(GFP_KERNEL);
+                       if (krx->krx_pages[j] == NULL)
+                       {
+                               kqswnal_finalise ();
+                               return (-ENOMEM);
+                       }
+
+                       LASSERT(page_address(krx->krx_pages[j]) != NULL);
+
+                       elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState,
+                                             kqswnal_data.kqn_eprxdmahandle,
+                                             page_address(krx->krx_pages[j]),
+                                             PAGE_SIZE, elan_page_idx,
+                                             &elanaddr);
+                       elan_page_idx++;
+
+                       if (j == 0)
+                               krx->krx_elanaddr = elanaddr;
+
+                       /* NB we assume a contiguous  */
+                       LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE);
+               }
+       }
+       LASSERT (elan_page_idx ==
+                (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) +
+                (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE));
+
+       /**********************************************************************/
+       /* Network interface ready to initialise */
+
+        rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni);
+        if (rc != 0)
+       {
+               CERROR ("PtlNIInit failed %d\n", rc);
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_init = KQN_INIT_PTL;
+
+       /**********************************************************************/
+       /* Queue receives, now that it's OK to run their completion callbacks */
+
+       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       {
+               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               /* NB this enqueue can allocate/sleep (attr == 0) */
+               rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
+                                     krx->krx_elanaddr,
+                                     krx->krx_npages * PAGE_SIZE, 0);
+               if (rc != 0)
+               {
+                       CERROR ("failed ep_queue_receive %d\n", rc);
+                       kqswnal_finalise ();
+                       return (-ENOMEM);
+               }
+       }
+
+       /**********************************************************************/
+       /* Spawn scheduling threads */
+       for (i = 0; i < smp_num_cpus; i++)
+       {
+               rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
+               if (rc != 0)
+               {
+                       CERROR ("failed to spawn scheduling thread: %d\n", rc);
+                       kqswnal_finalise ();
+                       return (rc);
+               }
+       }
+
+       /**********************************************************************/
+       /* Connect to the router */
+       rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface);
+       CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc);
+
+       rc = kportal_nal_register (QSWNAL, &kqswnal_cmd, NULL);
+       if (rc != 0) {
+               CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+               kqswnal_finalise ();
+               return (rc);
+       }
+
+       PORTAL_SYMBOL_REGISTER(kqswnal_ni);
+       kqswnal_data.kqn_init = KQN_INIT_ALL;
+
+       printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d "
+              "(Routing %s, initial mem %d)\n", 
+              kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes,
+              kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
+              pkmem);
+
+       return (0);
+}
+
+
+MODULE_AUTHOR("W. Marcus Miller <marcusm@llnl.gov>");
+MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00");
+MODULE_LICENSE("GPL");
+
+module_init (kqswnal_initialise);
+module_exit (kqswnal_finalise);
+
+EXPORT_SYMBOL (kqswnal_ni);
diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h
new file mode 100644 (file)
index 0000000..88ab74f
--- /dev/null
@@ -0,0 +1,270 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef _QSWNAL_H
+#define _QSWNAL_H
+#define EXPORT_SYMTAB
+
+#ifdef PROPRIETARY_ELAN
+# include <qsw/kernel.h>
+#else
+# include <qsnet/kernel.h>
+#endif
+
+#undef printf                                   /* nasty QSW #define */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <elan3/elanregs.h>
+#include <elan3/elandev.h>
+#include <elan3/elanvp.h>
+#include <elan3/elan3mmu.h>
+#include <elan3/elanctxt.h>
+#include <elan3/elandebug.h>
+#include <elan3/urom_addrs.h>
+#include <elan3/busops.h>
+#include <elan3/kcomm.h>
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_QSWNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define KQSW_CHECKSUM  0
+#if KQSW_CHECKSUM
+typedef unsigned long kqsw_csum_t;
+#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t))
+#else
+#define KQSW_CSUM_SIZE 0
+#endif
+#define KQSW_HDR_SIZE  (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE)
+
+/*
+ *  Elan NAL
+ */
+#define EP_SVC_LARGE_PORTALS_SMALL     (0x10)  /* Portals over elan port number (large payloads) */
+#define EP_SVC_LARGE_PORTALS_LARGE     (0x11)  /* Portals over elan port number (small payloads) */
+/* NB small/large message sizes are GLOBAL constants */
+
+/*
+ * Performance Tuning defines
+ * NB no mention of PAGE_SIZE for interoperability
+ */
+#if PTL_LARGE_MTU
+# define KQSW_MAXPAYLOAD               (256<<10) /* biggest message this NAL will cope with */
+#else
+# define KQSW_MAXPAYLOAD               (64<<10) /* biggest message this NAL will cope with */
+#endif
+
+#define KQSW_SMALLPAYLOAD              ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */
+
+#define KQSW_TX_MAXCONTIG              (1<<10) /* largest payload that gets made contiguous on transmit */
+
+#define KQSW_NTXMSGS                   8       /* # normal transmit messages */
+#define KQSW_NNBLK_TXMSGS              128     /* # reserved transmit messages if can't block */
+
+#define KQSW_NRXMSGS_LARGE             64      /* # large receive buffers */
+#define KQSW_EP_ENVELOPES_LARGE        128     /* # large ep envelopes */
+
+#define KQSW_NRXMSGS_SMALL             256     /* # small receive buffers */
+#define KQSW_EP_ENVELOPES_SMALL                2048    /* # small ep envelopes */
+
+#define KQSW_RESCHED                   100     /* # busy loops that forces scheduler to yield */
+
+/*
+ * derived constants
+ */
+
+#define KQSW_TX_BUFFER_SIZE    (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG)
+/* The pre-allocated tx buffer (hdr + small payload) */
+
+#define KQSW_NTXMSGPAGES       (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1)
+/* Reserve elan address space for pre-allocated and pre-mapped transmit
+ * buffer and a full payload too.  Extra pages allow for page alignment */
+
+#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD))
+/* receive hdr/payload always contiguous and page aligned */
+#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE)
+
+#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD))
+/* receive hdr/payload always contiguous and page aligned */
+#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE)
+/* biggest complete packet we can receive (or transmit) */
+
+
+typedef struct 
+{
+        struct list_head krx_list;              /* enqueue -> thread */
+        EP_RCVR                *krx_eprx;              /* port to post receives to */
+        EP_RXD          *krx_rxd;               /* receive descriptor (for repost) */
+        E3_Addr          krx_elanaddr;          /* Elan address of buffer (contiguous in elan vm) */
+        int              krx_npages;            /* # pages in receive buffer */
+        int              krx_nob;               /* Number Of Bytes received into buffer */
+        kpr_fwd_desc_t   krx_fwd;               /* embedded forwarding descriptor */
+        struct page     *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */
+        struct iovec     krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */
+}  kqswnal_rx_t;
+
+typedef struct
+{
+        struct list_head  ktx_list;             /* enqueue idle/delayed */
+        struct list_head *ktx_idle;             /* where to put when idle */
+        char              ktx_state;            /* What I'm doing */
+        uint32_t          ktx_basepage;         /* page offset in reserved elan tx vaddrs for mapping pages */
+        int               ktx_npages;           /* pages reserved for mapping messages */
+        int               ktx_nmappedpages;     /* # pages mapped for current message */
+        EP_IOVEC         ktx_iov[EP_MAXFRAG];  /* msg frags (elan vaddrs) */
+        int               ktx_niov;             /* # message frags */
+        int               ktx_port;             /* destination ep port */
+        ptl_nid_t         ktx_nid;              /* destination node */
+        void             *ktx_args[2];          /* completion passthru */
+        E3_Addr                  ktx_ebuffer;          /* elan address of ktx_buffer */
+        char             *ktx_buffer;           /* pre-allocated contiguous buffer for hdr + small payloads */
+} kqswnal_tx_t;
+
+#define KTX_IDLE       0                       /* MUST BE ZERO (so zeroed ktx is idle) */
+#define KTX_SENDING    1                       /* local send */
+#define KTX_FORWARDING 2                       /* routing a packet */
+
+typedef struct
+{
+        char               kqn_init;            /* what's been initialised */
+        char               kqn_shuttingdown;    /* I'm trying to shut down */
+        atomic_t           kqn_nthreads;        /* # threads still running */
+
+        kqswnal_rx_t      *kqn_rxds;            /* all the receive descriptors */
+        kqswnal_tx_t      *kqn_txds;            /* all the transmit descriptors */
+
+        struct list_head   kqn_idletxds;        /* transmit descriptors free to use */
+        struct list_head   kqn_nblk_idletxds;   /* reserve of */
+        spinlock_t         kqn_idletxd_lock;    /* serialise idle txd access */
+        wait_queue_head_t  kqn_idletxd_waitq;   /* sender blocks here waiting for idle txd */
+        struct list_head   kqn_idletxd_fwdq;    /* forwarded packets block here waiting for idle txd */
+        
+        spinlock_t         kqn_sched_lock;      /* serialise packet schedulers */
+        wait_queue_head_t  kqn_sched_waitq;     /* scheduler blocks here */
+
+        struct list_head   kqn_readyrxds;       /* rxds full of data */
+        struct list_head   kqn_delayedfwds;     /* delayed forwards */
+        struct list_head   kqn_delayedtxds;     /* delayed transmits */
+
+        spinlock_t         kqn_statelock;       /* cb_cli/cb_sti */
+        nal_cb_t          *kqn_cb;              /* -> kqswnal_lib */
+       EP_DEV            *kqn_epdev;           /* elan device */
+       EP_XMTR           *kqn_eptx;            /* elan transmitter */
+       EP_RCVR           *kqn_eprx_small;      /* elan receiver (small messages) */
+        EP_RCVR                  *kqn_eprx_large;      /* elan receiver (large messages) */
+       ELAN3_DMA_HANDLE  *kqn_eptxdmahandle;   /* elan reserved tx vaddrs */
+       ELAN3_DMA_HANDLE  *kqn_eprxdmahandle;   /* elan reserved rx vaddrs */
+        kpr_router_t       kqn_router;          /* connection to Kernel Portals Router module */
+
+        ptl_nid_t          kqn_nid_offset;      /* this cluster's NID offset */
+        int                kqn_nnodes;          /* this cluster's size */
+        int                kqn_elanid;          /* this nodes's elan ID */
+}  kqswnal_data_t;
+
+/* kqn_init state */
+#define KQN_INIT_NOTHING       0               /* MUST BE ZERO so zeroed state is initialised OK */
+#define KQN_INIT_DATA          1
+#define KQN_INIT_PTL           2
+#define KQN_INIT_ALL           3
+
+extern nal_cb_t        kqswnal_lib;
+extern nal_t           kqswnal_api;
+extern kqswnal_data_t  kqswnal_data;
+
+extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg);
+extern void kqswnal_rxhandler(EP_RXD *rxd);
+extern int kqswnal_scheduler (void *);
+extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+
+static inline ptl_nid_t
+kqswnal_elanid2nid (int elanid) 
+{
+        return (kqswnal_data.kqn_nid_offset + elanid);
+}
+
+static inline int
+kqswnal_nid2elanid (ptl_nid_t nid) 
+{
+        /* not in this cluster? */
+        if (nid < kqswnal_data.kqn_nid_offset ||
+            nid >= kqswnal_data.kqn_nid_offset + kqswnal_data.kqn_nnodes)
+                return (-1);
+        
+        return (nid - kqswnal_data.kqn_nid_offset);
+}
+
+static inline void
+kqswnal_requeue_rx (kqswnal_rx_t *krx)
+{
+        ep_requeue_receive (krx->krx_rxd, kqswnal_rxhandler, krx,
+                            krx->krx_elanaddr, krx->krx_npages * PAGE_SIZE);
+}
+
+static inline int
+kqswnal_pages_spanned (void *base, int nob)
+{
+        unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT;
+        unsigned long last_page  = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT;
+
+        LASSERT (last_page >= first_page);      /* can't wrap address space */
+        return (last_page - first_page + 1);
+}
+
+#if KQSW_CHECKSUM
+static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob)
+{
+        unsigned char *ptr = (unsigned char *)base;
+        
+        while (nob-- > 0)
+                sum += *ptr++;
+        
+        return (sum);
+}
+#endif
+
+#endif /* _QSWNAL_H */
diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c
new file mode 100644 (file)
index 0000000..3b47a25
--- /dev/null
@@ -0,0 +1,1239 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * W. Marcus Miller - Based on ksocknal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswnal.h"
+
+atomic_t kqswnal_packets_launched;
+atomic_t kqswnal_packets_transmitted;
+atomic_t kqswnal_packets_received;
+
+
+/*
+ *  LIB functions follow
+ *
+ */
+static int
+kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
+             size_t len)
+{
+        CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n",
+                nal->ni.nid, len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+
+        return (0);
+}
+
+static int
+kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
+              size_t len)
+{
+        CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n",
+                nal->ni.nid, len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+
+        return (0);
+}
+
+static void *
+kqswnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return (buf);
+}
+
+static void
+kqswnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+static void
+kqswnal_printf (nal_cb_t * nal, const char *fmt, ...)
+{
+        va_list ap;
+        char msg[256];
+
+        va_start (ap, fmt);
+        vsnprintf (msg, sizeof (msg), fmt, ap);        /* sprint safely */
+        va_end (ap);
+
+        msg[sizeof (msg) - 1] = 0;                /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+
+static void
+kqswnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kqswnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->kqn_statelock, *flags);
+}
+
+
+static void
+kqswnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kqswnal_data_t *data= nal->nal_data;
+
+        spin_unlock_irqrestore(&data->kqn_statelock, *flags);
+}
+
+
+static int
+kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        if (nid == nal->ni.nid)
+                *dist = 0;                      /* it's me */
+        else if (kqswnal_nid2elanid (nid) >= 0)
+                *dist = 1;                      /* it's my peer */
+        else
+                *dist = 2;                      /* via router */
+        return (0);
+}
+
+void
+kqswnal_unmap_tx (kqswnal_tx_t *ktx)
+{
+        if (ktx->ktx_nmappedpages == 0)
+                return;
+
+        CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n",
+                ktx, ktx->ktx_niov, ktx->ktx_basepage, ktx->ktx_nmappedpages);
+
+        LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages);
+        LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <=
+                 kqswnal_data.kqn_eptxdmahandle->NumDvmaPages);
+
+        elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                          kqswnal_data.kqn_eptxdmahandle,
+                          ktx->ktx_basepage, ktx->ktx_nmappedpages);
+        ktx->ktx_nmappedpages = 0;
+}
+
+int
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
+{
+        int       nfrags    = ktx->ktx_niov;
+        const int maxfrags  = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]);
+        int       nmapped   = ktx->ktx_nmappedpages;
+        int       maxmapped = ktx->ktx_npages;
+        uint32_t  basepage  = ktx->ktx_basepage + nmapped;
+        char     *ptr;
+        
+        LASSERT (nmapped <= maxmapped);
+        LASSERT (nfrags <= maxfrags);
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+        
+        do {
+                int  fraglen = kiov->kiov_len;
+
+                /* nob exactly spans the iovs */
+                LASSERT (fraglen <= nob);
+                /* each frag fits in a page */
+                LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE);
+
+                nmapped++;
+                if (nmapped > maxmapped) {
+                        CERROR("Can't map message in %d pages (max %d)\n",
+                               nmapped, maxmapped);
+                        return (-EMSGSIZE);
+                }
+
+                if (nfrags == maxfrags) {
+                        CERROR("Message too fragmented in Elan VM (max %d frags)\n",
+                               maxfrags);
+                        return (-EMSGSIZE);
+                }
+
+                /* XXX this is really crap, but we'll have to kmap until
+                 * EKC has a page (rather than vaddr) mapping interface */
+
+                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+
+                CDEBUG(D_NET,
+                       "%p[%d] loading %p for %d, page %d, %d total\n",
+                        ktx, nfrags, ptr, fraglen, basepage, nmapped);
+
+                elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                       kqswnal_data.kqn_eptxdmahandle,
+                                       ptr, fraglen,
+                                       basepage, &ktx->ktx_iov[nfrags].Base);
+
+                kunmap (kiov->kiov_page);
+                
+                /* keep in loop for failure case */
+                ktx->ktx_nmappedpages = nmapped;
+
+                if (nfrags > 0 &&                /* previous frag mapped */
+                    ktx->ktx_iov[nfrags].Base == /* contiguous with this one */
+                    (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len))
+                        /* just extend previous */
+                        ktx->ktx_iov[nfrags - 1].Len += fraglen;
+                else {
+                        ktx->ktx_iov[nfrags].Len = fraglen;
+                        nfrags++;                /* new frag */
+                }
+
+                basepage++;
+                kiov++;
+                niov--;
+                nob -= fraglen;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        ktx->ktx_niov = nfrags;
+        CDEBUG (D_NET, "%p got %d frags over %d pages\n",
+                ktx, ktx->ktx_niov, ktx->ktx_nmappedpages);
+
+        return (0);
+}
+
+int
+kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
+{
+        int       nfrags    = ktx->ktx_niov;
+        const int maxfrags  = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]);
+        int       nmapped   = ktx->ktx_nmappedpages;
+        int       maxmapped = ktx->ktx_npages;
+        uint32_t  basepage  = ktx->ktx_basepage + nmapped;
+
+        LASSERT (nmapped <= maxmapped);
+        LASSERT (nfrags <= maxfrags);
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+
+        do {
+                int  fraglen = iov->iov_len;
+                long npages  = kqswnal_pages_spanned (iov->iov_base, fraglen);
+
+                /* nob exactly spans the iovs */
+                LASSERT (fraglen <= nob);
+                
+                nmapped += npages;
+                if (nmapped > maxmapped) {
+                        CERROR("Can't map message in %d pages (max %d)\n",
+                               nmapped, maxmapped);
+                        return (-EMSGSIZE);
+                }
+
+                if (nfrags == maxfrags) {
+                        CERROR("Message too fragmented in Elan VM (max %d frags)\n",
+                               maxfrags);
+                        return (-EMSGSIZE);
+                }
+
+                CDEBUG(D_NET,
+                       "%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
+                        ktx, nfrags, iov->iov_base, fraglen, basepage, npages,
+                        nmapped);
+
+                elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                       kqswnal_data.kqn_eptxdmahandle,
+                                       iov->iov_base, fraglen,
+                                       basepage, &ktx->ktx_iov[nfrags].Base);
+                /* keep in loop for failure case */
+                ktx->ktx_nmappedpages = nmapped;
+
+                if (nfrags > 0 &&                /* previous frag mapped */
+                    ktx->ktx_iov[nfrags].Base == /* contiguous with this one */
+                    (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len))
+                        /* just extend previous */
+                        ktx->ktx_iov[nfrags - 1].Len += fraglen;
+                else {
+                        ktx->ktx_iov[nfrags].Len = fraglen;
+                        nfrags++;                /* new frag */
+                }
+
+                basepage += npages;
+                iov++;
+                niov--;
+                nob -= fraglen;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        ktx->ktx_niov = nfrags;
+        CDEBUG (D_NET, "%p got %d frags over %d pages\n",
+                ktx, ktx->ktx_niov, ktx->ktx_nmappedpages);
+
+        return (0);
+}
+
+void
+kqswnal_put_idle_tx (kqswnal_tx_t *ktx)
+{
+        kpr_fwd_desc_t   *fwd = NULL;
+        struct list_head *idle = ktx->ktx_idle;
+        unsigned long     flags;
+
+        kqswnal_unmap_tx (ktx);                /* release temporary mappings */
+        ktx->ktx_state = KTX_IDLE;
+
+        spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        list_add (&ktx->ktx_list, idle);
+
+        /* reserved for non-blocking tx */
+        if (idle == &kqswnal_data.kqn_nblk_idletxds) {
+                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+                return;
+        }
+
+        /* anything blocking for a tx descriptor? */
+        if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */
+        {
+                CDEBUG(D_NET,"wakeup fwd\n");
+
+                fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
+                                  kpr_fwd_desc_t, kprfd_list);
+                list_del (&fwd->kprfd_list);
+        }
+
+        if (waitqueue_active (&kqswnal_data.kqn_idletxd_waitq))  /* process? */
+        {
+                /* local sender waiting for tx desc */
+                CDEBUG(D_NET,"wakeup process\n");
+                wake_up (&kqswnal_data.kqn_idletxd_waitq);
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        if (fwd == NULL)
+                return;
+
+        /* schedule packet for forwarding again */
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+kqswnal_tx_t *
+kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
+{
+        unsigned long  flags;
+        kqswnal_tx_t  *ktx = NULL;
+
+        for (;;) {
+                spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kqswnal_data.kqn_idletxds)) {
+                        ktx = list_entry (kqswnal_data.kqn_idletxds.next,
+                                          kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        break;
+                }
+
+                /* "normal" descriptor pool is empty */
+
+                if (fwd != NULL) { /* forwarded packet => queue for idle txd */
+                        CDEBUG (D_NET, "blocked fwd [%p]\n", fwd);
+                        list_add_tail (&fwd->kprfd_list,
+                                       &kqswnal_data.kqn_idletxd_fwdq);
+                        break;
+                }
+
+                /* doing a local transmit */
+                if (!may_block) {
+                        if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) {
+                                CERROR ("intr tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next,
+                                          kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+
+                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+                CDEBUG (D_NET, "blocking for tx desc\n");
+                wait_event (kqswnal_data.kqn_idletxd_waitq,
+                            !list_empty (&kqswnal_data.kqn_idletxds));
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */
+        LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0);
+        return (ktx);
+}
+
+void
+kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
+{
+        switch (ktx->ktx_state) {
+        case KTX_FORWARDING:       /* router asked me to forward this packet */
+                kpr_fwd_done (&kqswnal_data.kqn_router,
+                              (kpr_fwd_desc_t *)ktx->ktx_args[0], error);
+                break;
+
+        case KTX_SENDING:          /* packet sourced locally */
+                lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
+                              (lib_msg_t *)ktx->ktx_args[1]);
+                break;
+
+        default:
+                LASSERT (0);
+        }
+
+        kqswnal_put_idle_tx (ktx);
+}
+
+static void
+kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
+{
+        kqswnal_tx_t      *ktx = (kqswnal_tx_t *)arg;
+
+        LASSERT (txd != NULL);
+        LASSERT (ktx != NULL);
+
+        CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status);
+
+        if (status == EP_SUCCESS)
+                atomic_inc (&kqswnal_packets_transmitted);
+
+        if (status != EP_SUCCESS)
+        {
+                CERROR ("kqswnal: Transmit failed with %d\n", status);
+                status = -EIO;
+        }
+
+        kqswnal_tx_done (ktx, status);
+}
+
+int
+kqswnal_launch (kqswnal_tx_t *ktx)
+{
+        /* Don't block for transmit descriptor if we're in interrupt context */
+        int   attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
+        int   dest = kqswnal_nid2elanid (ktx->ktx_nid);
+        long  flags;
+        int   rc;
+        
+        LASSERT (dest >= 0);                    /* must be a peer */
+        rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest,
+                               ktx->ktx_port, attr, kqswnal_txhandler,
+                               ktx, ktx->ktx_iov, ktx->ktx_niov);
+        if (rc == 0)
+                atomic_inc (&kqswnal_packets_launched);
+
+        if (rc != ENOMEM)
+                return (rc);
+
+        /* can't allocate ep txd => queue for later */
+
+        LASSERT (in_interrupt());      /* not called by thread (not looping) */
+
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+        return (0);
+}
+
+
+static char *
+hdr_type_string (ptl_hdr_t *hdr)
+{
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return ("ACK");
+        case PTL_MSG_PUT:
+                return ("PUT");
+        case PTL_MSG_GET:
+                return ("GET");
+        case PTL_MSG_REPLY:
+                return ("REPLY");
+        default:
+                return ("<UNKNOWN>");
+        }
+}
+
+static void
+kqswnal_cerror_hdr(ptl_hdr_t * hdr)
+{
+        char *type_str = hdr_type_string (hdr);
+
+        CERROR("P3 Header at %p of type %s\n", hdr, type_str);
+        CERROR("    From nid/pid "LPU64"/%u", NTOH__u64(hdr->src_nid),
+               NTOH__u32(hdr->src_pid));
+        CERROR("    To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid),
+               NTOH__u32(hdr->dest_pid));
+
+        switch (NTOH__u32(hdr->type)) {
+        case PTL_MSG_PUT:
+                CERROR("    Ptl index %d, ack md "LPX64"."LPX64", "
+                       "match bits "LPX64"\n",
+                       NTOH__u32 (hdr->msg.put.ptl_index),
+                       hdr->msg.put.ack_wmd.wh_interface_cookie,
+                       hdr->msg.put.ack_wmd.wh_object_cookie,
+                       NTOH__u64 (hdr->msg.put.match_bits));
+                CERROR("    Length %d, offset %d, hdr data "LPX64"\n",
+                       NTOH__u32(PTL_HDR_LENGTH(hdr)),
+                       NTOH__u32(hdr->msg.put.offset),
+                       hdr->msg.put.hdr_data);
+                break;
+
+        case PTL_MSG_GET:
+                CERROR("    Ptl index %d, return md "LPX64"."LPX64", "
+                       "match bits "LPX64"\n",
+                       NTOH__u32 (hdr->msg.get.ptl_index),
+                       hdr->msg.get.return_wmd.wh_interface_cookie,
+                       hdr->msg.get.return_wmd.wh_object_cookie,
+                       hdr->msg.get.match_bits);
+                CERROR("    Length %d, src offset %d\n",
+                       NTOH__u32 (hdr->msg.get.sink_length),
+                       NTOH__u32 (hdr->msg.get.src_offset));
+                break;
+
+        case PTL_MSG_ACK:
+                CERROR("    dst md "LPX64"."LPX64", manipulated length %d\n",
+                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                       hdr->msg.ack.dst_wmd.wh_object_cookie,
+                       NTOH__u32 (hdr->msg.ack.mlength));
+                break;
+
+        case PTL_MSG_REPLY:
+                CERROR("    dst md "LPX64"."LPX64", length %d\n",
+                       hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                       hdr->msg.reply.dst_wmd.wh_object_cookie,
+                       NTOH__u32 (PTL_HDR_LENGTH(hdr)));
+        }
+
+}                               /* end of print_hdr() */
+
+static int
+kqswnal_sendmsg (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 ptl_hdr_t    *hdr,
+                 int           type,
+                 ptl_nid_t     nid,
+                 ptl_pid_t     pid,
+                 unsigned int  payload_niov,
+                 struct iovec *payload_iov,
+                 ptl_kiov_t   *payload_kiov,
+                 size_t        payload_nob)
+{
+        kqswnal_tx_t      *ktx;
+        int                rc;
+        ptl_nid_t          gatewaynid;
+#if KQSW_CHECKSUM
+        int                i;
+        kqsw_csum_t        csum;
+        int                sumnob;
+#endif
+        
+        /* NB, the return code from this procedure is ignored.
+         * If we can't send, we must still complete with lib_finalize().
+         * We'll have to wait for 3.2 to return an error event.
+         */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64
+               " pid %u\n", payload_nob, payload_niov, nid, pid);
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* It must be OK to kmap() if required */
+        LASSERT (payload_kiov == NULL || !in_interrupt ());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+        
+        if (payload_nob > KQSW_MAXPAYLOAD) {
+                CERROR ("request exceeds MTU size "LPSZ" (max %u).\n",
+                        payload_nob, KQSW_MAXPAYLOAD);
+                lib_finalize (&kqswnal_lib, private, cookie);
+                return (-1);
+        }
+
+        if (kqswnal_nid2elanid (nid) < 0) {     /* Can't send direct: find gateway? */
+                rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid);
+                if (rc != 0) {
+                        CERROR("Can't route to "LPX64": router error %d\n",
+                               nid, rc);
+                        lib_finalize (&kqswnal_lib, private, cookie);
+                        return (-1);
+                }
+                if (kqswnal_nid2elanid (gatewaynid) < 0) {
+                        CERROR("Bad gateway "LPX64" for "LPX64"\n",
+                               gatewaynid, nid);
+                        lib_finalize (&kqswnal_lib, private, cookie);
+                        return (-1);
+                }
+                nid = gatewaynid;
+        }
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK ||
+                                          type == PTL_MSG_REPLY ||
+                                          in_interrupt()));
+        if (ktx == NULL) {
+                kqswnal_cerror_hdr (hdr);
+                lib_finalize (&kqswnal_lib, private, cookie);
+        }
+
+        memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */
+
+#if KQSW_CHECKSUM
+        csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
+        memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
+        for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) {
+                if (payload_kiov != NULL) {
+                        ptl_kiov_t *kiov = &payload_kiov[i];
+                        char       *addr = ((char *)kmap (kiov->kiov_page)) +
+                                           kiov->kiov_offset;
+                        
+                        csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len));
+                        sumnob -= kiov->kiov_len;
+                } else {
+                        struct iovec *iov = &payload_iov[i];
+
+                        csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len));
+                        sumnob -= iov->iov_len;
+                }
+        }
+        memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum));
+#endif
+
+        /* Set up first frag from pre-mapped buffer (it's at least the
+         * portals header) */
+        ktx->ktx_iov[0].Base = ktx->ktx_ebuffer;
+        ktx->ktx_iov[0].Len = KQSW_HDR_SIZE;
+        ktx->ktx_niov = 1;
+
+        if (payload_nob > 0) { /* got some payload (something more to do) */
+                /* make a single contiguous message? */
+                if (payload_nob <= KQSW_TX_MAXCONTIG) {
+                        /* copy payload to ktx_buffer, immediately after hdr */
+                        if (payload_kiov != NULL)
+                                lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
+                                                   payload_niov, payload_kiov, payload_nob);
+                        else
+                                lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
+                                                  payload_niov, payload_iov, payload_nob);
+                        /* first frag includes payload */
+                        ktx->ktx_iov[0].Len += payload_nob;
+                } else {
+                        if (payload_kiov != NULL)
+                                rc = kqswnal_map_tx_kiov (ktx, payload_nob, 
+                                                          payload_niov, payload_kiov);
+                        else
+                                rc = kqswnal_map_tx_iov (ktx, payload_nob,
+                                                         payload_niov, payload_iov);
+                        if (rc != 0) {
+                                kqswnal_put_idle_tx (ktx);
+                                lib_finalize (&kqswnal_lib, private, cookie);
+                                return (-1);
+                        }
+                } 
+        }
+
+        ktx->ktx_port    = (payload_nob <= KQSW_SMALLPAYLOAD) ?
+                        EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE;
+        ktx->ktx_nid     = nid;
+        ktx->ktx_state   = KTX_SENDING;   /* => lib_finalize() on completion */
+        ktx->ktx_args[0] = private;
+        ktx->ktx_args[1] = cookie;
+
+        rc = kqswnal_launch (ktx);
+        if (rc != 0) {                    /* failed? */
+                CERROR ("Failed to send packet to "LPX64": %d\n", nid, rc);
+                lib_finalize (&kqswnal_lib, private, cookie);
+                return (-1);
+        }
+
+        CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid);
+        return (0);
+}
+
+static int
+kqswnal_send (nal_cb_t     *nal,
+              void         *private,
+              lib_msg_t    *cookie,
+              ptl_hdr_t    *hdr,
+              int           type,
+              ptl_nid_t     nid,
+              ptl_pid_t     pid,
+              unsigned int  payload_niov,
+              struct iovec *payload_iov,
+              size_t        payload_nob)
+{
+        return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                 payload_niov, payload_iov, NULL, payload_nob));
+}
+
+static int
+kqswnal_send_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    ptl_hdr_t    *hdr,
+                    int           type,
+                    ptl_nid_t     nid,
+                    ptl_pid_t     pid,
+                    unsigned int  payload_niov,
+                    ptl_kiov_t   *payload_kiov,
+                    size_t        payload_nob)
+{
+        return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                 payload_niov, NULL, payload_kiov, payload_nob));
+}
+
+int kqswnal_fwd_copy_contig = 0;
+
+void
+kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        int             rc;
+        kqswnal_tx_t   *ktx;
+        struct iovec   *iov = fwd->kprfd_iov;
+        int             niov = fwd->kprfd_niov;
+        int             nob = fwd->kprfd_nob;
+        ptl_nid_t       nid = fwd->kprfd_gateway_nid;
+
+#if KQSW_CHECKSUM
+        CERROR ("checksums for forwarded packets not implemented\n");
+        LBUG ();
+#endif
+        /* The router wants this NAL to forward a packet */
+        CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n",
+                fwd, nid, niov, nob);
+
+        LASSERT (niov > 0);
+        
+        ktx = kqswnal_get_idle_tx (fwd, FALSE);
+        if (ktx == NULL)        /* can't get txd right now */
+                return;         /* fwd will be scheduled when tx desc freed */
+
+        if (nid == kqswnal_lib.ni.nid)          /* gateway is me */
+                nid = fwd->kprfd_target_nid;    /* target is final dest */
+
+        if (kqswnal_nid2elanid (nid) < 0) {
+                CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid);
+                rc = -EHOSTUNREACH;
+                goto failed;
+        }
+
+        if (nob > KQSW_NRXMSGBYTES_LARGE) {
+                CERROR ("Can't forward [%p] to "LPX64
+                        ": size %d bigger than max packet size %ld\n",
+                        fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE);
+                rc = -EMSGSIZE;
+                goto failed;
+        }
+
+        if ((kqswnal_fwd_copy_contig || niov > 1) &&
+            nob <= KQSW_TX_BUFFER_SIZE) 
+        {
+                /* send from ktx's pre-allocated/mapped contiguous buffer? */
+                lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob);
+                ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; /* already mapped */
+                ktx->ktx_iov[0].Len = nob;
+                ktx->ktx_niov = 1;
+        }
+        else
+        {
+                /* zero copy */
+                ktx->ktx_niov = 0;        /* no frags mapped yet */
+                rc = kqswnal_map_tx_iov (ktx, nob, niov, iov);
+                if (rc != 0)
+                        goto failed;
+        }
+
+        ktx->ktx_port    = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ?
+                        EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE;
+        ktx->ktx_nid     = nid;
+        ktx->ktx_state   = KTX_FORWARDING; /* kpr_put_packet() on completion */
+        ktx->ktx_args[0] = fwd;
+
+        rc = kqswnal_launch (ktx);
+        if (rc == 0)
+                return;
+
+ failed:
+        LASSERT (rc != 0);
+        CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc);
+
+        kqswnal_put_idle_tx (ktx);
+        /* complete now (with failure) */
+        kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc);
+}
+
+void
+kqswnal_fwd_callback (void *arg, int error)
+{
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)arg;
+
+        /* The router has finished forwarding this packet */
+
+        if (error != 0)
+        {
+                ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]);
+
+                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
+                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error);
+        }
+
+        kqswnal_requeue_rx (krx);
+}
+
+void
+kqswnal_rx (kqswnal_rx_t *krx)
+{
+        ptl_hdr_t      *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]);
+        ptl_nid_t       dest_nid = NTOH__u64 (hdr->dest_nid);
+        int             nob;
+        int             niov;
+
+        if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */
+                /* NB krx requeued when lib_parse() calls back kqswnal_recv */
+                lib_parse (&kqswnal_lib, hdr, krx);
+                return;
+        }
+
+#if KQSW_CHECKSUM
+        CERROR ("checksums for forwarded packets not implemented\n");
+        LBUG ();
+#endif
+        if (kqswnal_nid2elanid (dest_nid) >= 0)  /* should have gone direct to peer */
+        {
+                CERROR("dropping packet from "LPX64" for "LPX64
+                       ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid);
+                kqswnal_requeue_rx (krx);
+                return;
+        }
+
+        /* NB forwarding may destroy iov; rebuild every time */
+        for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++)
+        {
+                LASSERT (niov < krx->krx_npages);
+                krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]);
+                krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob);
+        }
+
+        kpr_fwd_init (&krx->krx_fwd, dest_nid,
+                      krx->krx_nob, niov, krx->krx_iov,
+                      kqswnal_fwd_callback, krx);
+
+        kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd);
+}
+
+/* Receive Interrupt Handler: posts to schedulers */
+void 
+kqswnal_rxhandler(EP_RXD *rxd)
+{
+        long          flags;
+        int           nob    = ep_rxd_len (rxd);
+        int           status = ep_rxd_status (rxd);
+        kqswnal_rx_t *krx    = (kqswnal_rx_t *)ep_rxd_arg (rxd);
+
+        CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n",
+               rxd, krx, nob, status);
+
+        LASSERT (krx != NULL);
+
+        krx->krx_rxd = rxd;
+        krx->krx_nob = nob;
+
+        /* must receive a whole header to be able to parse */
+        if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t))
+        {
+                /* receives complete with failure when receiver is removed */
+                if (kqswnal_data.kqn_shuttingdown)
+                        return;
+
+                CERROR("receive status failed with status %d nob %d\n",
+                       ep_rxd_status(rxd), nob);
+                kqswnal_requeue_rx (krx);
+                return;
+        }
+
+        atomic_inc (&kqswnal_packets_received);
+
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+#if KQSW_CHECKSUM
+void
+kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
+{
+        ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]);
+
+        CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64
+                ", dpid %d, spid %d, type %d\n",
+                ishdr ? "Header" : "Payload", krx,
+                NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid)
+                NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid),
+                NTOH__u32(hdr->type));
+
+        switch (NTOH__u32 (hdr->type))
+        {
+        case PTL_MSG_ACK:
+                CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64
+                       " len %u\n",
+                       NTOH__u32(hdr->msg.ack.mlength),
+                       hdr->msg.ack.dst_wmd.handle_cookie,
+                       hdr->msg.ack.dst_wmd.handle_idx,
+                       NTOH__u64(hdr->msg.ack.match_bits),
+                       NTOH__u32(hdr->msg.ack.length));
+                break;
+        case PTL_MSG_PUT:
+                CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64
+                       " len %u off %u data "LPX64"\n",
+                       NTOH__u32(hdr->msg.put.ptl_index),
+                       hdr->msg.put.ack_wmd.handle_cookie,
+                       hdr->msg.put.ack_wmd.handle_idx,
+                       NTOH__u64(hdr->msg.put.match_bits),
+                       NTOH__u32(hdr->msg.put.length),
+                       NTOH__u32(hdr->msg.put.offset),
+                       hdr->msg.put.hdr_data);
+                break;
+        case PTL_MSG_GET:
+                CERROR ("GET: <>\n");
+                break;
+        case PTL_MSG_REPLY:
+                CERROR ("REPLY: <>\n");
+                break;
+        default:
+                CERROR ("TYPE?: <>\n");
+        }
+}
+#endif
+
+static int
+kqswnal_recvmsg (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 unsigned int  niov,
+                 struct iovec *iov,
+                 ptl_kiov_t   *kiov,
+                 size_t        mlen,
+                 size_t        rlen)
+{
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)private;
+        int           page;
+        char         *page_ptr;
+        int           page_nob;
+        char         *iov_ptr;
+        int           iov_nob;
+        int           frag;
+#if KQSW_CHECKSUM
+        kqsw_csum_t   senders_csum;
+        kqsw_csum_t   payload_csum = 0;
+        kqsw_csum_t   hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]),
+                                           sizeof(ptl_hdr_t));
+        size_t        csum_len = mlen;
+        int           csum_frags = 0;
+        int           csum_nob = 0;
+        static atomic_t csum_counter;
+        int           csum_verbose = (atomic_read(&csum_counter)%1000001) == 0;
+
+        atomic_inc (&csum_counter);
+
+        memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) +
+                                sizeof (ptl_hdr_t), sizeof (kqsw_csum_t));
+        if (senders_csum != hdr_csum)
+                kqswnal_csum_error (krx, 1);
+#endif
+        CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
+
+        /* What was actually received must be >= payload.
+         * This is an LASSERT, as lib_finalize() doesn't have a completion status. */
+        LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen);
+        LASSERT (mlen <= rlen);
+
+        /* It must be OK to kmap() if required */
+        LASSERT (kiov == NULL || !in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+        
+        if (mlen != 0)
+        {
+                page     = 0;
+                page_ptr = ((char *) page_address(krx->krx_pages[0])) +
+                        KQSW_HDR_SIZE;
+                page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
+
+                LASSERT (niov > 0);
+                if (kiov != NULL) {
+                        iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                        iov_nob = kiov->kiov_len;
+                } else {
+                        iov_ptr = iov->iov_base;
+                        iov_nob = iov->iov_len;
+                }
+
+                for (;;)
+                {
+                        /* We expect the iov to exactly match mlen */
+                        LASSERT (iov_nob <= mlen);
+                        
+                        frag = MIN (page_nob, iov_nob);
+                        memcpy (iov_ptr, page_ptr, frag);
+#if KQSW_CHECKSUM
+                        payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
+                        csum_nob += frag;
+                        csum_frags++;
+#endif
+                        mlen -= frag;
+                        if (mlen == 0)
+                                break;
+
+                        page_nob -= frag;
+                        if (page_nob != 0)
+                                page_ptr += frag;
+                        else
+                        {
+                                page++;
+                                LASSERT (page < krx->krx_npages);
+                                page_ptr = page_address(krx->krx_pages[page]);
+                                page_nob = PAGE_SIZE;
+                        }
+
+                        iov_nob -= frag;
+                        if (iov_nob != 0)
+                                iov_ptr += frag;
+                        else if (kiov != NULL) {
+                                kunmap (kiov->kiov_page);
+                                kiov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                                iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                                iov_nob = kiov->kiov_len;
+                        } else {
+                                iov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                                iov_ptr = iov->iov_base;
+                                iov_nob = iov->iov_len;
+                        }
+                }
+
+                if (kiov != NULL)
+                        kunmap (kiov->kiov_page);
+        }
+
+#if KQSW_CHECKSUM
+        memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) +
+                sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t));
+
+        if (csum_len != rlen)
+                CERROR("Unable to checksum data in user's buffer\n");
+        else if (senders_csum != payload_csum)
+                kqswnal_csum_error (krx, 0);
+
+        if (csum_verbose)
+                CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, "
+                       "csum_nob %d\n",
+                        hdr_csum, payload_csum, csum_frags, csum_nob);
+#endif
+        lib_finalize(nal, private, cookie);
+
+        kqswnal_requeue_rx (krx);
+
+        return (rlen);
+}
+
+static int
+kqswnal_recv(nal_cb_t     *nal,
+             void         *private,
+             lib_msg_t    *cookie,
+             unsigned int  niov,
+             struct iovec *iov,
+             size_t        mlen,
+             size_t        rlen)
+{
+        return (kqswnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen));
+}
+
+static int
+kqswnal_recv_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    unsigned int  niov,
+                    ptl_kiov_t   *kiov,
+                    size_t        mlen,
+                    size_t        rlen)
+{
+        return (kqswnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen));
+}
+
+int
+kqswnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kqswnal_data.kqn_nthreads);
+        return (0);
+}
+
+void
+kqswnal_thread_fini (void)
+{
+        atomic_dec (&kqswnal_data.kqn_nthreads);
+}
+
+int
+kqswnal_scheduler (void *arg)
+{
+        kqswnal_rx_t    *krx;
+        kqswnal_tx_t    *ktx;
+        kpr_fwd_desc_t  *fwd;
+        long             flags;
+        int              rc;
+        int              counter = 0;
+        int              did_something;
+
+        kportal_daemonize ("kqswnal_sched");
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        while (!kqswnal_data.kqn_shuttingdown)
+        {
+                did_something = FALSE;
+
+                if (!list_empty (&kqswnal_data.kqn_readyrxds))
+                {
+                        krx = list_entry(kqswnal_data.kqn_readyrxds.next,
+                                         kqswnal_rx_t, krx_list);
+                        list_del (&krx->krx_list);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        kqswnal_rx (krx);
+
+                        did_something = TRUE;
+                        spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                if (!list_empty (&kqswnal_data.kqn_delayedtxds))
+                {
+                        ktx = list_entry(kqswnal_data.kqn_delayedtxds.next,
+                                         kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        rc = kqswnal_launch (ktx);
+                        if (rc != 0)          /* failed: ktx_nid down? */
+                        {
+                                CERROR("Failed delayed transmit to "LPX64
+                                       ": %d\n", ktx->ktx_nid, rc);
+                                kqswnal_tx_done (ktx, rc);
+                        }
+
+                        did_something = TRUE;
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                if (!list_empty (&kqswnal_data.kqn_delayedfwds))
+                {
+                        fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list);
+                        list_del (&fwd->kprfd_list);
+                        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+                        kqswnal_fwd_packet (NULL, fwd);
+
+                        did_something = TRUE;
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                    /* nothing to do or hogging CPU */
+                if (!did_something || counter++ == KQSW_RESCHED) {
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        counter = 0;
+
+                        if (!did_something) {
+                                rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq,
+                                                               kqswnal_data.kqn_shuttingdown ||
+                                                               !list_empty(&kqswnal_data.kqn_readyrxds) ||
+                                                               !list_empty(&kqswnal_data.kqn_delayedtxds) ||
+                                                               !list_empty(&kqswnal_data.kqn_delayedfwds));
+                                LASSERT (rc == 0);
+                        } else if (current->need_resched)
+                                schedule ();
+
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+        kqswnal_thread_fini ();
+        return (0);
+}
+
+nal_cb_t kqswnal_lib =
+{
+        nal_data:       &kqswnal_data,         /* NAL private data */
+        cb_send:        kqswnal_send,
+        cb_send_pages:  kqswnal_send_pages,
+        cb_recv:        kqswnal_recv,
+        cb_recv_pages:  kqswnal_recv_pages,
+        cb_read:        kqswnal_read,
+        cb_write:       kqswnal_write,
+        cb_malloc:      kqswnal_malloc,
+        cb_free:        kqswnal_free,
+        cb_printf:      kqswnal_printf,
+        cb_cli:         kqswnal_cli,
+        cb_sti:         kqswnal_sti,
+        cb_dist:        kqswnal_dist
+};
diff --git a/lnet/klnds/scimaclnd/.cvsignore b/lnet/klnds/scimaclnd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/klnds/scimaclnd/Makefile.am b/lnet/klnds/scimaclnd/Makefile.am
new file mode 100644 (file)
index 0000000..6da31f0
--- /dev/null
@@ -0,0 +1,11 @@
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kscimacnal
+modulenet_DATA = kscimacnal.o
+EXTRA_PROGRAMS = kscimacnal
+
+DEFS =
+kscimacnal_SOURCES = scimacnal.c scimacnal_cb.c scimacnal.h
diff --git a/lnet/klnds/scimaclnd/README.scimacnal b/lnet/klnds/scimaclnd/README.scimacnal
new file mode 100644 (file)
index 0000000..d4c6a49
--- /dev/null
@@ -0,0 +1,14 @@
+
+scimacnal - A NAL for the Scali ScaMAC midlayer.
+
+The ScaMAC midlayer is a simplified API to the SCI high performance
+interconnect.
+
+In order to use this NAL you'll need to tune scimac to use larger buffers.
+See scimac.conf in this directory for an example.
+
+Overall performance and stability isn't great but this can be attributed
+to the scimac driver which apparently is in need of some development.
+
+TODO:
+Routing isn't yet implemented.
diff --git a/lnet/klnds/scimaclnd/scimac.conf b/lnet/klnds/scimaclnd/scimac.conf
new file mode 100644 (file)
index 0000000..bfb6d02
--- /dev/null
@@ -0,0 +1,35 @@
+#  Configuration file for the scimac driver - lustre friendly settings
+#
+
+#  The maximal number of message headers to use in the system.
+scimac_max_no_hdrs = 32
+
+#  The maximal number of eager buffers to use in the system.
+scimac_max_no_ebufs = 8
+
+#  The maximal size in bytes of each eager buffer.
+scimac_max_ebuf_size = 65536
+
+#  Enable use of a kernel thread to defer reception of packets.
+#  Default is to use a tasklet (sw interrupt).
+scimac_use_ulevel_recv = 1
+
+#  The maximal number of packets queued for transfer per path at any one time. 
+scimac_max_send_queuelen = 2000
+
+#  The packet retransmit time in milliseconds.
+#  The time elapsed since a packet was attempted sent until the packet is resent.
+scimac_pkt_rexmit_time = 200
+
+#  The packet's maximal retransmit time in milliseconds.
+#  The total time that a packet will be attempted sent before it is dropped.
+scimac_max_rexmit_time = 5000
+
+#  The lowest valid node identifier in the system.
+scimac_min_nodeid_number = 0x100
+
+#  The largest valid node identifier in the system.
+scimac_max_nodeid_number = 0xff00
+
+#  The incremental nodeid step in the system.
+scimac_nodeid_increment = 0x100
diff --git a/lnet/klnds/scimaclnd/scimacnal.c b/lnet/klnds/scimaclnd/scimacnal.c
new file mode 100644 (file)
index 0000000..1066d69
--- /dev/null
@@ -0,0 +1,219 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+
+ * Based on gmnal, which is based on ksocknal and qswnal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "scimacnal.h"
+
+ptl_handle_ni_t kscimacnal_ni;
+nal_t  kscimacnal_api;
+
+kscimacnal_data_t kscimacnal_data;
+
+kpr_nal_interface_t kscimacnal_router_interface = {
+        kprni_nalid:    SCIMACNAL,
+        kprni_arg:      NULL,
+        kprni_fwd:      kscimacnal_fwd_packet,
+};
+
+
+static int kscimacnal_forward(nal_t   *nal,
+                          int     id,
+                          void    *args,  size_t args_len,
+                          void    *ret,   size_t ret_len)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        lib_dispatch(nal_cb, ksci, id, args, ret); /* nal needs ksci */
+        return PTL_OK;
+}
+
+
+static void kscimacnal_lock(nal_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+
+static void kscimacnal_unlock(nal_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+
+static int kscimacnal_shutdown(nal_t *nal, int ni)
+{
+        LASSERT (nal == &kscimacnal_api);
+        return 0;
+}
+
+
+static void kscimacnal_yield( nal_t *nal )
+{
+        LASSERT (nal == &kscimacnal_api);
+
+        if (current->need_resched) 
+                schedule();
+        return;
+}
+
+
+static nal_t *kscimacnal_init(int interface, ptl_pt_index_t  ptl_size,
+                ptl_ac_index_t  ac_size, ptl_pid_t requested_pid)
+{
+        int     nnids = 512; /* FIXME: Need ScaMac funktion to get #nodes */
+
+        CDEBUG(D_NET, "calling lib_init with nid 0x%Lx nnids %d\n", kscimacnal_data.ksci_nid, nnids);
+        lib_init(&kscimacnal_lib, kscimacnal_data.ksci_nid, 0, nnids,ptl_size, ac_size); 
+        return &kscimacnal_api;
+}
+
+
+/* Called by kernel at module unload time */
+static void __exit 
+kscimacnal_finalize(void)
+{
+        /* FIXME: How should the shutdown procedure really look? */
+        kscimacnal_data.ksci_shuttingdown=1;
+
+        PORTAL_SYMBOL_UNREGISTER(kscimacnal_ni);
+
+        PtlNIFini(kscimacnal_ni);
+        lib_fini(&kscimacnal_lib);
+
+        mac_finish(kscimacnal_data.ksci_machandle);
+
+        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
+
+        return;
+}
+
+
+/* Called by kernel at module insertion time */
+static int __init
+kscimacnal_initialize(void)
+{
+        int rc;
+        unsigned long     nid=0;
+        mac_handle_t    *machandle = NULL;
+
+
+        CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
+
+        kscimacnal_api.forward = kscimacnal_forward;
+        kscimacnal_api.shutdown = kscimacnal_shutdown;
+        kscimacnal_api.yield = kscimacnal_yield;
+        kscimacnal_api.validate = NULL;         /* our api validate is a NOOP */
+        kscimacnal_api.lock= kscimacnal_lock;
+        kscimacnal_api.unlock= kscimacnal_unlock;
+        kscimacnal_api.nal_data = &kscimacnal_data;
+
+        kscimacnal_lib.nal_data = &kscimacnal_data;
+
+        memset(&kscimacnal_data, 0, sizeof(kscimacnal_data));
+
+        kscimacnal_data.ksci_cb = &kscimacnal_lib;
+
+        /* We're not using this, but cli/sti callbacks does... ??? */
+        spin_lock_init(&kscimacnal_data.ksci_dispatch_lock);
+
+        /* FIXME: We only support one adapter for now */
+        machandle = mac_init(0, MAC_SAPID_LUSTRE, kscimacnal_rx,
+                        &kscimacnal_data);
+
+        if(!machandle) {
+                CERROR("mac_init() failed\n");
+                return -1;
+        }
+
+        kscimacnal_data.ksci_machandle = machandle;
+
+        /* Make sure the scimac MTU is tuned */
+        if(mac_get_mtusize(machandle) < SCIMACNAL_MTU) {
+                CERROR("scimac mtu of %ld smaller than SCIMACNAL MTU of %d\n",
+                                mac_get_mtusize(machandle), SCIMACNAL_MTU);
+                CERROR("Consult README.scimacnal for more information\n");
+                mac_finish(machandle);
+                return -1;
+        }
+
+        /* Get the node ID */
+        /* mac_get_physaddrlen() is a function instead of define, sigh */
+        LASSERT(mac_get_physaddrlen(machandle) <= sizeof(nid));
+        if(mac_get_physaddr(machandle, (mac_physaddr_t *) &nid)) {
+                CERROR("mac_get_physaddr() failed\n");
+                mac_finish(machandle);
+                return -1;
+        }
+        nid = ntohl(nid);
+        kscimacnal_data.ksci_nid = nid;
+
+
+        /* Initialize Network Interface */
+        /* FIXME: What do the magic numbers mean? Documentation anyone? */
+        rc = PtlNIInit(kscimacnal_init, 32, 4, 0, &kscimacnal_ni);
+        if (rc) {
+                CERROR("PtlNIInit failed %d\n", rc);
+                mac_finish(machandle);
+                return (-ENOMEM);
+        }
+
+        PORTAL_SYMBOL_REGISTER(kscimacnal_ni);
+
+        /* We're done now, it's OK for the RX callback to do stuff */
+        kscimacnal_data.ksci_init = 1;
+
+        return 0;
+}
+
+
+MODULE_AUTHOR("Niklas Edmundsson <nikke@hpc2n.umu.se>");
+MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0");
+MODULE_LICENSE("GPL");
+
+module_init (kscimacnal_initialize);
+module_exit (kscimacnal_finalize);
+
+EXPORT_SYMBOL(kscimacnal_ni);
diff --git a/lnet/klnds/scimaclnd/scimacnal.h b/lnet/klnds/scimaclnd/scimacnal.h
new file mode 100644 (file)
index 0000000..1ff180e
--- /dev/null
@@ -0,0 +1,85 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+ */
+
+
+#ifndef _SCIMACNAL_H
+#define _SCIMACNAL_H
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <asm/page.h>            /* For PAGE_SIZE */
+
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#include <scamac.h>
+
+#ifndef MAC_SAPID_LUSTRE
+#define MAC_SAPID_LUSTRE MAC_SAPID_TEST1
+#endif /* MAC_SAPID_LUSTRE */
+
+#define SCIMACNAL_MTU 65536
+/* FIXME: What is really the MTU of lustre? */
+#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU
+#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger.
+#endif
+
+typedef struct {
+        mac_handle_t    *handle;
+        mac_mblk_t      *msg;
+        mac_msg_type_t   type;
+        void            *userdata;
+}  kscimacnal_rx_t;
+
+
+typedef struct {
+        nal_cb_t        *ktx_nal;
+        void            *ktx_private;
+        lib_msg_t       *ktx_cookie;
+        ptl_hdr_t       ktx_hdr;
+}  kscimacnal_tx_t;
+
+
+typedef struct {
+        char              ksci_init;
+        char              ksci_shuttingdown;
+        ptl_nid_t         ksci_nid;
+        nal_cb_t         *ksci_cb;
+        spinlock_t        ksci_dispatch_lock;
+        mac_handle_t     *ksci_machandle;
+}  kscimacnal_data_t;
+
+extern kscimacnal_data_t   kscimacnal_data;
+extern nal_t            kscimacnal_api;
+extern nal_cb_t         kscimacnal_lib;
+
+void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+void kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, void *userdata);
+
+
+#endif  /* _SCIMACNAL_H */
diff --git a/lnet/klnds/scimaclnd/scimacnal_cb.c b/lnet/klnds/scimaclnd/scimacnal_cb.c
new file mode 100644 (file)
index 0000000..7e4a2e8
--- /dev/null
@@ -0,0 +1,468 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "scimacnal.h"
+
+static int 
+kscimacnal_read (nal_cb_t *nal, void *private,
+                void *dst_addr, user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+
+static int 
+kscimacnal_write(nal_cb_t *nal, void *private,
+                user_ptr dst_addr, void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+
+static void *
+kscimacnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return buf;
+}
+
+
+static void 
+kscimacnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+
+static void 
+kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list         ap;
+        char msg[256]; 
+
+        if (portal_debug & D_NET) {
+                va_start( ap, fmt );
+                vsnprintf( msg, sizeof(msg), fmt, ap );
+                va_end( ap );
+
+                printk("CPUId: %d %s",smp_processor_id(), msg);
+        }
+}
+
+
+static void 
+kscimacnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->ksci_dispatch_lock,*flags);
+}
+
+
+static void 
+kscimacnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *data= nal->nal_data; 
+
+        spin_unlock_irqrestore(&data->ksci_dispatch_lock,*flags);
+}
+
+
+static int 
+kscimacnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* FIXME: Network distance has a meaning, but is there no easy
+         * way to figure it out (depends on routing) */
+
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+
+static
+char * get_mac_error(mac_status_t status) 
+{
+        switch(status) {
+                case MAC_MSG_STAT_OK:
+                        return "MAC_MSG_STAT_OK";
+                case MAC_MSG_STAT_FREED:
+                        return "MAC_MSG_STAT_FREED";
+                case MAC_MSG_STAT_ABORTED:
+                        return "MAC_MSG_STAT_ABORTED";
+                case MAC_MSG_STAT_TIMEDOUT:
+                        return "MAC_MSG_STAT_TIMEDOUT";
+                case MAC_MSG_STAT_NODEUNREACH:
+                        return "MAC_MSG_STAT_NODEUNREACH";
+                case MAC_MSG_STAT_NETDOWN:
+                        return "MAC_MSG_STAT_NETDOWN";
+                case MAC_MSG_STAT_RESET:
+                        return "MAC_MSG_STAT_RESET";
+                case MAC_MSG_STAT_INITFAILED:
+                        return "MAC_MSG_STAT_INITFAILED";
+                case MAC_MSG_STAT_SYNCFAILED:
+                        return "MAC_MSG_STAT_SYNCFAILED";
+                case MAC_MSG_STAT_BADPROTO:
+                        return "MAC_MSG_STAT_BADPROTO";
+                case MAC_MSG_STAT_NOBUFSPACE:
+                        return "MAC_MSG_STAT_NOBUFSPACE";
+                case MAC_MSG_STAT_CONGESTION:
+                        return "MAC_MSG_STAT_CONGESTION";
+                case MAC_MSG_STAT_OTHER:
+                        return "MAC_MSG_STAT_OTHER";
+                default:
+                        return "Unknown error";
+        }
+}
+
+
+/* FIXME add routing code here ? */
+
+/* Called by ScaMac when transmission is complete  (ie. message is released) */
+static void 
+kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
+{
+        kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context;
+        int err=0;
+        
+        LASSERT (ktx != NULL);
+
+        /* Euh, there is no feedback when transmission fails?! */
+        switch(status) {
+                case MAC_MSG_STAT_OK:        /* normal */
+                        break;
+                default:
+                        CERROR("%s (%d):\n", get_mac_error(status), status);
+                        err = -EIO;
+                        break;
+        }
+
+        lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie);
+
+        PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+}
+
+
+/* Called by portals when it wants to send a message.
+ * Since ScaMAC has it's own TX thread we don't bother setting up our own. */
+static int 
+kscimacnal_send(nal_cb_t        *nal,
+           void            *private,
+           lib_msg_t       *cookie,
+           ptl_hdr_t       *hdr,
+           int              type, 
+           ptl_nid_t        nid,
+           ptl_pid_t        pid,
+           unsigned int     payload_niov,
+           struct iovec    *payload_iov,
+           size_t           payload_len)
+{
+        kscimacnal_tx_t    *ktx=NULL;
+        kscimacnal_data_t  *ksci = nal->nal_data;
+        int              rc=0;
+        int              buf_len = sizeof(ptl_hdr_t) + payload_len;
+        mac_mblk_t      *msg=NULL, *lastblk, *newblk;
+        unsigned long   physaddr;
+        
+
+        CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n",
+               payload_len, payload_iov, nid, payload_niov);
+
+        LASSERT(ksci != NULL);
+
+        LASSERT(hdr != NULL);
+
+        /* Do real check if we can send this */
+        if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
+                CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n",
+                                mac_get_mtusize(ksci->ksci_machandle));
+                return -EINVAL;
+        }
+
+
+        /* save transaction info for later finalize and cleanup */
+        PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t)));
+        if (!ktx) {
+                return -ENOMEM;
+        }
+
+        /* *SIGH* hdr is a stack variable in the calling function, so we
+         * need to copy it to a buffer. Zerocopy magic (or is it just
+         * deferred memcpy?) is annoying sometimes.  */
+        memcpy(&ktx->ktx_hdr, hdr, sizeof(ptl_hdr_t));
+
+        /* First, put the header in the main message mblk */
+        msg = mac_alloc_mblk(&ktx->ktx_hdr, sizeof(ptl_hdr_t),
+                        kscimacnal_txrelease, ktx);
+        if (!msg) {
+                PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                return -ENOMEM;
+        }
+        mac_put_mblk(msg, sizeof(ptl_hdr_t));
+        lastblk=msg;
+
+        /* Allocate additional mblks for each iov as needed.
+         * Essentially lib_copy_iov2buf with a twist or two */
+        while (payload_len > 0)
+        {
+                ptl_size_t nob;
+
+                LASSERT (payload_niov > 0);
+
+                nob = MIN (payload_iov->iov_len, payload_len);
+
+                /* We don't need a callback on the additional mblks, since
+                 * all release callbacks seems to be called when the entire
+                 * message has been sent */
+                newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL);
+                if(!newblk) {
+                        mac_free_msg(msg);
+                        PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                        return -ENOMEM;
+                }
+                mac_put_mblk(newblk, nob);
+                mac_link_mblk(lastblk, newblk);
+                lastblk=newblk;
+
+                payload_len -= nob;
+                payload_niov--;
+                payload_iov++;
+        }
+
+        ktx->ktx_nal = nal;
+        ktx->ktx_private = private;
+        ktx->ktx_cookie = cookie;
+
+        CDEBUG(D_NET, "mac_send %d bytes to nid: 0x%Lx\n", buf_len, nid);
+
+        physaddr = htonl(nid);
+
+        if((rc=mac_send(ksci->ksci_machandle, msg,
+                                        (mac_physaddr_t *) &physaddr))) {
+                CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc);
+                mac_free_msg(msg);
+                PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                return rc;
+        }
+
+        return 0;
+}
+
+
+void
+kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+
+/* Process a received portals packet */
+/* Called by the ScaMac RX thread when a packet is received */
+void
+kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type,
+                void *userdata)
+{
+        ptl_hdr_t       *hdr = NULL;
+        kscimacnal_rx_t     krx; 
+        mac_size_t       size;
+        kscimacnal_data_t  *ksci = userdata;
+
+        LASSERT(ksci != NULL);
+
+        if ( !ksci->ksci_init || ksci->ksci_shuttingdown || 
+                    type == MAC_MSG_TYPE_CTRL || type == MAC_MSG_TYPE_OTHER ) {
+                /* We're not interested in messages not for us, ignore */
+                mac_free_msg(msg);
+                return;
+        }
+
+        size = mac_msg_size(msg);
+
+        CDEBUG(D_NET,"msg %p type %d, size %ld bytes (%ld mblks)\n", 
+                        msg, type, size, mac_msg_mblks(msg));
+
+        if( size < sizeof( ptl_hdr_t ) ) {
+                /* XXX what's this for? */
+                if (ksci->ksci_shuttingdown)
+                        return;
+                CERROR("kscimacnal: did not receive complete portal header,"
+                                "size= %ld\n", size);
+                /* Free the message before exiting */
+                mac_free_msg(msg);
+                return;
+        }
+
+        /* Provide everything we know */
+        krx.handle = handle;
+        krx.msg = msg;
+        krx.type = type;
+        krx.userdata = userdata;
+
+        /* mac_msg_next returns the next mblk with unread data */
+        hdr = mac_get_mblk(mac_msg_next(msg), sizeof(ptl_hdr_t) );
+
+        if(!hdr) {
+                CERROR("kscimacnal: no data block in message %p\n", msg);
+                mac_free_msg(msg);
+                return;
+        }
+
+        if ( hdr->dest_nid == kscimacnal_lib.ni.nid ) {
+                PROF_START(lib_parse);
+                /* sets wanted_len, iovs etc and calls our callback */
+                lib_parse(&kscimacnal_lib, hdr, &krx);
+                PROF_FINISH(lib_parse);
+#if 0 /* FIXME: Is it possible to detect this? */
+        } else if (kgmnal_ispeer(hdr->dest_nid)) {
+                /* should have gone direct to peer */
+                CERROR("dropping packet from 0x%llx to 0x%llx:"
+                                "target is a  peer\n",
+                                hdr->src_nid, hdr->dest_nid);
+                kgmnal_requeue_rx(&krx);
+#endif /* if 0 FIXME */
+        } else {
+                /* forward to gateway */
+                CERROR("forwarding not implemented, mynid=0x%llx dest=0x%llx\n",
+                                kscimacnal_lib.ni.nid, hdr->dest_nid);
+        }
+
+        mac_free_msg(msg);
+
+        CDEBUG(D_NET, "msg %p: Done\n", msg);
+}
+
+
+/* Called by portals to process a recieved packet */
+static int kscimacnal_recv(nal_cb_t     *nal, 
+                      void         *private, 
+                      lib_msg_t    *cookie, 
+                      unsigned int  niov, 
+                      struct iovec *iov, 
+                      size_t        mlen, 
+                      size_t        rlen)
+{
+        kscimacnal_rx_t    *krx = private;
+        mac_mblk_t      *mblk;
+        void            *src;
+        mac_size_t       pkt_len;
+        ptl_size_t       iovused=0;
+
+        LASSERT (krx != NULL);
+        LASSERT (krx->msg != NULL);
+
+        CDEBUG(D_NET,"msg %p: mlen=%d, rlen=%d, niov=%d\n",
+                        krx->msg, mlen, rlen, niov);
+
+        /* What was actually received must be >= what sender claims to have
+         * sent.  This is an LASSERT, since lib-move doesn't check cb return
+         * code yet. Also, rlen seems to be negative when mlen==0 so don't
+         * assert on that.
+         */
+        LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
+        LASSERT (mlen==0 || mlen <= rlen);
+
+        PROF_START(memcpy);
+
+        /* mac_msg_next returns next mblk with unread data (ie. can
+         * be same mblk */
+        while (mlen != 0 && (mblk = mac_msg_next(krx->msg))) {
+                pkt_len = mac_mblk_len(mblk);
+                src = mac_get_mblk(mblk, pkt_len); /* Next unread block */
+
+                CDEBUG(D_NET,"msg %p: mblk: %p pkt_len: %ld  src: %p\n",
+                                krx->msg, mblk, pkt_len, src);
+
+                LASSERT(src != NULL);
+
+                /* Essentially lib_copy_buf2iov but with continuation support,
+                 * we "gracefully" thrash the argument vars ;) */
+                while (pkt_len > 0) {
+                        ptl_size_t nob;
+
+                        LASSERT (niov > 0);
+
+                        LASSERT(iovused < iov->iov_len);
+
+                        nob = MIN (iov->iov_len-iovused, pkt_len);
+                        CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p  nob: %d "
+                                        "iovused: %d\n",
+                                        iov->iov_base, iov->iov_len,
+                                        src, nob, iovused);
+
+                        memcpy (iov->iov_base+iovused, src, nob);
+                        pkt_len -= nob;
+                        src += nob;
+
+                        if(nob+iovused < iov->iov_len) {
+                                /* We didn't use all of the iov */
+                                iovused+=nob;
+                        }
+                        else {
+                                niov--;
+                                iov++;
+                                iovused=0;
+                        }
+                }
+        }
+        PROF_FINISH(memcpy);
+
+        CDEBUG(D_NET, "Calling lib_finalize.\n");
+
+        PROF_START(lib_finalize);
+        lib_finalize(nal, private, cookie);
+        PROF_FINISH(lib_finalize);
+
+        CDEBUG(D_NET, "Done.\n");
+
+        return rlen;
+}
+
+
+nal_cb_t kscimacnal_lib = {
+        nal_data:       &kscimacnal_data,               /* NAL private data */
+        cb_send:         kscimacnal_send,
+        cb_send_pages:   NULL,                  /* Ignore for now */
+        cb_recv:         kscimacnal_recv,
+        cb_recv_pages:   NULL,
+        cb_read:         kscimacnal_read,
+        cb_write:        kscimacnal_write,
+        cb_malloc:       kscimacnal_malloc,
+        cb_free:         kscimacnal_free,
+        cb_printf:       kscimacnal_printf,
+        cb_cli:          kscimacnal_cli,
+        cb_sti:          kscimacnal_sti,
+        cb_dist:         kscimacnal_dist
+};
diff --git a/lnet/klnds/socklnd/.cvsignore b/lnet/klnds/socklnd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/klnds/socklnd/Makefile.am b/lnet/klnds/socklnd/Makefile.am
new file mode 100644 (file)
index 0000000..437d7fc
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = ksocknal
+modulenet_DATA = ksocknal.o
+EXTRA_PROGRAMS = ksocknal
+
+DEFS =
+ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h
diff --git a/lnet/klnds/socklnd/Makefile.mk b/lnet/klnds/socklnd/Makefile.mk
new file mode 100644 (file)
index 0000000..46edf01
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Kernelenv
+
+obj-y += ksocknal.o
+ksocknal-objs    := socknal.o socknal_cb.o
+
diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c
new file mode 100644 (file)
index 0000000..91d971c
--- /dev/null
@@ -0,0 +1,860 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socknal.h"
+
+ptl_handle_ni_t         ksocknal_ni;
+static nal_t            ksocknal_api;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ksock_nal_data_t ksocknal_data;
+#else
+static ksock_nal_data_t ksocknal_data;
+#endif
+
+kpr_nal_interface_t ksocknal_router_interface = {
+        kprni_nalid:      SOCKNAL,
+        kprni_arg:        &ksocknal_data,
+        kprni_fwd:        ksocknal_fwd_packet,
+};
+
+
+int
+ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
+                       void *ret, size_t ret_len)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */
+        return PTL_OK;
+}
+
+int
+ksocknal_api_shutdown(nal_t *nal, int ni)
+{
+        CDEBUG (D_NET, "closing all connections\n");
+
+        return ksocknal_close_sock(0);          /* close all sockets */
+}
+
+void
+ksocknal_api_yield(nal_t *nal)
+{
+        our_cond_resched();
+        return;
+}
+
+void
+ksocknal_api_lock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+void
+ksocknal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+nal_t *
+ksocknal_init(int interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
+{
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", (ptl_nid_t)0);
+        lib_init(&ksocknal_lib, (ptl_nid_t)0, 0, 10, ptl_size, ac_size);
+        return (&ksocknal_api);
+}
+
+/*
+ *  EXTRA functions follow
+ */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define SOCKET_I(inode) (&(inode)->u.socket_i)
+#endif
+static __inline__ struct socket *
+socki_lookup(struct inode *inode)
+{
+        return SOCKET_I(inode);
+}
+
+int
+ksocknal_set_mynid(ptl_nid_t nid)
+{
+        lib_ni_t *ni = &ksocknal_lib.ni;
+
+        /* FIXME: we have to do this because we call lib_init() at module
+         * insertion time, which is before we have 'mynid' available.  lib_init
+         * sets the NAL's nid, which it uses to tell other nodes where packets
+         * are coming from.  This is not a very graceful solution to this
+         * problem. */
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+               nid, ni->nid);
+
+        ni->nid = nid;
+        return (0);
+}
+
+void
+ksocknal_bind_irq (unsigned int irq, int cpu)
+{
+#if (defined(CONFIG_SMP) && CPU_AFFINITY)
+        char  cmdline[64];
+        char *argv[] = {"/bin/sh",
+                        "-c",
+                        cmdline,
+                        NULL};
+        char *envp[] = {"HOME=/",
+                        "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+                        NULL};
+
+        snprintf (cmdline, sizeof (cmdline),
+                  "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq);
+
+        printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n",
+                irq, cpu, cmdline);
+
+        /* FIXME: Find a better method of setting IRQ affinity...
+         */
+
+        call_usermodehelper (argv[0], argv, envp);
+#endif
+}
+
+int
+ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        struct file       *file = NULL;
+        struct socket     *sock = NULL;
+        ksock_sched_t     *sched = NULL;
+        unsigned int       irq = 0;
+        struct net_device *dev = NULL;
+        int                ret;
+        int                idx;
+        ENTRY;
+
+        LASSERT (!in_interrupt());
+
+        file = fget(fd);
+        if (file == NULL)
+                RETURN(-EINVAL);
+
+        ret = -EINVAL;
+        sock = socki_lookup(file->f_dentry->d_inode);
+        if (sock == NULL)
+                GOTO(error, ret);
+
+        ret = -ENOMEM;
+        PORTAL_ALLOC(conn, sizeof(*conn));
+        if (!conn)
+                GOTO(error, ret);
+
+        sock->sk->allocation = GFP_NOFS;    /* don't call info fs for alloc */
+
+        conn->ksnc_file = file;
+        conn->ksnc_sock = sock;
+        conn->ksnc_saved_data_ready = sock->sk->data_ready;
+        conn->ksnc_saved_write_space = sock->sk->write_space;
+        conn->ksnc_peernid = nid;
+        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for socklist */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ksocknal_new_packet (conn, 0);
+
+        INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+
+#warning check it is OK to derefence sk->dst_cache->dev like this...
+        lock_sock (conn->ksnc_sock->sk);
+
+        if (conn->ksnc_sock->sk->dst_cache != NULL) {
+                dev = conn->ksnc_sock->sk->dst_cache->dev;
+                if (dev != NULL) {
+                        irq = dev->irq;
+                        if (irq >= NR_IRQS) {
+                                CERROR ("Unexpected IRQ %x\n", irq);
+                                irq = 0;
+                        }
+                }
+        }
+
+        release_sock (conn->ksnc_sock->sk);
+
+        write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (irq == 0 ||
+            ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) {
+                /* This is a software NIC, or we haven't associated it with
+                 * a CPU yet */
+
+                /* Choose the CPU with the fewest connections */
+                sched = ksocknal_data.ksnd_schedulers;
+                for (idx = 1; idx < SOCKNAL_N_SCHED; idx++)
+                        if (sched->kss_nconns >
+                            ksocknal_data.ksnd_schedulers[idx].kss_nconns)
+                                sched = &ksocknal_data.ksnd_schedulers[idx];
+
+                if (irq != 0) {                 /* Hardware NIC */
+                        /* Remember which scheduler we chose */
+                        idx = sched - ksocknal_data.ksnd_schedulers;
+
+                        LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK);
+
+                        if (bind_irq)       /* remember if we will bind below */
+                                idx |= SOCKNAL_IRQ_BOUND;
+
+                        ksocknal_data.ksnd_irq_info[irq] = idx;
+                }
+        } else { 
+                /* This is a hardware NIC, associated with a CPU */
+                idx = ksocknal_data.ksnd_irq_info[irq];
+
+                /* Don't bind again if we've bound already */
+                if ((idx & SOCKNAL_IRQ_BOUND) != 0)
+                        bind_irq = 0;
+                
+                sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK];
+        }
+
+        sched->kss_nconns++;
+        conn->ksnc_scheduler = sched;
+
+        list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist);
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (bind_irq &&                         /* irq binding required */
+            irq != 0)                           /* hardware NIC */
+                ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers);
+
+        /* NOW it's safe to get called back when socket is ready... */
+        sock->sk->user_data = conn;
+        sock->sk->data_ready = ksocknal_data_ready;
+        sock->sk->write_space = ksocknal_write_space;
+
+        /* ...which I call right now to get things going */
+        ksocknal_data_ready (sock->sk, 0);
+        ksocknal_write_space (sock->sk);
+
+        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n",
+               conn, conn->ksnc_peernid);
+
+        /* Can't unload while connection active */
+        PORTAL_MODULE_USE;
+        RETURN(0);
+
+error:
+        fput(file);
+        return (ret);
+}
+
+/* Passing in a zero nid will close all connections */
+int
+ksocknal_close_sock(ptl_nid_t nid)
+{
+        long               flags;
+        ksock_conn_t      *conn;
+        LIST_HEAD         (death_row);
+        struct list_head  *tmp;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (nid == 0) {                         /* close ALL connections */
+                /* insert 'death row' into the socket list... */
+                list_add (&death_row, &ksocknal_data.ksnd_socklist);
+                /* ...extract and reinitialise the socket list itself... */
+                list_del_init (&ksocknal_data.ksnd_socklist);
+                /* ...and voila, death row is the proud owner of all conns */
+        } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) {
+
+                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid) {
+                        list_del (&conn->ksnc_list);
+                        list_add (&conn->ksnc_list, &death_row);
+                        break;
+                }
+        }
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (nid && list_empty (&death_row))
+                return (-ENOENT);
+
+        while (!list_empty (&death_row)) {
+                conn = list_entry (death_row.next, ksock_conn_t, ksnc_list);
+                list_del (&conn->ksnc_list);
+
+                /* NB I _have_ to restore the callback, rather than storing
+                 * a noop, since the socket could survive past this module
+                 * being unloaded!! */
+                conn->ksnc_sock->sk->data_ready = conn->ksnc_saved_data_ready;
+                conn->ksnc_sock->sk->write_space = conn->ksnc_saved_write_space;
+
+                /* OK; no more callbacks, but they could be in progress now,
+                 * so wait for them to complete... */
+                write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+                /* ...however if I get the lock before a callback gets it,
+                 * this will make them noop
+                 */
+                conn->ksnc_sock->sk->user_data = NULL;
+
+                /* And drop the scheduler's connection count while I've got
+                 * the exclusive lock */
+                conn->ksnc_scheduler->kss_nconns--;
+
+                write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock,
+                                        flags);
+
+                ksocknal_put_conn (conn);       /* drop ref for ksnd_socklist */
+        }
+
+        return (0);
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+struct tcp_opt *sock2tcp_opt(struct sock *sk)
+{
+        return &(sk->tp_pinfo.af_tcp);
+}
+#else
+struct tcp_opt *sock2tcp_opt(struct sock *sk)
+{
+        struct tcp_sock *s = (struct tcp_sock *)sk;
+        return &s->tcp;
+}
+#endif
+
+void
+ksocknal_push_conn (ksock_conn_t *conn)
+{
+        struct sock    *sk = conn->ksnc_sock->sk;
+        struct tcp_opt *tp = sock2tcp_opt(sk);
+        int             nonagle;
+        int             val = 1;
+        int             rc;
+        mm_segment_t    oldmm;
+
+        lock_sock (sk);
+        nonagle = tp->nonagle;
+        tp->nonagle = 1;
+        release_sock (sk);
+
+        oldmm = get_fs ();
+        set_fs (KERNEL_DS);
+
+        rc = sk->prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+                                   (char *)&val, sizeof (val));
+        LASSERT (rc == 0);
+
+        set_fs (oldmm);
+
+        lock_sock (sk);
+        tp->nonagle = nonagle;
+        release_sock (sk);
+}
+
+/* Passing in a zero nid pushes all connections */
+int
+ksocknal_push_sock (ptl_nid_t nid)
+{
+        ksock_conn_t      *conn;
+        struct list_head  *tmp;
+        int                index;
+        int                i;
+
+        if (nid != 0) {
+                conn = ksocknal_get_conn (nid);
+
+                if (conn == NULL)
+                        return (-ENOENT);
+
+                ksocknal_push_conn (conn);
+                ksocknal_put_conn (conn);
+
+                return (0);
+        }
+
+        /* NB we can't remove connections from the socket list so we have to
+         * cope with them being removed from under us...
+         */
+        for (index = 0; ; index++) {
+                read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+                i = 0;
+                conn = NULL;
+
+                list_for_each (tmp, &ksocknal_data.ksnd_socklist) {
+                        if (i++ == index) {
+                                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                                atomic_inc (&conn->ksnc_refcount); // take a ref
+                                break;
+                        }
+                }
+
+                read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+                if (conn == NULL)
+                        break;
+
+                ksocknal_push_conn (conn);
+                ksocknal_put_conn (conn);
+        }
+
+        return (0);
+}
+
+ksock_conn_t *
+ksocknal_get_conn (ptl_nid_t nid)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        PROF_START(conn_list_walk);
+
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        list_for_each(tmp, &ksocknal_data.ksnd_socklist) {
+
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid) {
+                        /* caller is referencing */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+                        CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n",
+                               conn, nid, atomic_read (&conn->ksnc_refcount));
+
+                        PROF_FINISH(conn_list_walk);
+                        return (conn);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+        CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n",
+               nid);
+        PROF_FINISH(conn_list_walk);
+        return (NULL);
+}
+
+void
+ksocknal_close_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_NET, "connection [%p] closed \n", conn);
+
+        fput (conn->ksnc_file);
+        PORTAL_FREE (conn, sizeof (*conn));
+
+        /* One less connection keeping us hanging on */
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+_ksocknal_put_conn (ksock_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn);
+
+        /* "But what is the black spot, captain?" I asked.
+         * "That's a summons, mate..." */
+
+        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (conn->ksnc_sock->sk->data_ready != ksocknal_data_ready);
+        LASSERT (conn->ksnc_sock->sk->write_space != ksocknal_write_space);
+        LASSERT (conn->ksnc_sock->sk->user_data == NULL);
+        LASSERT (!conn->ksnc_rx_scheduled);
+
+        if (!in_interrupt()) {
+                ksocknal_close_conn (conn);
+                return;
+        }
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+        list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list);
+        wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+}
+
+int
+ksocknal_cmd(struct portal_ioctl_data * data, void * private)
+{
+        int rc = -EINVAL;
+
+        LASSERT (data != NULL);
+
+        switch(data->ioc_nal_cmd) {
+        case NAL_CMD_REGISTER_PEER_FD: {
+                rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd,
+                                       data->ioc_flags);
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = ksocknal_close_sock(data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                rc = ksocknal_set_mynid (data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_PUSH_CONNECTION: {
+                rc = ksocknal_push_sock (data->ioc_nid);
+                break;
+        }
+        }
+
+        return rc;
+}
+
+void
+ksocknal_free_buffers (void)
+{
+        if (ksocknal_data.ksnd_fmbs != NULL) {
+                ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs;
+                int          i;
+                int          j;
+
+                for (i = 0;
+                     i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS);
+                     i++, fmb++)
+                        for (j = 0; j < fmb->fmb_npages; j++)
+                                if (fmb->fmb_pages[j] != NULL)
+                                        __free_page (fmb->fmb_pages[j]);
+
+                PORTAL_FREE (ksocknal_data.ksnd_fmbs,
+                             sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                                     SOCKNAL_LARGE_FWD_NMSGS));
+        }
+
+        if (ksocknal_data.ksnd_ltxs != NULL)
+                PORTAL_FREE (ksocknal_data.ksnd_ltxs,
+                             sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS +
+                                                     SOCKNAL_NNBLK_LTXS));
+
+        if (ksocknal_data.ksnd_schedulers != NULL)
+                PORTAL_FREE (ksocknal_data.ksnd_schedulers,
+                             sizeof (ksock_sched_t) * SOCKNAL_N_SCHED);
+}
+
+void __exit
+ksocknal_module_fini (void)
+{
+        int   i;
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        switch (ksocknal_data.ksnd_init) {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                kportal_nal_unregister(SOCKNAL);
+                PORTAL_SYMBOL_UNREGISTER (ksocknal_ni);
+                /* fall through */
+
+        case SOCKNAL_INIT_PTL:
+                PtlNIFini(ksocknal_ni);
+                lib_fini(&ksocknal_lib);
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all connections
+                 * have been closed so all lists must be empty */
+                LASSERT (list_empty (&ksocknal_data.ksnd_socklist));
+                LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list));
+                LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns));
+
+                if (ksocknal_data.ksnd_schedulers != NULL)
+                        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                                ksock_sched_t *kss =
+                                        &ksocknal_data.ksnd_schedulers[i];
+
+                                LASSERT (list_empty (&kss->kss_tx_conns));
+                                LASSERT (list_empty (&kss->kss_rx_conns));
+                                LASSERT (kss->kss_nconns == 0);
+                        }
+
+                /* stop router calling me */
+                kpr_shutdown (&ksocknal_data.ksnd_router);
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ksocknal_data.ksnd_shuttingdown = 1;
+                wake_up_all (&ksocknal_data.ksnd_reaper_waitq);
+
+                for (i = 0; i < SOCKNAL_N_SCHED; i++)
+                       wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq);
+
+                while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) {
+                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
+                                atomic_read (&ksocknal_data.ksnd_nthreads));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+
+                kpr_deregister (&ksocknal_data.ksnd_router);
+
+                ksocknal_free_buffers();
+                /* fall through */
+
+        case SOCKNAL_INIT_NOTHING:
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+}
+
+
+int __init
+ksocknal_module_init (void)
+{
+        int   pkmem = atomic_read(&portal_kmemory);
+        int   rc;
+        int   i;
+        int   j;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+
+        LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+
+        ksocknal_api.forward  = ksocknal_api_forward;
+        ksocknal_api.shutdown = ksocknal_api_shutdown;
+        ksocknal_api.yield    = ksocknal_api_yield;
+        ksocknal_api.validate = NULL;           /* our api validate is a NOOP */
+        ksocknal_api.lock     = ksocknal_api_lock;
+        ksocknal_api.unlock   = ksocknal_api_unlock;
+        ksocknal_api.nal_data = &ksocknal_data;
+
+        ksocknal_lib.nal_data = &ksocknal_data;
+
+        memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist);
+        rwlock_init(&ksocknal_data.ksnd_socklist_lock);
+
+        ksocknal_data.ksnd_nal_cb = &ksocknal_lib;
+        spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock);
+
+        spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns);
+
+        spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns);
+
+        spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list);
+        init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq);
+
+        spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
+        INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list);
+        init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+        memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED,
+                sizeof (ksocknal_data.ksnd_irq_info));
+
+        /* flag lists/ptrs/locks initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+
+        PORTAL_ALLOC(ksocknal_data.ksnd_schedulers,
+                     sizeof(ksock_sched_t) * SOCKNAL_N_SCHED);
+        if (ksocknal_data.ksnd_schedulers == NULL)
+                RETURN(-ENOMEM);
+
+        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
+
+                spin_lock_init (&kss->kss_lock);
+                INIT_LIST_HEAD (&kss->kss_rx_conns);
+                INIT_LIST_HEAD (&kss->kss_tx_conns);
+#if SOCKNAL_ZC
+                INIT_LIST_HEAD (&kss->kss_zctxdone_list);
+#endif
+                init_waitqueue_head (&kss->kss_waitq);
+        }
+
+        CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t),
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        PORTAL_ALLOC(ksocknal_data.ksnd_ltxs,
+                     sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS));
+        if (ksocknal_data.ksnd_ltxs == NULL) {
+                ksocknal_module_fini ();
+                return (-ENOMEM);
+        }
+
+        /* Deterministic bugs please */
+        memset (ksocknal_data.ksnd_ltxs, 0xeb,
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) {
+                ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i];
+
+                ltx->ltx_idle = i < SOCKNAL_NLTXS ?
+                                &ksocknal_data.ksnd_idle_ltx_list :
+                                &ksocknal_data.ksnd_idle_nblk_ltx_list;
+                list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+        }
+
+        rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni);
+        if (rc != 0) {
+                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+                ksocknal_module_fini ();
+                RETURN (rc);
+        }
+        PtlNIDebug(ksocknal_ni, ~0);
+
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called
+
+        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                rc = ksocknal_thread_start (ksocknal_scheduler,
+                                            &ksocknal_data.ksnd_schedulers[i]);
+                if (rc != 0) {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n",
+                               i, rc);
+                        ksocknal_module_fini ();
+                        RETURN (rc);
+                }
+        }
+
+        rc = ksocknal_thread_start (ksocknal_reaper, NULL);
+        if (rc != 0) {
+                CERROR("Can't spawn socknal reaper: %d\n", rc);
+                ksocknal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = kpr_register(&ksocknal_data.ksnd_router,
+                          &ksocknal_router_interface);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't initialise routing interface "
+                       "(rc = %d): not routing\n", rc);
+        } else {
+                /* Only allocate forwarding buffers if I'm on a gateway */
+
+                PORTAL_ALLOC(ksocknal_data.ksnd_fmbs,
+                             sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                                    SOCKNAL_LARGE_FWD_NMSGS));
+                if (ksocknal_data.ksnd_fmbs == NULL) {
+                        ksocknal_module_fini ();
+                        RETURN(-ENOMEM);
+                }
+
+                /* NULL out buffer pointers etc */
+                memset(ksocknal_data.ksnd_fmbs, 0,
+                       sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                              SOCKNAL_LARGE_FWD_NMSGS));
+
+                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS +
+                                 SOCKNAL_LARGE_FWD_NMSGS); i++) {
+                        ksock_fmb_t *fmb =
+                                &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i];
+
+                        if (i < SOCKNAL_SMALL_FWD_NMSGS) {
+                                fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES;
+                                fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp;
+                        } else {
+                                fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES;
+                                fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp;
+                        }
+
+                        LASSERT (fmb->fmb_npages > 0);
+                        for (j = 0; j < fmb->fmb_npages; j++) {
+                                fmb->fmb_pages[j] = alloc_page(GFP_KERNEL);
+
+                                if (fmb->fmb_pages[j] == NULL) {
+                                        ksocknal_module_fini ();
+                                        return (-ENOMEM);
+                                }
+
+                                LASSERT(page_address (fmb->fmb_pages[j]) !=
+                                        NULL);
+                        }
+
+                        list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+                }
+        }
+
+        rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                ksocknal_module_fini ();
+                return (rc);
+        }
+
+        PORTAL_SYMBOL_REGISTER(ksocknal_ni);
+
+        /* flag everything initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+        printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial "
+               "mem %d)\n",
+               kpr_routing (&ksocknal_data.ksnd_router) ?
+               "enabled" : "disabled", pkmem);
+
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(ksocknal_module_init);
+module_exit(ksocknal_module_fini);
+
+EXPORT_SYMBOL (ksocknal_ni);
diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h
new file mode 100644 (file)
index 0000000..86cdeb0
--- /dev/null
@@ -0,0 +1,292 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_SOCKNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define SOCKNAL_N_SCHED num_online_cpus()       /* # socknal schedulers */
+
+#if PTL_LARGE_MTU
+# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10)      /* biggest payload I can forward */
+#else
+# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10)       /* biggest payload I can forward */
+#endif
+
+#define SOCKNAL_NLTXS           128             /* # normal transmit messages */
+#define SOCKNAL_NNBLK_LTXS     128             /* # transmit messages reserved if can't block */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 64              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
+
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+                                               /* # pages in a large message fwd buffer */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+
+#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10)
+
+typedef struct                                  /* pool of forwarding buffers */
+{
+        spinlock_t        fmp_lock;             /* serialise */
+        struct list_head  fmp_idle_fmbs;        /* buffers waiting for a connection */
+        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
+} ksock_fmb_pool_t;
+
+
+typedef struct                                  /* per scheduler state */
+{
+        spinlock_t        kss_lock;             /* serialise */
+        struct list_head  kss_rx_conns;         /* conn waiting to be read */
+        struct list_head  kss_tx_conns;         /* conn waiting to be written */
+#if SOCKNAL_ZC
+        struct list_head  kss_zctxdone_list;    /* completed ZC transmits */
+#endif
+        wait_queue_head_t kss_waitq;            /* where scheduler sleeps */
+        int               kss_nconns;           /* # connections assigned to this scheduler */
+} ksock_sched_t;
+
+typedef struct {
+        int               ksnd_init;            /* initialisation state */
+        
+        struct list_head  ksnd_socklist;        /* all my connections */
+        rwlock_t          ksnd_socklist_lock;   /* stabilise add/find/remove */
+
+        nal_cb_t         *ksnd_nal_cb;
+        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
+
+        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        ksock_sched_t    *ksnd_schedulers;      /* scheduler state */
+        
+        kpr_router_t      ksnd_router;          /* THE router */
+
+        void             *ksnd_fmbs;            /* all the pre-allocated FMBs */
+        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
+        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
+
+        void             *ksnd_ltxs;            /* all the pre-allocated LTXs */
+        spinlock_t        ksnd_idle_ltx_lock;   /* serialise ltx alloc/free */
+        struct list_head  ksnd_idle_ltx_list;   /* where to get an idle LTX */
+        struct list_head  ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
+        wait_queue_head_t ksnd_idle_ltx_waitq;  /* where to block for an idle LTX */
+
+        struct list_head  ksnd_reaper_list;     /* conn waiting to be reaped */
+        wait_queue_head_t ksnd_reaper_waitq;    /* reaper sleeps here */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+        unsigned char     ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_ALL        3
+
+#define SOCKNAL_IRQ_BOUND       0x80            /* flag we _did_ bind already */
+#define SOCKNAL_IRQ_SCHED_MASK 0x7f            /* we assume < 127 CPUs */
+#define SOCKNAL_IRQ_UNASSIGNED  0xff            /* flag unassigned */
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments and 0 or more ptl_kiov_t fragments.  Forwarded
+ * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0
+ * ptl_kiov_t fragments.  Messages from an MD with PTL_MD_KIOV set, have 1
+ * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t
+ * fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, if the message
+ * requires forwarding or will be received into mapped memory, up to
+ * PTL_MD_MAX_IOV struct iovec fragments describe the target memory.
+ * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used.
+ */
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;        /* queue on conn for transmission etc */
+        char                    tx_isfwd;       /* forwarding / sourced here */
+        int                     tx_nob;         /* # packet bytes */
+        int                     tx_niov;        /* # packet iovec frags */
+        struct iovec           *tx_iov;         /* packet iovec frags */
+        int                     tx_nkiov;       /* # packet page frags */
+        ptl_kiov_t             *tx_kiov;        /* packet page frags */
+#if SOCKNAL_ZC        
+        ksock_sched_t          *tx_sched;       /* who to wake on callback */
+        zccd_t                  tx_zccd;        /* zero copy callback descriptor */
+#endif
+} ksock_tx_t;
+
+#define KSOCK_ZCCD_2_TX(ptr)   list_entry (ptr, ksock_tx_t, tx_zccd)
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the tx frag descriptors: hdr is always 1 iovec
+ * and payload is PTL_MD_MAX of either type. */
+typedef struct
+{
+        struct iovec            hdr;
+        union {
+                struct iovec    iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+        }                       payload;
+} ksock_txiovspace_t;
+
+typedef struct                                  /* locally transmitted packet */
+{
+        ksock_tx_t              ltx_tx;         /* send info */
+        struct list_head       *ltx_idle;       /* where to put when idle */
+        void                   *ltx_private;    /* lib_finalize() callback arg */
+        void                   *ltx_cookie;     /* lib_finalize() callback arg */
+        ksock_txiovspace_t      ltx_iov_space;  /* where to stash frag descriptors */
+        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
+} ksock_ltx_t;
+
+#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch)
+/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
+
+#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
+/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
+
+/* NB list_entry() is used here as convenient macro for calculating a
+ * pointer to a struct from the address of a member.
+ */
+
+typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
+{                                               /* (socknal->router) */
+        struct list_head        fmb_list;       /* queue idle */
+        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
+        int                     fmb_npages;     /* # pages allocated */
+        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
+        struct page            *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
+        struct iovec            fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+} ksock_fmb_t;
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or PTL_MD_MAX_IOV frags of payload of either type. */
+typedef union {
+        struct iovec    iov[PTL_MD_MAX_IOV];
+        ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
+#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
+#define SOCKNAL_RX_SLOP         4               /* skipping body */
+#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
+#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+
+typedef struct 
+{ 
+        struct list_head    ksnc_list;          /* stash on global socket list */
+        struct file        *ksnc_file;          /* socket filp */
+        struct socket      *ksnc_sock;          /* actual socket */
+        void               *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+        void               *ksnc_saved_write_space; /* socket's original write_space() callback */
+        ptl_nid_t           ksnc_peernid;       /* who's on the other end */
+        atomic_t            ksnc_refcount;      /* # users */
+        ksock_sched_t     *ksnc_scheduler;     /* who schedules this connection */
+        
+        /* READER */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        volatile int        ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # iovec frags */
+        struct iovec       *ksnc_rx_iov;        /* the iovec frags */
+        int                 ksnc_rx_nkiov;      /* # page frags */
+        ptl_kiov_t         *ksnc_rx_kiov;       /* the page frags */
+        ksock_rxiovspace_t  ksnc_rx_iov_space;  /* space for frag descriptors */
+        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
+        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        volatile int        ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+
+} ksock_conn_t;
+
+extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client);
+extern int ksocknal_close_sock(ptl_nid_t nid);
+extern int ksocknal_set_mynid(ptl_nid_t nid);
+extern int ksocknal_push_sock(ptl_nid_t nid);
+extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid);
+extern void _ksocknal_put_conn (ksock_conn_t *conn);
+extern void ksocknal_close_conn (ksock_conn_t *conn);
+
+static inline void
+ksocknal_put_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n",
+                conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount));
+
+        if (atomic_dec_and_test (&conn->ksnc_refcount))
+                _ksocknal_put_conn (conn);
+}
+
+extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern void ksocknal_data_ready(struct sock *sk, int n);
+extern void ksocknal_write_space(struct sock *sk);
+
+
+extern nal_cb_t         ksocknal_lib;
+extern ksock_nal_data_t ksocknal_data;
diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c
new file mode 100644 (file)
index 0000000..6147d8a
--- /dev/null
@@ -0,0 +1,1613 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socknal.h"
+
+atomic_t   ksocknal_packets_received;
+atomic_t   ksocknal_packets_launched;
+atomic_t   ksocknal_packets_being_sent;
+
+#if SOCKNAL_ZC
+int        ksocknal_do_zc = 1;
+int        ksocknal_zc_min_frag = 2048;
+#endif
+
+/*
+ *  LIB functions follow
+ *
+ */
+int
+ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
+              user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
+               void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                         ptl_event_t *ev)
+{
+        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
+               nal->ni.nid, eq, ev);
+
+        if (eq->event_callback != NULL)
+                eq->event_callback(ev);
+
+        return 0;
+}
+
+void *
+ksocknal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+
+        if (buf != NULL)
+                memset(buf, 0, len);
+
+        return (buf);
+}
+
+void
+ksocknal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+void
+ksocknal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list ap;
+        char msg[256];
+
+        va_start (ap, fmt);
+        vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
+        va_end (ap);
+
+        msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+void
+ksocknal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data = nal->nal_data;
+
+        spin_lock(&data->ksnd_nal_cb_lock);
+}
+
+void
+ksocknal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data;
+        data = nal->nal_data;
+
+        spin_unlock(&data->ksnd_nal_cb_lock);
+}
+
+int
+ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if ksocknal_get_conn(nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ksock_ltx_t *
+ksocknal_get_ltx (int may_block)
+{
+        long             flags;
+        ksock_ltx_t *ltx = NULL;
+
+        for (;;) {
+                spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+                if (!list_empty (&ksocknal_data.ksnd_idle_ltx_list)) {
+                        ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next,
+                                         ksock_ltx_t, ltx_tx.tx_list);
+                        list_del (&ltx->ltx_tx.tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        if (!list_empty(&ksocknal_data.ksnd_idle_nblk_ltx_list)) {
+                                ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next,
+                                                 ksock_ltx_t, ltx_tx.tx_list);
+                                list_del (&ltx->ltx_tx.tx_list);
+                        }
+                        break;
+                }
+
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_idle_ltx_lock,
+                                       flags);
+
+                wait_event (ksocknal_data.ksnd_idle_ltx_waitq,
+                            !list_empty (&ksocknal_data.ksnd_idle_ltx_list));
+        }
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+        return (ltx);
+}
+
+#if SOCKNAL_ZC
+struct page *
+ksocknal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+                /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (page == NULL ||
+            !VALID_PAGE (page))
+                return (NULL);
+
+        return (page);
+}
+#endif
+
+int
+ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        struct iovec  *iov = tx->tx_iov;
+        int            fragsize = iov->iov_len;
+        unsigned long  vaddr = (unsigned long)iov->iov_base;
+#if SOCKNAL_ZC
+        int            offset = vaddr & (PAGE_SIZE - 1);
+        int            zcsize = MIN (fragsize, PAGE_SIZE - offset);
+        struct page   *page;
+#endif
+        int            rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_nob);
+        LASSERT (tx->tx_niov > 0);
+        more |= (tx->tx_niov > 1);
+        
+#if SOCKNAL_ZC
+        if (ksocknal_do_zc &&
+            (sock->sk->route_caps & NETIF_F_SG) &&
+            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            zcsize >= ksocknal_zc_min_frag &&
+            (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
+                
+                CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
+                       (void *)vaddr, page, page_address(page), offset, zcsize);
+
+                more |= (zcsize < fragsize);
+
+                rc = tcp_sendpage_zccd(sock, page, offset, zcsize, 
+                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                /* NB don't pass tx's iov; sendmsg may or may not update it */
+                struct iovec fragiov = { .iov_base = (void *)vaddr,
+                                         .iov_len  = fragsize};
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &fragiov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
+                };
+                mm_segment_t oldmm = get_fs();
+                
+                set_fs (KERNEL_DS);
+                rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize);
+                set_fs (oldmm);
+        } 
+
+        if (rc <= 0)
+                return (rc);
+
+        tx->tx_nob -= rc;
+
+        if (rc < fragsize) {
+                /* didn't send whole frag */
+                iov->iov_base = (void *)(vaddr + rc);
+                iov->iov_len  = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        /* everything went */
+        LASSERT (rc == fragsize);
+        tx->tx_iov++;
+        tx->tx_niov--;
+        return (1);
+}
+
+int
+ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        ptl_kiov_t    *kiov = tx->tx_kiov;
+        int            fragsize = kiov->kiov_len;
+        struct page   *page = kiov->kiov_page;
+        int            offset = kiov->kiov_offset;
+        int            rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_nob);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        LASSERT (tx->tx_nkiov > 0);
+        more |= (tx->tx_nkiov > 1);
+
+#if SOCKNAL_ZC
+        if (ksocknal_do_zc &&
+            (sock->sk->route_caps & NETIF_F_SG) &&
+            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            fragsize >= ksocknal_zc_min_frag) {
+
+                CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                               page, offset, fragsize);
+
+                rc = tcp_sendpage_zccd(sock, page, offset, fragsize,
+                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                char *addr = ((char *)kmap (page)) + offset;
+                struct iovec fragiov = {.iov_base = addr,
+                                        .iov_len  = fragsize};
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &fragiov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
+                };
+                mm_segment_t  oldmm = get_fs();
+                
+                set_fs (KERNEL_DS);
+                rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize);
+                set_fs (oldmm);
+                kunmap (page);
+        }
+
+        if (rc <= 0)
+                return (rc);
+
+        tx->tx_nob -= rc;
+
+        if (rc < fragsize) {
+                /* didn't send whole frag */
+                kiov->kiov_offset = offset + rc;
+                kiov->kiov_len    = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        /* everything went */
+        LASSERT (rc == fragsize);
+        tx->tx_kiov++;
+        tx->tx_nkiov--;
+        return (1);
+}
+
+int
+ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        int    rc;
+        int    sent_some = 0;
+        ENTRY;
+        
+        LASSERT (!in_interrupt());
+
+        for (;;) {
+                if (tx->tx_niov != 0)
+                        rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0);
+                else
+                        rc = ksocknal_send_kiov (sock, tx, more);
+
+                /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */
+                if (rc <= 0)                    /* error or partial send */
+                        RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc);
+                
+                if (tx->tx_nob == 0)            /* sent everything */
+                        RETURN (0);
+
+                sent_some = 1;
+        }
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+        struct iovec *iov = conn->ksnc_rx_iov;
+        int           fragsize  = iov->iov_len;
+        unsigned long vaddr = (unsigned long)iov->iov_base;
+        struct iovec  fragiov = { .iov_base = (void *)vaddr,
+                                  .iov_len  = fragsize};
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = &fragiov,
+                .msg_iovlen     = 1,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        mm_segment_t oldmm = get_fs();
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (conn->ksnc_rx_niov > 0);
+        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        
+        set_fs (KERNEL_DS);
+        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
+        /* NB this is just a boolean............................^ */
+        set_fs (oldmm);
+
+        if (rc <= 0)
+                return (rc);
+
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+                
+        if (rc < fragsize) {
+                iov->iov_base = (void *)(vaddr + rc);
+                iov->iov_len = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        LASSERT (rc == fragsize);
+        conn->ksnc_rx_iov++;
+        conn->ksnc_rx_niov--;
+        return (1);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+        ptl_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        struct page  *page = kiov->kiov_page;
+        int           offset = kiov->kiov_offset;
+        int           fragsize = kiov->kiov_len;
+        unsigned long vaddr = ((unsigned long)kmap (page)) + offset;
+        struct iovec  fragiov = { .iov_base = (void *)vaddr,
+                                  .iov_len  = fragsize};
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = &fragiov,
+                .msg_iovlen     = 1,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        mm_segment_t oldmm = get_fs();
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        LASSERT (conn->ksnc_rx_nkiov > 0);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        
+        set_fs (KERNEL_DS);
+        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
+        /* NB this is just a boolean............................^ */
+        set_fs (oldmm);
+        kunmap (page);
+        
+        if (rc <= 0)
+                return (rc);
+        
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+                
+        if (rc < fragsize) {
+                kiov->kiov_offset = offset + rc;
+                kiov->kiov_len = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        LASSERT (rc == fragsize);
+        conn->ksnc_rx_kiov++;
+        conn->ksnc_rx_nkiov--;
+        return (1);
+}
+
+int
+ksocknal_recvmsg (ksock_conn_t *conn) 
+{
+        int    rc;
+        int    got_some = 0;
+        ENTRY;
+        
+        LASSERT (!in_interrupt ());
+
+        for (;;) {
+                LASSERT (conn->ksnc_rx_nob_wanted > 0);
+                
+                if (conn->ksnc_rx_niov != 0)
+                        rc = ksocknal_recv_iov (conn);
+                else
+                        rc = ksocknal_recv_kiov (conn);
+
+                /* CAVEAT EMPTOR: we return...
+                 * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */
+
+                if (rc <= 0)                    /* error/EOF or partial receive */
+                        RETURN ((got_some || rc == -EAGAIN) ? 1 : rc);
+                
+                if (conn->ksnc_rx_nob_wanted == 0)
+                        RETURN (1);
+
+                got_some = 0;
+        }
+}
+
+#if SOCKNAL_ZC
+void
+ksocknal_zc_callback (zccd_t *zcd)
+{
+        ksock_tx_t    *tx = KSOCK_ZCCD_2_TX(zcd);
+        ksock_sched_t *sched = tx->tx_sched;
+        unsigned long  flags;
+        ENTRY;
+
+        /* Schedule tx for cleanup (can't do it now due to lock conflicts) */
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
+        if (waitqueue_active (&sched->kss_waitq))
+                wake_up (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        EXIT;
+}
+#endif
+
+void
+ksocknal_tx_done (ksock_tx_t *tx)
+{
+        long           flags;
+        ksock_ltx_t   *ltx;
+        ENTRY;
+
+        atomic_dec (&ksocknal_packets_being_sent);
+
+        if (tx->tx_isfwd) {             /* was a forwarded packet? */
+                kpr_fwd_done (&ksocknal_data.ksnd_router,
+                              KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+                EXIT;
+                return;
+        }
+
+        /* local send */
+        ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+
+        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie);
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+        list_add_tail (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+
+        /* normal tx desc => wakeup anyone blocking for one */
+        if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list &&
+            waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq))
+                wake_up (&ksocknal_data.ksnd_idle_ltx_waitq);
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+        EXIT;
+}
+
+void
+ksocknal_process_transmit (ksock_sched_t *sched, long *irq_flags)
+{
+        ksock_conn_t *conn;
+        ksock_tx_t *tx;
+        int         rc;
+
+        LASSERT (!list_empty (&sched->kss_tx_conns));
+        conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list);
+        list_del (&conn->ksnc_tx_list);
+
+        LASSERT (conn->ksnc_tx_scheduled);
+        LASSERT (conn->ksnc_tx_ready);
+        LASSERT (!list_empty (&conn->ksnc_tx_queue));
+        tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list);
+        /* assume transmit will complete now, so dequeue while I've got lock */
+        list_del (&tx->tx_list);
+
+        spin_unlock_irqrestore (&sched->kss_lock, *irq_flags);
+
+        LASSERT (tx->tx_nob > 0);
+
+        conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */
+        mb();                   /* => clear BEFORE trying to write */
+
+        rc = ksocknal_sendmsg (conn->ksnc_sock, tx, 
+                               !list_empty (&conn->ksnc_tx_queue)); /* more to come? */
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc);
+
+        if (rc != 0) {
+#warning FIXME: handle socket errors properly
+                CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc);
+                /* kid on for now the whole packet went.
+                 * NB when we handle the error better, we'll still need to
+                 * block for zccd completion.
+                 */
+                tx->tx_nob = 0;
+        }
+
+        if (tx->tx_nob == 0)                    /* nothing left to send */
+        {
+                /* everything went; assume more can go, so prevent write_space locking */
+                conn->ksnc_tx_ready = 1;
+
+                ksocknal_put_conn (conn);       /* release packet's ref */
+                atomic_inc (&ksocknal_packets_being_sent);
+#if SOCKNAL_ZC
+                if (atomic_read (&tx->tx_zccd.zccd_count) != 1) {
+                        /* zccd skbufs are still in-flight.  Release my
+                         * initial ref on zccd, so callback can occur */
+                        zccd_put (&tx->tx_zccd);
+                } else
+#endif
+                        ksocknal_tx_done (tx);
+
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+        } else {
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+
+                /* back onto HEAD of tx_queue */
+                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+        }
+
+        if (!conn->ksnc_tx_ready ||             /* no space to write now */
+            list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */
+                conn->ksnc_tx_scheduled = 0;    /* not being scheduled */
+                ksocknal_put_conn (conn);       /* release scheduler's ref */
+        } else                                 /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
+}
+
+void
+ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        unsigned long  flags;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+
+        /* Ensure the frags we've been given EXACTLY match the number of
+         * bytes we want to send.  Many TCP/IP stacks disregard any total
+         * size parameters passed to them and just look at the frags. 
+         *
+         * We always expect at least 1 mapped fragment containing the
+         * complete portals header.
+         */
+        LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) +
+                 lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob);
+        LASSERT (tx->tx_niov >= 1);
+        LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t));
+        
+        CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n",
+                ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, 
+                tx->tx_niov, tx->tx_nkiov);
+
+#if SOCKNAL_ZC
+        zccd_init (&tx->tx_zccd, ksocknal_zc_callback);
+        /* NB this sets 1 ref on zccd, so the callback can only occur
+         * after I've released this ref */
+        tx->tx_sched = sched;
+#endif
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+
+        if (conn->ksnc_tx_ready &&              /* able to send */
+            !conn->ksnc_tx_scheduled) {          /* not scheduled to send */
+                list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */
+                if (waitqueue_active (&sched->kss_waitq))
+                        wake_up (&sched->kss_waitq);
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+        atomic_inc (&ksocknal_packets_launched);
+}
+
+ksock_conn_t *
+ksocknal_send_target (ptl_nid_t nid) 
+{
+        ptl_nid_t     gatewaynid;
+        ksock_conn_t *conn;
+        int           rc;
+
+        if ((conn = ksocknal_get_conn (nid)) == NULL) {
+                /* It's not a peer; try to find a gateway */
+                rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid);
+                if (rc != 0) {
+                        CERROR("Can't route to "LPX64": router error %d\n",
+                               nid, rc);
+                        return (NULL);
+                }
+
+                if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) {
+                        CERROR ("Can't route to "LPX64": gateway "LPX64
+                                " is not a peer\n", nid, gatewaynid);
+                        return (NULL);
+                }
+        }
+
+        return (conn);
+}
+
+ksock_ltx_t *
+ksocknal_setup_hdr (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+                    ptl_hdr_t *hdr, int type)
+{
+        ksock_ltx_t  *ltx;
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ltx = ksocknal_get_ltx (!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt ()));
+        if (ltx == NULL) {
+                CERROR ("Can't allocate tx desc\n");
+                return (NULL);
+        }
+
+        /* Init local send packet (storage for hdr, finalize() args) */
+        ltx->ltx_hdr = *hdr;
+        ltx->ltx_private = private;
+        ltx->ltx_cookie = cookie;
+        
+        /* Init common ltx_tx */
+        ltx->ltx_tx.tx_isfwd = 0;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr);
+
+        /* We always have 1 mapped frag for the header */
+        ltx->ltx_tx.tx_niov = 1;
+        ltx->ltx_tx.tx_iov = &ltx->ltx_iov_space.hdr;
+        ltx->ltx_tx.tx_iov[0].iov_base = &ltx->ltx_hdr;
+        ltx->ltx_tx.tx_iov[0].iov_len = sizeof (ltx->ltx_hdr);
+
+        ltx->ltx_tx.tx_kiov  = NULL;
+        ltx->ltx_tx.tx_nkiov = 0;
+
+        return (ltx);
+}
+
+int
+ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie,
+               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+               unsigned int payload_niov, struct iovec *payload_iov,
+               size_t payload_len)
+{
+        ksock_ltx_t  *ltx;
+        ksock_conn_t *conn;
+
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it until we can rely on it
+         *
+         * Also, the return code from this procedure is ignored.
+         * If we can't send, we must still complete with lib_finalize().
+         * We'll have to wait for 3.2 to return an error event.
+         */
+
+        CDEBUG(D_NET,
+               "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64
+               " pid %d\n", payload_len, payload_niov, nid, pid);
+
+        conn = ksocknal_send_target (nid);
+        if (conn == NULL) {
+                lib_finalize (&ksocknal_lib, private, cookie);
+                return (-1);
+        }
+
+        ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type);
+        if (ltx == NULL) {
+                ksocknal_put_conn (conn);
+                lib_finalize (&ksocknal_lib, private, cookie);
+                return (-1);
+        }
+
+        /* append the payload_iovs to the one pointing at the header */
+        LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        memcpy (ltx->ltx_tx.tx_iov + 1, payload_iov,
+                payload_niov * sizeof (*payload_iov));
+        ltx->ltx_tx.tx_niov = 1 + payload_niov;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+
+        ksocknal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+int
+ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len)
+{
+        ksock_ltx_t *ltx;
+        ksock_conn_t *conn;
+        
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it until we can rely on it */
+
+        CDEBUG(D_NET,
+               "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n",
+               payload_len, payload_niov, nid, pid);
+
+        conn = ksocknal_send_target (nid);
+        if (conn == NULL)
+                return (-1);
+
+        ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type);
+        if (ltx == NULL) {
+                ksocknal_put_conn (conn);
+                return (-1);
+        }
+
+        LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        
+        ltx->ltx_tx.tx_kiov = ltx->ltx_iov_space.payload.kiov;
+        memcpy (ltx->ltx_tx.tx_kiov, payload_iov, 
+                payload_niov * sizeof (*payload_iov));
+        ltx->ltx_tx.tx_nkiov = payload_niov;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+
+        ksocknal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+void
+ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        ksock_conn_t *conn;
+        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
+        ksock_tx_t   *tx  = (ksock_tx_t *)&fwd->kprfd_scratch;
+
+        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd,
+                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+
+        /* I'm the gateway; must be the last hop */
+        if (nid == ksocknal_lib.ni.nid)
+                nid = fwd->kprfd_target_nid;
+
+        conn = ksocknal_get_conn (nid);
+        if (conn == NULL) {
+                CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid);
+                kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH);
+                return;
+        }
+
+        /* This forward has now got a ref on conn */
+
+        tx->tx_isfwd = 1;                   /* This is a forwarding packet */
+        tx->tx_nob   = fwd->kprfd_nob;
+        tx->tx_niov  = fwd->kprfd_niov;
+        tx->tx_iov   = fwd->kprfd_iov;
+        tx->tx_nkiov = 0;
+        tx->tx_kiov  = NULL;
+        
+        ksocknal_launch_packet (conn, tx);
+}
+
+int
+ksocknal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&ksocknal_data.ksnd_nthreads);
+        return (0);
+}
+
+void
+ksocknal_thread_fini (void)
+{
+        atomic_dec (&ksocknal_data.ksnd_nthreads);
+}
+
+void
+ksocknal_fmb_callback (void *arg, int error)
+{
+        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
+        ksock_fmb_pool_t  *fmp = fmb->fmb_pool;
+        ptl_hdr_t         *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]);
+        ksock_conn_t      *conn = NULL;
+        ksock_sched_t     *sched;
+        long               flags;
+
+        if (error != 0)
+                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
+                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),
+                       error);
+        else
+                CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n",
+                        NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid));
+
+        spin_lock_irqsave (&fmp->fmp_lock, flags);
+
+        list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
+
+        if (!list_empty (&fmp->fmp_blocked_conns)) {
+                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next,
+                                   ksock_conn_t, ksnc_rx_list);
+                list_del (&conn->ksnc_rx_list);
+        }
+
+        spin_unlock_irqrestore (&fmp->fmp_lock, flags);
+
+        if (conn == NULL)
+                return;
+
+        CDEBUG (D_NET, "Scheduling conn %p\n", conn);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+
+        sched = conn->ksnc_scheduler;
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
+
+        if (waitqueue_active (&sched->kss_waitq))
+                wake_up (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+}
+
+ksock_fmb_t *
+ksocknal_get_idle_fmb (ksock_conn_t *conn)
+{
+        int               payload_nob = conn->ksnc_rx_nob_left;
+        int               packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        long              flags;
+        ksock_fmb_pool_t *pool;
+        ksock_fmb_t      *fmb;
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (ksocknal_data.ksnd_fmbs != NULL);
+
+        if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+                pool = &ksocknal_data.ksnd_small_fmp;
+        else
+                pool = &ksocknal_data.ksnd_large_fmp;
+
+        spin_lock_irqsave (&pool->fmp_lock, flags);
+
+        if (!list_empty (&pool->fmp_idle_fmbs)) {
+                fmb = list_entry(pool->fmp_idle_fmbs.next,
+                                 ksock_fmb_t, fmb_list);
+                list_del (&fmb->fmb_list);
+                spin_unlock_irqrestore (&pool->fmp_lock, flags);
+
+                return (fmb);
+        }
+
+        /* deschedule until fmb free */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+
+        list_add_tail (&conn->ksnc_rx_list,
+                       &pool->fmp_blocked_conns);
+
+        spin_unlock_irqrestore (&pool->fmp_lock, flags);
+        return (NULL);
+}
+
+
+int
+ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
+{
+        int payload_nob = conn->ksnc_rx_nob_left;
+        int packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        int niov;                               /* at least the header */
+        int nob;
+
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
+        LASSERT (payload_nob >= 0);
+        LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+
+        /* Got a forwarding buffer; copy the header we just read into the
+         * forwarding buffer.  If there's payload start reading reading it
+         * into the buffer, otherwise the forwarding buffer can be kicked
+         * off immediately.
+         *
+         * NB fmb->fmb_iov spans the WHOLE packet.
+         *    conn->ksnc_rx_iov spans just the payload.
+         */
+
+        fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]);
+
+        /* copy header */
+        memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t));
+
+        if (payload_nob == 0) {         /* got complete packet already */
+                atomic_inc (&ksocknal_packets_received);
+
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n",
+                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
+                        dest_nid, packet_nob);
+
+                fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t);
+
+                kpr_fwd_init (&fmb->fmb_fwd, dest_nid,
+                              packet_nob, 1, fmb->fmb_iov,
+                              ksocknal_fmb_callback, fmb);
+
+                /* forward it now */
+                kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd);
+
+                ksocknal_new_packet (conn, 0);  /* on to next packet */
+                return (1);
+        }
+
+        niov = 1;
+        if (packet_nob <= PAGE_SIZE) {  /* whole packet fits in first page */
+                fmb->fmb_iov[0].iov_len = packet_nob;
+        } else {
+                fmb->fmb_iov[0].iov_len = PAGE_SIZE;
+                nob = packet_nob - PAGE_SIZE;
+
+                do {
+                        LASSERT (niov < fmb->fmb_npages);
+                        fmb->fmb_iov[niov].iov_base =
+                                page_address (fmb->fmb_pages[niov]);
+                        fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob);
+                        nob -= PAGE_SIZE;
+                        niov++;
+                } while (nob > 0);
+        }
+
+        kpr_fwd_init (&fmb->fmb_fwd, dest_nid,
+                      packet_nob, niov, fmb->fmb_iov,
+                      ksocknal_fmb_callback, fmb);
+
+        /* stash router's descriptor ready for call to kpr_fwd_start */
+        conn->ksnc_cookie = &fmb->fmb_fwd;
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
+
+        /* payload is desc's iov-ed buffer, but skipping the hdr */
+        LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) /
+                 sizeof (struct iovec));
+
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        conn->ksnc_rx_iov[0].iov_base =
+                (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) +
+                         sizeof (ptl_hdr_t));
+        conn->ksnc_rx_iov[0].iov_len =
+                fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t);
+
+        if (niov > 1)
+                memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1],
+                       (niov - 1) * sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
+                NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob);
+        return (0);
+}
+
+void
+ksocknal_fwd_parse (ksock_conn_t *conn)
+{
+        ksock_conn_t *conn2;
+        ptl_nid_t     dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        int           body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr));
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
+                NTOH__u64 (conn->ksnc_hdr.src_nid),
+                dest_nid, conn->ksnc_rx_nob_left);
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        if (body_len < 0) {                 /* length corrupt (overflow) */
+                CERROR("dropping packet from "LPX64" for "LPX64": packet "
+                       "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid),
+                       dest_nid, body_len);
+                ksocknal_new_packet (conn, 0);          /* on to new packet */
+                return;
+        }
+
+        if (ksocknal_data.ksnd_fmbs == NULL) {        /* not forwarding */
+                CERROR("dropping packet from "LPX64" for "LPX64": not "
+                       "forwarding\n", conn->ksnc_hdr.src_nid,
+                       conn->ksnc_hdr.dest_nid);
+                /* on to new packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) {      /* too big to forward */
+                CERROR ("dropping packet from "LPX64" for "LPX64
+                        ": packet size %d too big\n", conn->ksnc_hdr.src_nid,
+                        conn->ksnc_hdr.dest_nid, body_len);
+                /* on to new packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        /* should have gone direct */
+        conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid);
+        if (conn2 != NULL) {
+                CERROR ("dropping packet from "LPX64" for "LPX64
+                        ": target is a peer\n", conn->ksnc_hdr.src_nid,
+                        conn->ksnc_hdr.dest_nid);
+                ksocknal_put_conn (conn2);  /* drop ref from get above */
+
+                /* on to next packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
+        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
+        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ksocknal_slop_buffer[4096];
+
+        int   nob;
+        int   niov;
+        int   skipped;
+
+        if (nob_to_skip == 0) {         /* right at next packet boundary now */
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+
+                conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_niov = 1;
+
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+                return (1);
+        }
+
+        /* Set up to skip as much a possible now.  If there's more left
+         * (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        skipped = 0;
+        niov = 0;
+
+        do {
+                nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+                 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+void
+ksocknal_process_receive (ksock_sched_t *sched, long *irq_flags)
+{
+        ksock_conn_t *conn;
+        ksock_fmb_t  *fmb;
+        int           rc;
+
+        /* NB: sched->ksnc_lock lock held */
+
+        LASSERT (!list_empty (&sched->kss_rx_conns));
+        conn = list_entry(sched->kss_rx_conns.next, ksock_conn_t, ksnc_rx_list);
+        list_del (&conn->ksnc_rx_list);
+
+        spin_unlock_irqrestore (&sched->kss_lock, *irq_flags);
+
+        CDEBUG(D_NET, "sched %p conn %p\n", sched, conn);
+        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_ready);
+
+        /* doesn't need a forwarding buffer */
+        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)
+                goto try_read;
+
+ get_fmb:
+        fmb = ksocknal_get_idle_fmb (conn);
+        if (fmb == NULL) {      /* conn descheduled waiting for idle fmb */
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+                return;
+        }
+
+        if (ksocknal_init_fmb (conn, fmb)) /* packet forwarded ? */
+                goto out;               /* come back later for next packet */
+
+ try_read:
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+
+        LASSERT (conn->ksnc_rx_nob_wanted > 0);
+
+        conn->ksnc_rx_ready = 0;/* data ready may race with me and set ready */
+        mb();                   /* => clear BEFORE trying to read */
+
+        rc = ksocknal_recvmsg(conn);
+
+        if (rc == 0)
+                goto out;
+        if (rc < 0) {
+#warning FIXME: handle socket errors properly
+                CERROR ("Error socknal read %p: %d\n", conn, rc);
+                goto out;
+        }
+
+        if (conn->ksnc_rx_nob_wanted != 0)      /* short read */
+                goto out;                       /* try again later */
+
+        /* got all I wanted, assume there's more - prevent data_ready locking */
+        conn->ksnc_rx_ready = 1;
+
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_HEADER:
+                /* It's not for me */
+                if (conn->ksnc_hdr.type != PTL_MSG_HELLO &&
+                    NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) {
+                        ksocknal_fwd_parse (conn);
+                        switch (conn->ksnc_rx_state) {
+                        case SOCKNAL_RX_HEADER: /* skipped (zero payload) */
+                                goto out;       /* => come back later */
+                        case SOCKNAL_RX_SLOP:   /* skipping packet's body */
+                                goto try_read;  /* => go read it */
+                        case SOCKNAL_RX_GET_FMB: /* forwarding */
+                                goto get_fmb;   /* => go get a fwd msg buffer */
+                        default:
+                                LBUG ();
+                        }
+                        /* Not Reached */
+                }
+
+                PROF_START(lib_parse);
+                /* sets wanted_len, iovs etc */
+                lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
+                PROF_FINISH(lib_parse);
+
+                if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */
+                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+                        goto try_read;          /* go read the payload */
+                }
+                /* Fall through (completed packet for me) */
+
+        case SOCKNAL_RX_BODY:
+                atomic_inc (&ksocknal_packets_received);
+                /* packet is done now */
+                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                /* starting new packet? */
+                if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+                        goto out;       /* come back later */
+                goto try_read;          /* try to finish reading slop now */
+
+        case SOCKNAL_RX_BODY_FWD:
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n",
+                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
+                        NTOH__u64 (conn->ksnc_hdr.dest_nid),
+                        conn->ksnc_rx_nob_left);
+
+                atomic_inc (&ksocknal_packets_received);
+
+                /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */
+                kpr_fwd_start (&ksocknal_data.ksnd_router,
+                               (kpr_fwd_desc_t *)conn->ksnc_cookie);
+
+                /* no slop in forwarded packets */
+                LASSERT (conn->ksnc_rx_nob_left == 0);
+
+                ksocknal_new_packet (conn, 0);  /* on to next packet */
+                goto out;                       /* (later) */
+
+        default:
+        }
+
+        /* Not Reached */
+        LBUG ();
+
+ out:
+        spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+
+        /* no data there to read? */
+        if (!conn->ksnc_rx_ready) {
+                /* let socket callback schedule again */
+                conn->ksnc_rx_scheduled = 0;
+                ksocknal_put_conn (conn);       /* release scheduler's ref */
+        } else                              /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
+}
+
+int
+ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
+               unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+        memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov));
+
+        LASSERT (mlen == 
+                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        return (rlen);
+}
+
+int
+ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        conn->ksnc_rx_niov = 0;
+        conn->ksnc_rx_iov  = NULL;
+        conn->ksnc_rx_nkiov = niov;
+        conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+        memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov));
+
+        LASSERT (mlen == 
+                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        return (rlen);
+}
+
+int ksocknal_scheduler (void *arg)
+{
+        ksock_sched_t     *sched = (ksock_sched_t *)arg;
+        unsigned long      flags;
+        int                rc;
+        int                nloops = 0;
+        int                id = sched - ksocknal_data.ksnd_schedulers;
+        char               name[16];
+#if (CONFIG_SMP && CPU_AFFINITY)
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        int                cpu = cpu_logical_map(id % num_online_cpus());
+#else
+#warning "Take care of architecure specific logical APIC map"
+        int cpu = 1;    /* Have to change later. */
+#endif /* LINUX_VERSION_CODE */
+        
+        set_cpus_allowed (current, 1 << cpu);
+        id = cpu;
+#endif /* CONFIG_SMP && CPU_AFFINITY */
+
+        snprintf (name, sizeof (name),"ksocknald[%d]", id);
+        kportal_daemonize (name);
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&sched->kss_rx_conns)) {
+                        did_something = 1;
+                        /* drops & regains kss_lock */
+                        ksocknal_process_receive (sched, &flags);
+                }
+
+                if (!list_empty (&sched->kss_tx_conns)) {
+                        did_something = 1;
+                        /* drops and regains kss_lock */
+                        ksocknal_process_transmit (sched, &flags);
+                }
+#if SOCKNAL_ZC
+                if (!list_empty (&sched->kss_zctxdone_list)) {
+                        ksock_tx_t *tx =
+                                list_entry(sched->kss_zctxdone_list.next,
+                                           ksock_tx_t, tx_list);
+                        did_something = 1;
+
+                        list_del (&tx->tx_list);
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        ksocknal_tx_done (tx);
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+#endif
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+#if SOCKNAL_ZC
+                                rc = wait_event_interruptible (sched->kss_waitq,
+                                                               ksocknal_data.ksnd_shuttingdown ||
+                                                               !list_empty(&sched->kss_rx_conns) ||
+                                                               !list_empty(&sched->kss_tx_conns) ||
+                                                               !list_empty(&sched->kss_zctxdone_list));
+#else
+                                rc = wait_event_interruptible (sched->kss_waitq,
+                                                               ksocknal_data.ksnd_shuttingdown ||
+                                                               !list_empty(&sched->kss_rx_conns) ||
+                                                               !list_empty(&sched->kss_tx_conns));
+#endif
+                                LASSERT (rc == 0);
+                        } else
+                               our_cond_resched();
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+        unsigned long  flags;
+        ksock_conn_t  *conn;
+        ksock_sched_t *sched;
+        ENTRY;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        conn = sk->user_data;
+        if (conn == NULL) {             /* raced with ksocknal_close_sock */
+                LASSERT (sk->data_ready != &ksocknal_data_ready);
+                sk->data_ready (sk, n);
+        } else if (!conn->ksnc_rx_ready) {        /* new news */
+                /* Set ASAP in case of concurrent calls to me */
+                conn->ksnc_rx_ready = 1;
+
+                sched = conn->ksnc_scheduler;
+
+                spin_lock_irqsave (&sched->kss_lock, flags);
+
+                /* Set again (process_receive may have cleared while I blocked for the lock) */
+                conn->ksnc_rx_ready = 1;
+
+                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail(&conn->ksnc_rx_list,
+                                      &sched->kss_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        if (waitqueue_active (&sched->kss_waitq))
+                                wake_up (&sched->kss_waitq);
+                }
+
+                spin_unlock_irqrestore (&sched->kss_lock, flags);
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+        EXIT;
+}
+
+void
+ksocknal_write_space (struct sock *sk)
+{
+        unsigned long  flags;
+        ksock_conn_t  *conn;
+        ksock_sched_t *sched;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        conn = sk->user_data;
+
+        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+               sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn,
+               (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ?
+                                      " ready" : " blocked"),
+               (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+                                      " scheduled" : " idle"),
+               (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+                                      " empty" : " queued"));
+
+        if (conn == NULL) {             /* raced with ksocknal_close_sock */
+                LASSERT (sk->write_space != &ksocknal_write_space);
+                sk->write_space (sk);
+        } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
+                clear_bit (SOCK_NOSPACE, &sk->socket->flags);
+
+                if (!conn->ksnc_tx_ready) {      /* new news */
+                        /* Set ASAP in case of concurrent calls to me */
+                        conn->ksnc_tx_ready = 1;
+
+                        sched = conn->ksnc_scheduler;
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+
+                        /* Set again (process_transmit may have
+                           cleared while I blocked for the lock) */
+                        conn->ksnc_tx_ready = 1;
+
+                        if (!conn->ksnc_tx_scheduled && // not being progressed
+                            !list_empty(&conn->ksnc_tx_queue)){//packets to send
+                                list_add_tail (&conn->ksnc_tx_list,
+                                               &sched->kss_tx_conns);
+                                conn->ksnc_tx_scheduled = 1;
+                                /* extra ref for scheduler */
+                                atomic_inc (&conn->ksnc_refcount);
+
+                                if (waitqueue_active (&sched->kss_waitq))
+                                        wake_up (&sched->kss_waitq);
+                        }
+
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        
+        kportal_daemonize ("ksocknal_reaper");
+        kportal_blockallsigs ();
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                if (list_empty (&ksocknal_data.ksnd_reaper_list)) {
+                        conn = NULL;
+                } else {
+                        conn = list_entry (ksocknal_data.ksnd_reaper_list.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                }
+
+                spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                if (conn != NULL)
+                        ksocknal_close_conn (conn);
+                else {
+                        rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq,
+                                                       ksocknal_data.ksnd_shuttingdown ||
+                                                       !list_empty(&ksocknal_data.ksnd_reaper_list));
+                        LASSERT (rc == 0);
+                }
+        }
+
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+nal_cb_t ksocknal_lib = {
+        nal_data:       &ksocknal_data,                /* NAL private data */
+        cb_send:         ksocknal_send,
+        cb_send_pages:   ksocknal_send_pages,
+        cb_recv:         ksocknal_recv,
+        cb_recv_pages:   ksocknal_recv_pages,
+        cb_read:         ksocknal_read,
+        cb_write:        ksocknal_write,
+        cb_callback:     ksocknal_callback,
+        cb_malloc:       ksocknal_malloc,
+        cb_free:         ksocknal_free,
+        cb_printf:       ksocknal_printf,
+        cb_cli:          ksocknal_cli,
+        cb_sti:          ksocknal_sti,
+        cb_dist:         ksocknal_dist
+};
diff --git a/lnet/klnds/toelnd/.cvsignore b/lnet/klnds/toelnd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/klnds/toelnd/Makefile.am b/lnet/klnds/toelnd/Makefile.am
new file mode 100644 (file)
index 0000000..9bfff64
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = ktoenal
+modulenet_DATA = ktoenal.o
+EXTRA_PROGRAMS = ktoenal
+
+DEFS =
+ktoenal_SOURCES = toenal.c toenal_cb.c toenal.h
diff --git a/lnet/klnds/toelnd/toenal.c b/lnet/klnds/toelnd/toenal.c
new file mode 100644 (file)
index 0000000..1f5dc38
--- /dev/null
@@ -0,0 +1,629 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <linux/poll.h>
+#include "toenal.h"
+
+ptl_handle_ni_t         ktoenal_ni;
+static nal_t            ktoenal_api;
+static ksock_nal_data_t ktoenal_data;
+
+/*
+ksocknal_interface_t ktoenal_interface = {
+        ksni_add_sock:         ktoenal_add_sock,
+        ksni_close_sock:       ktoenal_close_sock,
+        ksni_set_mynid:                ktoenal_set_mynid,
+};
+*/
+
+kpr_nal_interface_t ktoenal_router_interface = {
+        kprni_nalid:   TOENAL,
+        kprni_arg:     &ktoenal_data,
+        kprni_fwd:     ktoenal_fwd_packet,
+};
+
+
+int
+ktoenal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
+                       void *ret, size_t ret_len)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* ktoenal_send needs k */
+        return PTL_OK;
+}
+
+int
+ktoenal_api_shutdown(nal_t *nal, int ni)
+{
+       CDEBUG (D_NET, "closing all connections\n");
+
+        return ktoenal_close_sock(0);          /* close all sockets */
+}
+
+void
+ktoenal_api_yield(nal_t *nal)
+{
+        our_cond_resched();
+        return;
+}
+
+void
+ktoenal_api_lock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+void
+ktoenal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+nal_t *
+ktoenal_init(int interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
+{
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
+               ktoenal_data.ksnd_mynid);
+        lib_init(&ktoenal_lib, ktoenal_data.ksnd_mynid, 0, 10, ptl_size,
+                 ac_size);
+        return (&ktoenal_api);
+}
+
+/*
+ *  EXTRA functions follow
+ */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define SOCKET_I(inode) (&(inode)->u.socket_i)
+#endif
+static __inline__ struct socket *
+socki_lookup(struct inode *inode)
+{
+        return SOCKET_I(inode);
+}
+
+int
+ktoenal_set_mynid(ptl_nid_t nid)
+{
+        lib_ni_t *ni = &ktoenal_lib.ni;
+
+        /* FIXME: we have to do this because we call lib_init() at module
+         * insertion time, which is before we have 'mynid' available.  lib_init
+         * sets the NAL's nid, which it uses to tell other nodes where packets
+         * are coming from.  This is not a very graceful solution to this
+         * problem. */
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->nid);
+
+        ktoenal_data.ksnd_mynid = nid;
+        ni->nid = nid;
+        return (0);
+}
+
+int
+ktoenal_add_sock (ptl_nid_t nid, int fd)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        struct file       *file = NULL;
+        struct socket     *sock = NULL;
+        int                ret;
+        ENTRY;
+
+        file = fget(fd);
+        if (file == NULL)
+                RETURN(-EINVAL);
+
+        ret = -EINVAL;
+        sock = socki_lookup(file->f_dentry->d_inode);
+        if (sock == NULL)
+                GOTO(error, ret);
+
+        ret = -ENOMEM;
+        PORTAL_ALLOC(conn, sizeof(*conn));
+        if (!conn)
+                GOTO(error, ret);
+
+        memset (conn, 0, sizeof (conn));        /* zero for consistency */
+        file->f_flags |= O_NONBLOCK;  /*  Does this have any conflicts */
+        conn->ksnc_file = file;
+        conn->ksnc_sock = sock;
+        conn->ksnc_peernid = nid;
+        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for socklist */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ktoenal_new_packet (conn, 0);
+
+        INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        list_add(&conn->ksnc_list, &ktoenal_data.ksnd_socklist);
+        write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        ktoenal_data_ready(conn);
+        ktoenal_write_space(conn);
+
+        ktoenal_data.ksnd_slistchange = 1;
+        wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+        /* Schedule pollthread so that it will poll
+         * for newly created socket
+         */
+
+
+        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n",
+               conn, conn->ksnc_peernid);
+
+        /* Can't unload while connection active */
+        PORTAL_MODULE_USE;
+        RETURN(0);
+
+error:
+        fput(file);
+        return (ret);
+}
+
+/* Passing in a zero nid will close all connections */
+int
+ktoenal_close_sock(ptl_nid_t nid)
+{
+        long               flags;
+        ksock_conn_t      *conn;
+        LIST_HEAD         (death_row);
+        struct list_head  *tmp;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        if (nid == 0)                           /* close ALL connections */
+        {
+                /* insert 'death row' into the socket list... */
+                list_add (&death_row, &ktoenal_data.ksnd_socklist);
+                /* ...extract and reinitialise the socket list itself... */
+                list_del_init (&ktoenal_data.ksnd_socklist);
+                /* ...and voila, death row is the proud owner of all conns */
+        } else list_for_each (tmp, &ktoenal_data.ksnd_socklist) {
+
+                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid)
+                {
+                        list_del (&conn->ksnc_list);
+                        list_add (&conn->ksnc_list, &death_row);
+                        break;
+                }
+        }
+
+
+        write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        if (list_empty (&death_row))
+                return (-ENOENT);
+
+        do {
+                conn = list_entry (death_row.next, ksock_conn_t, ksnc_list);
+                list_del (&conn->ksnc_list);
+                ktoenal_put_conn (conn);       /* drop ref for ksnd_socklist */
+        } while (!list_empty (&death_row));
+
+        ktoenal_data.ksnd_slistchange = 1;
+        wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+
+        return (0);
+}
+
+
+ksock_conn_t *
+ktoenal_get_conn (ptl_nid_t nid)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        PROF_START(conn_list_walk);
+
+        read_lock (&ktoenal_data.ksnd_socklist_lock);
+
+        list_for_each(tmp, &ktoenal_data.ksnd_socklist) {
+
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid)
+                {
+                        /* caller is referencing */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+
+                        CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n",
+                               conn, nid, atomic_read (&conn->ksnc_refcount));
+
+                        PROF_FINISH(conn_list_walk);
+                        return (conn);
+                }
+        }
+
+        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+
+        CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", nid);
+        PROF_FINISH(conn_list_walk);
+        return (NULL);
+}
+
+void
+ktoenal_close_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_NET, "connection [%p] closed \n", conn);
+
+        fput (conn->ksnc_file);
+        PORTAL_FREE (conn, sizeof (*conn));
+        /* One less connection keeping us hanging on */
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+_ktoenal_put_conn (ksock_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn);
+
+        /* "But what is the black spot, captain?" I asked.
+         * "That's a summons, mate..." */
+
+        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (!conn->ksnc_rx_scheduled);
+
+        if (!in_interrupt())
+        {
+                ktoenal_close_conn (conn);
+                return;
+        }
+
+        spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags);
+
+        list_add (&conn->ksnc_list, &ktoenal_data.ksnd_reaper_list);
+        wake_up (&ktoenal_data.ksnd_reaper_waitq);
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags);
+}
+
+void
+ktoenal_free_buffers (void)
+{
+        if (ktoenal_data.ksnd_fmbs != NULL)
+        {
+                ksock_fmb_t *fmb = (ksock_fmb_t *)ktoenal_data.ksnd_fmbs;
+                int          i;
+                int          j;
+
+                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++, fmb++)
+                        for (j = 0; j < fmb->fmb_npages; j++)
+                                if (fmb->fmb_pages[j] != NULL)
+                                        __free_page (fmb->fmb_pages[j]);
+
+                PORTAL_FREE (ktoenal_data.ksnd_fmbs,
+                             sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+        }
+
+        if (ktoenal_data.ksnd_ltxs != NULL)
+                PORTAL_FREE (ktoenal_data.ksnd_ltxs,
+                             sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+}
+
+int
+ktoenal_cmd(struct portal_ioctl_data * data, void * private)
+{
+        int rc = -EINVAL;
+
+        LASSERT (data != NULL);
+
+        switch(data->ioc_nal_cmd) {
+        case NAL_CMD_REGISTER_PEER_FD: {
+                rc = ktoenal_add_sock(data->ioc_nid, data->ioc_fd);
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = ktoenal_close_sock(data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                rc = ktoenal_set_mynid (data->ioc_nid);
+                break;
+        }
+        }
+
+        return rc;
+}
+
+
+void __exit
+ktoenal_module_fini (void)
+{
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        switch (ktoenal_data.ksnd_init)
+        {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                kportal_nal_unregister(TOENAL);
+                PORTAL_SYMBOL_UNREGISTER (ktoenal_ni);
+                /* fall through */
+
+        case SOCKNAL_INIT_PTL:
+                PtlNIFini(ktoenal_ni);
+                lib_fini(&ktoenal_lib);
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all connections
+                 * have been closed so all lists must be empty */
+                LASSERT (list_empty (&ktoenal_data.ksnd_socklist));
+                LASSERT (list_empty (&ktoenal_data.ksnd_reaper_list));
+                LASSERT (list_empty (&ktoenal_data.ksnd_rx_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_tx_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns));
+
+                kpr_shutdown (&ktoenal_data.ksnd_router); /* stop router calling me */
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ktoenal_data.ksnd_shuttingdown = 1;
+                wake_up_all (&ktoenal_data.ksnd_reaper_waitq);
+                wake_up_all (&ktoenal_data.ksnd_sched_waitq);
+                wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+
+                while (atomic_read (&ktoenal_data.ksnd_nthreads) != 0)
+                {
+                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
+                                atomic_read (&ktoenal_data.ksnd_nthreads));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+
+                kpr_deregister (&ktoenal_data.ksnd_router);
+
+                ktoenal_free_buffers();
+                /* fall through */
+
+        case SOCKNAL_INIT_NOTHING:
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+}
+
+int __init
+ktoenal_module_init (void)
+{
+        int   pkmem = atomic_read(&portal_kmemory);
+        int   rc;
+        int   i;
+        int   j;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+
+        LASSERT (ktoenal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+
+        ktoenal_api.forward  = ktoenal_api_forward;
+        ktoenal_api.shutdown = ktoenal_api_shutdown;
+        ktoenal_api.yield    = ktoenal_api_yield;
+        ktoenal_api.validate = NULL;           /* our api validate is a NOOP */
+        ktoenal_api.lock     = ktoenal_api_lock;
+        ktoenal_api.unlock   = ktoenal_api_unlock;
+        ktoenal_api.nal_data = &ktoenal_data;
+
+        ktoenal_lib.nal_data = &ktoenal_data;
+
+        memset (&ktoenal_data, 0, sizeof (ktoenal_data)); /* zero pointers */
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_socklist);
+        rwlock_init(&ktoenal_data.ksnd_socklist_lock);
+
+        ktoenal_data.ksnd_nal_cb = &ktoenal_lib;
+        spin_lock_init (&ktoenal_data.ksnd_nal_cb_lock);
+
+        spin_lock_init (&ktoenal_data.ksnd_sched_lock);
+
+        init_waitqueue_head (&ktoenal_data.ksnd_sched_waitq);
+
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_rx_conns);
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_tx_conns);
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns);
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_nblk_ltx_list);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_ltx_list);
+        init_waitqueue_head(&ktoenal_data.ksnd_idle_ltx_waitq);
+
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_reaper_list);
+        init_waitqueue_head(&ktoenal_data.ksnd_reaper_waitq);
+        spin_lock_init (&ktoenal_data.ksnd_reaper_lock);
+
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_DATA; /* flag lists/ptrs/locks initialised */
+
+        PORTAL_ALLOC(ktoenal_data.ksnd_fmbs,
+                     sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+        if (ktoenal_data.ksnd_fmbs == NULL)
+                RETURN(-ENOMEM);
+
+        /* NULL out buffer pointers etc */
+        memset(ktoenal_data.ksnd_fmbs, 0,
+               sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+
+        for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++)
+        {
+                ksock_fmb_t *fmb = &((ksock_fmb_t *)ktoenal_data.ksnd_fmbs)[i];
+
+                if (i < SOCKNAL_SMALL_FWD_NMSGS)
+                {
+                        fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES;
+                        fmb->fmb_pool = &ktoenal_data.ksnd_small_fmp;
+                }
+                else
+                {
+                        fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES;
+                        fmb->fmb_pool = &ktoenal_data.ksnd_large_fmp;
+                }
+
+                LASSERT (fmb->fmb_npages > 0);
+                for (j = 0; j < fmb->fmb_npages; j++)
+                {
+                        fmb->fmb_pages[j] = alloc_page(GFP_KERNEL);
+
+                        if (fmb->fmb_pages[j] == NULL)
+                        {
+                                ktoenal_module_fini ();
+                                return (-ENOMEM);
+                        }
+
+                        LASSERT (page_address (fmb->fmb_pages[j]) != NULL);
+                }
+
+                list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+        }
+
+        PORTAL_ALLOC(ktoenal_data.ksnd_ltxs,
+                     sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+        if (ktoenal_data.ksnd_ltxs == NULL)
+        {
+                ktoenal_module_fini ();
+                return (-ENOMEM);
+        }
+
+        /* Deterministic bugs please */
+        memset (ktoenal_data.ksnd_ltxs, 0xeb,
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++)
+        {
+                ksock_ltx_t *ltx = &((ksock_ltx_t *)ktoenal_data.ksnd_ltxs)[i];
+
+                ltx->ltx_idle = i < SOCKNAL_NLTXS ?
+                                &ktoenal_data.ksnd_idle_ltx_list :
+                                &ktoenal_data.ksnd_idle_nblk_ltx_list;
+                list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+        }
+
+        rc = PtlNIInit(ktoenal_init, 32, 4, 0, &ktoenal_ni);
+        if (rc != 0)
+        {
+                CERROR("ktoenal: PtlNIInit failed: error %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+        PtlNIDebug(ktoenal_ni, ~0);
+
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_PTL; /* flag PtlNIInit() called */
+
+        ktoenal_data.ksnd_slistchange = 1;
+        for (i = 0; i < TOENAL_N_SCHED; i++)
+        {
+                rc = ktoenal_thread_start (ktoenal_scheduler, NULL);
+                if (rc != 0)
+                {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc);
+                        ktoenal_module_fini ();
+                        RETURN (rc);
+                }
+        }
+
+        rc = ktoenal_thread_start (ktoenal_reaper, NULL);
+        if (rc != 0)
+        {
+                CERROR("Can't spawn socknal reaper: %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = ktoenal_thread_start (ktoenal_pollthread, NULL);
+        if (rc != 0)
+        {
+                CERROR("Can't spawn socknal pollthread: %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = kpr_register(&ktoenal_data.ksnd_router,
+                  &ktoenal_router_interface);
+        if (rc != 0)
+                CDEBUG (D_NET, "Can't initialise routing interface (rc = %d): not routing\n", rc);
+
+        rc = kportal_nal_register(TOENAL, &ktoenal_cmd, NULL);
+        if (rc != 0)
+                CDEBUG(D_NET, "Can't initialise command interface (rc = %d)\n",
+                       rc);
+
+        PORTAL_SYMBOL_REGISTER(ktoenal_ni);
+
+        /* flag everything initialised */
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+       printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n",
+              kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled",
+               pkmem);
+
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(ktoenal_module_init);
+module_exit(ktoenal_module_fini);
+
+EXPORT_SYMBOL (ktoenal_ni);
diff --git a/lnet/klnds/toelnd/toenal.h b/lnet/klnds/toelnd/toenal.h
new file mode 100644 (file)
index 0000000..f793d3b
--- /dev/null
@@ -0,0 +1,236 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/sched.h> 
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_SOCKNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define SOCKNAL_MAX_FWD_PAYLOAD (64<<10)        /* biggest payload I can forward */
+
+#define SOCKNAL_NLTXS           128             /* # normal transmit messages */
+#define SOCKNAL_NNBLK_LTXS     128             /* # transmit messages reserved if can't block */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 32              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
+
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+                                               /* # pages in a large message fwd buffer */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+
+#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10)
+
+#define TOENAL_N_SCHED 1
+
+typedef struct                                  /* pool of forwarding buffers */
+{
+        struct list_head  fmp_idle_fmbs;        /* buffers waiting for a connection */
+        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
+} ksock_fmb_pool_t;
+
+typedef struct {
+        int               ksnd_init;            /* initialisation state */
+        
+        struct list_head  ksnd_socklist;        /* all my connections */
+        rwlock_t          ksnd_socklist_lock;   /* stabilise add/find/remove */
+
+
+        ptl_nid_t         ksnd_mynid;
+        nal_cb_t         *ksnd_nal_cb;
+        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
+
+        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        
+        kpr_router_t      ksnd_router;          /* THE router */
+
+        spinlock_t        ksnd_sched_lock;      /* serialise packet scheduling */
+        wait_queue_head_t ksnd_sched_waitq;     /* where scheduler(s) wait */
+
+        struct list_head  ksnd_rx_conns;        /* conn waiting to be read */
+        struct list_head  ksnd_tx_conns;        /* conn waiting to be written */
+        
+        void             *ksnd_fmbs;            /* all the pre-allocated FMBs */
+        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
+        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
+
+        void             *ksnd_ltxs;            /* all the pre-allocated LTXs */
+        struct list_head  ksnd_idle_ltx_list;   /* where to get an idle LTX */
+        struct list_head  ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
+        wait_queue_head_t ksnd_idle_ltx_waitq;  /* where to block for an idle LTX */
+
+        struct list_head  ksnd_reaper_list;     /* conn waiting to be reaped */
+        wait_queue_head_t ksnd_reaper_waitq;    /* reaper sleeps here */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+        
+        struct task_struct *ksnd_pollthread_tsk;/* task_struct for the poll thread */
+        poll_table          ksnd_pwait;         /* poll wait table for the socket */
+        int                 ksnd_slistchange;   /* informs the pollthread that
+                                                 * the socklist has changed */  
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_ALL        3
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;       /* queue on conn for transmission etc */
+        char                    tx_isfwd;      /* forwarding / sourced here */
+        int                     tx_nob;        /* # packet bytes */
+        int                     tx_niov;       /* # packet frags */
+        struct iovec           *tx_iov;        /* packet frags */
+} ksock_tx_t;
+
+typedef struct                                  /* locally transmitted packet */
+{
+        ksock_tx_t              ltx_tx;         /* send info */
+        struct list_head       *ltx_idle;       /* where to put when idle */
+        void                   *ltx_private;    /* lib_finalize() callback arg */
+        void                   *ltx_cookie;     /* lib_finalize() callback arg */
+        struct iovec            ltx_iov[1 + PTL_MD_MAX_IOV]; /* msg frags */
+        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
+} ksock_ltx_t;
+
+#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry (ptr, kpr_fwd_desc_t, kprfd_scratch)
+/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
+
+#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
+/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
+
+/* NB list_entry() is used here as convenient macro for calculating a
+ * pointer to a struct from the addres of a member.
+ */
+
+typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
+{                                               /* (socknal->router) */
+        struct list_head        fmb_list;       /* queue idle */
+        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
+        int                     fmb_npages;     /* # pages allocated */
+        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
+        struct page            *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
+        struct iovec            fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+} ksock_fmb_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
+#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
+#define SOCKNAL_RX_SLOP         4               /* skipping body */
+#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
+#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+
+typedef struct 
+{ 
+        struct list_head    ksnc_list;          /* stash on global socket list */
+        struct file        *ksnc_file;          /* socket filp */
+        struct socket      *ksnc_sock;          /* socket */
+        ptl_nid_t           ksnc_peernid;       /* who's on the other end */
+        atomic_t            ksnc_refcount;      /* # users */
+        
+        /* READER */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        unsigned long       ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # frags */
+        struct iovec        ksnc_rx_iov[1 + PTL_MD_MAX_IOV]; /* the frags */
+
+        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
+        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        unsigned long       ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+        
+} ksock_conn_t;
+
+extern int ktoenal_add_sock (ptl_nid_t nid, int fd);
+extern int ktoenal_close_sock(ptl_nid_t nid);
+extern int ktoenal_set_mynid(ptl_nid_t nid);
+extern int ktoenal_push_sock(ptl_nid_t nid);
+extern ksock_conn_t *ktoenal_get_conn (ptl_nid_t nid);
+extern void _ktoenal_put_conn (ksock_conn_t *conn);
+extern void ktoenal_close_conn (ksock_conn_t *conn);
+
+static inline void
+ktoenal_put_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", 
+                conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount));
+        
+        if (atomic_dec_and_test (&conn->ksnc_refcount))
+                _ktoenal_put_conn (conn);
+}
+
+extern int ktoenal_thread_start (int (*fn)(void *arg), void *arg);
+extern int ktoenal_new_packet (ksock_conn_t *conn, int skip);
+extern void ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern int ktoenal_scheduler (void *arg);
+extern int ktoenal_reaper (void *arg);
+extern int ktoenal_pollthread (void *arg);
+extern void ktoenal_data_ready(ksock_conn_t *conn);
+extern void ktoenal_write_space(ksock_conn_t *conn);
+
+
+extern nal_cb_t         ktoenal_lib;
+extern ksock_nal_data_t ktoenal_data;
diff --git a/lnet/klnds/toelnd/toenal_cb.c b/lnet/klnds/toelnd/toenal_cb.c
new file mode 100644 (file)
index 0000000..ec37f6f
--- /dev/null
@@ -0,0 +1,1219 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *   
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/poll.h>
+#include "toenal.h"
+
+atomic_t   ktoenal_packets_received;
+long       ktoenal_packets_launched;
+long       ktoenal_packets_transmitted;
+
+/*
+ *  LIB functions follow
+ *
+ */
+int
+ktoenal_read(nal_cb_t *nal, void *private, void *dst_addr,
+              user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ktoenal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
+               void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int 
+ktoenal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                        ptl_event_t *ev)
+{
+        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
+               nal->ni.nid, eq, ev);
+
+        if (eq->event_callback != NULL) 
+                eq->event_callback(ev);
+
+        return 0;
+}
+
+void *
+ktoenal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+
+        if (buf != NULL)
+                memset(buf, 0, len);
+
+        return (buf);
+}
+
+void
+ktoenal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+void
+ktoenal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+       va_list ap;
+       char msg[256];
+
+       va_start (ap, fmt);
+       vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
+       va_end (ap);
+
+       msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+void
+ktoenal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data = nal->nal_data;
+
+        spin_lock(&data->ksnd_nal_cb_lock);
+}
+
+void
+ktoenal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data;
+        data = nal->nal_data;
+
+        spin_unlock(&data->ksnd_nal_cb_lock);
+}
+
+int
+ktoenal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if ktoenal_get_conn(nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ksock_ltx_t *
+ktoenal_get_ltx (int may_block)
+{
+        long        flags;
+        ksock_ltx_t *ltx = NULL;
+        
+        for (;;)
+        {
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        
+                if (!list_empty (&ktoenal_data.ksnd_idle_ltx_list))
+                {
+                        ltx = list_entry (ktoenal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list);
+                        list_del (&ltx->ltx_tx.tx_list);
+                        break;
+                }
+
+                if (!may_block)
+                {
+                        if (!list_empty (&ktoenal_data.ksnd_idle_nblk_ltx_list))
+                        {
+                                ltx = list_entry (ktoenal_data.ksnd_idle_nblk_ltx_list.next, 
+                                                  ksock_ltx_t, ltx_tx.tx_list);
+                                list_del (&ltx->ltx_tx.tx_list);
+                        }
+                        break;
+                }
+                
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+                
+                wait_event (ktoenal_data.ksnd_idle_ltx_waitq,
+                            !list_empty (&ktoenal_data.ksnd_idle_ltx_list));
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+
+        return (ltx);
+}
+
+int
+ktoenal_sendmsg (struct file *sock, struct iovec *iov, int niov, int nob, int flags)
+{
+        /* NB This procedure "consumes" iov (actually we do, tcp_sendmsg doesn't)
+         */
+        mm_segment_t oldmm;
+        int           rc;
+
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+        
+        oldmm = get_fs();
+        set_fs (KERNEL_DS);
+
+#ifdef PORTAL_DEBUG
+        {
+                int total_nob;
+                int i;
+                
+                for (i = total_nob = 0; i < niov; i++)
+                        total_nob += iov[i].iov_len;
+                
+                LASSERT (nob == total_nob);
+        }
+#endif        
+        LASSERT (!in_interrupt());
+       
+        rc = sock->f_op->writev(sock, iov, niov, NULL);
+
+        set_fs (oldmm);
+
+        if (rc > 0)                             /* sent something? */
+        {
+                nob = rc;                       /* consume iov */
+                for (;;)
+                {
+                        LASSERT (niov > 0);
+                        
+                        if (iov->iov_len >= nob)
+                        {
+                                iov->iov_len -= nob;
+                                iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob);
+                                break;
+                        }
+                        nob -= iov->iov_len;
+                        iov->iov_len = 0;
+                        iov++;
+                        niov--;
+                }
+        }
+
+        return (rc);
+}
+
+int
+ktoenal_recvmsg(struct file *sock, struct iovec *iov, int niov, int toread)
+{
+        /* NB This procedure "consumes" iov (actually tcp_recvmsg does)
+         */
+        mm_segment_t oldmm;
+        int ret, i, len = 0, origlen = 0;
+        
+        PROF_START(our_recvmsg);
+        for(i = 0; i < niov; i++) {
+                len += iov[i].iov_len;
+                if(len >= toread)
+                        break;
+        }
+
+        if(len >= toread) {
+                origlen = iov[i].iov_len;
+                iov[i].iov_len -= (len - toread);
+        }
+        else {  /* i == niov */
+                i = niov - 1;
+        }
+
+        oldmm = get_fs();
+        set_fs(KERNEL_DS);
+
+        ret = sock->f_op->readv(sock, iov, i + 1, NULL);
+        
+        set_fs(oldmm);
+
+        if(origlen)
+                iov[i].iov_len = origlen;
+
+        PROF_FINISH(our_recvmsg);
+        return ret;
+}
+
+void
+ktoenal_process_transmit (ksock_conn_t *conn, long *irq_flags)
+{
+        ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list);
+        int         rc;
+        
+        LASSERT (conn->ksnc_tx_scheduled);
+        LASSERT (conn->ksnc_tx_ready);
+        LASSERT (!list_empty (&conn->ksnc_tx_queue));
+
+        /* assume transmit will complete now, so dequeue while I've got the lock */
+        list_del (&tx->tx_list);
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+        LASSERT (tx->tx_nob > 0);
+
+        conn->ksnc_tx_ready = 0;                /* write_space may race with me and set ready */
+        mb();                                   /* => clear BEFORE trying to write */
+
+        rc = ktoenal_sendmsg (conn->ksnc_file,
+                               tx->tx_iov, tx->tx_niov, tx->tx_nob,
+                               list_empty (&conn->ksnc_tx_queue) ? 
+                               MSG_DONTWAIT : (MSG_DONTWAIT | MSG_MORE));
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc);
+
+        if (rc < 0)                             /* error */
+        {
+                if (rc == -EAGAIN)              /* socket full => */
+                        rc = 0;                 /* nothing sent */
+                else
+                {
+#warning FIXME: handle socket errors properly
+                        CERROR ("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc);
+                        rc = tx->tx_nob;        /* kid on for now whole packet went */
+                }
+        }
+
+        if (rc == tx->tx_nob)                   /* everything went */
+        {
+                conn->ksnc_tx_ready = 1;        /* assume more can go (ASAP) */
+                ktoenal_put_conn (conn);       /* release packet's ref */
+
+                if (tx->tx_isfwd)               /* was a forwarded packet? */
+                {
+                        kpr_fwd_done (&ktoenal_data.ksnd_router,
+                                      KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                }
+                else                            /* local send */
+                {
+                        ksock_ltx_t *ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+
+                        lib_finalize (&ktoenal_lib, ltx->ltx_private, ltx->ltx_cookie);
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                        
+                        list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+
+                        /* normal tx desc => wakeup anyone blocking for one */
+                        if (ltx->ltx_idle == &ktoenal_data.ksnd_idle_ltx_list &&
+                            waitqueue_active (&ktoenal_data.ksnd_idle_ltx_waitq))
+                                wake_up (&ktoenal_data.ksnd_idle_ltx_waitq);
+                }
+                ktoenal_packets_transmitted++;
+        }
+        else
+        {
+                tx->tx_nob -= rc;
+
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+                /* back onto HEAD of tx_queue */
+                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+        }
+
+        if (!conn->ksnc_tx_ready ||             /* no space to write now */
+            list_empty (&conn->ksnc_tx_queue))  /* nothing to write */
+        {
+                conn->ksnc_tx_scheduled = 0;    /* not being scheduled */
+                ktoenal_put_conn (conn);       /* release scheduler's ref */
+        }
+        else                                    /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns);
+}
+
+void
+ktoenal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        long          flags;
+        int           nob = tx->tx_nob;
+        struct iovec *iov = tx->tx_iov;
+        int           niov = 1;
+        
+        LASSERT (nob >= sizeof (ptl_hdr_t));
+
+        /* Truncate iov to exactly match total packet length
+         * since socket sendmsg pays no attention to requested length.
+         */
+        for (;;)
+        {
+                LASSERT (niov <= tx->tx_niov);
+                LASSERT (iov->iov_len >= 0);
+                
+                if (iov->iov_len >= nob)
+                {
+                        iov->iov_len = nob;
+                        break;
+                }
+                nob -= iov->iov_len;
+                iov++;
+                niov++;
+        }
+        tx->tx_niov = niov;
+        
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+
+        if (conn->ksnc_tx_ready &&              /* able to send */
+            !conn->ksnc_tx_scheduled)           /* not scheduled to send */
+        {
+                list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */
+                if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                        wake_up (&ktoenal_data.ksnd_sched_waitq);
+        }
+
+        ktoenal_packets_launched++;
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+}
+
+int
+ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie,
+              ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+              unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len)
+{
+        ptl_nid_t     gatewaynid;
+        ksock_conn_t *conn;
+        ksock_ltx_t  *ltx;
+        int           rc;
+        int           i;
+
+        /* By this point, as it happens, we have absolutely no idea what
+         * 'private' is.  It might be ksock_nal_data or it might be ksock_conn.
+         * Ha ha, isn't that a funny joke?
+         *
+         * FIXME: this is not the right way to fix this; the right way is to
+         * always pass in the same kind of structure.  This is hard right now.
+         * To revisit this issue, set a breakpoint in here and watch for when
+         * it's called from lib_finalize.  I think this occurs when we send a
+         * packet as a side-effect of another packet, such as when an ACK has
+         * been requested. -phil */
+
+        CDEBUG(D_NET, "sending %d bytes from [%d](%p,%d)... to nid: "
+               LPX64" pid %d\n", (int)payload_len, payload_niov,
+               payload_niov > 0 ? payload_iov[0].iov_base : NULL,
+               (int)(payload_niov > 0 ? payload_iov[0].iov_len : 0), nid, pid);
+
+        if ((conn = ktoenal_get_conn (nid)) == NULL)
+        {
+                /* It's not a peer; try to find a gateway */
+                rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid);
+                if (rc != 0)
+                {
+                        CERROR ("Can't route to "LPX64": router error %d\n", nid, rc);
+                        return (-1);
+                }
+
+                if ((conn = ktoenal_get_conn (gatewaynid)) == NULL)
+                {
+                        CERROR ("Can't route to "LPX64": gateway "LPX64" is not a peer\n", 
+                                nid, gatewaynid);
+                        return (-1);
+                }
+        }
+
+        /* This transmit has now got a ref on conn */
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ltx = ktoenal_get_ltx (!(type == PTL_MSG_ACK ||
+                                 type == PTL_MSG_REPLY ||
+                                 in_interrupt ()));
+        if (ltx == NULL)
+        {
+                CERROR ("Can't allocate tx desc\n");
+                ktoenal_put_conn (conn);
+                return (-1);
+        }
+        
+        /* Init common (to sends and forwards) packet part */
+        ltx->ltx_tx.tx_isfwd = 0;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+        ltx->ltx_tx.tx_niov = 1 + payload_niov;
+        ltx->ltx_tx.tx_iov = ltx->ltx_iov;
+
+        /* Init local send packet (storage for hdr, finalize() args, iov) */
+        ltx->ltx_hdr = *hdr;
+        ltx->ltx_private = private;
+        ltx->ltx_cookie = cookie;
+
+        ltx->ltx_iov[0].iov_base = &ltx->ltx_hdr;
+        ltx->ltx_iov[0].iov_len = sizeof (ltx->ltx_hdr);
+
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        for (i = 0; i < payload_niov; i++)
+        {
+                ltx->ltx_iov[1 + i].iov_base = payload_iov[i].iov_base;
+                ltx->ltx_iov[1 + i].iov_len  = payload_iov[i].iov_len;
+        }
+
+        ktoenal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+void
+ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        ksock_conn_t *conn;
+        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
+        ksock_tx_t   *tx  = (ksock_tx_t *)&fwd->kprfd_scratch;
+
+        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, 
+                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+
+        if (nid == ktoenal_lib.ni.nid)         /* I'm the gateway; must be the last hop */
+                nid = fwd->kprfd_target_nid;
+        
+        conn = ktoenal_get_conn (nid);
+        if (conn == NULL)
+        {
+                CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid);
+                kpr_fwd_done (&ktoenal_data.ksnd_router, fwd, -EHOSTUNREACH);
+                return;
+        }
+
+        /* This forward has now got a ref on conn */
+
+        tx->tx_isfwd = 1;                       /* This is a forwarding packet */
+        tx->tx_nob   = fwd->kprfd_nob;
+        tx->tx_niov  = fwd->kprfd_niov;
+        tx->tx_iov   = fwd->kprfd_iov;
+
+        ktoenal_launch_packet (conn, tx);
+}
+
+int
+ktoenal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&ktoenal_data.ksnd_nthreads);
+        return (0);
+}
+
+void
+ktoenal_thread_fini (void)
+{
+        atomic_dec (&ktoenal_data.ksnd_nthreads);
+}
+
+void
+ktoenal_fmb_callback (void *arg, int error)
+{
+        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
+        ptl_hdr_t         *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]);
+        ksock_conn_t      *conn;
+        long               flags;
+
+        CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": %d\n", 
+                hdr->src_nid, hdr->dest_nid, error);
+
+        if (error != 0)
+                CERROR ("Failed to route packet from "LPX64" to "LPX64": %d\n", 
+                        hdr->src_nid, hdr->dest_nid, error);
+
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        
+        list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+
+        if (!list_empty (&fmb->fmb_pool->fmp_blocked_conns))
+        {
+                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, ksock_conn_t, ksnc_rx_list);
+                list_del (&conn->ksnc_rx_list);
+
+                CDEBUG (D_NET, "Scheduling conn %p\n", conn);
+                LASSERT (conn->ksnc_rx_scheduled);
+                LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+
+                conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+                list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns);
+
+                if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                        wake_up (&ktoenal_data.ksnd_sched_waitq);
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+}
+
+ksock_fmb_t *
+ktoenal_get_idle_fmb (ksock_conn_t *conn)
+{
+        /* NB called with sched lock held */
+        int               payload_nob = conn->ksnc_rx_nob_left;
+        int               packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        ksock_fmb_pool_t *pool;
+        ksock_fmb_t      *fmb;
+        
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+
+        if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+                pool = &ktoenal_data.ksnd_small_fmp;
+        else
+                pool = &ktoenal_data.ksnd_large_fmp;
+        
+        if (!list_empty (&pool->fmp_idle_fmbs))
+        {
+                fmb = list_entry (pool->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list);
+                list_del (&fmb->fmb_list);
+                return (fmb);
+        }
+
+        /* deschedule until fmb free */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+
+        list_add_tail (&conn->ksnc_rx_list,
+                       &pool->fmp_blocked_conns);
+        return (NULL);
+}
+
+
+int
+ktoenal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
+{
+        int payload_nob = conn->ksnc_rx_nob_left;
+        int packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        int niov;                               /* at least the header */
+        int nob;
+        
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
+        LASSERT (payload_nob >= 0);
+        LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+        
+        /* Got a forwarding buffer; copy the header we just read into the
+         * forwarding buffer.  If there's payload start reading reading it
+         * into the buffer, otherwise the forwarding buffer can be kicked
+         * off immediately.
+         *
+         * NB fmb->fmb_iov spans the WHOLE packet.
+         *    conn->ksnc_rx_iov spans just the payload.
+         */
+
+        fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]);
+                
+        memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); /* copy header */
+
+        if (payload_nob == 0)                   /* got complete packet already */
+        {
+                atomic_inc (&ktoenal_packets_received);
+
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn,
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, packet_nob);
+
+                fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t);
+
+                kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, 
+                              packet_nob, 1, fmb->fmb_iov, 
+                              ktoenal_fmb_callback, fmb);
+
+                kpr_fwd_start (&ktoenal_data.ksnd_router, &fmb->fmb_fwd); /* forward it now */
+
+                ktoenal_new_packet (conn, 0);  /* on to next packet */
+                return (1);
+        }
+
+        niov = 1;
+        if (packet_nob <= PAGE_SIZE)            /* whole packet fits in first page */
+                fmb->fmb_iov[0].iov_len = packet_nob;
+        else
+        {
+                fmb->fmb_iov[0].iov_len = PAGE_SIZE;
+                nob = packet_nob - PAGE_SIZE;
+                
+                do
+                {
+                        LASSERT (niov < fmb->fmb_npages);
+                        fmb->fmb_iov[niov].iov_base = page_address (fmb->fmb_pages[niov]);
+                        fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob);
+                        nob -= PAGE_SIZE;
+                        niov++;
+                } while (nob > 0);
+        }
+
+        kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, 
+                      packet_nob, niov, fmb->fmb_iov, 
+                      ktoenal_fmb_callback, fmb);
+
+        /* stash router's descriptor ready for call to kpr_fwd_start */        
+        conn->ksnc_cookie = &fmb->fmb_fwd;
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
+
+        /* payload is desc's iov-ed buffer, but skipping the hdr */
+        LASSERT (niov <= sizeof (conn->ksnc_rx_iov) / sizeof (conn->ksnc_rx_iov[0]));
+
+        conn->ksnc_rx_iov[0].iov_base = (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + sizeof (ptl_hdr_t));
+        conn->ksnc_rx_iov[0].iov_len = fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t);
+
+        if (niov > 1)
+                memcpy (&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], (niov - 1) * sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
+                conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, payload_nob);
+        return (0);
+}
+
+void
+ktoenal_fwd_parse (ksock_conn_t *conn)
+{
+        ksock_conn_t *conn2;
+        int           body_len;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
+                conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left);
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        switch (conn->ksnc_hdr.type)
+        {
+        case PTL_MSG_GET:
+        case PTL_MSG_ACK:
+                body_len = 0;
+                break;
+        case PTL_MSG_PUT:
+                body_len = conn->ksnc_hdr.msg.put.length;
+                break;
+        case PTL_MSG_REPLY:
+                body_len = conn->ksnc_hdr.msg.reply.length;
+                break;
+        default:
+                /* Unrecognised packet type */
+                CERROR ("Unrecognised packet type %d from "LPX64" for "LPX64"\n",
+                        conn->ksnc_hdr.type, conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid);
+                /* Ignore this header and go back to reading a new packet. */
+                ktoenal_new_packet (conn, 0);
+                return;
+        }
+
+        if (body_len < 0)                               /* length corrupt */
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d illegal\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len);
+                ktoenal_new_packet (conn, 0);          /* on to new packet */
+                return;
+        }
+
+        if (body_len > SOCKNAL_MAX_FWD_PAYLOAD)         /* too big to forward */
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d too big\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len);
+                ktoenal_new_packet (conn, body_len);    /* on to new packet (skip this one's body) */
+                return;
+        }
+
+        conn2 = ktoenal_get_conn (conn->ksnc_hdr.dest_nid); /* should have gone direct */
+        if (conn2 != NULL)
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": target is a peer\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid);
+                ktoenal_put_conn (conn2);          /* drop ref from get above */
+
+                ktoenal_new_packet (conn, body_len);  /* on to next packet (skip this one's body) */
+                return;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
+        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
+        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+}
+
+int
+ktoenal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ktoenal_slop_buffer[4096];
+
+        int   nob;
+        int   niov;
+        int   skipped;
+
+        if (nob_to_skip == 0)                   /* right at next packet boundary now */
+        {
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_niov = 1;
+                return (1);
+        }
+
+        /* set up to skip as much a possible now */
+        /* if there's more left (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        skipped = 0;
+        niov = 0;
+
+        do
+        {
+                nob = MIN (nob_to_skip, sizeof (ktoenal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ktoenal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&            /* mustn't overflow conn's rx iov */
+                 niov < sizeof (conn->ksnc_rx_iov)/sizeof (conn->ksnc_rx_iov[0]));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+void
+ktoenal_process_receive (ksock_conn_t *conn, long *irq_flags)
+{
+        ksock_fmb_t *fmb;
+        int          len;
+        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_ready);
+
+        /* NB: sched lock held */
+        CDEBUG(D_NET, "conn %p\n", conn);
+
+        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)     /* doesn't need a forwarding buffer */
+        {
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                goto try_read;
+        }
+
+ get_fmb:
+        /* NB: sched lock held */
+        fmb = ktoenal_get_idle_fmb (conn);
+        if (fmb == NULL)                        /* conn descheduled waiting for idle fmb */
+                return;
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+        
+        if (ktoenal_init_fmb (conn, fmb)) /* packet forwarded ? */
+                goto out;               /* come back later for next packet */
+
+ try_read:
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+
+        LASSERT (conn->ksnc_rx_niov > 0);
+        LASSERT (conn->ksnc_rx_nob_wanted > 0);
+
+        conn->ksnc_rx_ready = 0;                /* data ready may race with me and set ready */
+        mb();                                   /* => clear BEFORE trying to read */
+
+        /* NB ktoenal_recvmsg "consumes" the iov passed to it */
+        len = ktoenal_recvmsg(conn->ksnc_file,
+                               conn->ksnc_rx_iov, conn->ksnc_rx_niov,
+                               conn->ksnc_rx_nob_wanted);
+        CDEBUG (D_NET, "%p read(%d) %d\n", conn, conn->ksnc_rx_nob_wanted, len);
+
+        if (len <= 0)                           /* nothing ready (EAGAIN) or EOF or error */
+        {
+                if (len != -EAGAIN &&           /* ! nothing to read now */
+                    len != 0)                   /* ! nothing to read ever */
+                {
+#warning FIXME: handle socket errors properly
+                        CERROR ("Error socknal read(%d) %p: %d\n",
+                                conn->ksnc_rx_nob_wanted, conn, len);
+                }
+                goto out;                       /* come back when there's data ready */
+        }
+
+        LASSERT (len <= conn->ksnc_rx_nob_wanted);
+        conn->ksnc_rx_nob_wanted -= len;
+        conn->ksnc_rx_nob_left -= len;
+
+        if (conn->ksnc_rx_nob_wanted != 0)      /* short read */
+                goto out;                       /* try again later */
+
+        conn->ksnc_rx_ready = 1;                /* assume there's more to be had */
+
+        switch (conn->ksnc_rx_state)
+        {
+        case SOCKNAL_RX_HEADER:
+                if (conn->ksnc_hdr.dest_nid != ktoenal_lib.ni.nid) /* It's not for me */
+                {
+                        ktoenal_fwd_parse (conn);
+                        switch (conn->ksnc_rx_state)
+                        {
+                        case SOCKNAL_RX_HEADER: /* skipped this packet (zero payload) */
+                                goto out;       /* => come back later */
+                        case SOCKNAL_RX_SLOP:   /* skipping this packet's body */
+                                goto try_read;  /* => go read it */
+                        case SOCKNAL_RX_GET_FMB: /* forwarding */
+                                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                                goto get_fmb;   /* => go get a fwd msg buffer */
+                        default:
+                        }
+                        /* Not Reached */
+                        LBUG ();
+                }
+
+                PROF_START(lib_parse);
+                lib_parse(&ktoenal_lib, &conn->ksnc_hdr, conn); /* sets wanted_len, iovs etc */
+                PROF_FINISH(lib_parse);
+
+                if (conn->ksnc_rx_nob_wanted != 0) /* need to get some payload? */
+                {
+                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+                        goto try_read;          /* go read the payload */
+                }
+                /* Fall through (completed packet for me) */
+
+        case SOCKNAL_RX_BODY:
+                atomic_inc (&ktoenal_packets_received);
+                lib_finalize(&ktoenal_lib, NULL, conn->ksnc_cookie); /* packet is done now */
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                if (ktoenal_new_packet (conn, conn->ksnc_rx_nob_left)) /* starting new packet? */
+                        goto out;               /* come back later */
+                goto try_read;                  /* try to finish reading slop now */
+
+        case SOCKNAL_RX_BODY_FWD:
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn,
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left);
+
+                atomic_inc (&ktoenal_packets_received);
+
+                /* ktoenal_init_fmb() stashed router descriptor in conn->ksnc_cookie */
+                kpr_fwd_start (&ktoenal_data.ksnd_router, (kpr_fwd_desc_t *)conn->ksnc_cookie);
+
+                LASSERT (conn->ksnc_rx_nob_left == 0); /* no slop in forwarded packets */
+
+                ktoenal_new_packet (conn, 0);  /* on to next packet */
+                goto out;                       /* (later) */
+
+        default:
+        }
+
+        /* Not Reached */
+        LBUG ();
+
+ out:
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+        if (!conn->ksnc_rx_ready)               /* no data there to read? */
+        {
+                conn->ksnc_rx_scheduled = 0;    /* let socket callback schedule again */
+                ktoenal_put_conn (conn);       /* release scheduler's ref */
+        }
+        else                                    /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns);
+}
+
+int
+ktoenal_recv(nal_cb_t *nal, void *private, lib_msg_t *msg,
+             unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+        int           i;
+
+        conn->ksnc_cookie = msg;
+
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        for (i = 0; i < niov; i++)
+        {
+                conn->ksnc_rx_iov[i].iov_len = iov[i].iov_len;
+                conn->ksnc_rx_iov[i].iov_base = iov[i].iov_base;
+        }
+
+        conn->ksnc_rx_niov       = niov;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        return (rlen);
+}
+
+int
+ktoenal_scheduler (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        int                nloops = 0;
+
+        kportal_daemonize ("ktoenal_sched");
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+        while (!ktoenal_data.ksnd_shuttingdown)
+        {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&ktoenal_data.ksnd_rx_conns))
+                {
+                        did_something = 1;
+                        conn = list_entry (ktoenal_data.ksnd_rx_conns.next,
+                                           ksock_conn_t, ksnc_rx_list);
+                        list_del (&conn->ksnc_rx_list);
+
+                        ktoenal_process_receive (conn, &flags); /* drops & regains ksnd_sched_lock */
+                }
+
+                if (!list_empty (&ktoenal_data.ksnd_tx_conns))
+                {
+                        did_something = 1;
+                        conn = list_entry (ktoenal_data.ksnd_tx_conns.next,
+                                           ksock_conn_t, ksnc_tx_list);
+
+                        list_del (&conn->ksnc_tx_list);
+                        ktoenal_process_transmit (conn, &flags); /* drops and regains ksnd_sched_lock */
+                }
+
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) /* hogging CPU? */
+                {
+                        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+                                rc = wait_event_interruptible (ktoenal_data.ksnd_sched_waitq,
+                                                               ktoenal_data.ksnd_shuttingdown ||
+                                                               !list_empty (&ktoenal_data.ksnd_rx_conns) ||
+                                                               !list_empty (&ktoenal_data.ksnd_tx_conns));
+                                LASSERT (rc == 0);
+                        } else 
+                                our_cond_resched();
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        ktoenal_thread_fini ();
+        return (0);
+}
+
+
+int
+ktoenal_reaper (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        
+        kportal_daemonize ("ktoenal_reaper");
+        kportal_blockallsigs ();
+
+        while (!ktoenal_data.ksnd_shuttingdown)
+        {
+                spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags);
+
+                if (list_empty (&ktoenal_data.ksnd_reaper_list))
+                        conn = NULL;
+                else
+                {
+                        conn = list_entry (ktoenal_data.ksnd_reaper_list.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                }
+
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags);
+
+                if (conn != NULL)
+                        ktoenal_close_conn (conn);
+                else {
+                        rc = wait_event_interruptible (ktoenal_data.ksnd_reaper_waitq,
+                                                       ktoenal_data.ksnd_shuttingdown ||
+                                                       !list_empty(&ktoenal_data.ksnd_reaper_list));
+                        LASSERT (rc == 0);
+                }
+        }
+
+        ktoenal_thread_fini ();
+        return (0);
+}
+
+#define POLLREAD        (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)
+#define POLLWRITE       (POLLOUT | POLLWRNORM | POLLWRBAND)
+
+int
+ktoenal_pollthread(void *arg)
+{
+        unsigned int mask;
+        struct list_head *tmp;
+        ksock_conn_t *conn;
+        
+        /* Save the task struct for waking it up */
+        ktoenal_data.ksnd_pollthread_tsk = current; 
+        
+        kportal_daemonize ("ktoenal_pollthread");
+        kportal_blockallsigs ();
+        
+        poll_initwait(&ktoenal_data.ksnd_pwait);
+        
+        while(!ktoenal_data.ksnd_shuttingdown) {
+                
+                set_current_state(TASK_INTERRUPTIBLE);
+                
+                read_lock (&ktoenal_data.ksnd_socklist_lock);
+                list_for_each(tmp, &ktoenal_data.ksnd_socklist) {
+                        
+                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                        atomic_inc(&conn->ksnc_refcount);
+                        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+                        
+                        mask = conn->ksnc_file->f_op->poll(conn->ksnc_file,
+                                  ktoenal_data.ksnd_slistchange ? 
+                                  &ktoenal_data.ksnd_pwait : NULL);
+                         
+                        if(mask & POLLREAD) {
+                                ktoenal_data_ready(conn);
+                                                        
+                        } 
+                        if (mask & POLLWRITE) {
+                                ktoenal_write_space(conn);  
+                              
+                        }
+                        if (mask & (POLLERR | POLLHUP)) {
+                                         /* Do error processing */          
+                        }      
+                        
+                        read_lock (&ktoenal_data.ksnd_socklist_lock);
+                        if(atomic_dec_and_test(&conn->ksnc_refcount))
+                                _ktoenal_put_conn(conn);
+                }
+                ktoenal_data.ksnd_slistchange = 0;
+                read_unlock (&ktoenal_data.ksnd_socklist_lock);
+                
+                schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+                if(ktoenal_data.ksnd_slistchange) {
+                        poll_freewait(&ktoenal_data.ksnd_pwait); 
+                        poll_initwait(&ktoenal_data.ksnd_pwait);
+                }
+         }
+        poll_freewait(&ktoenal_data.ksnd_pwait);
+        ktoenal_thread_fini();
+        return (0);
+}
+
+void
+ktoenal_data_ready (ksock_conn_t *conn)
+{
+        unsigned long  flags;
+        ENTRY;
+
+        if (!test_and_set_bit (0, &conn->ksnc_rx_ready)) { 
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail (&conn->ksnc_rx_list, 
+                                        &ktoenal_data.ksnd_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        /* This is done to avoid the effects of a sequence
+                         * of events in which the rx_ready is lost
+                         */
+                        conn->ksnc_rx_ready=1;
+                          
+                        if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                                wake_up (&ktoenal_data.ksnd_sched_waitq);
+                }
+
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        }
+
+        EXIT;
+}
+
+void
+ktoenal_write_space (ksock_conn_t *conn)
+{
+        unsigned long  flags;
+
+        CDEBUG (D_NET, "conn %p%s%s%s\n",
+                         conn,
+                        (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? " ready" : " blocked"),
+                        (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"),
+                        (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued"));
+
+
+        if (!test_and_set_bit (0, &conn->ksnc_tx_ready)) {
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+                if (!list_empty (&conn->ksnc_tx_queue) && /* packets to send */
+                                !conn->ksnc_tx_scheduled) { /* not being progressed */
+
+                        list_add_tail (&conn->ksnc_tx_list, 
+                                        &ktoenal_data.ksnd_tx_conns);
+                        conn->ksnc_tx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                                wake_up (&ktoenal_data.ksnd_sched_waitq);
+                }
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        }
+}
+
+nal_cb_t ktoenal_lib = {
+        nal_data:       &ktoenal_data,                /* NAL private data */
+        cb_send:         ktoenal_send,
+        cb_recv:         ktoenal_recv,
+        cb_read:         ktoenal_read,
+        cb_write:        ktoenal_write,
+        cb_callback:     ktoenal_callback,
+        cb_malloc:       ktoenal_malloc,
+        cb_free:         ktoenal_free,
+        cb_printf:       ktoenal_printf,
+        cb_cli:          ktoenal_cli,
+        cb_sti:          ktoenal_sti,
+        cb_dist:         ktoenal_dist
+};
diff --git a/lnet/libcfs/.cvsignore b/lnet/libcfs/.cvsignore
new file mode 100644 (file)
index 0000000..67d1a3d
--- /dev/null
@@ -0,0 +1,4 @@
+.deps
+Makefile
+Makefile.in
+link-stamp
diff --git a/lnet/libcfs/Makefile.am b/lnet/libcfs/Makefile.am
new file mode 100644 (file)
index 0000000..20d7fbd
--- /dev/null
@@ -0,0 +1,29 @@
+# Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+MODULE = portals
+modulenet_DATA = portals.o
+EXTRA_PROGRAMS = portals
+
+LIBLINKS := lib-dispatch.c lib-eq.c lib-init.c lib-md.c lib-me.c lib-move.c lib-msg.c lib-ni.c lib-pid.c
+APILINKS := api-eq.c api-errno.c api-init.c api-me.c api-ni.c api-wrap.c
+LINKS = $(APILINKS) $(LIBLINKS) 
+DISTCLEANFILES = $(LINKS) link-stamp *.orig *.rej
+
+$(LINKS): link-stamp
+link-stamp:
+       -list='$(LIBLINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done
+       -list='$(APILINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done
+       echo timestamp > link-stamp
+
+DEFS =
+portals_SOURCES = $(LINKS) module.c proc.c debug.c
+
+# Don't distribute any patched files.
+dist-hook:
+       list='$(EXT2C)'; for f in $$list; do rm -f $(distdir)/$$f; done
+
+include ../Rules.linux
diff --git a/lnet/libcfs/Makefile.mk b/lnet/libcfs/Makefile.mk
new file mode 100644 (file)
index 0000000..3196ea2
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include fs/lustre/portals/Kernelenv
+
+obj-y += libcfs.o
+licfs-objs    := module.o proc.o debug.o
\ No newline at end of file
diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c
new file mode 100644 (file)
index 0000000..8d26dbb
--- /dev/null
@@ -0,0 +1,830 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/miscdevice.h>
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/kp30.h>
+
+#define DEBUG_OVERFLOW 1024
+static char *debug_buf = NULL;
+static unsigned long debug_size = 0;
+static atomic_t debug_off_a = ATOMIC_INIT(0);
+static int debug_wrapped;
+wait_queue_head_t debug_ctlwq;
+#define DAEMON_SND_SIZE      (64 << 10)
+
+/*
+ * used by the daemon to keep track the offset into debug_buffer for the next
+ * write to the file.  Usually, the daemon is to write out buffer
+ * from debug_daemon_next_write upto debug_off
+ *  variable usage
+ *      Reader - portals_debug_msg()
+ *      Writer - portals_debug_daemon()
+ *               portals_debug_daemon_start() during daemon init time
+ *               portals_debug_daemon_continue() to reset to debug_off
+ *               portals_debug_clear_buffer() reset to debug_off for clear
+ *      Note that *_start(), *_continue() & *clear_buffer() should serialized;
+ */
+static atomic_t   debug_daemon_next_write;
+
+/*
+ * A debug_daemon can be in following states
+ *      stopped - stopped state means there is no debug_daemon running.
+ *                accordingly, it must be in paused state
+ *                a daemon is in !stopped && !paused state after
+ *                "lctl debug_daemon start" creates debug_daemon successfully
+ *                Variable Usage
+ *                      Reader - portals_debug_daemon()
+ *                               portals_debug_set_daemon() routines
+ *                      Writer - portals_debug_set_daemon() routines
+ *                              portals_debug_daemon() on IO error
+ *      paused -  a debug_daemon state is changed from !paused into paused
+ *                when "lctl debug_daemon paused" is issued
+ *                "lctl debug_daemon continue" gets a daemon into !paused mode
+ *                      Reader - portals_debug_set_daemon() routines
+ *                               portals_debug_msg()
+ *                      Writer - portals_debug_set_daemon() on init
+ *                               portals_debug_daemon()
+ *
+ *        Daemon  state diagram.
+ *                      (stopped, paused)
+ *                              |  <-- debug_daemon start
+ *                              V
+ *                      (!stopped, !paused)
+ *                              |  <-- debug_daemon pause
+ *                              V
+ *                      (!stopped, paused)
+ *                              |  <-- debug_daemon continue
+ *                              V
+ *                      (!stopped, !paused)
+ *                              |  <-- debug_daemon stop
+ *                              V
+ *                      (stopped, paused)
+ *      Overlapped - this is a state when CDEBUG is too fast for the daemon to
+ *                   write out the debug_bufferr.  That is, debug_off is to
+ *                   overlap debug_daemon_next_write;
+ *                     Reader - portals_debug_msg()
+ *                     Writer - portals_debug_msg()
+ */
+
+/*
+ * Description on Trace Daemon Synchronization
+ *
+ * Three categories of code are synchronizing between each other
+ * 1.   lctl, portals_debug_set_daemon(), the user debug control code, 
+ *      as well as portals_debug_clear_buffer()
+ * 2.   CDEBUG, portals_debug_msg(), the debug put messages routine
+ * 3.   Daemon, portals_debug_daemon(), to write out debug log file
+ *
+ *
+ * Three different controls for synchronizations
+ *
+ * 1.   debug_daemon_semaphore
+ *      The usage of this semaphore is to serialize multiple lctl controls 
+ *      in manipulating debug daemon state.  The semaphore serves as the 
+ *      gatekeeper to allow only one user control thread, at any giving time, 
+ *      to access debug daemon state and keeps the other user control requests 
+ *      in wait state until the current control request is serviced.
+ *
+ * 2.   wait_queue_head_t lctl (paired with lctl_event flag)
+ *      Lctl event is the event between portals_debug_set_daemon() and 
+ *      portals_debug_daemon().  Lctl is an indicator for portals_debug_daemon()
+ *      to flush data out to file.  portals_debug_daemon() is to use lctl event
+ *      as signal channel to wakeup portals_debug_set_daemon() upon flush 
+ *      operation is done.
+ *
+ *      Producer :
+ *              portals_debug_daemon() uses to wake up 
+ *              portals_debug_set_daemon(), pause and stop, routines
+ *      Consumer :
+ *              portals_debug_set_daemon(), stop and pause operations, 
+ *              wait and sleep on the event
+ *
+ * 3.   wait_queue_head_t daemon (paired with daemon_event flag)
+ *      This is an event channel to wakeup portals_debug_daemon.  Daemon 
+ *      wakes up to run whenever there is an event posted.   Daemon handles 
+ *      2 types of operations . 1. Writes data out to debug file, 2. Flushes 
+ *      file and terminates base on lctl event. 
+ *      File operation -
+ *              Daemon is normally in a sleep state.  
+ *              Daemon is woken up through daemon event whenever CDEBUG is 
+ *              putting data over any 64K boundary. 
+ *      File flush and termination -
+ *              On portals_debug_daemon_stop/pause() operations, lctl control 
+ *              is to wake up daemon through daemon event.
+ *
+ *      We can't use sleep_on() and wake_up() to replace daemon event because 
+ *      portals_debug_daemon() must catch the wakeup operation posted by 
+ *      portals_debug_daemon_stop/pause().  Otherwise, stop and pause may 
+ *      stuck in lctl wait event.
+ *
+ *      Producer :
+ *           a. portals_debug_daemon_pause() and portals_debug_daemon_stop() 
+ *              uses the event to wake up portals_debug_daemon()
+ *           b. portals_debug_msg() uses the event to wake up 
+ *              portals_debug_daemon() whenever the data output is acrossing 
+ *              a 64K bytes boundary.
+ *      Consumer :
+ *              portals_debug_daemon() wakes up upon daemon event.
+ *
+ * Sequence for portals_debug_daemon_stop() operation
+ *
+ * _Portals_debug_daemon_stop()_          _Daemon_
+ *                                      Wait_event(daemon) or running
+ *      Paused = 1;
+ *      Wakeup_event (daemon)
+ *      Wait_event(lctl)
+ *                                      Set force_flush flag if lctlevnt
+ *                                      Flush data
+ *                                      Wakeup_event (lctl)
+ *                                      Wait_event(daemon)
+ *      Stopped = 1;
+ *      Wakeup_event (daemon)
+ *      Wait_event(lctl)
+ *                                      Exit daemon loop if (Stopped)
+ *                                      Wakeup_event (lctl)
+ *                                      Exit
+ *      Return to user application
+ *
+ *
+ * _Portals_debug_msg()_                  _Daemon_
+ *                                      Wait_event(daemon) or running
+ *      If (WriteStart<64K<WriteEnd)
+ *         Wakeup_event(daemon)
+ *                                      Do file IO
+ *                                      Wait_event(daemon)
+ */
+struct debug_daemon_state {
+        unsigned long overlapped;
+        unsigned long stopped;
+        atomic_t paused;
+        unsigned long   lctl_event;     /* event for lctl */
+        wait_queue_head_t lctl;
+        unsigned long   daemon_event;   /* event for daemon */
+        wait_queue_head_t daemon;
+};
+static struct debug_daemon_state debug_daemon_state;
+static DECLARE_MUTEX(debug_daemon_semaphore);
+
+static loff_t daemon_file_size_limit;
+char debug_daemon_file_path[1024] = "";
+
+spinlock_t portals_debug_lock = SPIN_LOCK_UNLOCKED;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+int handled_panic; /* to avoid recursive calls to notifiers */
+char portals_upcall[1024] = "/usr/lib/lustre/portals_upcall";
+
+
+int portals_do_debug_dumplog(void *arg)
+{
+        struct file *file;
+        void *journal_info;
+        int rc;
+        mm_segment_t oldfs;
+        unsigned long debug_off;
+
+        kportal_daemonize("");
+
+        reparent_to_init();
+        journal_info = current->journal_info;
+        current->journal_info = NULL;
+        sprintf(debug_file_name, "%s.%ld", debug_file_path, CURRENT_TIME);
+        file = filp_open(debug_file_name, O_CREAT|O_TRUNC|O_RDWR, 0644);
+
+        if (!file || IS_ERR(file)) {
+                CERROR("cannot open %s for dumping: %ld\n", debug_file_name,
+                       PTR_ERR(file));
+                GOTO(out, PTR_ERR(file));
+        } else {
+                printk(KERN_ALERT "dumping log to %s ... writing ...\n",
+                       debug_file_name);
+        }
+
+        debug_off = atomic_read(&debug_off_a);
+        oldfs = get_fs();
+        set_fs(get_ds());
+        if (debug_wrapped) {
+                rc = file->f_op->write(file, debug_buf + debug_off + 1,
+                                       debug_size-debug_off-1, &file->f_pos);
+                rc += file->f_op->write(file, debug_buf, debug_off + 1,
+                                        &file->f_pos);
+        } else {
+                rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos);
+        }
+        printk("wrote %d bytes\n", rc);
+        set_fs(oldfs);
+
+        rc = file->f_op->fsync(file, file->f_dentry, 1);
+        if (rc)
+                CERROR("sync returns %d\n", rc);
+        filp_close(file, 0);
+out:
+        current->journal_info = journal_info;
+        wake_up(&debug_ctlwq);
+        return 0;
+}
+
+int portals_debug_daemon(void *arg)
+{
+        struct file *file;
+        void *journal_info;
+        mm_segment_t oldfs;
+        unsigned long force_flush = 0;
+        unsigned long size, off, flags;
+        int rc;
+
+        kportal_daemonize("ldebug_daemon");
+        reparent_to_init();
+        journal_info = current->journal_info;
+        current->journal_info = NULL;
+
+        file = filp_open(debug_daemon_file_path,
+                         O_CREAT|O_TRUNC|O_RDWR|O_LARGEFILE, 0644);
+
+        if (!file || IS_ERR(file)) {
+                CERROR("cannot open %s for logging", debug_daemon_file_path);
+                GOTO(out1, PTR_ERR(file));
+        } else {
+                printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n",
+                       debug_daemon_file_path);
+        }
+
+        debug_daemon_state.overlapped = 0;
+        debug_daemon_state.stopped = 0;
+
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        off = atomic_read(&debug_off_a) + 1;
+        if (debug_wrapped)
+                off = (off >= debug_size)? 0 : off;
+        else
+                off = 0;
+        atomic_set(&debug_daemon_next_write, off);
+        atomic_set(&debug_daemon_state.paused, 0);
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+        while (1) {
+                unsigned long ending;
+                unsigned long start, tail;
+                long delta;
+
+                debug_daemon_state.daemon_event = 0;
+
+                ending = atomic_read(&debug_off_a);
+                start = atomic_read(&debug_daemon_next_write);
+
+                /* check if paused is imposed by lctl ? */
+                force_flush = !debug_daemon_state.lctl_event;
+
+                delta = ending - start;
+                tail = debug_size - start;
+                size = (delta >= 0) ? delta : tail;
+                while (size && (force_flush || (delta < 0) ||
+                                (size >= DAEMON_SND_SIZE))) {
+                        if (daemon_file_size_limit) {
+                               int ssize = daemon_file_size_limit - file->f_pos;
+                               if (size > ssize)
+                                        size = ssize;
+                        }
+
+                        rc = file->f_op->write(file, debug_buf+start,
+                                               size, &file->f_pos);
+                        if (rc < 0) {
+                                printk(KERN_ALERT
+                                           "Debug_daemon write error %d\n", rc);
+                                goto out;
+                        }
+                        start += rc;
+                        delta = ending - start;
+                        tail = debug_size - start;
+                        if (tail == 0)
+                                start = 0;
+                        if (delta >= 0)
+                                size = delta;
+                        else
+                                size = (tail == 0) ? ending : tail;
+                        if (daemon_file_size_limit == file->f_pos) {
+                                // file wrapped around
+                                file->f_pos = 0;
+                        }
+                }
+                atomic_set(&debug_daemon_next_write, start);
+                if (force_flush) {
+                        rc = file->f_op->fsync(file, file->f_dentry, 1);
+                        if (rc < 0) {
+                                printk(KERN_ALERT
+                                       "Debug_daemon sync error %d\n", rc);
+                                goto out;
+                        }
+                        if (debug_daemon_state.stopped)
+                               break;           
+                        debug_daemon_state.lctl_event = 1;
+                        wake_up(&debug_daemon_state.lctl);
+                }
+                wait_event(debug_daemon_state.daemon, 
+                           debug_daemon_state.daemon_event);
+                }
+out:
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.stopped = 1;
+        set_fs(oldfs);
+        filp_close(file, 0);
+        current->journal_info = journal_info;
+out1:
+        debug_daemon_state.lctl_event = 1;
+        wake_up(&debug_daemon_state.lctl);
+        return 0;
+}
+
+void portals_debug_print(void)
+{
+        unsigned long dumplen = 64 * 1024;
+        char *start1, *start2;
+        char *end1, *end2;
+        unsigned long debug_off = atomic_read(&debug_off_a);
+
+        start1 = debug_buf + debug_off - dumplen;
+        if (start1 < debug_buf) {
+                start1 += debug_size;
+                end1 = debug_buf + debug_size - 1;
+                start2 = debug_buf;
+                end2 = debug_buf + debug_off;
+        } else {
+                end1 = debug_buf + debug_off;
+                start2 = debug_buf + debug_off;
+                end2 = debug_buf + debug_off;
+        }
+
+        while (start1 < end1) {
+                int count = MIN(1024, end1 - start1);
+                printk("%*s", count, start1);
+                start1 += 1024;
+        }
+        while (start2 < end2) {
+                int count = MIN(1024, end2 - start2);
+                printk("%*s", count, start2);
+                start2 += 1024;
+        }
+}
+
+void portals_debug_dumplog(void)
+{
+        int rc;
+        ENTRY;
+
+        init_waitqueue_head(&debug_ctlwq);
+
+        rc = kernel_thread(portals_do_debug_dumplog,
+                           NULL, CLONE_VM | CLONE_FS | CLONE_FILES);
+        if (rc < 0) {
+                printk(KERN_ERR "cannot start dump thread\n");
+                return;
+        }
+        sleep_on(&debug_ctlwq);
+}
+
+int portals_debug_daemon_start(char *file, unsigned int size)
+{
+        int rc;
+
+        if (!debug_daemon_state.stopped)
+                return -EALREADY;
+
+        if (file != NULL)
+                strncpy(debug_daemon_file_path, file, 1024);
+
+        init_waitqueue_head(&debug_daemon_state.lctl);
+        init_waitqueue_head(&debug_daemon_state.daemon);
+
+        daemon_file_size_limit = size << 20;
+
+        debug_daemon_state.lctl_event = 0;
+        rc = kernel_thread(portals_debug_daemon, NULL, 0);
+        if (rc < 0) {
+                printk(KERN_ERR "cannot start debug daemon thread\n");
+                strncpy(debug_daemon_file_path, "\0", 1);
+                return rc;
+        }
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+        return 0;
+}
+
+int portals_debug_daemon_pause(void)
+{
+        if (atomic_read(&debug_daemon_state.paused))
+                return -EALREADY;
+
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.lctl_event = 0;
+        debug_daemon_state.daemon_event = 1;
+        wake_up(&debug_daemon_state.daemon);
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+        return 0;
+}
+
+int portals_debug_daemon_continue(void)
+{
+        if (!atomic_read(&debug_daemon_state.paused))
+                return -EINVAL;
+        if (debug_daemon_state.stopped)
+                return -EINVAL;
+
+        debug_daemon_state.overlapped = 0;
+        atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a));
+        atomic_set(&debug_daemon_state.paused, 0);
+        return 0;
+}
+
+int portals_debug_daemon_stop(void)
+{
+        if (debug_daemon_state.stopped)
+                return -EALREADY;
+
+        if (!atomic_read(&debug_daemon_state.paused))
+                portals_debug_daemon_pause();
+
+        debug_daemon_state.lctl_event = 0;
+        debug_daemon_state.stopped = 1;
+
+        debug_daemon_state.daemon_event = 1;
+        wake_up(&debug_daemon_state.daemon);
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+
+        debug_daemon_file_path[0] = '\0';
+        return 0;
+}
+
+int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
+                             char *filename, unsigned int size)
+{
+        int rc = -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        switch (cmd) {
+                case DEBUG_DAEMON_START:
+                        if (length && (filename[length -1] != '\0')) {
+                                CERROR("Invalid filename for debug_daemon\n");
+                                rc = -EINVAL;
+                                break;
+                        }
+                        rc = portals_debug_daemon_start(filename, size);
+                        break;
+                case DEBUG_DAEMON_STOP:
+                        rc = portals_debug_daemon_stop();
+                        break;
+                case DEBUG_DAEMON_PAUSE:
+                        rc = portals_debug_daemon_pause();
+                        break;
+                case DEBUG_DAEMON_CONTINUE:
+                        rc = portals_debug_daemon_continue();
+                        break;
+                default:
+                        CERROR("unknown set_daemon cmd\n");
+        }
+        up(&debug_daemon_semaphore);
+        return rc;
+}
+
+static int panic_dumplog(struct notifier_block *self, unsigned long unused1,
+                         void *unused2)
+{
+        if (handled_panic)
+                return 0;
+        else
+                handled_panic = 1;
+
+        if (in_interrupt()) {
+                portals_debug_print();
+                return 0;
+        }
+
+        while (current->lock_depth >= 0)
+                unlock_kernel();
+        portals_debug_dumplog();
+        return 0;
+}
+
+static struct notifier_block lustre_panic_notifier = {
+        notifier_call :     panic_dumplog,
+        next :              NULL,
+        priority :          10000
+};
+
+int portals_debug_init(unsigned long bufsize)
+{
+        unsigned long debug_off = atomic_read(&debug_off_a);
+        if (debug_buf != NULL)
+                return -EALREADY;
+
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.stopped = 1;
+
+        debug_buf = vmalloc(bufsize + DEBUG_OVERFLOW);
+        if (debug_buf == NULL)
+                return -ENOMEM;
+        memset(debug_buf, 0, debug_size);
+        debug_wrapped = 0;
+
+        printk(KERN_INFO "Portals: allocated %lu byte debug buffer at %p.\n",
+               bufsize, debug_buf);
+        atomic_set(&debug_off_a, debug_off);
+        notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier);
+        debug_size = bufsize;
+
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier);
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        portals_debug_daemon_stop();
+
+        vfree(debug_buf);
+        atomic_set(&debug_off_a, 0);
+        up(&debug_daemon_semaphore);
+
+        return 0;
+}
+
+int portals_debug_clear_buffer(void)
+{
+        unsigned long flags;
+        unsigned long state;
+
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        state = atomic_read(&debug_daemon_state.paused);
+        if (!state)
+                portals_debug_daemon_pause();
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        atomic_set(&debug_off_a, 0);
+        debug_wrapped = 0;
+        atomic_set(&debug_daemon_next_write, 0);
+        debug_daemon_state.overlapped = 0;
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        if (!state)
+                atomic_set(&debug_daemon_state.paused, 0);
+        up(&debug_daemon_semaphore);
+
+        return 0;
+}
+
+/* Debug markers, although printed by S_PORTALS
+ * should not be be marked as such.
+ */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int portals_debug_mark_buffer(char *text)
+{
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        CDEBUG(0, "*******************************************************************************\n");
+        CDEBUG(0, "DEBUG MARKER: %s\n", text);
+        CDEBUG(0, "*******************************************************************************\n");
+
+        return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+__s32 portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        int rc;
+        unsigned long debug_off;
+        unsigned long flags;
+
+        if (len < debug_size)
+                return -ENOSPC;
+
+        debug_off = atomic_read(&debug_off_a);
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        if (debug_wrapped) {
+                /* All of this juggling with the 1s is to keep the trailing nul
+                 * (which falls at debug_buf + debug_off) at the end of what we
+                 * copy into user space */
+                copy_to_user(buf, debug_buf + debug_off + 1,
+                             debug_size - debug_off - 1);
+                copy_to_user(buf + debug_size - debug_off - 1,
+                             debug_buf, debug_off + 1);
+                rc = debug_size;
+        } else {
+                copy_to_user(buf, debug_buf, debug_off);
+                rc = debug_off;
+        }
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        return rc;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   unsigned long stack, const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        int           max_nob;
+        int           prefix_nob;
+        int           msg_nob;
+        struct timeval tv;
+        unsigned long base_offset;
+        unsigned long debug_off;
+
+        if (debug_buf == NULL) {
+                printk("portals_debug_msg: debug_buf is NULL!\n");
+                return;
+        }
+
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        debug_off = atomic_read(&debug_off_a);
+        if (!atomic_read(&debug_daemon_state.paused)) {
+                unsigned long available;
+                long delta;
+                long v = atomic_read(&debug_daemon_next_write);
+
+                delta = debug_off - v;
+                available = (delta>=0) ? debug_size-delta : -delta;
+                // Check if we still have enough debug buffer for CDEBUG
+                if (available < DAEMON_SND_SIZE) {
+                        /* Drop CDEBUG packets until enough debug_buffer is
+                         * available */
+                        if (debug_daemon_state.overlapped)
+                                 goto out;
+                        /* If this is the first time, leave a marker in the
+                         * output */
+                        debug_daemon_state.overlapped = 1;
+                        ap = NULL;
+                        format = "DEBUG MARKER: Debug buffer overlapped\n";
+                } else  /* More space just became available */
+                        debug_daemon_state.overlapped = 0;
+        }
+
+        max_nob = debug_size - debug_off + DEBUG_OVERFLOW;
+        if (max_nob <= 0) {
+                spin_unlock_irqrestore(&portals_debug_lock, flags);
+                printk("logic error in portals_debug_msg: <0 bytes to write\n");
+                return;
+        }
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        do_gettimeofday(&tv);
+
+        prefix_nob = snprintf(debug_buf + debug_off, max_nob,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id(),
+                              tv.tv_sec, tv.tv_usec);
+        max_nob -= prefix_nob;
+
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d | %d+%lu): ",
+                           file, line, fn, current->pid,
+                           current->thread.extern_pid, stack);
+#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d | %d+%lu): ",
+                           file, line, fn, current->pid,
+                           current->thread.mode.tt.extern_pid, stack);
+#else
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d+%lu): ",
+                           file, line, fn, current->pid, stack);
+#endif
+        max_nob -= msg_nob;
+
+        va_start(ap, format);
+        msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob,
+                            max_nob, format, ap);
+        max_nob -= msg_nob;
+        va_end(ap);
+
+        /* Print to console, while msg is contiguous in debug_buf */
+        /* NB safely terminated see above */
+        if ((mask & D_EMERG) != 0)
+                printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob);
+        if ((mask & D_ERROR) != 0)
+                printk(KERN_ERR   "%s", debug_buf + debug_off + prefix_nob);
+        else if (portal_printk)
+                printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob);
+        base_offset = debug_off & 0xFFFF;
+
+        debug_off += prefix_nob + msg_nob;
+        if (debug_off > debug_size) {
+                memcpy(debug_buf, debug_buf + debug_size,
+                       debug_off - debug_size + 1);
+                debug_off -= debug_size;
+                debug_wrapped = 1;
+        }
+
+        atomic_set(&debug_off_a, debug_off);
+        if (!atomic_read(&debug_daemon_state.paused) &&
+            ((base_offset+prefix_nob+msg_nob) >= DAEMON_SND_SIZE)) {
+                debug_daemon_state.daemon_event = 1;
+                wake_up(&debug_daemon_state.daemon);
+        }
+out:
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+}
+
+void portals_debug_set_level(unsigned int debug_level)
+{
+        printk("Setting portals debug level to %08x\n", debug_level);
+        portal_debug = debug_level;
+}
+
+void portals_run_lbug_upcall(char * file, char *fn, int line)
+{
+        char *argv[6];
+        char *envp[3];
+        char buf[32];
+        int rc;
+
+        ENTRY;
+        snprintf (buf, sizeof buf, "%d", line);
+
+        argv[0] = portals_upcall;
+        argv[1] = "LBUG";
+        argv[2] = file;
+        argv[3] = fn;
+        argv[4] = buf;
+        argv[5] = NULL;
+
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+        envp[2] = NULL;
+
+        rc = call_usermodehelper(argv[0], argv, envp);
+        if (rc < 0) {
+                CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check "
+                       "/proc/sys/portals/upcall\n",                
+                       argv[0], argv[1], argv[2], argv[3], argv[4], rc);
+                
+        } else {
+                CERROR("Invoked upcall %s %s %s %s %s\n",
+                       argv[0], argv[1], argv[2], argv[3], argv[4]);
+        }
+}
+
+
+EXPORT_SYMBOL(portals_debug_dumplog);
+EXPORT_SYMBOL(portals_debug_msg);
+EXPORT_SYMBOL(portals_debug_set_level);
+EXPORT_SYMBOL(portals_run_lbug_upcall);
diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c
new file mode 100644 (file)
index 0000000..5e3fcb5
--- /dev/null
@@ -0,0 +1,574 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/miscdevice.h>
+
+#include <portals/lib-p30.h>
+#include <portals/p30.h>
+#include <linux/kp30.h>
+#include <linux/portals_compat25.h>
+
+#define PORTAL_MINOR 240
+
+extern void (kping_client)(struct portal_ioctl_data *);
+
+struct nal_cmd_handler {
+        nal_cmd_handler_t nch_handler;
+        void * nch_private;
+};
+
+static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1];
+struct semaphore nal_cmd_sem;
+
+#ifdef PORTAL_DEBUG
+void
+kportal_assertion_failed (char *expr, char *file, char *func, int line)
+{
+        portals_debug_msg(0, D_EMERG, file, func, line, CDEBUG_STACK(),
+                          "ASSERTION(%s) failed\n", expr);
+        LBUG_WITH_LOC(file, func, line);
+}
+#endif
+
+void
+kportal_daemonize (char *str) 
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63))
+        daemonize(str);
+#else
+        daemonize();
+        snprintf (current->comm, sizeof (current->comm), "%s", str);
+#endif
+}
+
+void
+kportal_blockallsigs ()
+{
+        unsigned long  flags;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+/* called when opening /dev/device */
+static int kportal_psdev_open(struct inode * inode, struct file * file)
+{
+        ENTRY;
+
+        if (!inode)
+                RETURN(-EINVAL);
+        PORTAL_MODULE_USE;
+        RETURN(0);
+}
+
+/* called when closing /dev/device */
+static int kportal_psdev_release(struct inode * inode, struct file * file)
+{
+        ENTRY;
+
+        if (!inode)
+                RETURN(-EINVAL);
+
+        PORTAL_MODULE_UNUSE;
+        RETURN(0);
+}
+
+static inline void freedata(void *data, int len)
+{
+        PORTAL_FREE(data, len);
+}
+
+static int
+kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
+                  ptl_nid_t hi_nid)
+{
+        int rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+        PORTAL_SYMBOL_PUT(kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_del_route(ptl_nid_t target)
+{
+        int rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_del_route (target);
+
+        PORTAL_SYMBOL_PUT(kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp,
+                  ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp)
+{
+        int       gateway_nalid;
+        ptl_nid_t gateway_nid;
+        ptl_nid_t lo_nid;
+        ptl_nid_t hi_nid;
+        int       rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid,
+                                 &hi_nid);
+
+        if (rc == 0) {
+                CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n",
+                       index, gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+                *gateway_nalidp = (__u32)gateway_nalid;
+                *gateway_nidp   = (__u32)gateway_nid;
+                *lo_nidp        = (__u32)lo_nid;
+                *hi_nidp        = (__u32)hi_nid;
+        }
+
+        PORTAL_SYMBOL_PUT (kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_nal_cmd(int nal, struct portal_ioctl_data *data)
+{
+        int rc = -EINVAL;
+
+        ENTRY;
+
+        down(&nal_cmd_sem);
+        if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
+                CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, data->ioc_nal_cmd);
+                rc = nal_cmd[nal].nch_handler(data, nal_cmd[nal].nch_private);
+        }
+        up(&nal_cmd_sem);
+        RETURN(rc);
+}
+
+ptl_handle_ni_t *
+kportal_get_ni (int nal)
+{
+
+        switch (nal)
+        {
+        case QSWNAL:
+                return (PORTAL_SYMBOL_GET(kqswnal_ni));
+        case SOCKNAL:
+                return (PORTAL_SYMBOL_GET(ksocknal_ni));
+        case TOENAL:
+                return  (PORTAL_SYMBOL_GET(ktoenal_ni));
+        case GMNAL:
+                return  (PORTAL_SYMBOL_GET(kgmnal_ni));
+        case TCPNAL:
+                /* userspace NAL */
+                return (NULL);
+        case SCIMACNAL:
+                return  (PORTAL_SYMBOL_GET(kscimacnal_ni));
+        default:
+                /* A warning to a naive caller */
+                CERROR ("unknown nal: %d\n", nal);
+                return (NULL);
+        }
+}
+
+void
+kportal_put_ni (int nal)
+{
+
+        switch (nal)
+        {
+        case QSWNAL:
+                PORTAL_SYMBOL_PUT(kqswnal_ni);
+                break;
+        case SOCKNAL:
+                PORTAL_SYMBOL_PUT(ksocknal_ni);
+                break;
+        case TOENAL:
+                PORTAL_SYMBOL_PUT(ktoenal_ni);
+                break;
+        case GMNAL:
+                PORTAL_SYMBOL_PUT(kgmnal_ni);
+                break;
+        case TCPNAL:
+                /* A lesson to a malicious caller */
+                LBUG ();
+        case SCIMACNAL:
+                PORTAL_SYMBOL_PUT(kscimacnal_ni);
+                break;
+        default:
+                CERROR ("unknown nal: %d\n", nal);
+        }
+}
+
+int
+kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private)
+{
+        int rc = 0;
+
+        CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler);
+
+        if (nal > 0  && nal <= NAL_MAX_NR) {
+                down(&nal_cmd_sem);
+                if (nal_cmd[nal].nch_handler != NULL)
+                        rc = -EBUSY;
+                else {
+                        nal_cmd[nal].nch_handler = handler;
+                        nal_cmd[nal].nch_private = private;
+                }
+                up(&nal_cmd_sem);
+        }
+        return rc;
+}
+
+int
+kportal_nal_unregister(int nal)
+{
+        int rc = 0;
+
+        CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal);
+
+        if (nal > 0  && nal <= NAL_MAX_NR) {
+                down(&nal_cmd_sem);
+                nal_cmd[nal].nch_handler = NULL;
+                nal_cmd[nal].nch_private = NULL;
+                up(&nal_cmd_sem);
+        }
+        return rc;
+}
+
+
+static int kportal_ioctl(struct inode *inode, struct file *file,
+                         unsigned int cmd, unsigned long arg)
+{
+        int err = 0;
+        char buf[1024];
+        struct portal_ioctl_data *data;
+
+        ENTRY;
+
+        if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE ||
+             _IOC_NR(cmd) < IOC_PORTAL_MIN_NR  ||
+             _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) {
+                CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+                                _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+                RETURN(-EINVAL);
+        }
+
+        if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+                CERROR("PORTALS ioctl: data error\n");
+                RETURN(-EINVAL);
+        }
+
+        data = (struct portal_ioctl_data *)buf;
+
+        switch (cmd) {
+        case IOC_PORTAL_SET_DAEMON: 
+                RETURN (portals_debug_set_daemon ( 
+                                        (unsigned int) data->ioc_count,
+                                        (unsigned int) data->ioc_inllen1,
+                                        (char *) data->ioc_inlbuf1,
+                                        (unsigned int) data->ioc_misc)); 
+        case IOC_PORTAL_GET_DEBUG: {
+                __s32 size = portals_debug_copy_to_user(data->ioc_pbuf1,
+                                                        data->ioc_plen1);
+
+                if (size < 0)
+                        RETURN(size);
+
+                data->ioc_size = size;
+                err = copy_to_user((char *)arg, data, sizeof(*data));
+                RETURN(err);
+        }
+        case IOC_PORTAL_CLEAR_DEBUG:
+                portals_debug_clear_buffer();
+                RETURN(0);
+        case IOC_PORTAL_PANIC:
+                if (!capable (CAP_SYS_BOOT))
+                        RETURN (-EPERM);
+                panic("debugctl-invoked panic");
+                RETURN(0);
+        case IOC_PORTAL_MARK_DEBUG:
+                if (data->ioc_inlbuf1 == NULL ||
+                    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+                        RETURN(-EINVAL);
+                portals_debug_mark_buffer(data->ioc_inlbuf1);
+                RETURN(0);
+        case IOC_PORTAL_PING: {
+                void (*ping)(struct portal_ioctl_data *);
+
+                CDEBUG(D_IOCTL, "doing %d pings to nid "LPU64"\n",
+                       data->ioc_count, data->ioc_nid);
+                ping = PORTAL_SYMBOL_GET(kping_client);
+                if (!ping)
+                        CERROR("PORTAL_SYMBOL_GET failed\n");
+                else {
+                        ping(data);
+                        PORTAL_SYMBOL_PUT(kping_client);
+                }
+                RETURN(0);
+        }
+
+        case IOC_PORTAL_ADD_ROUTE:
+                CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n",
+                       data->ioc_nal, data->ioc_nid, data->ioc_nid2,
+                       data->ioc_nid3);
+                err = kportal_add_route(data->ioc_nal, data->ioc_nid,
+                                        MIN (data->ioc_nid2, data->ioc_nid3),
+                                        MAX (data->ioc_nid2, data->ioc_nid3));
+                break;
+
+        case IOC_PORTAL_DEL_ROUTE:
+                CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid);
+                err = kportal_del_route (data->ioc_nid);
+                break;
+
+        case IOC_PORTAL_GET_ROUTE:
+                CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count);
+                err = kportal_get_route(data->ioc_count, &data->ioc_nal,
+                                        &data->ioc_nid, &data->ioc_nid2,
+                                        &data->ioc_nid3);
+                if (err == 0)
+                        if (copy_to_user((char *)arg, data, sizeof (*data)))
+                                err = -EFAULT;
+                break;
+
+        case IOC_PORTAL_GET_NID: {
+                const ptl_handle_ni_t *nip;
+                ptl_process_id_t       pid;
+
+                CDEBUG (D_IOCTL, "Getting nid [%d]\n", data->ioc_nal);
+
+                nip = kportal_get_ni (data->ioc_nal);
+                if (nip == NULL)
+                        RETURN (-EINVAL);
+
+                err = PtlGetId (*nip, &pid);
+                LASSERT (err == PTL_OK);
+                kportal_put_ni (data->ioc_nal);
+
+                data->ioc_nid = pid.nid;
+                if (copy_to_user ((char *)arg, data, sizeof (*data)))
+                        err = -EFAULT;
+                break;
+        }
+
+        case IOC_PORTAL_NAL_CMD:
+                CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", data->ioc_nal,
+                        data->ioc_nal_cmd);
+                err = kportal_nal_cmd(data->ioc_nal, data);
+                if (err == 0)
+                        if (copy_to_user((char *)arg, data, sizeof (*data)))
+                                err = -EFAULT;
+                break;
+
+        case IOC_PORTAL_FAIL_NID: {
+                const ptl_handle_ni_t *nip;
+
+                CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
+                        data->ioc_nal, data->ioc_nid, data->ioc_count);
+
+                nip = kportal_get_ni (data->ioc_nal);
+                if (nip == NULL)
+                        return (-EINVAL);
+
+                err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count);
+                break;
+        }
+
+        default:
+                err = -EINVAL;
+                break;
+        }
+
+        RETURN(err);
+}
+
+
+static struct file_operations portalsdev_fops = {
+        ioctl:   kportal_ioctl,
+        open:    kportal_psdev_open,
+        release: kportal_psdev_release
+};
+
+
+static struct miscdevice portal_dev = {
+        PORTAL_MINOR,
+        "portals",
+        &portalsdev_fops
+};
+
+extern int insert_proc(void);
+extern void remove_proc(void);
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+static int init_kportals_module(void)
+{
+        int rc;
+
+        rc = portals_debug_init(5 * 1024 * 1024);
+        if (rc < 0) {
+                printk(KERN_ERR "portals_debug_init: %d\n", rc);
+                return (rc);
+        }
+
+        sema_init(&nal_cmd_sem, 1);
+
+        rc = misc_register(&portal_dev);
+        if (rc) {
+                CERROR("misc_register: error %d\n", rc);
+                goto cleanup_debug;
+        }
+
+        rc = PtlInit();
+        if (rc) {
+                CERROR("PtlInit: error %d\n", rc);
+                goto cleanup_deregister;
+        }
+
+        rc = insert_proc();
+        if (rc) {
+                CERROR("insert_proc: error %d\n", rc);
+                goto cleanup_fini;
+        }
+
+        CDEBUG (D_OTHER, "portals setup OK\n");
+        return (0);
+
+ cleanup_fini:
+        PtlFini();
+ cleanup_deregister:
+        misc_deregister(&portal_dev);
+ cleanup_debug:
+        portals_debug_cleanup();
+        return rc;
+}
+
+static void exit_kportals_module(void)
+{
+        int rc;
+
+        remove_proc();
+        PtlFini();
+
+        CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+               atomic_read(&portal_kmemory));
+
+
+        rc = misc_deregister(&portal_dev);
+        if (rc)
+                CERROR("misc_deregister error %d\n", rc);
+
+        if (atomic_read(&portal_kmemory) != 0)
+                CERROR("Portals memory leaked: %d bytes\n",
+                       atomic_read(&portal_kmemory));
+
+        rc = portals_debug_cleanup();
+        if (rc)
+                printk(KERN_ERR "portals_debug_cleanup: %d\n", rc);
+}
+
+EXPORT_SYMBOL(lib_dispatch);
+EXPORT_SYMBOL(PtlMEAttach);
+EXPORT_SYMBOL(PtlMEInsert);
+EXPORT_SYMBOL(PtlMEUnlink);
+EXPORT_SYMBOL(PtlEQAlloc);
+EXPORT_SYMBOL(PtlMDAttach);
+EXPORT_SYMBOL(PtlMDUnlink);
+EXPORT_SYMBOL(PtlNIInit);
+EXPORT_SYMBOL(PtlNIFini);
+EXPORT_SYMBOL(PtlNIDebug);
+EXPORT_SYMBOL(PtlInit);
+EXPORT_SYMBOL(PtlFini);
+EXPORT_SYMBOL(PtlPut);
+EXPORT_SYMBOL(PtlGet);
+EXPORT_SYMBOL(ptl_err_str);
+EXPORT_SYMBOL(portal_subsystem_debug);
+EXPORT_SYMBOL(portal_debug);
+EXPORT_SYMBOL(portal_stack);
+EXPORT_SYMBOL(portal_printk);
+EXPORT_SYMBOL(PtlEQWait);
+EXPORT_SYMBOL(PtlEQFree);
+EXPORT_SYMBOL(PtlEQGet);
+EXPORT_SYMBOL(PtlGetId);
+EXPORT_SYMBOL(PtlMDBind);
+EXPORT_SYMBOL(lib_iov_nob);
+EXPORT_SYMBOL(lib_copy_iov2buf);
+EXPORT_SYMBOL(lib_copy_buf2iov);
+EXPORT_SYMBOL(lib_kiov_nob);
+EXPORT_SYMBOL(lib_copy_kiov2buf);
+EXPORT_SYMBOL(lib_copy_buf2kiov);
+EXPORT_SYMBOL(lib_finalize);
+EXPORT_SYMBOL(lib_parse);
+EXPORT_SYMBOL(lib_init);
+EXPORT_SYMBOL(lib_fini);
+EXPORT_SYMBOL(portal_kmemory);
+EXPORT_SYMBOL(kportal_daemonize);
+EXPORT_SYMBOL(kportal_blockallsigs);
+EXPORT_SYMBOL(kportal_nal_register);
+EXPORT_SYMBOL(kportal_nal_unregister);
+EXPORT_SYMBOL(kportal_assertion_failed);
+EXPORT_SYMBOL(dispatch_name);
+EXPORT_SYMBOL(kportal_get_ni);
+EXPORT_SYMBOL(kportal_put_ni);
+
+module_init(init_kportals_module);
+module_exit (exit_kportals_module);
diff --git a/lnet/libcfs/proc.c b/lnet/libcfs/proc.c
new file mode 100644 (file)
index 0000000..2fa739a
--- /dev/null
@@ -0,0 +1,290 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/kp30.h>
+#include <asm/div64.h>
+
+static struct ctl_table_header *portals_table_header = NULL;
+extern char debug_file_path[1024];
+extern char debug_daemon_file_path[1024];
+extern char portals_upcall[1024];
+
+#define PSDEV_PORTALS  (0x100)
+#define PSDEV_DEBUG           1   /* control debugging */
+#define PSDEV_SUBSYSTEM_DEBUG 2   /* control debugging */
+#define PSDEV_PRINTK          3   /* force all errors to console */
+#define PSDEV_DEBUG_PATH      4   /* crashdump log location */
+#define PSDEV_DEBUG_DUMP_PATH 5   /* crashdump tracelog location */
+#define PSDEV_PORTALS_UPCALL  6   /* User mode upcall script  */
+
+#define PORTALS_PRIMARY_CTLCNT 6
+static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = {
+        {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug,
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
+         sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
+        {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path,
+         sizeof(debug_daemon_file_path), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+        {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall,
+         sizeof(portals_upcall), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+        {0}
+};
+
+static struct ctl_table top_table[2] = {
+        {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table},
+        {0}
+};
+
+
+#ifdef PORTALS_PROFILING
+/*
+ * profiling stuff.  we do this statically for now 'cause its simple,
+ * but we could do some tricks with elf sections to have this array
+ * automatically built.
+ */
+#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, }
+
+struct prof_ent prof_ents[] = {
+        def_prof(our_recvmsg),
+        def_prof(our_sendmsg),
+        def_prof(socknal_recv),
+        def_prof(lib_parse),
+        def_prof(conn_list_walk),
+        def_prof(memcpy),
+        def_prof(lib_finalize),
+        def_prof(pingcli_time),
+        def_prof(gmnal_send),
+        def_prof(gmnal_recv),
+};
+
+EXPORT_SYMBOL(prof_ents);
+
+/*
+ * this function is as crazy as the proc filling api
+ * requires.
+ *
+ * buffer: page allocated for us to scribble in.  the
+ *  data returned to the user will be taken from here.
+ * *start: address of the pointer that will tell the 
+ *  caller where in buffer the data the user wants is.
+ * ppos: offset in the entire /proc file that the user
+ *  currently wants.
+ * wanted: the amount of data the user wants.
+ *
+ * while going, 'curpos' is the offset in the entire
+ * file where we currently are.  We only actually
+ * start filling buffer when we get to a place in
+ * the file that the user cares about.
+ *
+ * we take care to only sprintf when the user cares because
+ * we're holding a lock while we do this.
+ *
+ * we're smart and know that we generate fixed size lines.
+ * we only start writing to the buffer when the user cares.
+ * This is unpredictable because we don't snapshot the
+ * list between calls that are filling in a file from
+ * the list.  The list could change mid read and the
+ * output will look very weird indeed.  oh well.
+ */
+
+static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted,
+                          int *eof, void *data)
+{
+        int len = 0, i;
+        int curpos;
+        char *header = "Interval        Cycles_per (Starts Finishes Total)\n";
+        int header_len = strlen(header);
+        char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)";
+        int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1);
+
+        *start = buffer;
+
+        if (ppos < header_len) {
+                int diff = MIN(header_len, wanted);
+                memcpy(buffer, header + ppos, diff);
+                len += diff;
+                ppos += diff;
+        }
+
+        if (len >= wanted)
+                goto out;
+
+        curpos = header_len;
+
+        for ( i = 0; i < MAX_PROFS ; i++) {
+                int copied;
+                struct prof_ent *pe = &prof_ents[i];
+                long long cycles_per;
+                /*
+                 * find the part of the array that the buffer wants
+                 */
+                if (ppos >= (curpos + line_len))  {
+                        curpos += line_len;
+                        continue;
+                }
+                /* the clever caller split a line */
+                if (ppos > curpos) {
+                        *start = buffer + (ppos - curpos);
+                }
+
+                if (pe->finishes == 0)
+                        cycles_per = 0;
+                else
+                {
+                        cycles_per = pe->total_cycles;
+                        do_div (cycles_per, pe->finishes);
+                }
+
+                copied = sprintf(buffer + len, format, pe->str, cycles_per,
+                                 pe->starts, pe->finishes, pe->total_cycles);
+
+                len += copied;
+
+                /* pad to line len, -1 for \n */
+                if ((copied < line_len-1)) {
+                        int diff = (line_len-1) - copied;
+                        memset(buffer + len, ' ', diff);
+                        len += diff;
+                        copied += diff;
+                }
+
+                buffer[len++]= '\n';
+
+                /* bail if we have enough */
+                if (((buffer + len) - *start) >= wanted)
+                        break;
+
+                curpos += line_len;
+        }
+
+        /* lameness */
+        if (i == MAX_PROFS)
+                *eof = 1;
+ out:
+
+        return MIN(((buffer + len) - *start), wanted);
+}
+
+/*
+ * all kids love /proc :/
+ */
+static unsigned char basedir[]="net/portals";
+#endif /* PORTALS_PROFILING */
+
+int insert_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        struct proc_dir_entry *ent;
+
+        if (ARRAY_SIZE(prof_ents) != MAX_PROFS) {
+                CERROR("profiling enum and array are out of sync.\n");
+                return -1;
+        }
+
+        /*
+         * This is pretty lame.  assuming that failure just
+         * means that they already existed.
+         */
+        strcat(dir, basedir);
+        create_proc_entry(dir, S_IFDIR, 0);
+
+        strcat(dir, "/cycles");
+        ent = create_proc_entry(dir, 0, 0);
+        if (!ent) {
+                CERROR("couldn't register %s?\n", dir);
+                return -1;
+        }
+
+        ent->data = NULL;
+        ent->read_proc = prof_read_proc;
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (!portals_table_header)
+                portals_table_header = register_sysctl_table(top_table, 0);
+#endif
+
+        return 0;
+}
+
+void remove_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        int end;
+
+        dir[0]='\0';
+        strcat(dir, basedir);
+
+        end = strlen(dir);
+
+        strcat(dir, "/cycles");
+        remove_proc_entry(dir,0);
+
+        dir[end] = '\0';
+        remove_proc_entry(dir,0);
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (portals_table_header)
+                unregister_sysctl_table(portals_table_header);
+        portals_table_header = NULL;
+#endif
+}
diff --git a/lnet/lnet/.cvsignore b/lnet/lnet/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/lnet/Makefile.am b/lnet/lnet/Makefile.am
new file mode 100644 (file)
index 0000000..8c03749
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include
+lib_LIBRARIES= libportals.a
+libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c
diff --git a/lnet/lnet/Makefile.mk b/lnet/lnet/Makefile.mk
new file mode 100644 (file)
index 0000000..5627ef7
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Kernelenv
+
+obj-y += portals.o
+portals-objs    := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o lib-move.o lib-msg.o lib-ni.o lib-not-impl.o lib-pid.o api-eq.o api-errno.o api-init.o api-md.o api-me.o api-ni.o api-wrap.o
diff --git a/lnet/lnet/api-eq.c b/lnet/lnet/api-eq.c
new file mode 100644 (file)
index 0000000..e066619
--- /dev/null
@@ -0,0 +1,158 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-eq.c
+ * User-level event queue management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_eq_init(void)
+{
+        /* Nothing to do anymore... */
+        return PTL_OK;
+}
+
+void ptl_eq_fini(void)
+{
+        /* Nothing to do anymore... */
+}
+
+int ptl_eq_ni_init(nal_t * nal)
+{
+        /* Nothing to do anymore... */
+        return PTL_OK;
+}
+
+void ptl_eq_ni_fini(nal_t * nal)
+{
+        /* Nothing to do anymore... */
+}
+
+int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev)
+{
+        ptl_eq_t *eq;
+        int rc, new_index;
+        unsigned long flags;
+        ptl_event_t *new_event;
+        nal_t *nal;
+        ENTRY;
+
+        if (!ptl_init)
+                RETURN(PTL_NOINIT);
+
+        nal = ptl_hndl2nal(&eventq);
+        if (!nal)
+                RETURN(PTL_INV_EQ);
+
+        eq = ptl_handle2usereq(&eventq);
+        nal->lock(nal, &flags);
+
+        /* size must be a power of 2 to handle a wrapped sequence # */
+        LASSERT (eq->size != 0 &&
+                 eq->size == LOWEST_BIT_SET (eq->size));
+
+        new_index = eq->sequence & (eq->size - 1);
+        new_event = &eq->base[new_index];
+        CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n",
+               new_event, eq->sequence, eq->size);
+        if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) {
+                nal->unlock(nal, &flags);
+                RETURN(PTL_EQ_EMPTY);
+        }
+
+        *ev = *new_event;
+
+        /* Set the unlinked_me interface number if there is one to pass
+         * back, since the NAL hasn't a clue what it is and therefore can't
+         * set it. */
+        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE))
+                ev->unlinked_me.nal_idx = eventq.nal_idx;
+        
+        /* ensure event is delivered correctly despite possible 
+           races with lib_finalize */
+        if (eq->sequence != new_event->sequence) {
+                CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n",
+                       eq->sequence, new_event->sequence);
+                rc = PTL_EQ_DROPPED;
+        } else {
+                rc = PTL_OK;
+        }
+
+        eq->sequence = new_event->sequence + 1;
+        nal->unlock(nal, &flags);
+        RETURN(rc);
+}
+
+
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
+{
+        int rc;
+        
+        /* PtlEQGet does the handle checking */
+        while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
+                nal_t *nal = ptl_hndl2nal(&eventq_in);
+                
+                if (nal->yield)
+                        nal->yield(nal);
+        }
+
+        return rc;
+}
+
+#ifndef __KERNEL__
+static jmp_buf eq_jumpbuf;
+
+static void eq_timeout(int signal)
+{
+        longjmp(eq_jumpbuf, -1);
+}
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout)
+{
+        static void (*prev) (int);
+        static int left_over;
+        time_t time_at_start;
+        int rc;
+
+        if (setjmp(eq_jumpbuf)) {
+                signal(SIGALRM, prev);
+                alarm(left_over - timeout);
+                return PTL_EQ_EMPTY;
+        }
+
+        left_over = alarm(timeout);
+        prev = signal(SIGALRM, eq_timeout);
+        time_at_start = time(NULL);
+        if (left_over < timeout)
+                alarm(left_over);
+
+        rc = PtlEQWait(eventq_in, event_out);
+
+        signal(SIGALRM, prev);
+        alarm(left_over);       /* Should compute how long we waited */
+
+        return rc;
+}
+
+#endif
+
diff --git a/lnet/lnet/api-errno.c b/lnet/lnet/api-errno.c
new file mode 100644 (file)
index 0000000..026c93b
--- /dev/null
@@ -0,0 +1,55 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-errno.c
+ * Instantiate the string table of errors
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */
+const char *ptl_err_str[] = {
+        "PTL_OK",
+        "PTL_SEGV",
+
+        "PTL_NOSPACE",
+        "PTL_INUSE",
+        "PTL_VAL_FAILED",
+
+        "PTL_NAL_FAILED",
+        "PTL_NOINIT",
+        "PTL_INIT_DUP",
+        "PTL_INIT_INV",
+        "PTL_AC_INV_INDEX",
+
+        "PTL_INV_ASIZE",
+        "PTL_INV_HANDLE",
+        "PTL_INV_MD",
+        "PTL_INV_ME",
+        "PTL_INV_NI",
+/* If you change these, you must update the number table in portals/errno.h */
+        "PTL_ILL_MD",
+        "PTL_INV_PROC",
+        "PTL_INV_PSIZE",
+        "PTL_INV_PTINDEX",
+        "PTL_INV_REG",
+
+        "PTL_INV_SR_INDX",
+        "PTL_ML_TOOLONG",
+        "PTL_ADDR_UNKNOWN",
+        "PTL_INV_EQ",
+        "PTL_EQ_DROPPED",
+
+        "PTL_EQ_EMPTY",
+        "PTL_NOUPDATE",
+        "PTL_FAIL",
+        "PTL_NOT_IMPLEMENTED",
+        "PTL_NO_ACK",
+
+        "PTL_IOV_TOO_MANY",
+        "PTL_IOV_TOO_SMALL",
+
+        "PTL_EQ_INUSE",
+        "PTL_MD_INUSE"
+};
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/lnet/lnet/api-init.c b/lnet/lnet/api-init.c
new file mode 100644 (file)
index 0000000..e59c922
--- /dev/null
@@ -0,0 +1,71 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-init.c
+ * Initialization and global data for the p30 user side library
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_init;
+unsigned int portal_subsystem_debug = 0xfff7e3ff;
+unsigned int portal_debug = ~0;
+unsigned int portal_printk;
+unsigned int portal_stack;
+
+#ifdef __KERNEL__
+atomic_t portal_kmemory = ATOMIC_INIT(0);
+#endif
+
+int __p30_initialized;
+int __p30_myr_initialized;
+int __p30_ip_initialized;
+ptl_handle_ni_t __myr_ni_handle;
+ptl_handle_ni_t __ip_ni_handle;
+
+int __p30_myr_timeout = 10;
+int __p30_ip_timeout;
+
+int PtlInit(void)
+{
+
+        if (ptl_init)
+                return PTL_OK;
+
+        ptl_ni_init();
+        ptl_me_init();
+        ptl_eq_init();
+        ptl_init = 1;
+        __p30_initialized = 1;
+
+        return PTL_OK;
+}
+
+
+void PtlFini(void)
+{
+
+        /* Reverse order of initialization */
+        ptl_eq_fini();
+        ptl_me_fini();
+        ptl_ni_fini();
+        ptl_init = 0;
+}
diff --git a/lnet/lnet/api-me.c b/lnet/lnet/api-me.c
new file mode 100644 (file)
index 0000000..e724e58
--- /dev/null
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-me.c
+ * Match Entry local operations.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_me_init(void)
+{
+        return PTL_OK;
+}
+void ptl_me_fini(void)
+{                                /* Nothing to do */
+}
+int ptl_me_ni_init(nal_t * nal)
+{
+        return PTL_OK;
+}
+
+void ptl_me_ni_fini(nal_t * nal)
+{                                /* Nothing to do... */
+}
diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c
new file mode 100644 (file)
index 0000000..b2e069e
--- /dev/null
@@ -0,0 +1,197 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-ni.c
+ * Network Interface code
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+/* Put some magic in the NI handle so uninitialised/zeroed handles are easy
+ * to spot */
+#define NI_HANDLE_MAGIC  0xebc0de00
+#define NI_HANDLE_MASK   0x000000ff
+#define MAX_NIS          8         
+static nal_t *ptl_interfaces[MAX_NIS];
+int ptl_num_interfaces = 0;
+
+nal_t *ptl_hndl2nal(ptl_handle_any_t *handle)
+{
+        unsigned int idx = handle->nal_idx;
+
+        /* XXX we really rely on the caller NOT racing with interface
+         * setup/teardown.  That ensures her NI handle can't get
+         * invalidated out from under her (or worse, swapped for a
+         * completely different interface!) */
+
+        if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0)
+                return NULL;
+
+        idx &= NI_HANDLE_MASK;
+        if (idx < MAX_NIS)
+                return ptl_interfaces[idx];
+
+        return NULL;
+}
+
+int ptl_ni_init(void)
+{
+        int i;
+
+        LASSERT (MAX_NIS <= (NI_HANDLE_MASK + 1));
+        
+        for (i = 0; i < MAX_NIS; i++)
+                ptl_interfaces[i] = NULL;
+
+        return PTL_OK;
+}
+
+void ptl_ni_fini(void)
+{
+        int i;
+
+        for (i = 0; i < MAX_NIS; i++) {
+                nal_t *nal = ptl_interfaces[i];
+                if (!nal)
+                        continue;
+
+                if (nal->shutdown)
+                        nal->shutdown(nal, i);
+        }
+}
+
+#ifdef __KERNEL__
+DECLARE_MUTEX(ptl_ni_init_mutex);
+
+static void ptl_ni_init_mutex_enter (void) 
+{
+        down (&ptl_ni_init_mutex);
+}
+
+static void ptl_ni_init_mutex_exit (void)
+{
+        up (&ptl_ni_init_mutex);
+}
+
+#else
+static void ptl_ni_init_mutex_enter (void)
+{
+}
+
+static void ptl_ni_init_mutex_exit (void) 
+{
+}
+
+#endif
+
+int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t acl_size, ptl_pid_t requested_pid,
+              ptl_handle_ni_t * handle)
+{
+        nal_t *nal;
+        int i;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+
+        ptl_ni_init_mutex_enter ();
+
+        nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid);
+
+        if (!nal) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_NAL_FAILED;
+        }
+
+        for (i = 0; i < ptl_num_interfaces; i++) {
+                if (ptl_interfaces[i] == nal) {
+                        nal->refct++;
+                        handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i;
+                        fprintf(stderr, "Returning existing NAL (%d)\n", i);
+                        ptl_ni_init_mutex_exit ();
+                        return PTL_OK;
+                }
+        }
+        nal->refct = 1;
+
+        if (ptl_num_interfaces >= MAX_NIS) {
+                if (nal->shutdown)
+                        nal->shutdown (nal, ptl_num_interfaces);
+                ptl_ni_init_mutex_exit ();
+                return PTL_NOSPACE;
+        }
+
+        handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | ptl_num_interfaces;
+        ptl_interfaces[ptl_num_interfaces++] = nal;
+
+        ptl_eq_ni_init(nal);
+        ptl_me_ni_init(nal);
+
+        ptl_ni_init_mutex_exit ();
+        return PTL_OK;
+}
+
+
+int PtlNIFini(ptl_handle_ni_t ni)
+{
+        nal_t *nal;
+        int idx;
+        int rc;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+
+        ptl_ni_init_mutex_enter ();
+
+        nal = ptl_hndl2nal (&ni);
+        if (nal == NULL) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_INV_HANDLE;
+        }
+
+        idx = ni.nal_idx & NI_HANDLE_MASK;
+
+        nal->refct--;
+        if (nal->refct > 0) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_OK;
+        }
+
+        ptl_me_ni_fini(nal);
+        ptl_eq_ni_fini(nal);
+
+        rc = PTL_OK;
+        if (nal->shutdown)
+                rc = nal->shutdown(nal, idx);
+
+        ptl_interfaces[idx] = NULL;
+        ptl_num_interfaces--;
+
+        ptl_ni_init_mutex_exit ();
+        return rc;
+}
+
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out)
+{
+        *ni_out = handle_in;
+
+        return PTL_OK;
+}
diff --git a/lnet/lnet/api-wrap.c b/lnet/lnet/api-wrap.c
new file mode 100644 (file)
index 0000000..e54707f
--- /dev/null
@@ -0,0 +1,599 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-wrap.c
+ * User-level wrappers that dispatch across the protection boundaries
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/api-support.h>
+
+static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf,
+                      int argsize, void *retbuf, int retsize)
+{
+        nal_t *nal;
+
+        if (!ptl_init) {
+                fprintf(stderr, "PtlGetId: Not initialized\n");
+                return PTL_NOINIT;
+        }
+
+        nal = ptl_hndl2nal(&any_h);
+        if (!nal)
+                return PTL_INV_HANDLE;
+
+        nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize);
+
+        return PTL_OK;
+}
+
+int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id)
+{
+        PtlGetId_in args;
+        PtlGetId_out ret;
+        int rc;
+
+        args.handle_in = ni_handle;
+
+        rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return rc;
+        
+        if (id)
+                *id = ret.id_out;
+
+        return ret.rc;
+}
+
+int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) 
+{
+        PtlFailNid_in  args;
+        PtlFailNid_out ret;
+        int            rc;
+        
+        args.interface = interface;
+        args.nid       = nid;
+        args.threshold = threshold;
+        
+        rc = do_forward (interface, PTL_FAILNID, 
+                         &args, sizeof(args), &ret, sizeof (ret));
+
+        return ((rc != PTL_OK) ? rc : ret.rc);
+}
+
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t * status_out)
+{
+        PtlNIStatus_in args;
+        PtlNIStatus_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.register_in = register_in;
+
+        rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (status_out)
+                *status_out = ret.status_out;
+
+        return ret.rc;
+}
+
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out)
+{
+        PtlNIDist_in args;
+        PtlNIDist_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.process_in = process_in;
+
+        rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (distance_out)
+                *distance_out = ret.distance_out;
+
+        return ret.rc;
+}
+
+
+
+unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in)
+{
+        PtlNIDebug_in args;
+        PtlNIDebug_out ret;
+        int rc;
+
+        args.mask_in = mask_in;
+
+        rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        return ret.rc;
+}
+
+int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
+                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
+                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
+                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out)
+{
+        PtlMEAttach_in args;
+        PtlMEAttach_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.index_in = index_in;
+        args.match_id_in = match_id_in;
+        args.match_bits_in = match_bits_in;
+        args.ignore_bits_in = ignore_bits_in;
+        args.unlink_in = unlink_in;
+        args.position_in = pos_in;
+
+        rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = interface_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+
+        return ret.rc;
+}
+
+int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
+                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
+                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
+                ptl_handle_me_t * handle_out)
+{
+        PtlMEInsert_in args;
+        PtlMEInsert_out ret;
+        int rc;
+
+        args.current_in = current_in;
+        args.match_id_in = match_id_in;
+        args.match_bits_in = match_bits_in;
+        args.ignore_bits_in = ignore_bits_in;
+        args.unlink_in = unlink_in;
+        args.position_in = position_in;
+
+        rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = current_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+int PtlMEUnlink(ptl_handle_me_t current_in)
+{
+        PtlMEUnlink_in args;
+        PtlMEUnlink_out ret;
+        int rc;
+
+        args.current_in = current_in;
+        args.unlink_in = PTL_RETAIN;
+
+        rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        return ret.rc;
+}
+
+int PtlTblDump(ptl_handle_ni_t ni, int index_in)
+{
+        PtlTblDump_in args;
+        PtlTblDump_out ret;
+        int rc;
+
+        args.index_in = index_in;
+
+        rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        return ret.rc;
+}
+
+int PtlMEDump(ptl_handle_me_t current_in)
+{
+        PtlMEDump_in args;
+        PtlMEDump_out ret;
+        int rc;
+
+        args.current_in = current_in;
+
+        rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        return ret.rc;
+}
+
+static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in)
+{
+        nal_t *nal;
+        int rc;
+        int i;
+
+        if (!ptl_init) {
+                fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n");
+                return PTL_NOINIT;
+        }
+
+        nal = ptl_hndl2nal(&current_in);
+        if (!nal)
+                return PTL_INV_HANDLE;
+
+        if (nal->validate != NULL)                /* nal->validate not a NOOP */
+        {
+                if ((md_in.options & PTL_MD_IOV) == 0)        /* contiguous */
+                {
+                        rc = nal->validate (nal, md_in.start, md_in.length);
+                        if (rc)
+                                return (PTL_SEGV);
+                }
+                else
+                {
+                        struct iovec *iov = (struct iovec *)md_in.start;
+
+                        for (i = 0; i < md_in.niov; i++, iov++)
+                        {
+                                rc = nal->validate (nal, iov->iov_base, iov->iov_len);
+                                if (rc)
+                                        return (PTL_SEGV);
+                        }
+                }
+        }
+
+        return 0;
+}
+
+static ptl_handle_eq_t md2eq (ptl_md_t *md)
+{
+        if (PtlHandleEqual (md->eventq, PTL_EQ_NONE))
+                return (PTL_EQ_NONE);
+        
+        return (ptl_handle2usereq (&md->eventq)->cb_eq_handle);
+}
+
+
+int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out)
+{
+        PtlMDAttach_in args;
+        PtlMDAttach_out ret;
+        int rc;
+
+        rc = validate_md(me_in, md_in);
+        if (rc == PTL_OK) {
+                args.eq_in = md2eq(&md_in);
+                args.me_in = me_in;
+                args.md_in = md_in;
+                args.unlink_in = unlink_in;
+                
+                rc = do_forward(me_in, PTL_MDATTACH, 
+                                &args, sizeof(args), &ret, sizeof(ret));
+        }
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = me_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+
+
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+                       ptl_handle_md_t * handle_out)
+{
+        PtlMDBind_in args;
+        PtlMDBind_out ret;
+        int rc;
+
+        rc = validate_md(ni_in, md_in);
+        if (rc != PTL_OK)
+                return rc;
+
+        args.eq_in = md2eq(&md_in);
+        args.ni_in = ni_in;
+        args.md_in = md_in;
+
+        rc = do_forward(ni_in, PTL_MDBIND, 
+                        &args, sizeof(args), &ret, sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = ni_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout,
+                ptl_md_t *new_inout, ptl_handle_eq_t testq_in)
+{
+        PtlMDUpdate_internal_in args;
+        PtlMDUpdate_internal_out ret;
+        int rc;
+
+        args.md_in = md_in;
+
+        if (old_inout) {
+                args.old_inout = *old_inout;
+                args.old_inout_valid = 1;
+        } else
+                args.old_inout_valid = 0;
+
+        if (new_inout) {
+                rc = validate_md (md_in, *new_inout);
+                if (rc != PTL_OK)
+                        return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+                args.new_inout = *new_inout;
+                args.new_inout_valid = 1;
+        } else
+                args.new_inout_valid = 0;
+
+        if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) {
+                args.testq_in = PTL_EQ_NONE;
+                args.sequence_in = -1;
+        } else {
+                ptl_eq_t *eq = ptl_handle2usereq (&testq_in);
+                
+                args.testq_in = eq->cb_eq_handle;
+                args.sequence_in = eq->sequence;
+        }
+
+        rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+
+        if (old_inout)
+                *old_inout = ret.old_inout;
+
+        return ret.rc;
+}
+
+int PtlMDUnlink(ptl_handle_md_t md_in)
+{
+        PtlMDUnlink_in args;
+        PtlMDUnlink_out ret;
+        int rc;
+
+        args.md_in = md_in;
+        rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+
+        return ret.rc;
+}
+
+int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count,
+               int (*callback) (ptl_event_t * event),
+               ptl_handle_eq_t * handle_out)
+{
+        ptl_eq_t *eq = NULL;
+        ptl_event_t *ev = NULL;
+        PtlEQAlloc_in args;
+        PtlEQAlloc_out ret;
+        int rc, i;
+        nal_t *nal;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+        
+        nal = ptl_hndl2nal (&interface);
+        if (nal == NULL)
+                return PTL_INV_HANDLE;
+
+        if (count != LOWEST_BIT_SET(count)) {   /* not a power of 2 already */
+                do {                    /* knock off all but the top bit... */
+                        count &= ~LOWEST_BIT_SET (count);
+                } while (count != LOWEST_BIT_SET(count));
+
+                count <<= 1;                             /* ...and round up */
+        }
+
+        if (count == 0)        /* catch bad parameter / overflow on roundup */
+                return (PTL_VAL_FAILED);
+
+        PORTAL_ALLOC(ev, count * sizeof(ptl_event_t));
+        if (!ev)
+                return PTL_NOSPACE;
+
+        for (i = 0; i < count; i++)
+                ev[i].sequence = 0;
+
+        if (nal->validate != NULL) {
+                rc = nal->validate(nal, ev, count * sizeof(ptl_event_t));
+                if (rc != PTL_OK)
+                        goto fail;
+        }
+
+        args.ni_in = interface;
+        args.count_in = count;
+        args.base_in = ev;
+        args.len_in = count * sizeof(*ev);
+        args.callback_in = callback;
+
+        rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                goto fail;
+        if (ret.rc)
+                GOTO(fail, rc = ret.rc);
+
+        PORTAL_ALLOC(eq, sizeof(*eq));
+        if (!eq) {
+                rc = PTL_NOSPACE;
+                goto fail;
+        }
+
+        eq->sequence = 1;
+        eq->size = count;
+        eq->base = ev;
+
+        /* EQ handles are a little wierd.  PtlEQGet() just looks at the
+         * queued events in shared memory.  It doesn't want to do_forward()
+         * at all, so the cookie in the EQ handle we pass out of here is
+         * simply a pointer to the event queue we just set up.  We stash
+         * the handle returned by do_forward(), so we can pass it back via
+         * do_forward() when we need to. */
+
+        eq->cb_eq_handle.nal_idx = interface.nal_idx;
+        eq->cb_eq_handle.cookie = ret.handle_out.cookie;
+
+        handle_out->nal_idx = interface.nal_idx;
+        handle_out->cookie = (__u64)((unsigned long)eq);
+        return PTL_OK;
+
+fail:
+        PORTAL_FREE(ev, count * sizeof(ptl_event_t));
+        return rc;
+}
+
+int PtlEQFree(ptl_handle_eq_t eventq)
+{
+        PtlEQFree_in args;
+        PtlEQFree_out ret;
+        ptl_eq_t *eq;
+        int rc;
+
+        eq = ptl_handle2usereq (&eventq);
+        args.eventq_in = eq->cb_eq_handle;
+
+        rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args,
+                        sizeof(args), &ret, sizeof(ret));
+
+        /* XXX we're betting rc == PTL_OK here */
+        PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t));
+        PORTAL_FREE(eq, sizeof(*eq));
+
+        return rc;
+}
+
+int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
+               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in)
+{
+        PtlACEntry_in args;
+        PtlACEntry_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.ni_in = ni_in;
+        args.index_in = index_in;
+        args.match_id_in = match_id_in;
+        args.portal_in = portal_in;
+
+        rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
+
+int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
+           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
+           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in)
+{
+        PtlPut_in args;
+        PtlPut_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.md_in = md_in;
+        args.ack_req_in = ack_req_in;
+        args.target_in = target_in;
+        args.portal_in = portal_in;
+        args.cookie_in = cookie_in;
+        args.match_bits_in = match_bits_in;
+        args.offset_in = offset_in;
+        args.hdr_data_in = hdr_data_in;
+
+        rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
+
+int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_match_bits_t match_bits_in, ptl_size_t offset_in)
+{
+        PtlGet_in args;
+        PtlGet_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.md_in = md_in;
+        args.target_in = target_in;
+        args.portal_in = portal_in;
+        args.cookie_in = cookie_in;
+        args.match_bits_in = match_bits_in;
+        args.offset_in = offset_in;
+
+        rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
diff --git a/lnet/lnet/lib-dispatch.c b/lnet/lnet/lib-dispatch.c
new file mode 100644 (file)
index 0000000..13036c7
--- /dev/null
@@ -0,0 +1,80 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-dispatch.c
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/lib-dispatch.h>
+
+typedef struct {
+        int (*fun) (nal_cb_t * nal, void *private, void *in, void *out);
+        char *name;
+} dispatch_table_t;
+
+static dispatch_table_t dispatch_table[] = {
+        [PTL_GETID] {do_PtlGetId, "PtlGetId"},
+        [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"},
+        [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"},
+        [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"},
+        [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"},
+        [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"},
+        [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"},
+        [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"},
+        [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"},
+        [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"},
+        [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"},
+        [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"},
+        [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"},
+        [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"},
+        [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"},
+        [PTL_PUT] {do_PtlPut, "PtlPut"},
+        [PTL_GET] {do_PtlGet, "PtlGet"},
+        [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"},
+        /*    */ {0, ""}
+};
+
+/*
+ * This really should be elsewhere, but lib-p30/dispatch.c is
+ * an automatically generated file.
+ */
+void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block,
+                  void *ret_block)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (index < 0 || index > LIB_MAX_DISPATCH ||
+            !dispatch_table[index].fun) {
+                CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index);
+                return;
+        }
+
+        CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid,
+               dispatch_table[index].name, index);
+
+        dispatch_table[index].fun(nal, private, arg_block, ret_block);
+}
+
+char *dispatch_name(int index)
+{
+        return dispatch_table[index].name;
+}
diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c
new file mode 100644 (file)
index 0000000..ce343c1
--- /dev/null
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-eq.c
+ * Library level Event queue management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args,
+                           void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_size_t count_in
+         *      void                    * base_in
+         *
+         * Outgoing:
+         *      ptl_handle_eq_t         * handle_out
+         */
+
+        PtlEQAlloc_in *args = v_args;
+        PtlEQAlloc_out *ret = v_ret;
+
+        lib_eq_t *eq;
+        unsigned long flags;
+
+        /* api should have rounded up */
+        if (args->count_in != LOWEST_BIT_SET (args->count_in))
+                return ret->rc = PTL_VAL_FAILED;
+
+        eq = lib_eq_alloc (nal);
+        if (eq == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        if (nal->cb_map != NULL) {
+                struct iovec iov = {
+                        .iov_base = args->base_in,
+                        .iov_len = args->count_in * sizeof (ptl_event_t) };
+
+                ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey);
+                if (ret->rc != PTL_OK) {
+                        lib_eq_free (nal, eq);
+                        
+                        state_unlock (nal, &flags);
+                        return (ret->rc);
+                }
+        }
+
+        eq->sequence = 1;
+        eq->base = args->base_in;
+        eq->size = args->count_in;
+        eq->eq_refcount = 0;
+        eq->event_callback = args->callback_in;
+
+        lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ);
+        list_add (&eq->eq_list, &nal->ni.ni_active_eqs);
+
+        state_unlock(nal, &flags);
+
+        ptl_eq2handle(&ret->handle_out, eq);
+        return (ret->rc = PTL_OK);
+}
+
+int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args,
+                          void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_eq_t eventq_in
+         *
+         * Outgoing:
+         */
+
+        PtlEQFree_in *args = v_args;
+        PtlEQFree_out *ret = v_ret;
+        lib_eq_t *eq;
+        long flags;
+
+        state_lock (nal, &flags);
+
+        eq = ptl_handle2eq(&args->eventq_in, nal);
+        if (eq == NULL) {
+                ret->rc = PTL_INV_EQ;
+        } else if (eq->eq_refcount != 0) {
+                ret->rc = PTL_EQ_INUSE;
+        } else {
+                if (nal->cb_unmap != NULL) {
+                        struct iovec iov = {
+                                .iov_base = eq->base,
+                                .iov_len = eq->size * sizeof (ptl_event_t) };
+                        
+                        nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey);
+                }
+
+                lib_invalidate_handle (nal, &eq->eq_lh);
+                list_del (&eq->eq_list);
+                lib_eq_free (nal, eq);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock (nal, &flags);
+
+        return (ret->rc);
+}
diff --git a/lnet/lnet/lib-init.c b/lnet/lnet/lib-init.c
new file mode 100644 (file)
index 0000000..99c4d32
--- /dev/null
@@ -0,0 +1,474 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-init.c
+ * Start up the internal library and clear all structures
+ * Called by the NAL when it initializes.  Safe to call multiple times.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+
+#ifdef __KERNEL__
+# include <linux/string.h>      /* for memset() */
+# include <linux/kp30.h>
+# ifdef KERNEL_ADDR_CACHE
+#  include <compute/OS/addrCache/cache.h>
+# endif
+#else
+# include <string.h>
+# include <sys/time.h>
+#endif
+
+#ifdef PTL_USE_SLAB_CACHE
+static int ptl_slab_users;
+
+kmem_cache_t *ptl_md_slab;
+kmem_cache_t *ptl_msg_slab;
+kmem_cache_t *ptl_me_slab;
+kmem_cache_t *ptl_eq_slab;
+
+atomic_t md_in_use_count;
+atomic_t msg_in_use_count;
+atomic_t me_in_use_count;
+atomic_t eq_in_use_count;
+
+/* NB zeroing in ctor and on freeing ensures items that
+ * kmem_cache_validate() OK, but haven't been initialised
+ * as an MD/ME/EQ can't have valid handles
+ */
+static void
+ptl_md_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_md_t));
+}
+
+static void
+ptl_me_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_me_t));
+}
+
+static void
+ptl_eq_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_eq_t));
+}
+
+int
+kportal_descriptor_setup (nal_cb_t *nal)
+{
+        /* NB on failure caller must still call kportal_descriptor_cleanup */
+        /*               ******                                            */
+
+        /* We'll have 1 set of slabs for ALL the nals :) */
+
+        if (ptl_slab_users++)
+                return 0;
+
+        ptl_md_slab = kmem_cache_create("portals_MD",
+                                        sizeof(lib_md_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_md_slab_ctor, NULL);
+        if (!ptl_md_slab) {
+                CERROR("couldn't allocate ptl_md_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        /* NB no ctor for msgs; they don't need handle verification */
+        ptl_msg_slab = kmem_cache_create("portals_MSG",
+                                         sizeof(lib_msg_t), 0,
+                                         SLAB_HWCACHE_ALIGN,
+                                         NULL, NULL);
+        if (!ptl_msg_slab) {
+                CERROR("couldn't allocate ptl_msg_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        ptl_me_slab = kmem_cache_create("portals_ME",
+                                        sizeof(lib_me_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_me_slab_ctor, NULL);
+        if (!ptl_me_slab) {
+                CERROR("couldn't allocate ptl_me_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        ptl_eq_slab = kmem_cache_create("portals_EQ",
+                                        sizeof(lib_eq_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_eq_slab_ctor, NULL);
+        if (!ptl_eq_slab) {
+                CERROR("couldn't allocate ptl_eq_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        RETURN(PTL_OK);
+}
+
+void
+kportal_descriptor_cleanup (nal_cb_t *nal)
+{
+        if (--ptl_slab_users != 0)
+                return;
+
+        LASSERT (atomic_read (&md_in_use_count) == 0);
+        LASSERT (atomic_read (&me_in_use_count) == 0);
+        LASSERT (atomic_read (&eq_in_use_count) == 0);
+        LASSERT (atomic_read (&msg_in_use_count) == 0);
+
+        if (ptl_md_slab != NULL)
+                kmem_cache_destroy(ptl_md_slab);
+        if (ptl_msg_slab != NULL)
+                kmem_cache_destroy(ptl_msg_slab);
+        if (ptl_me_slab != NULL)
+                kmem_cache_destroy(ptl_me_slab);
+        if (ptl_eq_slab != NULL)
+                kmem_cache_destroy(ptl_eq_slab);
+}
+#else
+
+int
+lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size)
+{
+        char *space;
+
+        LASSERT (n > 0);
+
+        size += offsetof (lib_freeobj_t, fo_contents);
+
+        space = nal->cb_malloc (nal, n * size);
+        if (space == NULL)
+                return (PTL_NOSPACE);
+
+        INIT_LIST_HEAD (&fl->fl_list);
+        fl->fl_objs = space;
+        fl->fl_nobjs = n;
+        fl->fl_objsize = size;
+
+        do
+        {
+                memset (space, 0, size);
+                list_add ((struct list_head *)space, &fl->fl_list);
+                space += size;
+        } while (--n != 0);
+
+        return (PTL_OK);
+}
+
+void
+lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl)
+{
+        struct list_head *el;
+        int               count;
+
+        if (fl->fl_nobjs == 0)
+                return;
+
+        count = 0;
+        for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+                count++;
+
+        LASSERT (count == fl->fl_nobjs);
+
+        nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+        memset (fl, 0, sizeof (fl));
+}
+
+int
+kportal_descriptor_setup (nal_cb_t *nal)
+{
+        /* NB on failure caller must still call kportal_descriptor_cleanup */
+        /*               ******                                            */
+        int rc;
+
+        memset (&nal->ni.ni_free_mes,  0, sizeof (nal->ni.ni_free_mes));
+        memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs));
+        memset (&nal->ni.ni_free_mds,  0, sizeof (nal->ni.ni_free_mds));
+        memset (&nal->ni.ni_free_eqs,  0, sizeof (nal->ni.ni_free_eqs));
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_mes,
+                                MAX_MES, sizeof (lib_me_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs,
+                                MAX_MSGS, sizeof (lib_msg_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_mds,
+                                MAX_MDS, sizeof (lib_md_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs,
+                                MAX_EQS, sizeof (lib_eq_t));
+        return (rc);
+}
+
+void
+kportal_descriptor_cleanup (nal_cb_t *nal)
+{
+        lib_freelist_fini (nal, &nal->ni.ni_free_mes);
+        lib_freelist_fini (nal, &nal->ni.ni_free_msgs);
+        lib_freelist_fini (nal, &nal->ni.ni_free_mds);
+        lib_freelist_fini (nal, &nal->ni.ni_free_eqs);
+}
+
+#endif
+
+__u64
+lib_create_interface_cookie (nal_cb_t *nal)
+{
+        /* NB the interface cookie in wire handles guards against delayed
+         * replies and ACKs appearing valid in a new instance of the same
+         * interface.  Initialisation time, even if it's only implemented
+         * to millisecond resolution is probably easily good enough. */
+        struct timeval tv;
+        __u64          cookie;
+#ifndef __KERNEL__
+        int            rc = gettimeofday (&tv, NULL);
+        LASSERT (rc == 0);
+#else
+       do_gettimeofday(&tv);
+#endif
+        cookie = tv.tv_sec;
+        cookie *= 1000000;
+        cookie += tv.tv_usec;
+        return (cookie);
+}
+
+int
+lib_setup_handle_hash (nal_cb_t *nal) 
+{
+        lib_ni_t *ni = &nal->ni;
+        int       i;
+        
+        /* Arbitrary choice of hash table size */
+#ifdef __KERNEL__
+        ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head);
+#else
+        ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4;
+#endif
+        ni->ni_lh_hash_table = 
+                (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size
+                                                    * sizeof (struct list_head));
+        if (ni->ni_lh_hash_table == NULL)
+                return (PTL_NOSPACE);
+        
+        for (i = 0; i < ni->ni_lh_hash_size; i++)
+                INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]);
+
+        ni->ni_next_object_cookie = PTL_COOKIE_TYPES;
+        
+        return (PTL_OK);
+}
+
+void
+lib_cleanup_handle_hash (nal_cb_t *nal)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (ni->ni_lh_hash_table == NULL)
+                return;
+        
+        nal->cb_free (nal, ni->ni_lh_hash_table,
+                      ni->ni_lh_hash_size * sizeof (struct list_head));
+}
+
+lib_handle_t *
+lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) 
+{
+        /* ALWAYS called with statelock held */
+        lib_ni_t            *ni = &nal->ni;
+        struct list_head    *list;
+        struct list_head    *el;
+        unsigned int         hash;
+
+        if ((cookie & (PTL_COOKIE_TYPES - 1)) != type)
+                return (NULL);
+        
+        hash = ((unsigned int)cookie) % ni->ni_lh_hash_size;
+        list = &ni->ni_lh_hash_table[hash];
+        
+        list_for_each (el, list) {
+                lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain);
+                
+                if (lh->lh_cookie == cookie)
+                        return (lh);
+        }
+        
+        return (NULL);
+}
+
+void
+lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) 
+{
+        /* ALWAYS called with statelock held */
+        lib_ni_t       *ni = &nal->ni;
+        unsigned int    hash;
+
+        LASSERT (type >= 0 && type < PTL_COOKIE_TYPES);
+        lh->lh_cookie = ni->ni_next_object_cookie | type;
+        ni->ni_next_object_cookie += PTL_COOKIE_TYPES;
+        
+        hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size;
+        list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]);
+}
+
+void
+lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh)
+{
+        list_del (&lh->lh_hash_chain);
+}
+
+int
+lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+         ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size)
+{
+        int       rc = PTL_OK;
+        lib_ni_t *ni = &nal->ni;
+        int i;
+        ENTRY;
+
+        /* NB serialised in PtlNIInit() */
+
+        if (ni->refcnt != 0) {                       /* already initialised */
+                ni->refcnt++;
+                goto out;
+        }
+
+        lib_assert_wire_constants ();
+        
+        /*
+         * Allocate the portal table for this interface
+         * and all per-interface objects.
+         */
+        memset(&ni->counters, 0, sizeof(lib_counters_t));
+
+        rc = kportal_descriptor_setup (nal);
+        if (rc != PTL_OK)
+                goto out;
+
+        INIT_LIST_HEAD (&ni->ni_active_msgs);
+        INIT_LIST_HEAD (&ni->ni_active_mds);
+        INIT_LIST_HEAD (&ni->ni_active_eqs);
+
+        INIT_LIST_HEAD (&ni->ni_test_peers);
+
+        ni->ni_interface_cookie = lib_create_interface_cookie (nal);
+        ni->ni_next_object_cookie = 0;
+        rc = lib_setup_handle_hash (nal);
+        if (rc != PTL_OK)
+                goto out;
+        
+        ni->nid = nid;
+        ni->pid = pid;
+
+        ni->num_nodes = gsize;
+        ni->tbl.size = ptl_size;
+
+        ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size);
+        if (ni->tbl.tbl == NULL) {
+                rc = PTL_NOSPACE;
+                goto out;
+        }
+
+        for (i = 0; i < ptl_size; i++)
+                INIT_LIST_HEAD(&(ni->tbl.tbl[i]));
+
+        ni->debug = PTL_DEBUG_NONE;
+        ni->up = 1;
+        ni->refcnt++;
+
+ out:
+        if (rc != PTL_OK) {
+                lib_cleanup_handle_hash (nal);
+                kportal_descriptor_cleanup (nal);
+        }
+
+        RETURN (rc);
+}
+
+int
+lib_fini(nal_cb_t * nal)
+{
+        lib_ni_t *ni = &nal->ni;
+        int       idx;
+
+        ni->refcnt--;
+
+        if (ni->refcnt != 0)
+                goto out;
+
+        /* NB no stat_lock() since this is the last reference.  The NAL
+         * should have shut down already, so it should be safe to unlink
+         * and free all descriptors, even those that appear committed to a
+         * network op (eg MD with non-zero pending count)
+         */
+
+        for (idx = 0; idx < ni->tbl.size; idx++)
+                while (!list_empty (&ni->tbl.tbl[idx])) {
+                        lib_me_t *me = list_entry (ni->tbl.tbl[idx].next,
+                                                   lib_me_t, me_list);
+
+                        CERROR ("Active me %p on exit\n", me);
+                        list_del (&me->me_list);
+                        lib_me_free (nal, me);
+                }
+
+        while (!list_empty (&ni->ni_active_mds)) {
+                lib_md_t *md = list_entry (ni->ni_active_mds.next,
+                                           lib_md_t, md_list);
+
+                CERROR ("Active md %p on exit\n", md);
+                list_del (&md->md_list);
+                lib_md_free (nal, md);
+        }
+
+        while (!list_empty (&ni->ni_active_eqs)) {
+                lib_eq_t *eq = list_entry (ni->ni_active_eqs.next,
+                                           lib_eq_t, eq_list);
+
+                CERROR ("Active eq %p on exit\n", eq);
+                list_del (&eq->eq_list);
+                lib_eq_free (nal, eq);
+        }
+
+        while (!list_empty (&ni->ni_active_msgs)) {
+                lib_msg_t *msg = list_entry (ni->ni_active_msgs.next,
+                                             lib_msg_t, msg_list);
+
+                CERROR ("Active msg %p on exit\n", msg);
+                list_del (&msg->msg_list);
+                lib_msg_free (nal, msg);
+        }
+
+        nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size);
+        ni->up = 0;
+
+        lib_cleanup_handle_hash (nal);
+        kportal_descriptor_cleanup (nal);
+
+ out:
+        return (PTL_OK);
+}
diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c
new file mode 100644 (file)
index 0000000..a79e2be
--- /dev/null
@@ -0,0 +1,412 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-md.c
+ * Memory Descriptor management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+/*
+ * must be called with state lock held
+ */
+void lib_md_unlink(nal_cb_t * nal, lib_md_t * md)
+{
+        lib_me_t *me = md->me;
+
+        if (md->pending != 0) {
+                CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+                md->md_flags |= PTL_MD_FLAG_UNLINK;
+                return;
+        }
+
+        CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+        if ((md->options & PTL_MD_KIOV) != 0) {
+                if (nal->cb_unmap_pages != NULL)
+                        nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, 
+                                             &md->md_addrkey);
+        } else if (nal->cb_unmap != NULL)
+                nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, 
+                               &md->md_addrkey);
+
+        if (me) {
+                me->md = NULL;
+                if (me->unlink == PTL_UNLINK)
+                        lib_me_unlink(nal, me);
+        }
+
+        if (md->eq != NULL)
+        {
+                md->eq->eq_refcount--;
+                LASSERT (md->eq->eq_refcount >= 0);
+        }
+
+        lib_invalidate_handle (nal, &md->md_lh);
+        list_del (&md->md_list);
+        lib_md_free(nal, md);
+}
+
+/* must be called with state lock held */
+static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
+                        ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink)
+{
+        const int     max_size_opts = PTL_MD_AUTO_UNLINK |
+                                      PTL_MD_MAX_SIZE;
+        lib_eq_t     *eq = NULL;
+        int           rc;
+        int           i;
+
+        /* NB we are passes an allocated, but uninitialised/active md.
+         * if we return success, caller may lib_md_unlink() it.
+         * otherwise caller may only lib_md_free() it.
+         */
+
+        if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) {
+                eq = ptl_handle2eq(eqh, nal);
+                if (eq == NULL)
+                        return PTL_INV_EQ;
+        }
+
+        if ((md->options & PTL_MD_IOV) != 0 &&  /* discontiguous MD */
+            md->niov > PTL_MD_MAX_IOV)          /* too many fragments */
+                return PTL_IOV_TOO_MANY;
+
+        if ((md->options & max_size_opts) != 0 && /* max size used */
+            (md->max_size < 0 || md->max_size > md->length)) // illegal max_size
+                return PTL_INV_MD;
+
+        new->me = NULL;
+        new->start = md->start;
+        new->length = md->length;
+        new->offset = 0;
+        new->max_size = md->max_size;
+        new->unlink = unlink;
+        new->options = md->options;
+        new->user_ptr = md->user_ptr;
+        new->eq = eq;
+        new->threshold = md->threshold;
+        new->pending = 0;
+        new->md_flags = 0;
+
+        if ((md->options & PTL_MD_IOV) != 0) {
+                int total_length = 0;
+
+                if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */
+                        return PTL_INV_MD; 
+
+                new->md_niov = md->niov;
+                
+                if (nal->cb_read (nal, private, new->md_iov.iov, md->start,
+                                  md->niov * sizeof (new->md_iov.iov[0])))
+                        return PTL_SEGV;
+
+                for (i = 0; i < new->md_niov; i++) {
+                        /* We take the base address on trust */
+                        if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */
+                                return PTL_VAL_FAILED;
+
+                        total_length += new->md_iov.iov[i].iov_len;
+                }
+
+                if (md->length > total_length)
+                        return PTL_IOV_TOO_SMALL;
+                
+                if (nal->cb_map != NULL) {
+                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
+                                          &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+        } else if ((md->options & PTL_MD_KIOV) != 0) {
+#ifndef __KERNEL__
+                return PTL_INV_MD;
+#else
+                int total_length = 0;
+                
+                /* Trap attempt to use paged I/O if unsupported early. */
+                if (nal->cb_send_pages == NULL ||
+                    nal->cb_recv_pages == NULL)
+                        return PTL_INV_MD;
+
+                new->md_niov = md->niov;
+
+                if (nal->cb_read (nal, private, new->md_iov.kiov, md->start,
+                                  md->niov * sizeof (new->md_iov.kiov[0])))
+                        return PTL_SEGV;
+                
+                for (i = 0; i < new->md_niov; i++) {
+                        /* We take the page pointer on trust */
+                        if (new->md_iov.kiov[i].kiov_offset + 
+                            new->md_iov.kiov[i].kiov_len > PAGE_SIZE )
+                                return PTL_VAL_FAILED; /* invalid length */
+
+                        total_length += new->md_iov.kiov[i].kiov_len;
+                }
+
+                if (md->length > total_length)
+                        return PTL_IOV_TOO_SMALL;
+
+                if (nal->cb_map_pages != NULL) {
+                        rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, 
+                                                &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+#endif
+        } else {   /* contiguous */
+                new->md_niov = 1;
+                new->md_iov.iov[0].iov_base = md->start;
+                new->md_iov.iov[0].iov_len = md->length;
+
+                if (nal->cb_map != NULL) {
+                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
+                                          &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+        } 
+
+        if (eq != NULL)
+                eq->eq_refcount++;
+
+        /* It's good; let handle2md succeed and add to active mds */
+        lib_initialise_handle (nal, &new->md_lh, PTL_COOKIE_TYPE_MD);
+        list_add (&new->md_list, &nal->ni.ni_active_mds);
+
+        return PTL_OK;
+}
+
+/* must be called with state lock held */
+void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new)
+{
+        /* NB this doesn't copy out all the iov entries so when a
+         * discontiguous MD is copied out, the target gets to know the
+         * original iov pointer (in start) and the number of entries it had
+         * and that's all.
+         */
+        new->start = md->start;
+        new->length = md->length;
+        new->threshold = md->threshold;
+        new->max_size = md->max_size;
+        new->options = md->options;
+        new->user_ptr = md->user_ptr;
+        ptl_eq2handle(&new->eventq, md->eq);
+        new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov;
+}
+
+int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_me_t current_in
+         *      ptl_md_t md_in
+         *      ptl_unlink_t unlink_in
+         *
+         * Outgoing:
+         *      ptl_handle_md_t         * handle_out
+         */
+
+        PtlMDAttach_in *args = v_args;
+        PtlMDAttach_out *ret = v_ret;
+        lib_me_t *me;
+        lib_md_t *md;
+        unsigned long flags;
+
+        md = lib_md_alloc (nal);
+        if (md == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->me_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else if (me->md != NULL) {
+                ret->rc = PTL_INUSE;
+        } else {
+                ret->rc = lib_md_build(nal, md, private, &args->md_in,
+                                       &args->eq_in, args->unlink_in);
+
+                if (ret->rc == PTL_OK) {
+                        me->md = md;
+                        md->me = me;
+
+                        ptl_md2handle(&ret->handle_out, md);
+
+                        state_unlock (nal, &flags);
+                        return (PTL_OK);
+                }
+        }
+
+        lib_md_free (nal, md);
+
+        state_unlock (nal, &flags);
+        return (ret->rc);
+}
+
+int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_md_t md_in
+         *
+         * Outgoing:
+         *      ptl_handle_md_t         * handle_out
+         */
+
+        PtlMDBind_in *args = v_args;
+        PtlMDBind_out *ret = v_ret;
+        lib_md_t *md;
+        unsigned long flags;
+
+        md = lib_md_alloc (nal);
+        if (md == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        ret->rc = lib_md_build(nal, md, private,
+                               &args->md_in, &args->eq_in, PTL_UNLINK);
+
+        if (ret->rc == PTL_OK) {
+                ptl_md2handle(&ret->handle_out, md);
+
+                state_unlock(nal, &flags);
+                return (PTL_OK);
+        }
+
+        lib_md_free (nal, md);
+
+        state_unlock(nal, &flags);
+        return (ret->rc);
+}
+
+int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMDUnlink_in *args = v_args;
+        PtlMDUnlink_out *ret = v_ret;
+
+        lib_md_t *md;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL) {
+                ret->rc = PTL_INV_MD;
+        } else if (md->pending != 0) {           /* being filled/spilled */
+                ret->rc = PTL_MD_INUSE;
+        } else {
+                /* Callers attempting to unlink a busy MD which will get
+                 * unlinked once the net op completes should see INUSE,
+                 * before completion and INV_MD thereafter.  LASSERT we've
+                 * got that right... */
+                LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0);
+
+                lib_md_deconstruct(nal, md, &ret->status_out);
+                lib_md_unlink(nal, md);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return (ret->rc);
+}
+
+int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
+                            void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_md_t                * old_inout
+         *      ptl_md_t                * new_inout
+         *      ptl_handle_eq_t testq_in
+         *      ptl_seq_t               sequence_in
+         *
+         * Outgoing:
+         *      ptl_md_t                * old_inout
+         *      ptl_md_t                * new_inout
+         */
+        PtlMDUpdate_internal_in *args = v_args;
+        PtlMDUpdate_internal_out *ret = v_ret;
+        lib_md_t *md;
+        lib_eq_t *test_eq = NULL;
+        ptl_md_t *new = &args->new_inout;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL) {
+                 ret->rc = PTL_INV_MD;
+                 goto out;
+        }
+
+        if (args->old_inout_valid)
+                lib_md_deconstruct(nal, md, &ret->old_inout);
+
+        if (!args->new_inout_valid) {
+                ret->rc = PTL_OK;
+                goto out;
+        }
+
+        if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
+                test_eq = ptl_handle2eq(&args->testq_in, nal);
+                if (test_eq == NULL) {
+                        ret->rc = PTL_INV_EQ;
+                        goto out;
+                }
+        }
+
+        if (md->pending != 0) {
+                        ret->rc = PTL_NOUPDATE;
+                        goto out;
+        }
+
+        if (test_eq == NULL ||
+            test_eq->sequence == args->sequence_in) {
+                lib_me_t *me = md->me;
+
+#warning this does not track eq refcounts properly
+
+                ret->rc = lib_md_build(nal, md, private,
+                                       new, &new->eventq, md->unlink);
+
+                md->me = me;
+        } else {
+                ret->rc = PTL_NOUPDATE;
+        }
+
+ out:
+        state_unlock(nal, &flags);
+        return (ret->rc);
+}
diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c
new file mode 100644 (file)
index 0000000..bd1af5b
--- /dev/null
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-me.c
+ * Match Entry management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+static void lib_me_dump(nal_cb_t * nal, lib_me_t * me);
+
+int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEAttach_in *args = v_args;
+        PtlMEAttach_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+        lib_ptl_t *tbl = &ni->tbl;
+        unsigned long flags;
+        lib_me_t *me;
+
+        if (args->index_in < 0 || args->index_in >= tbl->size)
+                return ret->rc = PTL_INV_PTINDEX;
+
+        /* Should check for valid matchid, but not yet */
+        if (0)
+                return ret->rc = PTL_INV_PROC;
+
+        me = lib_me_alloc (nal);
+        if (me == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        me->match_id = args->match_id_in;
+        me->match_bits = args->match_bits_in;
+        me->ignore_bits = args->ignore_bits_in;
+        me->unlink = args->unlink_in;
+        me->md = NULL;
+
+        lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME);
+
+        if (args->position_in == PTL_INS_AFTER)
+                list_add_tail(&me->me_list, &(tbl->tbl[args->index_in]));
+        else
+                list_add(&me->me_list, &(tbl->tbl[args->index_in]));
+
+        ptl_me2handle(&ret->handle_out, me);
+
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEInsert_in *args = v_args;
+        PtlMEInsert_out *ret = v_ret;
+        unsigned long flags;
+        lib_me_t *me;
+        lib_me_t *new;
+
+        new = lib_me_alloc (nal);
+        if (new == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        /* Should check for valid matchid, but not yet */
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                lib_me_free (nal, new);
+
+                state_unlock (nal, &flags);
+                return (ret->rc = PTL_INV_ME);
+        }
+
+        new->match_id = args->match_id_in;
+        new->match_bits = args->match_bits_in;
+        new->ignore_bits = args->ignore_bits_in;
+        new->unlink = args->unlink_in;
+        new->md = NULL;
+
+        lib_initialise_handle (nal, &new->me_lh, PTL_COOKIE_TYPE_ME);
+
+        if (args->position_in == PTL_INS_AFTER)
+                list_add_tail(&new->me_list, &me->me_list);
+        else
+                list_add(&new->me_list, &me->me_list);
+
+        ptl_me2handle(&ret->handle_out, new);
+
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEUnlink_in *args = v_args;
+        PtlMEUnlink_out *ret = v_ret;
+        unsigned long flags;
+        lib_me_t *me;
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else {
+                lib_me_unlink(nal, me);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return (ret->rc);
+}
+
+/* call with state_lock please */
+void lib_me_unlink(nal_cb_t *nal, lib_me_t *me)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (ni->debug & PTL_DEBUG_UNLINK) {
+                ptl_handle_any_t handle;
+                ptl_me2handle(&handle, me);
+        }
+
+        list_del (&me->me_list);
+
+        if (me->md) {
+                me->md->me = NULL;
+                lib_md_unlink(nal, me->md);
+        }
+
+        lib_invalidate_handle (nal, &me->me_lh);
+        lib_me_free(nal, me);
+}
+
+int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlTblDump_in *args = v_args;
+        PtlTblDump_out *ret = v_ret;
+        lib_ptl_t *tbl = &nal->ni.tbl;
+        ptl_handle_any_t handle;
+        struct list_head *tmp;
+        unsigned long flags;
+
+        if (args->index_in < 0 || args->index_in >= tbl->size)
+                return ret->rc = PTL_INV_PTINDEX;
+
+        nal->cb_printf(nal, "Portal table index %d\n", args->index_in);
+
+        state_lock(nal, &flags);
+        list_for_each(tmp, &(tbl->tbl[args->index_in])) {
+                lib_me_t *me = list_entry(tmp, lib_me_t, me_list);
+                ptl_me2handle(&handle, me);
+                lib_me_dump(nal, me);
+        }
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEDump_in *args = v_args;
+        PtlMEDump_out *ret = v_ret;
+        lib_me_t *me;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else {
+                lib_me_dump(nal, me);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return ret->rc;
+}
+
+static void lib_me_dump(nal_cb_t * nal, lib_me_t * me)
+{
+        nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, 
+                       me->me_lh.lh_cookie);
+
+        nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n",
+                       me->match_bits, me->ignore_bits);
+
+        nal->cb_printf(nal, "\tMD\t= %p\n", me->md);
+        nal->cb_printf(nal, "\tprev\t= %p\n",
+                       list_entry(me->me_list.prev, lib_me_t, me_list));
+        nal->cb_printf(nal, "\tnext\t= %p\n",
+                       list_entry(me->me_list.next, lib_me_t, me_list));
+}
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c
new file mode 100644 (file)
index 0000000..fde4f16
--- /dev/null
@@ -0,0 +1,1379 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-move.c
+ * Data movement routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+/*
+ * Right now it does not check access control lists.
+ *
+ * We only support one MD per ME, which is how the Portals 3.1 spec is written.
+ * All previous complication is removed.
+ */
+
+static lib_me_t *
+lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid,
+            ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset,
+            ptl_match_bits_t match_bits, ptl_size_t *mlength_out,
+            ptl_size_t *offset_out, int *unlink_out)
+{
+        lib_ni_t         *ni = &nal->ni;
+        struct list_head *match_list = &ni->tbl.tbl[index];
+        struct list_head *tmp;
+        lib_me_t         *me;
+        lib_md_t         *md;
+        ptl_size_t        mlength;
+        ptl_size_t        offset;
+
+        ENTRY;
+
+        CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d "
+                "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits);
+
+        if (index < 0 || index >= ni->tbl.size) {
+                CERROR("Invalid portal %d not in [0-%d]\n",
+                       index, ni->tbl.size);
+                goto failed;
+        }
+
+        list_for_each (tmp, match_list) {
+                me = list_entry(tmp, lib_me_t, me_list);
+                md = me->md;
+
+                 /* ME attached but MD not attached yet */
+                if (md == NULL)
+                        continue;
+
+                LASSERT (me == md->me);
+
+                /* MD deactivated */
+                if (md->threshold == 0)
+                        continue;
+
+                /* mismatched MD op */
+                if ((md->options & op_mask) == 0)
+                        continue;
+
+                /* mismatched ME nid/pid? */
+                if (me->match_id.nid != PTL_NID_ANY &&
+                    me->match_id.nid != src_nid)
+                        continue;
+
+                if (me->match_id.pid != PTL_PID_ANY &&
+                    me->match_id.pid != src_pid)
+                        continue;
+
+                /* mismatched ME matchbits? */
+                if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0)
+                        continue;
+
+                /* Hurrah! This _is_ a match; check it out... */
+
+                if ((md->options & PTL_MD_MANAGE_REMOTE) == 0)
+                        offset = md->offset;
+                else
+                        offset = roffset;
+
+                mlength = md->length - offset;
+                if ((md->options & PTL_MD_MAX_SIZE) != 0 &&
+                    mlength > md->max_size)
+                        mlength = md->max_size;
+
+                if (rlength <= mlength) {        /* fits in allowed space */
+                        mlength = rlength;
+                } else if ((md->options & PTL_MD_TRUNCATE) == 0) {
+                        /* this packet _really_ is too big */
+                        CERROR("Matching packet %d too big: %d left, "
+                               "%d allowed\n", rlength, md->length - offset,
+                               mlength);
+                        goto failed;
+                }
+
+                md->offset = offset + mlength;
+
+                *offset_out = offset;
+                *mlength_out = mlength;
+                *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 &&
+                               md->offset >= (md->length - md->max_size));
+                RETURN (me);
+        }
+
+ failed:
+        CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64
+                " offset %d length %d: no match\n",
+                ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT",
+                src_nid, src_pid, index, match_bits, roffset, rlength);
+        RETURN(NULL);
+}
+
+int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret)
+{
+        PtlFailNid_in     *args = v_args;
+        PtlFailNid_out    *ret  = v_ret;
+        lib_test_peer_t   *tp;
+        unsigned long      flags;
+        struct list_head  *el;
+        struct list_head  *next;
+        struct list_head   cull;
+        
+        if (args->threshold != 0) {
+                /* Adding a new entry */
+                tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp));
+                if (tp == NULL)
+                        return (ret->rc = PTL_FAIL);
+                
+                tp->tp_nid = args->nid;
+                tp->tp_threshold = args->threshold;
+                
+                state_lock (nal, &flags);
+                list_add (&tp->tp_list, &nal->ni.ni_test_peers);
+                state_unlock (nal, &flags);
+                return (ret->rc = PTL_OK);
+        }
+        
+        /* removing entries */
+        INIT_LIST_HEAD (&cull);
+        
+        state_lock (nal, &flags);
+
+        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+                tp = list_entry (el, lib_test_peer_t, tp_list);
+                
+                if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+                    args->nid == PTL_NID_ANY || /* removing all entries */
+                    tp->tp_nid == args->nid)    /* matched this one */
+                {
+                        list_del (&tp->tp_list);
+                        list_add (&tp->tp_list, &cull);
+                }
+        }
+        
+        state_unlock (nal, &flags);
+                
+        while (!list_empty (&cull)) {
+                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+
+                list_del (&tp->tp_list);
+                nal->cb_free (nal, tp, sizeof (*tp));
+        }
+        return (ret->rc = PTL_OK);
+}
+
+static int
+fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) 
+{
+        lib_test_peer_t  *tp;
+        struct list_head *el;
+        struct list_head *next;
+        unsigned long     flags;
+        struct list_head  cull;
+        int               fail = 0;
+
+        INIT_LIST_HEAD (&cull);
+        
+        state_lock (nal, &flags);
+
+        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+                tp = list_entry (el, lib_test_peer_t, tp_list);
+
+                if (tp->tp_threshold == 0) {
+                        /* zombie entry */
+                        if (outgoing) {
+                                /* only cull zombies on outgoing tests,
+                                 * since we may be at interrupt priority on
+                                 * incoming messages. */
+                                list_del (&tp->tp_list);
+                                list_add (&tp->tp_list, &cull);
+                        }
+                        continue;
+                }
+                        
+                if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */
+                    nid == tp->tp_nid) {        /* fail this peer */
+                        fail = 1;
+                        
+                        if (tp->tp_threshold != PTL_MD_THRESH_INF) {
+                                tp->tp_threshold--;
+                                if (outgoing &&
+                                    tp->tp_threshold == 0) {
+                                        /* see above */
+                                        list_del (&tp->tp_list);
+                                        list_add (&tp->tp_list, &cull);
+                                }
+                        }
+                        break;
+                }
+        }
+        
+        state_unlock (nal, &flags);
+
+        while (!list_empty (&cull)) {
+                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+                list_del (&tp->tp_list);
+                
+                nal->cb_free (nal, tp, sizeof (*tp));
+        }
+
+        return (fail);
+}
+
+ptl_size_t
+lib_iov_nob (int niov, struct iovec *iov)
+{
+        ptl_size_t nob = 0;
+        
+        while (niov-- > 0)
+                nob += (iov++)->iov_len;
+        
+        return (nob);
+}
+
+void
+lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len)
+{
+        ptl_size_t nob;
+
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len, len);
+                memcpy (dest, iov->iov_base, nob);
+
+                len -= nob;
+                dest += nob;
+                niov--;
+                iov++;
+        }
+}
+
+void
+lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len)
+{
+        ptl_size_t nob;
+
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len, len);
+                memcpy (iov->iov_base, src, nob);
+                
+                len -= nob;
+                src += nob;
+                niov--;
+                iov++;
+        }
+}
+
+static int
+lib_extract_iov (struct iovec *dst, lib_md_t *md,
+                 ptl_size_t offset, ptl_size_t len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        int             src_niov = md->md_niov;  
+        struct iovec   *src = md->md_iov.iov;
+        ptl_size_t      frag_len;
+        int             dst_niov;
+
+        LASSERT (len >= 0);
+        LASSERT (offset >= 0);
+        LASSERT (offset + len <= md->length);
+        
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->iov_len) {      /* skip initial frags */
+                offset -= src->iov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        dst_niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                
+                frag_len = src->iov_len - offset;
+                dst->iov_base = ((char *)src->iov_base) + offset;
+
+                if (len <= frag_len) {
+                        dst->iov_len = len;
+                        return (dst_niov);
+                }
+                
+                dst->iov_len = frag_len;
+
+                len -= frag_len;
+                dst++;
+                src++;
+                dst_niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+
+#ifndef __KERNEL__
+ptl_size_t
+lib_kiov_nob (int niov, ptl_kiov_t *kiov) 
+{
+        LASSERT (0);
+        return (0);
+}
+
+void
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+void
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+static int
+lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+                  ptl_size_t offset, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+#else
+
+ptl_size_t
+lib_kiov_nob (int niov, ptl_kiov_t *kiov) 
+{
+        ptl_size_t  nob = 0;
+
+        while (niov-- > 0)
+                nob += (kiov++)->kiov_len;
+
+        return (nob);
+}
+
+void
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+{
+        ptl_size_t  nob;
+        char       *addr;
+        
+        LASSERT (!in_interrupt ());
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (kiov->kiov_len, len);
+                
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                memcpy (dest, addr, nob);
+                kunmap (kiov->kiov_page);
+                
+                len -= nob;
+                dest += nob;
+                niov--;
+                kiov++;
+        }
+}
+
+void
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
+{
+        ptl_size_t  nob;
+        char       *addr;
+
+        LASSERT (!in_interrupt ());
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (kiov->kiov_len, len);
+                
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                memcpy (addr, src, nob);
+                kunmap (kiov->kiov_page);
+                
+                len -= nob;
+                src += nob;
+                niov--;
+                kiov++;
+        }
+}
+
+static int
+lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+                  ptl_size_t offset, ptl_size_t len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        int             src_niov = md->md_niov;  
+        ptl_kiov_t     *src = md->md_iov.kiov;
+        ptl_size_t      frag_len;
+        int             dst_niov;
+
+        LASSERT (len >= 0);
+        LASSERT (offset >= 0);
+        LASSERT (offset + len <= md->length);
+        
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->kiov_len) {      /* skip initial frags */
+                offset -= src->kiov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        dst_niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                
+                frag_len = src->kiov_len - offset;
+                dst->kiov_page = src->kiov_page;
+                dst->kiov_offset = src->kiov_offset + offset;
+
+                if (len <= frag_len) {
+                        dst->kiov_len = len;
+                        LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+                        return (dst_niov);
+                }
+
+                dst->kiov_len = frag_len;
+                LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+
+                len -= frag_len;
+                dst++;
+                src++;
+                dst_niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+#endif
+
+void
+lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+          ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
+{
+        int   niov;
+
+        if (mlen == 0)
+                nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen);
+        else if ((md->options & PTL_MD_KIOV) == 0) {
+                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen);
+                nal->cb_recv (nal, private, msg,
+                              niov, msg->msg_iov.iov, mlen, rlen);
+        } else {
+                niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen);
+                nal->cb_recv_pages (nal, private, msg, 
+                                    niov, msg->msg_iov.kiov, mlen, rlen);
+        }
+}
+
+int
+lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+          ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+          lib_md_t *md, ptl_size_t offset, ptl_size_t len) 
+{
+        int   niov;
+
+        if (len == 0)
+                return (nal->cb_send (nal, private, msg, 
+                                      hdr, type, nid, pid,
+                                      0, NULL, 0));
+        
+        if ((md->options & PTL_MD_KIOV) == 0) {
+                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len);
+                return (nal->cb_send (nal, private, msg, 
+                                      hdr, type, nid, pid,
+                                      niov, msg->msg_iov.iov, len));
+        }
+
+        niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len);
+        return (nal->cb_send_pages (nal, private, msg, 
+                                    hdr, type, nid, pid,
+                                    niov, msg->msg_iov.kiov, len));
+}
+
+static lib_msg_t *
+get_new_msg (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called holding the state_lock */
+        lib_counters_t *counters = &nal->ni.counters;
+        lib_msg_t      *msg      = lib_msg_alloc (nal);
+
+        if (msg == NULL)
+                return (NULL);
+
+        memset (msg, 0, sizeof (*msg));
+
+        msg->send_ack = 0;
+
+        msg->md = md;
+        msg->ev.arrival_time = get_cycles();
+        md->pending++;
+        if (md->threshold != PTL_MD_THRESH_INF) {
+                LASSERT (md->threshold > 0);
+                md->threshold--;
+        }
+
+        counters->msgs_alloc++;
+        if (counters->msgs_alloc > counters->msgs_max)
+                counters->msgs_max = counters->msgs_alloc;
+
+        list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+
+        return (msg);
+}
+
+
+/*
+ * Incoming messages have a ptl_msg_t object associated with them
+ * by the library.  This object encapsulates the state of the
+ * message and allows the NAL to do non-blocking receives or sends
+ * of long messages.
+ *
+ */
+static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        ptl_size_t       mlength = 0;
+        ptl_size_t       offset = 0;
+        int              unlink = 0;
+        lib_me_t        *me;
+        lib_md_t        *md;
+        lib_msg_t       *msg;
+        unsigned long    flags;
+
+        /* Convert put fields to host byte order */
+        hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
+        hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
+        hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset);
+
+        state_lock(nal, &flags);
+
+        me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT,
+                         hdr->src_nid, hdr->src_pid,
+                         PTL_HDR_LENGTH (hdr), hdr->msg.put.offset,
+                         hdr->msg.put.match_bits,
+                         &mlength, &offset, &unlink);
+        if (me == NULL)
+                goto drop;
+
+        md = me->md;
+        CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
+               "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index,
+               hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), 
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+            !(md->options & PTL_MD_ACK_DISABLE)) {
+                msg->send_ack = 1;
+                msg->ack_wmd = hdr->msg.put.ack_wmd;
+                msg->nid = hdr->src_nid;
+                msg->pid = hdr->src_pid;
+                msg->ev.match_bits = hdr->msg.put.match_bits;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_PUT;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.portal = hdr->msg.put.ptl_index;
+                msg->ev.match_bits = hdr->msg.put.match_bits;
+                msg->ev.rlength = PTL_HDR_LENGTH(hdr);
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+                msg->ev.hdr_data = hdr->msg.put.hdr_data;
+
+                /* NB if this match has exhausted the MD, we can't be sure
+                 * that this event will the the last one associated with
+                 * this MD in the event queue (another message already
+                 * matching this ME/MD could end up being last).  So we
+                 * remember the ME handle anyway and check again when we're
+                 * allocating our slot in the event queue.
+                 */
+                ptl_me2handle (&msg->ev.unlinked_me, me);
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        ni->counters.recv_length += mlength;
+
+        /* only unlink after MD's pending count has been bumped
+         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
+        if (unlink) {
+                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+                lib_me_unlink (nal, me);
+        }
+
+        state_unlock(nal, &flags);
+
+        lib_recv (nal, private, msg, md, offset, mlength, PTL_HDR_LENGTH (hdr));
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        ptl_size_t       mlength = 0;
+        ptl_size_t       offset = 0;
+        int              unlink = 0;
+        lib_me_t        *me;
+        lib_md_t        *md;
+        lib_msg_t       *msg;
+        ptl_hdr_t        reply;
+        unsigned long    flags;
+        int              rc;
+
+        /* Convert get fields to host byte order */
+        hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits);
+        hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index);
+        hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length);
+        hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset);
+
+        /* compatibility check until field is deleted */
+        if (hdr->msg.get.return_offset != 0)
+                CERROR("Unexpected non-zero get.return_offset %x from "
+                       LPU64"\n", hdr->msg.get.return_offset, hdr->src_nid);
+
+        state_lock(nal, &flags);
+
+        me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET,
+                         hdr->src_nid, hdr->src_pid,
+                         hdr->msg.get.sink_length, hdr->msg.get.src_offset,
+                         hdr->msg.get.match_bits,
+                         &mlength, &offset, &unlink);
+        if (me == NULL)
+                goto drop;
+
+        md = me->md;
+        CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
+               "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index,
+               hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), 
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_GET;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.portal = hdr->msg.get.ptl_index;
+                msg->ev.match_bits = hdr->msg.get.match_bits;
+                msg->ev.rlength = PTL_HDR_LENGTH(hdr);
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+                msg->ev.hdr_data = 0;
+
+                /* NB if this match has exhausted the MD, we can't be sure
+                 * that this event will the the last one associated with
+                 * this MD in the event queue (another message already
+                 * matching this ME/MD could end up being last).  So we
+                 * remember the ME handle anyway and check again when we're
+                 * allocating our slot in the event queue.
+                 */
+                ptl_me2handle (&msg->ev.unlinked_me, me);
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.send_count++;
+        ni->counters.send_length += mlength;
+
+        /* only unlink after MD's refcount has been bumped
+         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
+        if (unlink) {
+                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+                lib_me_unlink (nal, me);
+        }
+
+        state_unlock(nal, &flags);
+
+        memset (&reply, 0, sizeof (reply));
+        reply.type     = HTON__u32 (PTL_MSG_REPLY);
+        reply.dest_nid = HTON__u64 (hdr->src_nid);
+        reply.src_nid  = HTON__u64 (ni->nid);
+        reply.dest_pid = HTON__u32 (hdr->src_pid);
+        reply.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&reply) = HTON__u32 (mlength);
+
+        reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd;
+
+        rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, 
+                       hdr->src_nid, hdr->src_pid, md, offset, mlength);
+        if (rc != 0) {
+                CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n",
+                       ni->nid, hdr->src_nid);
+                state_lock (nal, &flags);
+                goto drop;
+        }
+
+        /* Complete the incoming message */
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return (rc);
+ drop:
+        ni->counters.drop_count++;
+        ni->counters.drop_length += hdr->msg.get.sink_length;
+        state_unlock(nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        lib_md_t        *md;
+        int              rlength;
+        int              length;
+        lib_msg_t       *msg;
+        unsigned long    flags;
+
+        /* compatibility check until field is deleted */
+        if (hdr->msg.reply.dst_offset != 0)
+                CERROR("Unexpected non-zero reply.dst_offset %x from "LPU64"\n",
+                       hdr->msg.reply.dst_offset, hdr->src_nid);
+
+        state_lock(nal, &flags);
+
+        /* NB handles only looked up by creator (no flips) */
+        md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal);
+        if (md == NULL || md->threshold == 0) {
+                CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n",
+                        ni->nid, hdr->src_nid,
+                        md == NULL ? "invalid" : "inactive",
+                        hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                        hdr->msg.reply.dst_wmd.wh_object_cookie);
+                goto drop;
+        }
+
+        LASSERT (md->offset == 0);
+
+        length = rlength = PTL_HDR_LENGTH(hdr);
+
+        if (length > md->length) {
+                if ((md->options & PTL_MD_TRUNCATE) == 0) {
+                        CERROR (LPU64": Dropping REPLY from "LPU64
+                                " length %d for MD "LPX64" would overflow (%d)\n",
+                                ni->nid, hdr->src_nid, length,
+                                hdr->msg.reply.dst_wmd.wh_object_cookie,
+                                md->length);
+                        goto drop;
+                }
+                length = md->length;
+        }
+
+        CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n",
+               hdr->src_nid, length, rlength, 
+               hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping REPLY from "LPU64": can't "
+                       "allocate msg\n", ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_REPLY;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.rlength = rlength;
+                msg->ev.mlength = length;
+                msg->ev.offset = 0;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        ni->counters.recv_length += length;
+
+        state_unlock(nal, &flags);
+
+        lib_recv (nal, private, msg, md, 0, length, rlength);
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t *ni = &nal->ni;
+        lib_md_t *md;
+        lib_msg_t *msg = NULL;
+        unsigned long flags;
+
+        /* Convert ack fields to host byte order */
+        hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
+        hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength);
+
+        state_lock(nal, &flags);
+
+        /* NB handles only looked up by creator (no flips) */
+        md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal);
+        if (md == NULL || md->threshold == 0) {
+                CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD "
+                       LPX64"."LPX64"\n", ni->nid, hdr->src_nid, 
+                       (md == NULL) ? "invalid" : "inactive",
+                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                       hdr->msg.ack.dst_wmd.wh_object_cookie);
+                goto drop;
+        }
+
+        CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
+               ni->nid, hdr->src_nid, 
+               hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_ACK;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.mlength = hdr->msg.ack.mlength;
+                msg->ev.match_bits = hdr->msg.ack.match_bits;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        state_unlock(nal, &flags);
+        lib_recv (nal, private, msg, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static char *
+hdr_type_string (ptl_hdr_t *hdr)
+{
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return ("ACK");
+        case PTL_MSG_PUT:
+                return ("PUT");
+        case PTL_MSG_GET:
+                return ("GET");
+        case PTL_MSG_REPLY:
+                return ("REPLY");
+        case PTL_MSG_HELLO:
+                return ("HELLO");
+        default:
+                return ("<UNKNOWN>");
+        }
+}
+
+void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr)
+{
+        char *type_str = hdr_type_string (hdr);
+
+        nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str);
+        nal->cb_printf(nal, "    From nid/pid %Lu/%Lu", hdr->src_nid,
+                       hdr->src_pid);
+        nal->cb_printf(nal, "    To nid/pid %Lu/%Lu\n", hdr->dest_nid,
+                       hdr->dest_pid);
+
+        switch (hdr->type) {
+        default:
+                break;
+
+        case PTL_MSG_PUT:
+                nal->cb_printf(nal,
+                               "    Ptl index %d, ack md "LPX64"."LPX64", "
+                               "match bits "LPX64"\n",
+                               hdr->msg.put.ptl_index,
+                               hdr->msg.put.ack_wmd.wh_interface_cookie,
+                               hdr->msg.put.ack_wmd.wh_object_cookie,
+                               hdr->msg.put.match_bits);
+                nal->cb_printf(nal,
+                               "    Length %d, offset %d, hdr data "LPX64"\n",
+                               PTL_HDR_LENGTH(hdr), hdr->msg.put.offset,
+                               hdr->msg.put.hdr_data);
+                break;
+
+        case PTL_MSG_GET:
+                nal->cb_printf(nal,
+                               "    Ptl index %d, return md "LPX64"."LPX64", "
+                               "match bits "LPX64"\n", hdr->msg.get.ptl_index,
+                               hdr->msg.get.return_wmd.wh_interface_cookie,
+                               hdr->msg.get.return_wmd.wh_object_cookie,
+                               hdr->msg.get.match_bits);
+                nal->cb_printf(nal,
+                               "    Length %d, src offset %d\n",
+                               hdr->msg.get.sink_length,
+                               hdr->msg.get.src_offset);
+                break;
+
+        case PTL_MSG_ACK:
+                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
+                               "manipulated length %d\n",
+                               hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                               hdr->msg.ack.dst_wmd.wh_object_cookie,
+                               hdr->msg.ack.mlength);
+                break;
+
+        case PTL_MSG_REPLY:
+                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
+                               "length %d\n",
+                               hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                               hdr->msg.reply.dst_wmd.wh_object_cookie,
+                               PTL_HDR_LENGTH(hdr));
+        }
+
+}                               /* end of print_hdr() */
+
+
+int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        unsigned long  flags;
+
+        /* NB static check; optimizer will elide this if it's right */
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.put.length));
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.get.length));
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.reply.length));
+
+        /* convert common fields to host byte order */
+        hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
+        hdr->src_nid = NTOH__u64 (hdr->src_nid);
+        hdr->dest_pid = NTOH__u32 (hdr->dest_pid);
+        hdr->src_pid = NTOH__u32 (hdr->src_pid);
+        hdr->type = NTOH__u32 (hdr->type);
+        PTL_HDR_LENGTH(hdr) = NTOH__u32 (PTL_HDR_LENGTH(hdr));
+#if 0
+        nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n",
+                       nal->ni.nid, nal, hdr, hdr->type);
+        print_hdr(nal, hdr);
+#endif
+        if (hdr->type == PTL_MSG_HELLO) {
+                /* dest_nid is really ptl_magicversion_t */
+                ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid;
+
+                CERROR (LPU64": Dropping unexpected HELLO message: "
+                        "magic %d, version %d.%d from "LPD64"\n",
+                        nal->ni.nid, mv->magic, 
+                        mv->version_major, mv->version_minor,
+                        hdr->src_nid);
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+        
+        if (hdr->dest_nid != nal->ni.nid) {
+                CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
+                       " (not me)\n", nal->ni.nid, hdr_type_string (hdr),
+                       hdr->src_nid, hdr->dest_nid);
+
+                state_lock (nal, &flags);
+                nal->ni.counters.drop_count++;
+                nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+                state_unlock (nal, &flags);
+
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, hdr->src_nid, 0))      /* shall we now? */
+        {
+                CERROR(LPU64": Dropping incoming %s from "LPU64
+                       ": simulated failure\n",
+                       nal->ni.nid, hdr_type_string (hdr), 
+                       hdr->src_nid);
+                return (-1);
+        }
+        
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return (parse_ack(nal, hdr, private));
+        case PTL_MSG_PUT:
+                return (parse_put(nal, hdr, private));
+                break;
+        case PTL_MSG_GET:
+                return (parse_get(nal, hdr, private));
+                break;
+        case PTL_MSG_REPLY:
+                return (parse_reply(nal, hdr, private));
+                break;
+        default:
+                CERROR(LPU64": Dropping <unknown> message from "LPU64
+                       ": Bad type=0x%x\n",  nal->ni.nid, hdr->src_nid,
+                       hdr->type);
+
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+}
+
+
+int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_ack_req_t ack_req_in
+         *      ptl_process_id_t target_in
+         *      ptl_pt_index_t portal_in
+         *      ptl_ac_index_t cookie_in
+         *      ptl_match_bits_t match_bits_in
+         *      ptl_size_t offset_in
+         *
+         * Outgoing:
+         */
+
+        PtlPut_in *args = v_args;
+        PtlPut_out *ret = v_ret;
+        ptl_hdr_t hdr;
+
+        lib_ni_t *ni = &nal->ni;
+        lib_md_t *md;
+        lib_msg_t *msg = NULL;
+        ptl_process_id_t *id = &args->target_in;
+        unsigned long flags;
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        {
+                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
+                       nal->ni.nid, id->nid);
+                return (ret->rc = PTL_INV_PROC);
+        }
+        
+        ret->rc = PTL_OK;
+        state_lock(nal, &flags);
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL || !md->threshold) {
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_INV_MD;
+        }
+
+        CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
+               (unsigned long)id->pid);
+
+        memset (&hdr, 0, sizeof (hdr));
+        hdr.type     = HTON__u32 (PTL_MSG_PUT);
+        hdr.dest_nid = HTON__u64 (id->nid);
+        hdr.src_nid  = HTON__u64 (ni->nid);
+        hdr.dest_pid = HTON__u32 (id->pid);
+        hdr.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&hdr) = HTON__u32 (md->length);
+
+        /* NB handles only looked up by creator (no flips) */
+        if (args->ack_req_in == PTL_ACK_REQ) {
+                hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie;
+                hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+        } else {
+                hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
+
+        hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in);
+        hdr.msg.put.ptl_index = HTON__u32 (args->portal_in);
+        hdr.msg.put.offset = HTON__u32 (args->offset_in);
+        hdr.msg.put.hdr_data = args->hdr_data_in;
+
+        ni->counters.send_count++;
+        ni->counters.send_length += md->length;
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR("BAD: could not allocate msg!\n");
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_NOSPACE;
+        }
+
+        /*
+         * If this memory descriptor has an event queue associated with
+         * it we need to allocate a message state object and record the
+         * information about this operation that will be recorded into
+         * event queue once the message has been completed.
+         *
+         * NB. We're now committed to the GET, since we just marked the MD
+         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
+         * PtlMDUnlink()) expect a completion event to tell them when the
+         * MD becomes idle. 
+         */
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_SENT;
+                msg->ev.initiator.nid = ni->nid;
+                msg->ev.initiator.pid = ni->pid;
+                msg->ev.portal = args->portal_in;
+                msg->ev.match_bits = args->match_bits_in;
+                msg->ev.rlength = md->length;
+                msg->ev.mlength = md->length;
+                msg->ev.offset = args->offset_in;
+                msg->ev.hdr_data = args->hdr_data_in;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        state_unlock(nal, &flags);
+        
+        lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
+                  id->nid, id->pid, md, 0, md->length);
+
+        return ret->rc = PTL_OK;
+}
+
+
+int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_process_id_t target_in
+         *      ptl_pt_index_t portal_in
+         *      ptl_ac_index_t cookie_in
+         *      ptl_match_bits_t match_bits_in
+         *      ptl_size_t offset_in
+         *
+         * Outgoing:
+         */
+
+        PtlGet_in *args = v_args;
+        PtlGet_out *ret = v_ret;
+        ptl_hdr_t hdr;
+        lib_msg_t *msg = NULL;
+        lib_ni_t *ni = &nal->ni;
+        ptl_process_id_t *id = &args->target_in;
+        lib_md_t *md;
+        unsigned long flags;
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        {
+                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
+                       nal->ni.nid, id->nid);
+                return (ret->rc = PTL_INV_PROC);
+        }
+        
+        state_lock(nal, &flags);
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL || !md->threshold) {
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_INV_MD;
+        }
+
+        LASSERT (md->offset == 0);
+
+        CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
+               (unsigned long)id->pid);
+
+        memset (&hdr, 0, sizeof (hdr));
+        hdr.type     = HTON__u32 (PTL_MSG_GET);
+        hdr.dest_nid = HTON__u64 (id->nid);
+        hdr.src_nid  = HTON__u64 (ni->nid);
+        hdr.dest_pid = HTON__u32 (id->pid);
+        hdr.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&hdr) = 0;
+
+        /* NB handles only looked up by creator (no flips) */
+        hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie;
+        hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+
+        hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in);
+        hdr.msg.get.ptl_index = HTON__u32 (args->portal_in);
+        hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
+        hdr.msg.get.sink_length = HTON__u32 (md->length);
+
+        ni->counters.send_count++;
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR("do_PtlGet: BAD - could not allocate cookie!\n");
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_NOSPACE;
+        }
+
+        /*
+         * If this memory descriptor has an event queue associated with
+         * it we must allocate a message state object that will record
+         * the information to be filled in once the message has been
+         * completed.  More information is in the do_PtlPut() comments.
+         *
+         * NB. We're now committed to the GET, since we just marked the MD
+         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
+         * PtlMDUnlink()) expect a completion event to tell them when the
+         * MD becomes idle. 
+         */
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_SENT;
+                msg->ev.initiator.nid = ni->nid;
+                msg->ev.initiator.pid = ni->pid;
+                msg->ev.portal = args->portal_in;
+                msg->ev.match_bits = args->match_bits_in;
+                msg->ev.rlength = md->length;
+                msg->ev.mlength = md->length;
+                msg->ev.offset = args->offset_in;
+                msg->ev.hdr_data = 0;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        state_unlock(nal, &flags);
+
+        lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
+                  id->nid, id->pid, NULL, 0, 0);
+
+        return ret->rc = PTL_OK;
+}
+
+void lib_assert_wire_constants (void)
+{
+        /* Wire protocol assertions generated by 'wirecheck' */
+
+        /* Constants... */
+        LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded);
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        LASSERT (PORTALS_PROTO_VERSION_MINOR == 1);
+        LASSERT (PTL_MSG_ACK == 0);
+        LASSERT (PTL_MSG_PUT == 1);
+        LASSERT (PTL_MSG_GET == 2);
+        LASSERT (PTL_MSG_REPLY == 3);
+        LASSERT (PTL_MSG_HELLO == 4);
+
+        /* Checks for struct ptl_handle_wire_t */
+        LASSERT (sizeof (ptl_handle_wire_t) == 16);
+        LASSERT (offsetof (ptl_handle_wire_t, wh_interface_cookie) == 0);
+        LASSERT (sizeof (((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8);
+        LASSERT (offsetof (ptl_handle_wire_t, wh_object_cookie) == 8);
+        LASSERT (sizeof (((ptl_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+        /* Checks for struct ptl_magicversion_t */
+        LASSERT (sizeof (ptl_magicversion_t) == 8);
+        LASSERT (offsetof (ptl_magicversion_t, magic) == 0);
+        LASSERT (sizeof (((ptl_magicversion_t *)0)->magic) == 4);
+        LASSERT (offsetof (ptl_magicversion_t, version_major) == 4);
+        LASSERT (sizeof (((ptl_magicversion_t *)0)->version_major) == 2);
+        LASSERT (offsetof (ptl_magicversion_t, version_minor) == 6);
+        LASSERT (sizeof (((ptl_magicversion_t *)0)->version_minor) == 2);
+
+        /* Checks for struct ptl_hdr_t */
+        LASSERT (sizeof (ptl_hdr_t) == 72);
+        LASSERT (offsetof (ptl_hdr_t, dest_nid) == 0);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->dest_nid) == 8);
+        LASSERT (offsetof (ptl_hdr_t, src_nid) == 8);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->src_nid) == 8);
+        LASSERT (offsetof (ptl_hdr_t, dest_pid) == 16);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->dest_pid) == 4);
+        LASSERT (offsetof (ptl_hdr_t, src_pid) == 20);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->src_pid) == 4);
+        LASSERT (offsetof (ptl_hdr_t, type) == 24);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->type) == 4);
+
+        /* Ack */
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.mlength) == 28);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.mlength) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.dst_wmd) == 32);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.match_bits) == 48);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.match_bits) == 8);
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == 56);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.length) == 4);
+
+        /* Put */
+        LASSERT (offsetof (ptl_hdr_t, msg.put.ptl_index) == 28);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.ptl_index) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.ack_wmd) == 32);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.match_bits) == 48);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.match_bits) == 8);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.length) == 56);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.length) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.offset) == 60);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.offset) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.hdr_data) == 64);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.hdr_data) == 8);
+
+        /* Get */
+        LASSERT (offsetof (ptl_hdr_t, msg.get.ptl_index) == 28);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.ptl_index) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.return_wmd) == 32);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.return_wmd) == 16);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.match_bits) == 48);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.match_bits) == 8);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.length) == 56);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.length) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.src_offset) == 60);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.src_offset) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.return_offset) == 64);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.return_offset) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.sink_length) == 68);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.sink_length) == 4);
+
+        /* Reply */
+        LASSERT (offsetof (ptl_hdr_t, msg.reply.dst_wmd) == 32);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+        LASSERT (offsetof (ptl_hdr_t, msg.reply.dst_offset) == 48);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.dst_offset) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.reply.length) == 56);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.length) == 4);
+}
diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c
new file mode 100644 (file)
index 0000000..f10892c
--- /dev/null
@@ -0,0 +1,163 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-msg.c
+ * Message decoding, parsing and finalizing routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+
+int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
+{
+        lib_md_t     *md;
+        lib_eq_t     *eq;
+        int           rc;
+        unsigned long flags;
+
+        /* ni went down while processing this message */
+        if (nal->ni.up == 0) {
+                return -1;
+        }
+
+        if (msg == NULL)
+                return 0;
+
+        rc = 0;
+        if (msg->send_ack) {
+                ptl_hdr_t ack;
+
+                LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd));
+
+                memset (&ack, 0, sizeof (ack));
+                ack.type     = HTON__u32 (PTL_MSG_ACK);
+                ack.dest_nid = HTON__u64 (msg->nid);
+                ack.src_nid  = HTON__u64 (nal->ni.nid);
+                ack.dest_pid = HTON__u32 (msg->pid);
+                ack.src_pid  = HTON__u32 (nal->ni.pid);
+                PTL_HDR_LENGTH(&ack) = 0;
+
+                ack.msg.ack.dst_wmd = msg->ack_wmd;
+                ack.msg.ack.match_bits = msg->ev.match_bits;
+                ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
+
+                rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
+                               msg->nid, msg->pid, NULL, 0, 0);
+        }
+
+        md = msg->md;
+        LASSERT (md->pending > 0);  /* I've not dropped my ref yet */
+        eq = md->eq;
+
+        state_lock(nal, &flags);
+
+        if (eq != NULL) {
+                ptl_event_t  *ev = &msg->ev;
+                ptl_event_t  *eq_slot;
+
+                /* I have to hold the lock while I bump the sequence number
+                 * and copy the event into the queue.  If not, and I was
+                 * interrupted after bumping the sequence number, other
+                 * events could fill the queue, including the slot I just
+                 * allocated to this event.  On resuming, I would overwrite
+                 * a more 'recent' event with old event state, and
+                 * processes taking events off the queue would not detect
+                 * overflow correctly.
+                 */
+
+                ev->sequence = eq->sequence++;/* Allocate the next queue slot */
+
+                /* size must be a power of 2 to handle a wrapped sequence # */
+                LASSERT (eq->size != 0 &&
+                         eq->size == LOWEST_BIT_SET (eq->size));
+                eq_slot = eq->base + (ev->sequence & (eq->size - 1));
+
+                /* Invalidate unlinked_me unless this is the last
+                 * event for an auto-unlinked MD.  Note that if md was
+                 * auto-unlinked, md->pending can only decrease
+                 */
+                if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */
+                    md->pending != 1)                       /* not last ref */
+                        ev->unlinked_me = PTL_HANDLE_NONE;
+
+                /* Copy the event into the allocated slot, ensuring all the
+                 * rest of the event's contents have been copied _before_
+                 * the sequence number gets updated.  A processes 'getting'
+                 * an event waits on the next queue slot's sequence to be
+                 * 'new'.  When it is, _all_ other event fields had better
+                 * be consistent.  I assert 'sequence' is the last member,
+                 * so I only need a 2 stage copy.
+                 */
+                LASSERT(sizeof (ptl_event_t) ==
+                        offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
+
+                rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
+                                    offsetof (ptl_event_t, sequence));
+                LASSERT (rc == 0);
+
+#ifdef __KERNEL__
+                barrier();
+#endif
+                /* Updating the sequence number is what makes the event 'new' */
+
+                /* cb_write is not necessarily atomic, so this could
+                   cause a race with PtlEQGet */
+                rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
+                                   (void *)&ev->sequence,sizeof (ev->sequence));
+                LASSERT (rc == 0);
+
+#ifdef __KERNEL__
+                barrier();
+#endif
+
+                /* I must also ensure that (a) callbacks are made in the
+                 * same order as the events land in the queue, and (b) the
+                 * callback occurs before the event can be removed from the
+                 * queue, so I can't drop the lock during the callback. */
+                if (nal->cb_callback != NULL)
+                        nal->cb_callback(nal, private, eq, ev);
+                else  if (eq->event_callback != NULL)
+                        (void)((eq->event_callback) (ev));
+        }
+
+        LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 ||
+                 (md->md_flags & PTL_MD_FLAG_UNLINK) != 0);
+
+        md->pending--;
+        if (md->pending == 0 && /* no more outstanding operations on this md */
+            (md->threshold == 0 ||              /* done its business */
+             (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */
+                lib_md_unlink(nal, md);
+
+        list_del (&msg->msg_list);
+        nal->ni.counters.msgs_alloc--;
+        lib_msg_free(nal, msg);
+
+        state_unlock(nal, &flags);
+
+        return rc;
+}
diff --git a/lnet/lnet/lib-ni.c b/lnet/lnet/lib-ni.c
new file mode 100644 (file)
index 0000000..aa30329
--- /dev/null
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-ni.c
+ * Network status registers and distance functions.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+#define MAX_DIST 18446744073709551615UL
+
+int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlNIDebug_in *args = v_args;
+        PtlNIDebug_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+
+        ret->rc = ni->debug;
+        ni->debug = args->mask_in;
+
+        return 0;
+}
+
+int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t interface_in
+         *      ptl_sr_index_t register_in
+         *
+         * Outgoing:
+         *      ptl_sr_value_t          * status_out
+         */
+
+        PtlNIStatus_in *args = v_args;
+        PtlNIStatus_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+        lib_counters_t *count = &ni->counters;
+
+        if (!args)
+                return ret->rc = PTL_SEGV;
+
+        ret->rc = PTL_OK;
+        ret->status_out = 0;
+
+        /*
+         * I hate this sort of code....  Hash tables, offset lists?
+         * Treat the counters as an array of ints?
+         */
+        if (args->register_in == PTL_SR_DROP_COUNT)
+                ret->status_out = count->drop_count;
+
+        else if (args->register_in == PTL_SR_DROP_LENGTH)
+                ret->status_out = count->drop_length;
+
+        else if (args->register_in == PTL_SR_RECV_COUNT)
+                ret->status_out = count->recv_count;
+
+        else if (args->register_in == PTL_SR_RECV_LENGTH)
+                ret->status_out = count->recv_length;
+
+        else if (args->register_in == PTL_SR_SEND_COUNT)
+                ret->status_out = count->send_count;
+
+        else if (args->register_in == PTL_SR_SEND_LENGTH)
+                ret->status_out = count->send_length;
+
+        else if (args->register_in == PTL_SR_MSGS_MAX)
+                ret->status_out = count->msgs_max;
+        else
+                ret->rc = PTL_INV_SR_INDX;
+
+        return ret->rc;
+}
+
+
+int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t interface_in
+         *      ptl_process_id_t process_in
+
+         *
+         * Outgoing:
+         *      unsigned long   * distance_out
+
+         */
+
+        PtlNIDist_in *args = v_args;
+        PtlNIDist_out *ret = v_ret;
+
+        unsigned long dist;
+        ptl_process_id_t id_in = args->process_in;
+        ptl_nid_t nid;
+        int rc;
+
+        nid = id_in.nid;
+
+        if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) {
+                ret->distance_out = (unsigned long) MAX_DIST;
+                return PTL_INV_PROC;
+        }
+
+        ret->distance_out = dist;
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lnet/lnet/lib-pid.c b/lnet/lnet/lib-pid.c
new file mode 100644 (file)
index 0000000..12eebb5
--- /dev/null
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-pid.c
+ *
+ * Process identification routines
+ * Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ *
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This should be removed.  The NAL should have the PID information */
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#if defined (__KERNEL__)
+#       include <linux/kernel.h>
+extern int getpid(void);
+#else
+#       include <stdio.h>
+#       include <unistd.h>
+#endif
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t handle_in
+         *
+         * Outgoing:
+         *      ptl_process_id_t        * id_out
+         *      ptl_id_t                * gsize_out
+         */
+
+        PtlGetId_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+
+        ret->id_out.nid = ni->nid;
+        ret->id_out.pid = ni->pid;
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lnet/packaging/.cvsignore b/lnet/packaging/.cvsignore
new file mode 100644 (file)
index 0000000..fd1d56a
--- /dev/null
@@ -0,0 +1,8 @@
+Makefile
+Makefile.in
+aclocal.m4
+config.log
+config.status
+config.cache
+configure
+portals.spec
diff --git a/lnet/packaging/Makefile.am b/lnet/packaging/Makefile.am
new file mode 100644 (file)
index 0000000..126bc69
--- /dev/null
@@ -0,0 +1,6 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = portals.spec
\ No newline at end of file
diff --git a/lnet/packaging/portals.spec.in b/lnet/packaging/portals.spec.in
new file mode 100644 (file)
index 0000000..e196b3f
--- /dev/null
@@ -0,0 +1,116 @@
+%define kversion @RELEASE@
+%define linuxdir @LINUX@
+%define version HEAD
+
+Summary: Sandia Portals Message Passing - utilities 
+Name: portals
+Version: %{version}
+Release: 0210101748uml
+Copyright: LGPL
+Group: Utilities/System
+BuildRoot: /var/tmp/portals-%{version}-root
+Source: http://sandiaportals.org/portals-%{version}.tar.gz
+
+%description
+Sandia Portals message passing package.  Contains kernel modules, libraries and utilities. 
+
+%package -n portals-modules
+Summary: Kernel modules and NAL's for portals
+Group: Development/Kernel
+
+%description -n portals-modules
+Object-Based Disk storage drivers for Linux %{kversion}.
+
+%package -n portals-source
+Summary: Portals kernel source for rebuilding with other kernels
+Group: Development/Kernel
+
+%description -n portals-source
+Portals kernel source for rebuilding with other kernels
+
+%prep
+%setup -n portals-%{version}
+
+%build
+rm -rf $RPM_BUILD_ROOT
+
+# Create the pristine source directory.
+srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version}
+mkdir -p $srcdir
+find . -name CVS -prune -o -print | cpio -ap $srcdir
+
+# Set an explicit path to our Linux tree, if we can.
+conf_flag=
+linuxdir=%{linuxdir}
+test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+./configure $conf_flag
+make 
+
+%install
+make install prefix=$RPM_BUILD_ROOT
+
+%ifarch alpha
+# this hurts me
+  conf_flag=
+  linuxdir=%{linuxdir}
+  test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+  make clean
+  ./configure --enable-rtscts-myrinet $conf_flag
+  make
+  cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o
+  cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload
+%endif
+
+
+%files
+%attr(-, root, root) %doc COPYING
+%attr(-, root, root) /usr/sbin/acceptor
+%attr(-, root, root) /usr/sbin/ptlctl
+%attr(-, root, root) /usr/sbin/debugctl
+%ifarch alpha
+%attr(-, root, root) /usr/sbin/mcpload
+%endif
+%attr(-, root, root) /lib/libmyrnal.a
+%attr(-, root, root) /lib/libptlapi.a
+%attr(-, root, root) /lib/libptlctl.a
+%attr(-, root, root) /lib/libprocbridge.a
+%attr(-, root, root) /lib/libptllib.a
+%attr(-, root, root) /lib/libtcpnal.a 
+%attr(-, root, root) /lib/libtcpnalutil.a
+%attr(-, root, root) /usr/include/portals/*.h
+%attr(-, root, root) /usr/include/portals/base/*.h
+%attr(-, root, root) /usr/include/linux/*.h
+
+%files -n portals-modules
+%attr(-, root, root) %doc COPYING
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o
+%ifarch alpha
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o
+%endif
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o
+
+%files -n portals-source
+%attr(-, root, root) /usr/src/portals-%{version}
+
+%post
+if [ ! -e /dev/portals ]; then
+   mknod /dev/portals c 10 240
+fi
+depmod -ae || exit 0
+
+grep -q portals /etc/modules.conf || \
+       echo 'alias char-major-10-240 portals' >> /etc/modules.conf
+
+grep -q '/dev/portals' /etc/modules.conf || \
+       echo 'alias /dev/portals portals' >> /etc/modules.conf
+
+%postun
+depmod -ae || exit 0
+
+%clean
+#rm -rf $RPM_BUILD_ROOT
+
+# end of file
diff --git a/lnet/router/.cvsignore b/lnet/router/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/router/Makefile.am b/lnet/router/Makefile.am
new file mode 100644 (file)
index 0000000..1c8087b
--- /dev/null
@@ -0,0 +1,16 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Rules.linux
+
+MODULE = kptlrouter
+modulenet_DATA = kptlrouter.o
+EXTRA_PROGRAMS = kptlrouter
+
+
+#CFLAGS:= @KCFLAGS@ 
+#CPPFLAGS:=@KCPPFLAGS@
+DEFS =
+kptlrouter_SOURCES = router.c proc.c router.h
diff --git a/lnet/router/Makefile.mk b/lnet/router/Makefile.mk
new file mode 100644 (file)
index 0000000..64bd09b
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Kernelenv
+
+obj-y += kptlrouter.o
+kptlrouter-objs    := router.o proc.o
diff --git a/lnet/router/proc.c b/lnet/router/proc.c
new file mode 100644 (file)
index 0000000..dd65b34
--- /dev/null
@@ -0,0 +1,78 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "router.h"
+
+#define KPR_PROC_ROUTER "sys/portals/router"
+
+int
+kpr_proc_read (char *page, char **start, off_t off, int count, int *eof, void *data)
+{
+       unsigned long long bytes = kpr_fwd_bytes;
+       unsigned long      packets = kpr_fwd_packets;
+       unsigned long      errors = kpr_fwd_errors;
+        unsigned int       qdepth = atomic_read (&kpr_queue_depth);
+       int                len;
+       
+       *eof = 1;
+       if (off != 0)
+               return (0);
+       
+       len = sprintf (page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth);
+       
+       *start = page;
+       return (len);
+}
+
+int
+kpr_proc_write (struct file *file, const char *ubuffer, unsigned long count, void *data)
+{
+       /* Ignore what we've been asked to write, and just zero the stats counters */
+       kpr_fwd_bytes = 0;
+       kpr_fwd_packets = 0;
+       kpr_fwd_errors = 0;
+
+       return (count);
+}
+
+void
+kpr_proc_init(void)
+{
+        struct proc_dir_entry *entry = create_proc_entry (KPR_PROC_ROUTER, S_IFREG | S_IRUGO | S_IWUSR, NULL);
+
+        if (entry == NULL) 
+       {
+                CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER);
+                return;
+        }
+
+        entry->data = NULL;
+        entry->read_proc = kpr_proc_read;
+       entry->write_proc = kpr_proc_write;
+}
+
+void 
+kpr_proc_fini(void)
+{
+        remove_proc_entry(KPR_PROC_ROUTER, 0);
+}
diff --git a/lnet/router/router.c b/lnet/router/router.c
new file mode 100644 (file)
index 0000000..6074c3c
--- /dev/null
@@ -0,0 +1,449 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "router.h"
+
+struct list_head kpr_routes;
+struct list_head kpr_nals;
+
+unsigned long long kpr_fwd_bytes;
+unsigned long      kpr_fwd_packets;
+unsigned long      kpr_fwd_errors;
+atomic_t           kpr_queue_depth;
+
+/* Mostly the tables are read-only (thread and interrupt context)
+ *
+ * Once in a blue moon we register/deregister NALs and add/remove routing
+ * entries (thread context only)... */
+rwlock_t         kpr_rwlock;
+
+kpr_router_interface_t kpr_router_interface = {
+       kprri_register:         kpr_register_nal,
+       kprri_lookup:           kpr_lookup_target,
+       kprri_fwd_start:        kpr_forward_packet,
+       kprri_fwd_done:         kpr_complete_packet,
+       kprri_shutdown:         kpr_shutdown_nal,
+       kprri_deregister:       kpr_deregister_nal,
+};
+
+kpr_control_interface_t kpr_control_interface = {
+       kprci_add_route:        kpr_add_route,
+       kprci_del_route:        kpr_del_route,
+       kprci_get_route:        kpr_get_route,
+};
+
+int
+kpr_register_nal (kpr_nal_interface_t *nalif, void **argp)
+{
+       long               flags;
+       struct list_head  *e;
+       kpr_nal_entry_t   *ne;
+
+        CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid);
+
+       PORTAL_ALLOC (ne, sizeof (*ne));
+       if (ne == NULL)
+               return (-ENOMEM);
+
+       memset (ne, 0, sizeof (*ne));
+        memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif));
+
+       LASSERT (!in_interrupt());
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+       for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
+       {
+               kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+               if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid)
+               {
+                       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+                       CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid);
+
+                       PORTAL_FREE (ne, sizeof (*ne));
+                       return (-EEXIST);
+               }
+       }
+
+        list_add (&ne->kpne_list, &kpr_nals);
+
+       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+       *argp = ne;
+       PORTAL_MODULE_USE;
+        return (0);
+}
+
+void
+kpr_shutdown_nal (void *arg)
+{
+       long             flags;
+       kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
+
+        CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
+
+       LASSERT (!ne->kpne_shutdown);
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */
+       ne->kpne_shutdown = 1;
+       write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */
+
+       while (atomic_read (&ne->kpne_refcount) != 0)
+       {
+               CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n",
+                       ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
+
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+}
+
+void
+kpr_deregister_nal (void *arg)
+{
+       long              flags;
+       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
+
+        CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
+
+       LASSERT (ne->kpne_shutdown);            /* caller must have issued shutdown already */
+       LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+       list_del (&ne->kpne_list);
+
+       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+       PORTAL_FREE (ne, sizeof (*ne));
+        PORTAL_MODULE_UNUSE;
+}
+
+
+int
+kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp)
+{
+       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
+       struct list_head *e;
+       int               rc = -ENOENT;
+
+        CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid);
+
+       if (ne->kpne_shutdown)          /* caller is shutting down */
+               return (-ENOENT);
+
+       read_lock (&kpr_rwlock);
+
+       /* Search routes for one that has a gateway to target_nid on the callers network */
+
+       for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
+       {
+               kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+
+               if (re->kpre_lo_nid > target_nid ||
+                    re->kpre_hi_nid < target_nid)
+                       continue;
+
+               /* found table entry */
+
+               if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */
+                       rc = -EHOSTUNREACH;
+               else
+               {
+                       rc = 0;
+                       *gateway_nidp = re->kpre_gateway_nid;
+               }
+               break;
+       }
+
+       read_unlock (&kpr_rwlock);
+
+        CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
+                target_nid, ne->kpne_interface.kprni_nalid, rc,
+                (rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
+       return (rc);
+}
+
+void
+kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+       kpr_nal_entry_t  *src_ne = (kpr_nal_entry_t *)arg;
+       ptl_nid_t         target_nid = fwd->kprfd_target_nid;
+        int               nob = fwd->kprfd_nob;
+       struct list_head *e;
+
+        CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd,
+                target_nid, src_ne->kpne_interface.kprni_nalid);
+
+        LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */
+        LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov));
+        
+        atomic_inc (&kpr_queue_depth);
+       atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */
+
+        kpr_fwd_packets++;                   /* (loose) stats accounting */
+        kpr_fwd_bytes += nob;
+
+       if (src_ne->kpne_shutdown)           /* caller is shutting down */
+               goto out;
+
+       fwd->kprfd_router_arg = src_ne;      /* stash caller's nal entry */
+
+       read_lock (&kpr_rwlock);
+
+       /* Search routes for one that has a gateway to target_nid NOT on the caller's network */
+
+       for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
+       {
+               kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+
+               if (re->kpre_lo_nid > target_nid || /* no match */
+                    re->kpre_hi_nid < target_nid)
+                       continue;
+
+                CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd,
+                        target_nid, src_ne->kpne_interface.kprni_nalid,
+                        re->kpre_gateway_nid, re->kpre_gateway_nalid);
+
+               if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid)
+                       break;                  /* don't route to same NAL */
+
+               /* Search for gateway's NAL's entry */
+
+               for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
+               {
+                       kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+                       if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */
+                               continue;
+
+                       if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */
+                               break;
+
+                       fwd->kprfd_gateway_nid = re->kpre_gateway_nid;
+                       atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
+
+                       read_unlock (&kpr_rwlock);
+
+                        CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd,
+                                target_nid, src_ne->kpne_interface.kprni_nalid,
+                                fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
+
+                       dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
+                       return;
+               }
+               break;
+       }
+
+       read_unlock (&kpr_rwlock);
+ out:
+        kpr_fwd_errors++;
+
+        CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
+                target_nid, src_ne->kpne_interface.kprni_nalid);
+
+       /* Can't find anywhere to forward to */
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH);
+
+        atomic_dec (&kpr_queue_depth);
+       atomic_dec (&src_ne->kpne_refcount);
+}
+
+void
+kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error)
+{
+       kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
+       kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
+
+        CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
+                src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
+
+       atomic_dec (&dst_ne->kpne_refcount);    /* CAVEAT EMPTOR dst_ne can disappear now!!! */
+
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
+
+        CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd,
+                src_ne->kpne_interface.kprni_nalid, error);
+
+        atomic_dec (&kpr_queue_depth);
+       atomic_dec (&src_ne->kpne_refcount);    /* CAVEAT EMPTOR src_ne can disappear now!!! */
+}
+
+int
+kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
+               ptl_nid_t hi_nid)
+{
+       long               flags;
+       struct list_head  *e;
+       kpr_route_entry_t *re;
+
+        CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
+               gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+        LASSERT(lo_nid <= hi_nid);
+
+        PORTAL_ALLOC (re, sizeof (*re));
+        if (re == NULL)
+                return (-ENOMEM);
+
+        re->kpre_gateway_nalid = gateway_nalid;
+        re->kpre_gateway_nid = gateway_nid;
+        re->kpre_lo_nid = lo_nid;
+        re->kpre_hi_nid = hi_nid;
+
+        LASSERT(!in_interrupt());
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t,
+                                                    kpre_list);
+
+                if (re->kpre_lo_nid > re2->kpre_hi_nid ||
+                    re->kpre_hi_nid < re2->kpre_lo_nid)
+                        continue;
+
+                CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]"
+                        "to ["LPX64" - "LPX64"]\n",
+                        re->kpre_lo_nid, re->kpre_hi_nid,
+                        re2->kpre_lo_nid, re2->kpre_hi_nid);
+
+                write_unlock_irqrestore (&kpr_rwlock, flags);
+
+                PORTAL_FREE (re, sizeof (*re));
+                return (-EINVAL);
+        }
+
+        list_add (&re->kpre_list, &kpr_routes);
+
+        write_unlock_irqrestore (&kpr_rwlock, flags);
+        return (0);
+}
+
+int
+kpr_del_route (ptl_nid_t nid)
+{
+       long               flags;
+       struct list_head  *e;
+
+        CDEBUG(D_OTHER, "Del route "LPX64"\n", nid);
+
+        LASSERT(!in_interrupt());
+       write_lock_irqsave(&kpr_rwlock, flags);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+                                                   kpre_list);
+
+                if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid)
+                        continue;
+
+                list_del (&re->kpre_list);
+                write_unlock_irqrestore(&kpr_rwlock, flags);
+
+                PORTAL_FREE(re, sizeof (*re));
+                return (0);
+        }
+
+        write_unlock_irqrestore(&kpr_rwlock, flags);
+        return (-ENOENT);
+}
+
+int
+kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
+              ptl_nid_t *lo_nid, ptl_nid_t *hi_nid)
+{
+       struct list_head  *e;
+
+       read_lock(&kpr_rwlock);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+                                                   kpre_list);
+
+                if (idx-- == 0) {
+                        *gateway_nalid = re->kpre_gateway_nalid;
+                        *gateway_nid = re->kpre_gateway_nid;
+                        *lo_nid = re->kpre_lo_nid;
+                        *hi_nid = re->kpre_hi_nid;
+
+                        read_unlock(&kpr_rwlock);
+                        return (0);
+                }
+        }
+
+        read_unlock (&kpr_rwlock);
+        return (-ENOENT);
+}
+
+static void __exit
+kpr_finalise (void)
+{
+        LASSERT (list_empty (&kpr_nals));
+
+        while (!list_empty (&kpr_routes)) {
+                kpr_route_entry_t *re = list_entry(kpr_routes.next,
+                                                   kpr_route_entry_t,
+                                                   kpre_list);
+
+                list_del(&re->kpre_list);
+                PORTAL_FREE(re, sizeof (*re));
+        }
+
+        kpr_proc_fini();
+
+        PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
+        PORTAL_SYMBOL_UNREGISTER(kpr_control_interface);
+
+        CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n",
+               atomic_read(&portal_kmemory));
+}
+
+static int __init
+kpr_initialise (void)
+{
+        CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n",
+               atomic_read(&portal_kmemory));
+
+       rwlock_init(&kpr_rwlock);
+       INIT_LIST_HEAD(&kpr_routes);
+       INIT_LIST_HEAD(&kpr_nals);
+
+        kpr_proc_init();
+
+        PORTAL_SYMBOL_REGISTER(kpr_router_interface);
+        PORTAL_SYMBOL_REGISTER(kpr_control_interface);
+        return (0);
+}
+
+MODULE_AUTHOR("Eric Barton");
+MODULE_DESCRIPTION("Kernel Portals Router v0.01");
+MODULE_LICENSE("GPL");
+
+module_init (kpr_initialise);
+module_exit (kpr_finalise);
+
+EXPORT_SYMBOL (kpr_control_interface);
+EXPORT_SYMBOL (kpr_router_interface);
diff --git a/lnet/router/router.h b/lnet/router/router.h
new file mode 100644 (file)
index 0000000..b8c3bec
--- /dev/null
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _KPTLROUTER_H
+#define _KPTLROUTER_H
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+
+#define DEBUG_SUBSYSTEM S_PTLROUTER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+typedef struct
+{
+       struct list_head        kpne_list;
+       kpr_nal_interface_t     kpne_interface;
+       atomic_t                kpne_refcount;
+       int                     kpne_shutdown;
+} kpr_nal_entry_t;
+
+typedef struct
+{
+       struct list_head        kpre_list;
+       int                     kpre_gateway_nalid;
+       ptl_nid_t               kpre_gateway_nid;
+       ptl_nid_t               kpre_lo_nid;
+        ptl_nid_t               kpre_hi_nid;
+} kpr_route_entry_t;
+
+extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp);
+extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp);
+extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error);
+extern void kpr_shutdown_nal (void *arg);
+extern void kpr_deregister_nal (void *arg);
+
+extern void kpr_proc_init (void);
+extern void kpr_proc_fini (void);
+
+extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, 
+                          ptl_nid_t lo_nid, ptl_nid_t hi_nid);
+extern int kpr_del_route (ptl_nid_t nid);
+extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, 
+                          ptl_nid_t *lo_nid, ptl_nid_t *hi_nid);
+
+extern unsigned long long kpr_fwd_bytes;
+extern unsigned long      kpr_fwd_packets;
+extern unsigned long      kpr_fwd_errors;
+extern atomic_t           kpr_queue_depth;
+
+#endif /* _KPLROUTER_H */
diff --git a/lnet/tests/.cvsignore b/lnet/tests/.cvsignore
new file mode 100644 (file)
index 0000000..051d1bd
--- /dev/null
@@ -0,0 +1,3 @@
+Makefile
+Makefile.in
+.deps
diff --git a/lnet/tests/Makefile.am b/lnet/tests/Makefile.am
new file mode 100644 (file)
index 0000000..7b47ae0
--- /dev/null
@@ -0,0 +1,23 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Rules.linux
+
+LDFLAGS = -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r
+LINK = $(LD) $(LDFLAGS) -o $@
+DEFS =
+LIBS =
+MODULE = $(basename)
+EXTRA_DIST = startserver.sh startclient.sh stopserver.sh stopclient.sh
+
+noinst_PROGRAMS = pingsrv.o pingcli.o spingsrv.o spingcli.o 
+
+pingsrv_o_SOURCES = ping_srv.c ping.h
+
+pingcli_o_SOURCES = ping_cli.c ping.h
+
+spingsrv_o_SOURCES = sping_srv.c ping.h
+
+spingcli_o_SOURCES = sping_cli.c ping.h
diff --git a/lnet/tests/ping.h b/lnet/tests/ping.h
new file mode 100644 (file)
index 0000000..f07444b
--- /dev/null
@@ -0,0 +1,80 @@
+#ifndef _KPING_INCLUDED
+#define _KPING_INCLUDED
+
+#include <portals/p30.h>
+
+
+#define PTL_PING_IN_SIZE               256     // n packets per buffer
+#define PTL_PING_IN_BUFFERS            2       // n fallback buffers
+
+#define PTL_PING_CLIENT                        4
+#define PTL_PING_SERVER                        5
+
+#define PING_HEADER_MAGIC              0xDEADBEEF
+#define PING_BULK_MAGIC                        0xCAFEBABE
+
+#define PING_HEAD_BITS                 0x00000001
+#define PING_BULK_BITS                 0x00000002
+#define PING_IGNORE_BITS               0xFFFFFFFC
+
+#define PTL_PING_ACK                   0x01
+#define PTL_PING_VERBOSE               0x02
+#define PTL_PING_VERIFY                        0x04
+#define PTL_PING_PREALLOC              0x08
+
+
+#define NEXT_PRIMARY_BUFFER(index)             \
+       (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1))
+
+#define PDEBUG(str, err)                       \
+       CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err)
+
+
+/* Ping data to be passed via the ioctl to kernel space */
+
+#if __KERNEL__
+
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+struct pingsrv_data {
+        
+        ptl_handle_ni_t         ni;
+        ptl_handle_me_t         me;
+        ptl_handle_eq_t         eq;
+        void                   *in_buf;
+        ptl_process_id_t        my_id;
+        ptl_process_id_t        id_local;
+        ptl_md_t                mdin;
+        ptl_md_t                mdout;
+        ptl_handle_md_t         mdin_h;
+        ptl_handle_md_t         mdout_h;
+        ptl_event_t             evnt;
+        struct task_struct     *tsk;
+}; /* struct pingsrv_data */
+struct pingcli_data {
+        
+        struct portal_ioctl_data *args;
+        ptl_handle_me_t        me;
+        ptl_handle_eq_t                eq;
+        char                          *inbuf;    
+        char                   *outbuf;   
+        ptl_process_id_t       myid; 
+        ptl_process_id_t       id_local; 
+        ptl_process_id_t       id_remote;
+        ptl_md_t               md_in_head;
+        ptl_md_t               md_out_head;
+        ptl_handle_md_t        md_in_head_h;
+        ptl_handle_md_t        md_out_head_h;
+        ptl_event_t            ev;
+        struct task_struct     *tsk;
+}; /* struct pingcli_data */
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _KPING_INCLUDED */
diff --git a/lnet/tests/ping_cli.c b/lnet/tests/ping_cli.c
new file mode 100644 (file)
index 0000000..389ffbb
--- /dev/null
@@ -0,0 +1,300 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *         Kedar Sovani (kedar@calsoftinc.com)
+ *         Amey Inamdar (amey@calsoftinc.com)
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include "ping.h"
+/* int portal_debug = D_PING_CLI;  */
+
+
+#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
+
+#define MAX_TIME 100000
+
+/* This should be enclosed in a structure */
+
+static struct pingcli_data *client = NULL;
+
+static int count = 0;
+
+static void
+pingcli_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+                case 2:
+                        if ((rc = PtlMDUnlink (client->md_in_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (client->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        if ((rc = PtlMEUnlink (client->me)))
+                                PDEBUG ("PtlMEUnlink", rc);
+                case 3:
+                        kportal_put_ni (client->args->ioc_nal);
+
+                case 4:
+                        /* Free our buffers */
+
+                        if (client != NULL)
+                                PORTAL_FREE (client,
+                                                sizeof(struct pingcli_data));
+        }
+
+
+        CDEBUG (D_OTHER, "ping client released resources\n");
+} /* pingcli_shutdown() */
+
+static int pingcli_callback(ptl_event_t *ev)
+{
+        int i, magic;
+        i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned));
+        magic = *(int *)(ev->mem_desc.start + ev->offset);
+
+        if(magic != 0xcafebabe) {
+                printk ("Unexpected response \n");
+                return 1;
+        }
+
+        if((i == count) || !count)
+                wake_up_process (client->tsk);
+        else
+                printk ("Received response after timeout for %d\n",i);
+        return 1;
+}
+
+
+static struct pingcli_data *
+pingcli_start(struct portal_ioctl_data *args)
+{
+        ptl_handle_ni_t *nip;
+        unsigned ping_head_magic = PING_HEADER_MAGIC;
+        unsigned ping_bulk_magic = PING_BULK_MAGIC;
+        int rc;
+        struct timeval tv1, tv2;
+        client->tsk = current;
+        client->args = args;
+        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64",  \
+                        nal %d, size %u, count: %u, timeout: %u\n",
+                        args->ioc_nid, args->ioc_nal, args->ioc_size,
+                        args->ioc_count, args->ioc_timeout);
+
+
+        PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ;
+        if (client->outbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        PORTAL_ALLOC (client->inbuf,
+                        (args->ioc_size + STDSIZE) * args->ioc_count);
+        if (client->inbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        {
+                CERROR ("NAL %d not loaded\n", args->ioc_nal);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (*nip, &client->myid)))
+        {
+                CERROR ("PtlGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Setup the local match entries */
+        client->id_local.nid = PTL_NID_ANY;
+        client->id_local.pid = PTL_PID_ANY;
+
+        /* Setup the remote match entries */
+        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.pid = 0;
+
+        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, PTL_RETAIN,
+                   PTL_INS_AFTER, &client->me)))
+        {
+                CERROR ("PtlMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Allocate the event queue for this network interface */
+        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        {
+                CERROR ("PtlEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        count = args->ioc_count;
+
+        client->md_in_head.start     = client->inbuf;
+        client->md_in_head.length    = (args->ioc_size + STDSIZE)
+                                                * count;
+        client->md_in_head.threshold = PTL_MD_THRESH_INF;
+        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.user_ptr  = NULL;
+        client->md_in_head.eventq    = client->eq;
+        memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count);
+
+        /* Attach the incoming buffer */
+        if ((rc = PtlMDAttach (client->me, client->md_in_head,
+                              PTL_UNLINK, &client->md_in_head_h))) {
+                CERROR ("PtlMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+        /* Setup the outgoing ping header */
+        client->md_out_head.start     = client->outbuf;
+        client->md_out_head.length    = STDSIZE + args->ioc_size;
+        client->md_out_head.threshold = args->ioc_count;
+        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.user_ptr  = NULL;
+        client->md_out_head.eventq    = PTL_EQ_NONE;
+
+        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic));
+
+        count = 0;
+
+        /* Bind the outgoing ping header */
+        if ((rc=PtlMDBind (*nip, client->md_out_head,
+                                        &client->md_out_head_h))) {
+                CERROR ("PtlMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return NULL;
+        }
+        while ((args->ioc_count - count)) {
+                memcpy (client->outbuf + sizeof(unsigned),
+                       &(count), sizeof(unsigned));
+                 /* Put the ping packet */
+                do_gettimeofday (&tv1);
+
+                memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1,
+                       sizeof(struct timeval));
+
+                if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
+                          client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
+                         PDEBUG ("PtlPut (header)", rc);
+                         pingcli_shutdown (1);
+                         return NULL;
+                }
+                printk ("sent msg no %d", count);
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                rc = schedule_timeout (20 * args->ioc_timeout);
+                if (rc == 0) {
+                        printk ("   ::  timeout .....\n");
+                } else {
+                        do_gettimeofday (&tv2);
+                        printk("   ::  Reply in %u usec\n",
+                                (unsigned)((tv2.tv_sec - tv1.tv_sec)
+                                 * 1000000 +  (tv2.tv_usec - tv1.tv_usec)));
+                }
+                count++;
+        }
+
+        if (client->outbuf != NULL)
+                PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size);
+
+        if (client->inbuf != NULL)
+                PORTAL_FREE (client->inbuf,
+                               (args->ioc_size + STDSIZE) * args->ioc_count);
+
+        pingcli_shutdown (2);
+
+        /* Success! */
+        return NULL;
+} /* pingcli_setup() */
+
+
+
+/* called by the portals_ioctl for ping requests */
+static int kping_client(struct portal_ioctl_data *args)
+{
+        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        if (client == NULL)
+        {
+                CERROR ("Unable to allocate client structure\n");
+                return (0);
+        }
+        memset (client, 0, sizeof(struct pingcli_data));
+        pingcli_start (args);
+
+        return 0;
+} /* kping_client() */
+
+
+static int __init pingcli_init(void)
+{
+        PORTAL_SYMBOL_REGISTER(kping_client);
+        return 0;
+} /* pingcli_init() */
+
+
+static void __exit pingcli_cleanup(void)
+{
+        PORTAL_SYMBOL_UNREGISTER (kping_client);
+} /* pingcli_cleanup() */
+
+
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingcli_init);
+module_exit(pingcli_cleanup);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+EXPORT_SYMBOL (kping_client);
+#endif
diff --git a/lnet/tests/ping_srv.c b/lnet/tests/ping_srv.c
new file mode 100644 (file)
index 0000000..1037d09
--- /dev/null
@@ -0,0 +1,308 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *        Amey Inamdar     <amey@calsoftinc.com>
+ *        Kedar Sovani     <kedar@calsoftinc.com>
+ *
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include "ping.h"
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+#include <linux/wait.h>
+#include <linux/smp_lock.h>
+
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+
+#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
+#define MAXSIZE (16*1024*1024)
+
+static unsigned ping_head_magic;
+static unsigned ping_bulk_magic;
+static int nal  = 0;                            // Your NAL,
+static unsigned long packets_valid = 0;         // Valid packets 
+static int running = 1;
+atomic_t pkt;
+       
+static struct pingsrv_data *server=NULL;             // Our ping server
+
+static void *pingsrv_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (server->mdin_h)))
+                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (server->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        /* Unlink the client portal from the ME list */
+                        if ((rc = PtlMEUnlink (server->me)))
+                                        PDEBUG ("PtlMEUnlink", rc);
+
+                case 3:
+                        kportal_put_ni (nal);
+
+                case 4:
+                        
+                case 5:
+                        if (server->in_buf != NULL)
+                                PORTAL_FREE (server->in_buf, MAXSIZE);
+                        
+                        if (server != NULL)
+                                PORTAL_FREE (server, 
+                                             sizeof (struct pingsrv_data));
+                        
+        }
+
+        CDEBUG (D_OTHER, "ping sever resources released\n");
+        return NULL;
+} /* pingsrv_shutdown() */
+
+
+int pingsrv_thread(void *arg)
+{
+        int rc;
+        unsigned long magic;
+        unsigned long ping_bulk_magic = 0xcafebabe;
+        
+        kportal_daemonize ("pingsrv");
+        server->tsk =  current;
+        
+        while (running) {
+                set_current_state (TASK_INTERRUPTIBLE);
+                if (atomic_read (&pkt) == 0) {
+                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        continue;
+                }
+               
+                magic =  *((int *)(server->evnt.mem_desc.start 
+                                        + server->evnt.offset));
+                
+                
+                if(magic != 0xdeadbeef) {
+                        printk("Unexpected Packet to the server\n");
+                        
+                } 
+                memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic));
+                                
+                server->mdout.length    = server->evnt.rlength;
+                server->mdout.start     = server->in_buf;
+                server->mdout.threshold = 1; 
+                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.user_ptr  = NULL;
+                server->mdout.eventq    = PTL_EQ_NONE;
+       
+                /* Bind the outgoing buffer */
+                if ((rc = PtlMDBind (server->ni, server->mdout, 
+                                                &server->mdout_h))) {
+                         PDEBUG ("PtlMDBind", rc);
+                         pingsrv_shutdown (1);
+                         return 1;
+               }
+         
+                
+                server->mdin.start     = server->in_buf;
+                server->mdin.length    = MAXSIZE;
+                server->mdin.threshold = 1; 
+                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.user_ptr  = NULL;
+                server->mdin.eventq    = server->eq;
+        
+                if ((rc = PtlMDAttach (server->me, server->mdin,
+                        PTL_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                        CDEBUG (D_OTHER, "ping server resources allocated\n");
+                }
+                
+                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
+                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
+                         PDEBUG ("PtlPut", rc);
+                
+                atomic_dec (&pkt);
+                
+        }
+        pingsrv_shutdown (1);
+        running = 1;
+        return 0;    
+}
+
+static int pingsrv_packet(ptl_event_t *ev)
+{
+        atomic_inc (&pkt);
+        wake_up_process (server->tsk);
+        return 1;
+} /* pingsrv_head() */
+
+static int pingsrv_callback(ptl_event_t *ev)
+{
+        
+        if (ev == NULL) {
+                CERROR ("null in callback, ev=%p\n", ev);
+                return 0;
+        }
+        server->evnt = *ev;
+        
+        printk ("received ping from nid "LPX64" "
+               "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
+               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+               *((int *)(ev->mem_desc.start + ev->offset)),
+               *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))),
+               *((int *)(ev->mem_desc.start + ev->offset + 2 * 
+                               sizeof(unsigned))));
+        
+        packets_valid++;
+
+        return pingsrv_packet(ev);
+        
+} /* pingsrv_callback() */
+
+
+static struct pingsrv_data *pingsrv_setup(void)
+{
+        ptl_handle_ni_t *nip;
+        int rc;
+
+       /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (nal)) == NULL) {
+                CDEBUG (D_OTHER, "NAL %d not loaded\n", nal);
+                return pingsrv_shutdown (4);
+        }
+
+        server->ni= *nip;
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (server->ni, &server->my_id))) {
+                PDEBUG ("PtlGetId", rc);
+                return pingsrv_shutdown (2);
+        }
+
+        server->id_local.nid = PTL_NID_ANY;
+        server->id_local.pid = PTL_PID_ANY;
+
+        /* Attach a match entries for header packets */
+        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+            server->id_local,0, ~0,
+            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
+                PDEBUG ("PtlMEAttach", rc);
+                return pingsrv_shutdown (2);
+        }
+
+
+        if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback,
+                                        &server->eq))) {
+                PDEBUG ("PtlEQAlloc (callback)", rc);
+                return pingsrv_shutdown (2);
+        }
+        
+        PORTAL_ALLOC (server->in_buf, MAXSIZE);
+        if(!server->in_buf){
+                CDEBUG (D_OTHER,"Allocation error\n");
+                return pingsrv_shutdown(2);
+        }
+        
+        /* Setup the incoming buffer */
+        server->mdin.start     = server->in_buf;
+        server->mdin.length    = MAXSIZE;
+        server->mdin.threshold = 1; 
+        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.user_ptr  = NULL;
+        server->mdin.eventq    = server->eq;
+        memset (server->in_buf, 0, STDSIZE);
+        
+        if ((rc = PtlMDAttach (server->me, server->mdin,
+                PTL_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("PtlMDAttach (bulk)", rc);
+                CDEBUG (D_OTHER, "ping server resources allocated\n");
+       }
+        /* Success! */
+        return server; 
+} /* pingsrv_setup() */
+
+static int pingsrv_start(void)
+{
+        /* Setup our server */
+        if (!pingsrv_setup()) {
+                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
+                return -ENOMEM;
+        }
+        kernel_thread (pingsrv_thread,NULL,0);
+        return 0;
+} /* pingsrv_start() */
+
+
+
+static int __init pingsrv_init(void)
+{
+        ping_head_magic = PING_HEADER_MAGIC;
+        ping_bulk_magic = PING_BULK_MAGIC;
+        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        return pingsrv_start ();
+} /* pingsrv_init() */
+
+
+static void __exit pingsrv_cleanup(void)
+{
+        remove_proc_entry ("net/pingsrv", NULL);
+        
+        running = 0;
+        wake_up_process (server->tsk);
+        while (running != 1) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (HZ);
+        }
+        
+} /* pingsrv_cleanup() */
+
+
+MODULE_PARM(nal, "i");
+MODULE_PARM_DESC(nal, "Use the specified NAL "
+                "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)");
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A kernel space ping server for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingsrv_init);
+module_exit(pingsrv_cleanup);
diff --git a/lnet/tests/sping_cli.c b/lnet/tests/sping_cli.c
new file mode 100644 (file)
index 0000000..4cef08b
--- /dev/null
@@ -0,0 +1,276 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *         Kedar Sovani (kedar@calsoftinc.com)
+ *         Amey Inamdar (amey@calsoftinc.com)
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* This is a striped down version of pinger. It follows a single
+ * request-response protocol. Doesn't do Bulk data pinging. Also doesn't
+ * send multiple packets in a single ioctl.
+ */
+
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include "ping.h"
+/* int portal_debug = D_PING_CLI;  */
+
+
+#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes
+                                                   assumed */
+
+/* This should be enclosed in a structure */
+
+static struct pingcli_data *client = NULL;
+
+static int count = 0;
+
+static void
+pingcli_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (client->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        if ((rc = PtlMEUnlink (client->me)))
+                                PDEBUG ("PtlMEUnlink", rc);
+                case 3:
+                        kportal_put_ni (client->args->ioc_nal);
+
+                case 4:
+                        /* Free our buffers */
+                        if (client->outbuf != NULL)
+                                PORTAL_FREE (client->outbuf, STDSIZE);
+
+                        if (client->inbuf != NULL)
+                                PORTAL_FREE (client->inbuf, STDSIZE);
+
+
+                        if (client != NULL)
+                                PORTAL_FREE (client,
+                                                sizeof(struct pingcli_data));
+        }
+
+
+        CDEBUG (D_OTHER, "ping client released resources\n");
+} /* pingcli_shutdown() */
+
+static int pingcli_callback(ptl_event_t *ev)
+{
+                wake_up_process (client->tsk);
+        return 1;
+}
+
+
+static struct pingcli_data *
+pingcli_start(struct portal_ioctl_data *args)
+{
+        const ptl_handle_ni_t *nip;
+        unsigned ping_head_magic = PING_HEADER_MAGIC;
+        int rc;
+
+        client->tsk = current;
+        client->args = args;
+
+        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64",  \
+                        nal %d, size %u, count: %u, timeout: %u\n",
+                        args->ioc_nid, args->ioc_nal, args->ioc_size,
+                        args->ioc_count, args->ioc_timeout);
+
+
+        PORTAL_ALLOC (client->outbuf, STDSIZE) ;
+        if (client->outbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        PORTAL_ALLOC (client->inbuf,  STDSIZE);
+
+        if (client->inbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        {
+                CERROR ("NAL %d not loaded.\n", args->ioc_nal);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (*nip, &client->myid)))
+        {
+                CERROR ("PtlGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Setup the local match entries */
+        client->id_local.nid = PTL_NID_ANY;
+        client->id_local.pid = PTL_PID_ANY;
+
+        /* Setup the remote match entries */
+        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.pid = 0;
+
+        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, PTL_RETAIN,
+                   PTL_INS_AFTER, &client->me)))
+        {
+                CERROR ("PtlMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Allocate the event queue for this network interface */
+        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        {
+                CERROR ("PtlEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+
+        client->md_in_head.start     = client->inbuf;
+        client->md_in_head.length    = STDSIZE;
+        client->md_in_head.threshold = 1;
+        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.user_ptr  = NULL;
+        client->md_in_head.eventq    = client->eq;
+        memset (client->inbuf, 0, STDSIZE);
+
+        /* Attach the incoming buffer */
+        if ((rc = PtlMDAttach (client->me, client->md_in_head,
+                              PTL_UNLINK, &client->md_in_head_h))) {
+                CERROR ("PtlMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+
+        /* Setup the outgoing ping header */
+        client->md_out_head.start     = client->outbuf;
+        client->md_out_head.length    = STDSIZE;
+        client->md_out_head.threshold = 1;
+        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.user_ptr  = NULL;
+        client->md_out_head.eventq    = PTL_EQ_NONE;
+
+        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic));
+
+        /* Bind the outgoing ping header */
+        if ((rc=PtlMDBind (*nip, client->md_out_head,
+                                        &client->md_out_head_h))) {
+                CERROR ("PtlMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+        /* Put the ping packet */
+        if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
+                         client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
+                PDEBUG ("PtlPut (header)", rc);
+                pingcli_shutdown (1);
+                return NULL;
+        }
+
+        count = 0;
+        set_current_state (TASK_INTERRUPTIBLE);
+        rc = schedule_timeout (20 * args->ioc_timeout);
+        if (rc == 0) {
+                printk (" Time out on the server\n");
+                pingcli_shutdown (2);
+                return NULL;
+        } else
+                printk("Received respose from the server \n");
+
+
+        pingcli_shutdown (2);
+
+        /* Success! */
+        return NULL;
+} /* pingcli_setup() */
+
+
+
+/* called by the portals_ioctl for ping requests */
+static int kping_client(struct portal_ioctl_data *args)
+{
+
+        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        memset (client, 0, sizeof(struct pingcli_data));
+        if (client == NULL)
+        {
+                CERROR ("Unable to allocate client structure\n");
+                return (0);
+        }
+        pingcli_start (args);
+
+        return 0;
+} /* kping_client() */
+
+
+static int __init pingcli_init(void)
+{
+        PORTAL_SYMBOL_REGISTER(kping_client);
+        return 0;
+} /* pingcli_init() */
+
+
+static void __exit pingcli_cleanup(void)
+{
+        PORTAL_SYMBOL_UNREGISTER (kping_client);
+} /* pingcli_cleanup() */
+
+
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingcli_init);
+module_exit(pingcli_cleanup);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+EXPORT_SYMBOL (kping_client);
+#endif
diff --git a/lnet/tests/sping_srv.c b/lnet/tests/sping_srv.c
new file mode 100644 (file)
index 0000000..a18ea35
--- /dev/null
@@ -0,0 +1,295 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *        Amey Inamdar     <amey@calsoftinc.com>
+ *        Kedar Sovani     <kedar@calsoftinc.com>
+ *
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This is a striped down version of pinger. It follows a single
+ * request-response protocol. Doesn't do Bulk data pinging. Also doesn't 
+ * send multiple packets in a single ioctl.
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include "ping.h"
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+#include <linux/wait.h>
+#include <linux/smp_lock.h>
+
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+
+#define STDSIZE (sizeof(int) + sizeof(int) + 4)
+
+static int nal  = 0;                            // Your NAL,
+static unsigned long packets_valid = 0;         // Valid packets 
+static int running = 1;
+atomic_t pkt;
+       
+static struct pingsrv_data *server=NULL;             // Our ping server
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#endif
+
+static void *pingsrv_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (server->mdin_h)))
+                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (server->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        /* Unlink the client portal from the ME list */
+                        if ((rc = PtlMEUnlink (server->me)))
+                                        PDEBUG ("PtlMEUnlink", rc);
+
+                case 3:
+                        kportal_put_ni (nal);
+
+                case 4:
+                        
+                        if (server->in_buf != NULL)
+                                PORTAL_FREE (server->in_buf, STDSIZE);
+                        
+                        if (server != NULL)
+                                PORTAL_FREE (server, 
+                                             sizeof (struct pingsrv_data));
+                        
+        }
+
+        CDEBUG (D_OTHER, "ping sever resources released\n");
+        return NULL;
+} /* pingsrv_shutdown() */
+
+
+int pingsrv_thread(void *arg)
+{
+        int rc;
+        
+        kportal_daemonize ("pingsrv");
+        server->tsk = current;
+        
+        while (running) {
+                set_current_state (TASK_INTERRUPTIBLE);
+                if (atomic_read (&pkt) == 0) {
+                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        continue;
+                }
+                               
+                server->mdout.start     = server->in_buf;
+                server->mdout.length    = STDSIZE;
+                server->mdout.threshold = 1; 
+                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.user_ptr  = NULL;
+                server->mdout.eventq    = PTL_EQ_NONE;
+       
+                /* Bind the outgoing buffer */
+                if ((rc = PtlMDBind (server->ni, server->mdout, 
+                                                &server->mdout_h))) {
+                         PDEBUG ("PtlMDBind", rc);
+                         pingsrv_shutdown (1);
+                         return 1;
+               }
+         
+                
+                server->mdin.start     = server->in_buf;
+                server->mdin.length    = STDSIZE;
+                server->mdin.threshold = 1; 
+                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.user_ptr  = NULL;
+                server->mdin.eventq    = server->eq;
+        
+                if ((rc = PtlMDAttach (server->me, server->mdin,
+                        PTL_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                        CDEBUG (D_OTHER, "ping server resources allocated\n");
+                }
+                
+                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
+                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
+                         PDEBUG ("PtlPut", rc);
+                
+                atomic_dec (&pkt);
+                
+        }
+        pingsrv_shutdown (1);
+        running = 1;
+        return 0;    
+}
+
+static int pingsrv_packet(ptl_event_t *ev)
+{
+        atomic_inc (&pkt);
+        wake_up_process (server->tsk);
+        return 1;
+} /* pingsrv_head() */
+
+static int pingsrv_callback(ptl_event_t *ev)
+{
+        
+        if (ev == NULL) {
+                CERROR ("null in callback, ev=%p\n", ev);
+                return 0;
+        }
+        server->evnt = *ev;
+        
+        printk ("received ping from nid "LPX64" "
+               "(off=%u rlen=%u mlen=%u head=%x)\n",
+               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+               *((int *)(ev->mem_desc.start + ev->offset)));
+        
+        packets_valid++;
+
+        return pingsrv_packet(ev);
+        
+} /* pingsrv_callback() */
+
+
+static struct pingsrv_data *pingsrv_setup(void)
+{
+        ptl_handle_ni_t *nip;
+        int rc;
+
+       /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (nal)) == NULL) {
+                CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal);
+                return pingsrv_shutdown (4);
+        }
+
+        server->ni= *nip;
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (server->ni, &server->my_id))) {
+                PDEBUG ("PtlGetId", rc);
+                return pingsrv_shutdown (2);
+        }
+
+        server->id_local.nid = PTL_NID_ANY;
+        server->id_local.pid = PTL_PID_ANY;
+
+        /* Attach a match entries for header packets */
+        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+            server->id_local,0, ~0,
+            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
+                PDEBUG ("PtlMEAttach", rc);
+                return pingsrv_shutdown (2);
+        }
+
+
+        if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback,
+                                        &server->eq))) {
+                PDEBUG ("PtlEQAlloc (callback)", rc);
+                return pingsrv_shutdown (2);
+        }
+        
+        PORTAL_ALLOC (server->in_buf, STDSIZE);
+        if(!server->in_buf){
+                CDEBUG (D_OTHER,"Allocation error\n");
+                return pingsrv_shutdown(2);
+        }
+        
+        /* Setup the incoming buffer */
+        server->mdin.start     = server->in_buf;
+        server->mdin.length    = STDSIZE;
+        server->mdin.threshold = 1; 
+        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.user_ptr  = NULL;
+        server->mdin.eventq    = server->eq;
+        memset (server->in_buf, 0, STDSIZE);
+        
+        if ((rc = PtlMDAttach (server->me, server->mdin,
+                PTL_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("PtlMDAttach (bulk)", rc);
+                CDEBUG (D_OTHER, "ping server resources allocated\n");
+       }
+        /* Success! */
+        return server; 
+} /* pingsrv_setup() */
+
+static int pingsrv_start(void)
+{
+        /* Setup our server */
+        if (!pingsrv_setup()) {
+                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
+                return -ENOMEM;
+        }
+        kernel_thread (pingsrv_thread,NULL,0);
+        return 0;
+} /* pingsrv_start() */
+
+
+
+static int __init pingsrv_init(void)
+{
+        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        return pingsrv_start ();
+} /* pingsrv_init() */
+
+
+static void __exit pingsrv_cleanup(void)
+{
+        remove_proc_entry ("net/pingsrv", NULL);
+        
+        running = 0;
+        wake_up_process (server->tsk);
+        while (running != 1) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (HZ);
+        }
+        
+} /* pingsrv_cleanup() */
+
+
+MODULE_PARM(nal, "i");
+MODULE_PARM_DESC(nal, "Use the specified NAL "
+                "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)");
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A kernel space ping server for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingsrv_init);
+module_exit(pingsrv_cleanup);
diff --git a/lnet/tests/startclient.sh b/lnet/tests/startclient.sh
new file mode 100644 (file)
index 0000000..c9b7c16
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-0}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=pingcli.o
+else
+       PING=spingcli.o
+fi
+
+case "$1" in
+       toe)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../toenal/ktoenal.o
+               /sbin/insmod ./$PING
+               echo ktoenal > /tmp/nal
+       ;;
+       
+       tcp)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../socknal/ksocknal.o
+               /sbin/insmod ./$PING 
+               echo ksocknal > /tmp/nal
+       ;;
+       
+       elan)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../qswnal/kqswnal.o
+               /sbin/insmod ./$PING
+               echo kqswnal > /tmp/nal
+       ;;
+       
+       *)
+               echo "Usage : ${0} < tcp | toe | elan >"
+               exit 1;
+esac
+exit 0;
diff --git a/lnet/tests/startserver.sh b/lnet/tests/startserver.sh
new file mode 100644 (file)
index 0000000..942300e
--- /dev/null
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-0}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=pingsrv.o
+else
+       PING=spingsrv.o
+fi
+
+case "$1" in
+       toe)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../toenal/ktoenal.o
+               /sbin/insmod ./$PING nal=4
+               echo ktoenal > /tmp/nal
+       ;;
+       
+       tcp)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../socknal/ksocknal.o
+               /sbin/insmod ./$PING nal=2
+               echo ksocknal > /tmp/nal
+       ;;
+       
+       elan)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../qswnal/kqswnal.o
+               /sbin/insmod ./$PING nal=4
+               echo kqswnal > /tmp/nal
+       ;;
+       
+       *)
+               echo "Usage : ${0} < tcp | toe | elan >"
+               exit 1;
+esac
+../utils/acceptor 9999&
+exit 0;
diff --git a/lnet/tests/stopclient.sh b/lnet/tests/stopclient.sh
new file mode 100644 (file)
index 0000000..f7e3aa1
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-1}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=spingcli
+else
+       PING=pingcli
+fi
+
+rmmod $PING
+NAL=`cat /tmp/nal`;
+rmmod $NAL
+rmmod portals
diff --git a/lnet/tests/stopserver.sh b/lnet/tests/stopserver.sh
new file mode 100644 (file)
index 0000000..3e81831
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-1}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=spingsrv
+else
+       PING=pingsrv
+fi
+
+rmmod $PING
+NAL=`cat /tmp/nal`;
+rmmod $NAL
+killall -9 acceptor
+rm -f /var/run/acceptor-9999.pid
+rmmod portals
diff --git a/lnet/ulnds/.cvsignore b/lnet/ulnds/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/ulnds/Makefile.am b/lnet/ulnds/Makefile.am
new file mode 100644 (file)
index 0000000..dc427b0
--- /dev/null
@@ -0,0 +1,5 @@
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
+lib_LIBRARIES = libtcpnal.a
+pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
diff --git a/lnet/ulnds/README b/lnet/ulnds/README
new file mode 100644 (file)
index 0000000..6cb93d9
--- /dev/null
@@ -0,0 +1,53 @@
+This library implements two NAL interfaces, both running over IP.
+The first, tcpnal, creates TCP connections between participating
+processes in order to transport the portals requests. The second,
+ernal, provides a simple transport protocol which runs over
+UDP datagrams.
+
+The interface functions return both of these values in host order for
+convenience and readability. However this means that addresses
+exchanged in messages between hosts of different orderings will not
+function properly.
+
+Both NALs use the same support functions in order to schedule events
+and communicate with the generic portals implementation.
+
+            -------------------------
+            |         api           |
+            |_______________________|
+            |         lib           |
+            |_______________________|
+            | ernal  |   |tcpnal    |
+            |--------|   |----------|
+            | udpsock|   |connection|
+            |-----------------------|
+            |     timer/select      |
+            -------------------------
+
+
+  These NALs uses the framework from fdnal of a pipe between the api
+and library sides. This is wrapped up in the select on the library
+side, and blocks on the api side. Performance could be severely
+enhanced by collapsing this aritificial barrier, by using shared
+memory queues, or by wiring the api layer directly to the library.
+
+
+nid is defined as the low order 24-bits of the IP address of the
+physical node left shifted by 8 plus a virtual node number of 0
+through 255 (really only 239).  The virtual node number of a tcpnal
+application should be specified using the environment variable
+PTL_VIRTNODE.  pid is now a completely arbitrary number in the
+range of 0 to 255.  The IP interface used can be overridden by
+specifying the appropriate hostid by setting the PTL_HOSTID
+environment variable.  The value can be either dotted decimal
+(n.n.n.n) or hex starting with "0x".
+TCPNAL:
+  As the NAL needs to try to send to a particular nid/pid pair, it
+  will open up connections on demand. Because the port associated with
+  the connecting socket is different from the bound port, two
+  connections will normally be established between a pair of peers, with
+  data flowing from the anonymous connect (active) port to the advertised
+  or well-known bound (passive) port of each peer.
+
+  Should the connection fail to open, an error is reported to the
+  library component, which causes the api request to fail.
diff --git a/lnet/ulnds/address.c b/lnet/ulnds/address.c
new file mode 100644 (file)
index 0000000..b422c3f
--- /dev/null
@@ -0,0 +1,146 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* address.c:
+ * this file provides functions to aquire the IP address of the node
+ * and translate them into a NID/PID pair which supports a static
+ * mapping of virtual nodes into the port range of an IP socket.
+*/
+
+#include <stdlib.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <portals/p30.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+/* Function:  get_node_id
+ * Returns: a 32 bit id for this node, actually a big-endian IP address
+ *
+ * get_node_id() determines the host name and uses the resolver to
+ *  find out its ip address. This is fairly fragile and inflexible, but
+ *  explicitly asking about interfaces and their addresses is very
+ *  complicated and nonportable.
+ */
+static unsigned int get_node_id(void)
+{
+    char buffer[255];
+    unsigned int x;
+    struct hostent *he;
+    char * host_envp;
+
+    if (!(host_envp = getenv("PTL_HOSTID")))
+        {
+            gethostname(buffer,sizeof(buffer));
+            he=gethostbyname(buffer);
+            if (he)
+                    x=*(unsigned int *)he->h_addr_list[0];
+            else
+                    x = 0;
+            return(ntohl(x));
+        }
+    else 
+        {
+            if (host_envp[1] != 'x')
+                {
+                    int a, b, c, d;
+                    sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d);
+                    return ((a<<24) | (b<<16) | (c<<8) | d);
+                }
+            else
+                {
+                    long long hostid = strtoll(host_envp, 0, 0);
+                    return((unsigned int) hostid);
+                }
+        }
+}
+
+
+/* Function:  set_address
+ * Arugments: t: a procnal structure to populate with the request
+ *
+ * set_address performs the bit manipulations to set the nid, pid, and
+ *    iptop8 fields of the procnal structures.
+ *
+ * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY
+ */
+
+#ifdef DIRECT_IP_MODE
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int port;
+    if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
+    else port=pidrequest;
+    t->nal_cb->ni.nid=get_node_id();
+    t->nal_cb->ni.pid=port;
+}
+#else
+
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int virtnode, in_addr, port; 
+    ptl_pid_t pid;
+
+    /* get and remember my node id*/
+    if (!getenv("PTL_VIRTNODE"))
+        virtnode = 0;
+    else 
+        {
+            int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT 
+                                              >> PNAL_VNODE_SHIFT);
+            virtnode = atoi(getenv("PTL_VIRTNODE"));
+            if (virtnode > maxvnode)
+                {
+                    fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n",
+                            virtnode, maxvnode);
+                    return;
+                }
+        }
+    
+    in_addr = get_node_id();
+
+    t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
+    t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) 
+                            << PNAL_VNODE_SHIFT)
+        + virtnode;
+
+    pid=pidrequest;
+    /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
+#ifdef notyet
+    if (pid==(unsigned short)PTL_PID_ANY) port = 0;
+#endif
+    if (pid==(unsigned short)PTL_PID_ANY) 
+        {
+            fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n");
+            return;
+        }
+    else if (pid > PNAL_PID_MASK)
+        {
+            fprintf(stderr, "portal pid of %d is too large - max %d\n",
+                    pid, PNAL_PID_MASK);
+            return;
+        }
+    else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
+    t->nal_cb->ni.pid=pid;
+}
+#endif
diff --git a/lnet/ulnds/bridge.h b/lnet/ulnds/bridge.h
new file mode 100644 (file)
index 0000000..0b4940f
--- /dev/null
@@ -0,0 +1,29 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <portals/lib-p30.h>
+
+typedef struct bridge {
+    int alive;
+    nal_cb_t *nal_cb;
+    void *lower;
+    void *local;
+    void (*shutdown)(struct bridge *);
+    /* this doesn't really belong here */
+    unsigned char iptop8;
+} *bridge;
+
+
+nal_t *bridge_init(ptl_interface_t nal,
+                   ptl_pid_t pid_request,
+                   ptl_ni_limits_t *desired,
+                   ptl_ni_limits_t *actual,
+                   int *rc);
+
+typedef int (*nal_initialize)(bridge);
+extern nal_initialize nal_table[PTL_IFACE_MAX];
diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c
new file mode 100644 (file)
index 0000000..310e899
--- /dev/null
@@ -0,0 +1,294 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* connection.c:
+   This file provides a simple stateful connection manager which
+   builds tcp connections on demand and leaves them open for
+   future use. It also provides the machinery to allow peers
+   to connect to it
+*/
+
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <table.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <connection.h>
+#include <errno.h>
+
+
+/* global variable: acceptor port */
+unsigned short tcpnal_acceptor_port = 988;
+
+
+/* Function:  compare_connection
+ * Arguments: connection c:      a connection in the hash table
+ *            ptl_process_id_t:  an id to verify  agains
+ * Returns: 1 if the connection is the one requested, 0 otherwise
+ *
+ *    compare_connection() tests for collisions in the hash table
+ */
+static int compare_connection(void *arg1, void *arg2)
+{
+        connection c = arg1;
+        unsigned int * id = arg2;
+        return((c->ip==id[0]) && (c->port==id[1]));
+}
+
+
+/* Function:  connection_key
+ * Arguments: ptl_process_id_t id:  an id to hash
+ * Returns: a not-particularily-well-distributed hash
+ *          of the id
+ */
+static unsigned int connection_key(unsigned int *id)
+{
+    return(id[0]^id[1]);
+}
+
+
+/* Function:  remove_connection
+ * Arguments: c: the connection to remove
+ */
+void remove_connection(void *arg)
+{
+        connection c = arg;
+        unsigned int id[2];
+        
+        id[0]=c->ip;
+        id[1]=c->port;
+        hash_table_remove(c->m->connections,id);
+        close(c->fd);
+        free(c);
+}
+
+
+/* Function:  read_connection: 
+ * Arguments: c:    the connection to read from 
+ *            dest: the buffer to read into
+ *            len:  the number of bytes to read   
+ * Returns: success as 1, or failure as 0
+ *
+ *   read_connection() reads data from the connection, continuing
+ *   to read partial results until the request is satisfied or
+ *   it errors. TODO: this read should be covered by signal protection.
+ */
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len)
+{
+    int offset=0,rc;
+
+    if (len){
+        do {
+            if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){
+                if (errno==EINTR) {
+                    rc=0;
+                } else {
+                    remove_connection(c);
+                    return(0);
+                }
+            }
+            offset+=rc;
+        } while (offset<len);
+    }
+    return(1);
+}
+
+static int connection_input(void *d)
+{
+        connection c = d;
+        return((*c->m->handler)(c->m->handler_arg,c));
+}
+
+
+/* Function:  allocate_connection
+ * Arguments: t:    tcpnal the allocation is occuring in the context of
+ *            dest: portal endpoint address for this connection
+ *            fd:   open file descriptor for the socket
+ * Returns: an allocated connection structure
+ *
+ * just encompasses the action common to active and passive
+ *  connections of allocation and placement in the global table
+ */
+static connection allocate_connection(manager m,
+                               unsigned int ip,
+                               unsigned short port,
+                               int fd)
+{
+    connection c=malloc(sizeof(struct connection));
+    unsigned int id[2];
+    c->m=m;
+    c->fd=fd;
+    c->ip=ip;
+    c->port=port;
+    id[0]=ip;
+    id[1]=port;
+    register_io_handler(fd,READ_HANDLER,connection_input,c);
+    hash_table_insert(m->connections,c,id);
+    return(c);
+}
+
+
+/* Function:  new_connection
+ * Arguments: t: opaque argument holding the tcpname
+ * Returns: 1 in order to reregister for new connection requests
+ *
+ *  called when the bound service socket recieves
+ *     a new connection request, it always accepts and
+ *     installs a new connection
+ */
+static int new_connection(void *z)
+{
+    manager m=z;
+    struct sockaddr_in s;
+    int len=sizeof(struct sockaddr_in);
+    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
+    unsigned int nid=*((unsigned int *)&s.sin_addr);
+    /* cfs specific hack */
+    //unsigned short pid=s.sin_port;
+    allocate_connection(m,htonl(nid),0/*pid*/,fd);
+    return(1);
+}
+
+
+/* Function:  force_tcp_connection
+ * Arguments: t: tcpnal
+ *            dest: portals endpoint for the connection
+ * Returns: an allocated connection structure, either
+ *          a pre-existing one, or a new connection
+ */
+connection force_tcp_connection(manager m,
+                                unsigned int ip,
+                                unsigned short port)
+{
+    connection c;
+    struct sockaddr_in addr;
+    unsigned int id[2];
+
+    port = tcpnal_acceptor_port;
+
+    id[0]=ip;
+    id[1]=port;
+
+    if (!(c=hash_table_find(m->connections,id))){
+        int fd;
+
+        bzero((char *) &addr, sizeof(addr));
+        addr.sin_family      = AF_INET;
+        addr.sin_addr.s_addr = htonl(ip);
+        addr.sin_port        = htons(port);
+
+        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
+            perror("tcpnal socket failed");
+            exit(-1);
+        }
+        if (connect(fd,
+                    (struct sockaddr *)&addr,
+                    sizeof(struct sockaddr_in)))
+            {
+                perror("tcpnal connect");
+                return(0);
+            }
+        return(allocate_connection(m,ip,port,fd));
+    }
+    return(c);
+}
+
+
+/* Function:  bind_socket
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: 1 on success, or 0 on error
+ *
+ * bind_socket() attempts to allocate and bind a socket to the requested
+ *  port, or dynamically assign one from the kernel should the port be
+ *  zero. Sets the bound and bound_handler elements of m.
+ *
+ *  TODO: The port should be an explicitly sized type.
+ */
+static int bind_socket(manager m,unsigned short port)
+{
+    struct sockaddr_in addr;
+    int alen=sizeof(struct sockaddr_in);
+    
+    if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0)  
+        return(0);
+    
+    bzero((char *) &addr, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = 0;
+    addr.sin_port        = port; 
+    
+    if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){
+        perror ("tcpnal bind"); 
+        return(0);
+    }
+    
+    getsockname(m->bound,(struct sockaddr *)&addr, &alen);
+
+    m->bound_handler=register_io_handler(m->bound,READ_HANDLER,
+                                         new_connection,m);
+    listen(m->bound,5); 
+    m->port=addr.sin_port;
+    return(1);
+}
+
+
+/* Function:  shutdown_connections
+ * Arguments: m: the manager structure
+ *
+ * close all connections and reclaim resources
+ */
+void shutdown_connections(manager m)
+{
+    close(m->bound);
+    remove_io_handler(m->bound_handler);
+    hash_destroy_table(m->connections,remove_connection);
+    free(m);
+}
+
+
+/* Function:  init_connections
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: a newly allocated manager structure, or
+ *          zero if the fixed port could not be bound
+ */
+manager init_connections(unsigned short pid,
+                         int (*input)(void *, void *),
+                         void *a)
+{
+    manager m=(manager)malloc(sizeof(struct manager));
+    m->connections=hash_create_table(compare_connection,connection_key);
+    m->handler=input;
+    m->handler_arg=a;
+    if (bind_socket(m,pid)) return(m);
+    free(m);
+    return(0);
+}
diff --git a/lnet/ulnds/connection.h b/lnet/ulnds/connection.h
new file mode 100644 (file)
index 0000000..6f57287
--- /dev/null
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <table.h>
+
+typedef struct manager {
+    table connections;
+    int bound;
+    io_handler bound_handler;
+    int (*handler)(void *, void *);
+    void *handler_arg;
+    unsigned short port;
+} *manager;
+
+
+typedef struct connection {
+    unsigned int ip;
+    unsigned short port;
+    int fd;
+    manager m;
+} *connection;
+
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+manager init_connections(unsigned short, int (*f)(void *, void *), void *);
+void remove_connection(void *arg);
+void shutdown_connections(manager m);
+int read_connection(connection c, unsigned char *dest, int len);
diff --git a/lnet/ulnds/debug.c b/lnet/ulnds/debug.c
new file mode 100644 (file)
index 0000000..529bb2d
--- /dev/null
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sys/time.h>
+
+int smp_processor_id = 1;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+FILE *debug_file_fd;
+
+int portals_do_debug_dumplog(void *arg)
+{
+        printf("Look in %s\n", debug_file_name);
+        return 0;
+}
+
+
+void portals_debug_print(void)
+{
+        return;
+}
+
+
+void portals_debug_dumplog(void)
+{
+        printf("Look in %s\n", debug_file_name);
+        return;
+}
+
+
+int portals_debug_init(unsigned long bufsize)
+{ 
+        debug_file_fd = stdout;
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        return 0; //close(portals_debug_fd);
+}
+
+int portals_debug_clear_buffer(void)
+{
+        return 0;
+}
+
+int portals_debug_mark_buffer(char *text)
+{
+
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+
+        return 0;
+}
+
+int portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        return 0;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        struct timeval tv;
+        int nob;
+
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        gettimeofday(&tv, NULL);
+
+        nob += fprintf(debug_file_fd,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id,
+                              tv.tv_sec, tv.tv_usec);
+
+        nob += fprintf(debug_file_fd,
+                            "(%s:%d:%s() %d+%ld): ",
+                            file, line, fn, 0,
+                            8192 - ((unsigned long)&flags & 8191UL));
+
+        va_start (ap, format);
+        nob += fprintf(debug_file_fd, format, ap);
+        va_end (ap);
+
+
+}
+
diff --git a/lnet/ulnds/dispatch.h b/lnet/ulnds/dispatch.h
new file mode 100644 (file)
index 0000000..34dd070
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* this file is only called dispatch.h to prevent it
+   from colliding with /usr/include/sys/select.h */
+
+typedef struct io_handler *io_handler;
+
+struct io_handler{
+  io_handler *last;
+  io_handler next;
+  int fd;
+  int type;
+  int (*function)(void *);
+  void *argument;
+  int disabled;
+};
+
+
+#define READ_HANDLER 1
+#define WRITE_HANDLER 2
+#define EXCEPTION_HANDLER 4
+#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER)
+
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg);
+
+void remove_io_handler (io_handler i);
+void init_unix_timer(void);
+void select_timer_block(when until);
+when now(void);
diff --git a/lnet/ulnds/ipmap.h b/lnet/ulnds/ipmap.h
new file mode 100644 (file)
index 0000000..85b1e18
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#define DIRECT_IP_MODE
+#ifdef DIRECT_IP_MODE
+#define PNAL_NID(in_addr, port) (in_addr)
+#define PNAL_PID(pid) (pid)
+#define PNAL_IP(in_addr, port) (in_addr)
+#define PNAL_PORT(nid, pid) (pid)
+#else
+
+#define PNAL_BASE_PORT 4096
+#define PNAL_HOSTID_SHIFT 24
+#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1)
+#define PNAL_VNODE_SHIFT 8
+#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1)
+#define PNAL_PID_SHIFT 8
+#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1)
+
+#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \
+                                    << PNAL_VNODE_SHIFT) \
+                                   | (((ntohs(port)-PNAL_BASE_PORT) >>\
+                                       PNAL_PID_SHIFT)))
+#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT)  & PNAL_PID_MASK)
+
+#define PNAL_IP(nid,t)  (htonl((((unsigned)(nid))\
+                                >> PNAL_VNODE_SHIFT)\
+                               | (t->iptop8 << PNAL_HOSTID_SHIFT)))
+#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \
+                                 << PNAL_VNODE_SHIFT) \
+                                | ((pid) & PNAL_PID_MASK)) \
+                               + PNAL_BASE_PORT))
+#endif
diff --git a/lnet/ulnds/pqtimer.c b/lnet/ulnds/pqtimer.c
new file mode 100644 (file)
index 0000000..fa2fb4f
--- /dev/null
@@ -0,0 +1,226 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* timer.c:
+ *   this file implements a simple priority-queue based timer system. when
+ * combined with a file which implements now() and block(), it can
+ * be used to provide course-grained time-based callbacks.
+ */
+
+#include <pqtimer.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+};
+
+typedef struct thunk *thunk;
+struct thunk {
+    void (*f)(void *);
+    void *a;
+    thunk next;
+};
+
+extern when now(void);
+
+static thunk thunks;
+static int internal;
+static void (*block_function)(when);
+static int number_of_timers;
+static int size_of_pqueue;
+static timer *timers;
+
+
+static void heal(int where)
+{
+    int left=(where<<1);
+    int right=(where<<1)+1;
+    int min=where;
+    timer temp;
+  
+    if (left <= number_of_timers)
+       if (timers[left]->w < timers[min]->w) min=left;
+    if (right <= number_of_timers)
+       if (timers[right]->w < timers[min]->w) min=right;
+    if (min != where){
+       temp=timers[where];
+       timers[where]=timers[min];
+       timers[min]=temp;
+       heal(min);
+    }
+}
+
+static void add_pqueue(int i)
+{
+    timer temp;
+    int parent=(i>>1);
+    if ((i>1) && (timers[i]->w< timers[parent]->w)){
+       temp=timers[i];
+       timers[i]=timers[parent];
+       timers[parent]=temp;
+       add_pqueue(parent);
+    }
+}
+
+static void add_timer(timer t)
+{
+    if (size_of_pqueue<(number_of_timers+2)){
+       int oldsize=size_of_pqueue;
+       timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10));
+       memcpy(new,timers,sizeof(timer)*oldsize);
+       timers=new;
+    }
+    timers[++number_of_timers]=t;
+    add_pqueue(number_of_timers);
+}
+
+/* Function: register_timer
+ * Arguments: interval: the time interval from the current time when
+ *                      the timer function should be called
+ *            function: the function to call when the time has expired
+ *            argument: the argument to call it with.
+ * Returns: a pointer to a timer structure
+ */
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument)
+{
+    timer t=(timer)malloc(sizeof(struct timer));
+
+    t->arg=argument;
+    t->function=function;
+    t->interval=interval;
+    t->disable=0;
+    t->w=now()+interval;
+    add_timer(t);
+    if (!internal && (number_of_timers==1))
+        block_function(t->w);
+    return(t);
+}
+
+/* Function: remove_timer
+ * Arguments: t: 
+ * Returns: nothing
+ *
+ * remove_timer removes a timer from the system, insuring
+ * that it will never be called. It does not actually
+ * free the timer due to reentrancy issues.
+ */
+
+void remove_timer(timer t)
+{
+    t->disable=1;
+}
+
+
+
+void timer_fire()
+{
+    timer current;
+
+    current=timers[1];
+    timers[1]=timers[number_of_timers--];
+    heal(1);
+    if (!current->disable) {
+        (*current->function)(current->arg);
+    }
+    free(current);
+}
+
+when next_timer(void)
+{
+    when here=now();
+
+    while (number_of_timers && (timers[1]->w <= here)) timer_fire();
+    if (number_of_timers) return(timers[1]->w);
+    return(0);
+}
+
+/* Function: timer_loop
+ * Arguments: none
+ * Returns: never
+ * 
+ * timer_loop() is the blocking dispatch function for the timer.
+ * Is calls the block() function registered with init_timer,
+ * and handles associated with timers that have been registered.
+ */
+void timer_loop()
+{
+    when here;
+
+    while (1){
+       thunk z;
+       here=now();
+
+       for (z=thunks;z;z=z->next) (*z->f)(z->a);
+
+       if (number_of_timers){
+           if (timers[1]->w > here){
+               (*block_function)(timers[1]->w);
+           } else {
+                timer_fire();
+           }
+       } else {
+           thunk z;
+           for (z=thunks;z;z=z->next) (*z->f)(z->a);
+           (*block_function)(0);
+       }
+    }
+}
+
+
+/* Function: register_thunk
+ * Arguments: f: the function to call
+ *            a: the single argument to call it with
+ *
+ * Thunk functions get called at irregular intervals, they
+ * should not assume when, or take a particularily long
+ * amount of time. Thunks are for background cleanup tasks.
+ */
+void register_thunk(void (*f)(void *),void *a)
+{
+    thunk t=(void *)malloc(sizeof(struct thunk));
+    t->f=f;
+    t->a=a;
+    t->next=thunks;
+    thunks=t;
+}
+
+/* Function: initialize_timer
+ * Arguments: block: the function to call to block for the specified interval 
+ *
+ * initialize_timer() must be called before any other timer function,
+ * including timer_loop.
+ */
+void initialize_timer(void (*block)(when))
+{
+    block_function=block;
+    number_of_timers=0;
+    size_of_pqueue=10;
+    timers=(timer *)malloc(sizeof(timer)*size_of_pqueue);
+    thunks=0;
+}
diff --git a/lnet/ulnds/pqtimer.h b/lnet/ulnds/pqtimer.h
new file mode 100644 (file)
index 0000000..11efb0e
--- /dev/null
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned long long when;
+when now(void);
+typedef struct timer *timer;
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument);
+timer register_timer_wait(void);
+void remove_timer(timer);
+void timer_loop(void);
+void initialize_timer(void (*block)(when));
+void timer_fire(void);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c
new file mode 100644 (file)
index 0000000..6da3210
--- /dev/null
@@ -0,0 +1,283 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* api.c:
+ *  This file provides the 'api' side for the process-based nals.
+ *  it is responsible for creating the 'library' side thread,
+ *  and passing wrapped portals transactions to it.
+ *
+ *  Along with initialization, shutdown, and transport to the library
+ *  side, this file contains some stubs to satisfy the nal definition.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <errno.h>
+
+
+/* Function: forward
+ * Arguments: nal_t *nal: pointer to my top-side nal structure
+ *            id: the command to pass to the lower layer
+ *            args, args_len:pointer to and length of the request
+ *            ret, ret_len:  pointer to and size of the result
+ * Returns: a portals status code
+ *
+ * forwards a packaged api call from the 'api' side to the 'library'
+ *   side, and collects the result
+ */
+#define forward_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(PTL_SEGV);\
+       }
+static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len,
+                             void *ret, ptl_size_t ret_len)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int lib=p->to_lib[1];
+    int k;
+
+    forward_failure(write,lib, &id, sizeof(id));
+    forward_failure(write,lib,&args_len, sizeof(args_len));
+    forward_failure(write,lib,&ret_len, sizeof(ret_len));
+    forward_failure(write,lib,args, args_len);
+
+    do {
+        k=syscall(SYS_read, p->from_lib[0], ret, ret_len);
+    } while ((k!=ret_len) && (errno += EINTR));
+
+    if(k!=ret_len){
+        perror("nal: read return block");
+        return PTL_SEGV;
+    }
+    return (PTL_OK);
+}
+#undef forward_failure
+
+
+/* Function: shutdown
+ * Arguments: nal: a pointer to my top side nal structure
+ *            ni: my network interface index
+ *
+ * cleanup nal state, reclaim the lower side thread and
+ *   its state using PTL_FINI codepoint
+ */
+static int procbridge_shutdown(nal_t *n, int ni)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int code=PTL_FINI;
+
+    syscall(SYS_write, p->to_lib[1],&code,sizeof(code));
+    syscall(SYS_read, p->from_lib[0],&code,sizeof(code));
+
+    syscall(SYS_close, p->to_lib[0]);
+    syscall(SYS_close, p->to_lib[1]);
+    syscall(SYS_close, p->from_lib[0]);
+    syscall(SYS_close, p->from_lib[1]);
+
+    free(p);
+    return(0);
+}
+
+
+/* Function: validate
+ *    useless stub
+ */
+static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent)
+{
+    return(0);
+}
+
+
+/* Function: yield
+ * Arguments:  pid:
+ *
+ *  this function was originally intended to allow the
+ *   lower half thread to be scheduled to allow progress. we
+ *   overload it to explicitly block until signalled by the
+ *   lower half.
+ */
+static void procbridge_yield(nal_t *n)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_wait(&p->cond,&p->mutex);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+static void procbridge_lock(nal_t * nal, unsigned long *flags){}
+static void procbridge_unlock(nal_t * nal, unsigned long *flags){}
+/* api_nal
+ *  the interface vector to allow the generic code to access
+ *  this nal. this is seperate from the library side nal_cb.
+ *  TODO: should be dyanmically allocated
+ */
+static nal_t api_nal = {
+    ni:       {0},
+    nal_data: NULL,
+    forward:  procbridge_forward,
+    shutdown: procbridge_shutdown,
+    validate: procbridge_validate,
+    yield:    procbridge_yield,
+    lock:     procbridge_lock,
+    unlock:   procbridge_unlock
+};
+
+/* Function: bridge_init
+ *
+ * Arguments:  pid: requested process id (port offset)
+ *                  PTL_ID_ANY not supported.
+ *             desired: limits passed from the application
+ *                      and effectively ignored
+ *             actual:  limits actually allocated and returned
+ *
+ * Returns: a pointer to my statically allocated top side NAL
+ *          structure
+ *
+ * initializes the tcp nal. we define unix_failure as an
+ * error wrapper to cut down clutter.
+ */
+#define unix_failure(operand,fd,buffer,length,text)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          perror(text);\
+          return(NULL);\
+       }
+#if 0
+static nal_t *bridge_init(ptl_interface_t nal,
+                          ptl_pid_t pid_request,
+                          ptl_ni_limits_t *desired,
+                          ptl_ni_limits_t *actual,
+                          int *rc)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (desired) limits = *desired;
+    unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t),
+                 "tcp_init: read");
+    unix_failure(read,p->from_lib[0], rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(*rc) return(NULL);
+
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#endif
+
+ptl_nid_t tcpnal_mynid;
+
+nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+    int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (ptl_size)
+           limits.max_ptable_index = ptl_size;
+    if (acl_size)
+           limits.max_atable_index = acl_size;
+
+    unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], &rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(rc) return(NULL);
+
+    b->nal_cb->ni.nid = tcpnal_mynid;
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#undef unix_failure
diff --git a/lnet/ulnds/procbridge.h b/lnet/ulnds/procbridge.h
new file mode 100644 (file)
index 0000000..060ae7b
--- /dev/null
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef _PROCBRIDGE_H_
+#define _PROCBRIDGE_H_
+
+#include <pthread.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+typedef struct procbridge {
+    pthread_t t;
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    int to_lib[2];
+    int from_lib[2];
+} *procbridge;
+
+extern void *nal_thread(void *);
+
+
+#define PTL_INIT        (LIB_MAX_DISPATCH+1)
+#define PTL_FINI        (LIB_MAX_DISPATCH+2)
+
+#define MAX_ACLS        1
+#define MAX_PTLS        128
+
+extern void set_address(bridge t,ptl_pid_t pidrequest);
+extern nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid);
+
+#endif
diff --git a/lnet/ulnds/proclib.c b/lnet/ulnds/proclib.c
new file mode 100644 (file)
index 0000000..c3ee103
--- /dev/null
@@ -0,0 +1,270 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* lib.c:
+ *  This file provides the 'library' side for the process-based nals.
+ *  it is responsible for communication with the 'api' side and
+ *  providing service to the generic portals 'library'
+ *  implementation. 'library' might be better termed 'communication'
+ *  or 'kernel'.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <errno.h>
+#include <timer.h>
+//#include <util/pqtimer.h>
+#include <dispatch.h>
+
+/* the following functions are stubs to satisfy the nal definition
+   without doing anything particularily useful*/
+
+static int nal_write(nal_cb_t *nal,
+                     void *private,
+                     user_ptr dst_addr,
+                     void *src_addr,
+                     ptl_size_t len)
+{
+    memcpy(dst_addr, src_addr, len);
+    return 0;
+}
+
+static int nal_read(nal_cb_t * nal,
+                    void *private,
+                   void *dst_addr,
+                   user_ptr src_addr,
+                   size_t len)
+{
+       memcpy(dst_addr, src_addr, len);
+       return 0;
+}
+
+static void *nal_malloc(nal_cb_t *nal,
+                        ptl_size_t len)
+{
+    void *buf =  malloc(len);
+    return buf;
+}
+
+static void nal_free(nal_cb_t *nal,
+                     void *buf,
+                     ptl_size_t len)
+{
+    free(buf);
+}
+
+static void nal_printf(nal_cb_t *nal,
+                       const char *fmt,
+                       ...)
+{
+    va_list        ap;
+
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
+}
+
+
+static void nal_cli(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static void nal_sti(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static int nal_dist(nal_cb_t *nal,
+                    ptl_nid_t nid,
+                    unsigned long *dist)
+{
+    return 0;
+}
+    
+
+
+/* Function:  data_from_api
+ * Arguments: t: the nal state for this interface
+ * Returns: whether to continue reading from the pipe
+ *
+ *   data_from_api() reads data from the api side in response
+ *   to a select.
+ *
+ *   We define data_failure() for syntactic convenience
+ *   of unix error reporting.
+ */
+
+#define data_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(0);\
+       }
+static int data_from_api(void *arg)
+{
+        bridge b = arg;
+    procbridge p=(procbridge)b->local;
+    /* where are these two sizes derived from ??*/
+    char arg_block[ 256 ];
+    char ret_block[ 128 ];
+    ptl_size_t arg_len,ret_len;
+    int fd=p->to_lib[0];
+    int index;
+
+    data_failure(read,fd, &index, sizeof(index));
+
+    if (index==PTL_FINI) {
+        lib_fini(b->nal_cb);
+        if (b->shutdown) (*b->shutdown)(b);
+        syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive));
+
+        /* a heavy-handed but convenient way of shutting down
+           the lower side thread */
+        pthread_exit(0);
+    }
+
+    data_failure(read,fd, &arg_len, sizeof(arg_len));
+    data_failure(read,fd, &ret_len, sizeof(ret_len));
+    data_failure(read,fd, arg_block, arg_len);
+
+    lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block);
+
+    data_failure(write,p->from_lib[1],ret_block, ret_len);
+    return(1);
+}
+#undef data_failure
+
+
+
+static void wakeup_topside(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_broadcast(&p->cond);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+/* Function:  nal_thread
+ * Arguments: z: an opaque reference to a nal control structure
+ *               allocated and partially populated by the api level code
+ * Returns: nothing, and only on error or explicit shutdown
+ *
+ *  This function is the entry point of the pthread initiated on 
+ *  the api side of the interface. This thread is used to handle
+ *  asynchronous delivery to the application.
+ * 
+ *  We define a limit macro to place a ceiling on limits
+ *   for syntactic convenience
+ */
+#define LIMIT(x,y,max)\
+     if ((unsigned int)x > max) y = max;
+
+extern int tcpnal_init(bridge);
+
+nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
+
+void *nal_thread(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+    int rc;
+    ptl_pid_t pid_request;
+    int nal_type;
+    ptl_ni_limits_t desired;
+    ptl_ni_limits_t actual;
+    
+    b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t));
+    b->nal_cb->nal_data=b;
+    b->nal_cb->cb_read=nal_read;
+    b->nal_cb->cb_write=nal_write;
+    b->nal_cb->cb_malloc=nal_malloc;
+    b->nal_cb->cb_free=nal_free;
+    b->nal_cb->cb_map=NULL;
+    b->nal_cb->cb_unmap=NULL;
+    b->nal_cb->cb_printf=nal_printf;
+    b->nal_cb->cb_cli=nal_cli;
+    b->nal_cb->cb_sti=nal_sti;
+    b->nal_cb->cb_dist=nal_dist;
+
+
+    register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b);
+
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type))))
+        perror("procbridge read from api");
+
+    actual = desired;
+    LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES);
+    LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS);
+    LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS);
+    LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS);
+    LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS);
+
+    set_address(b,pid_request);
+
+    if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
+    /* initialize the generic 'library' level code */
+
+    rc = lib_init(b->nal_cb, 
+                  b->nal_cb->ni.nid,
+                  b->nal_cb->ni.pid,
+                 10,
+                 actual.max_ptable_index,
+                 actual.max_atable_index);
+
+    /*
+     * Whatever the initialization returned is passed back to the
+     * user level code for further interpretation.  We just exit if
+     * it is non-zero since something went wrong.
+     */
+    /* this should perform error checking */
+#if 0
+    write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t));
+#endif
+    syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc));
+    
+    if(!rc) {
+        /* the thunk function is called each time the timer loop
+           performs an operation and returns to blocking mode. we
+           overload this function to inform the api side that
+           it may be interested in looking at the event queue */
+        register_thunk(wakeup_topside,b);
+        timer_loop();
+    }
+    return(0);
+}
+#undef LIMIT
+
diff --git a/lnet/ulnds/select.c b/lnet/ulnds/select.c
new file mode 100644 (file)
index 0000000..c4f84f4
--- /dev/null
@@ -0,0 +1,165 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* select.c:
+ *  Provides a general mechanism for registering and dispatching
+ *  io events through the select system call.
+ */
+
+#ifdef sun
+#include <sys/filio.h>
+#else
+#include <sys/ioctl.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+
+
+static struct timeval beginning_of_epoch;
+static io_handler io_handlers;
+
+/* Function: now
+ *
+ * Return: the current time in canonical units: a 64 bit number
+ *   where the most significant 32 bits contains the number
+ *   of seconds, and the least signficant a count of (1/(2^32))ths
+ *   of a second.
+ */
+when now()
+{
+    struct timeval result;
+  
+    gettimeofday(&result,0);
+    return((((unsigned long long)result.tv_sec)<<32)|
+           (((unsigned long long)result.tv_usec)<<32)/1000000);
+}
+
+
+/* Function: register_io_handler
+ * Arguments: fd: the file descriptor of interest
+ *            type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER
+ *            function: a function to call when io is available on fd
+ *            arg: an opaque correlator to return to the handler
+ * Returns: a pointer to the io_handler structure
+ */
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg)
+{
+    io_handler i=(io_handler)malloc(sizeof(struct io_handler));
+    if ((i->fd=fd)>=0){
+        i->type=type;
+        i->function=function;
+        i->argument=arg;
+        i->disabled=0;
+        i->last=&io_handlers;
+        if ((i->next=io_handlers)) i->next->last=&i->next;
+        io_handlers=i;
+    }
+    return(i);
+}
+
+/* Function: remove_io_handler
+ * Arguments: i: a pointer to the handler to stop servicing
+ *
+ * remove_io_handler() doesn't actually free the handler, due
+ * to reentrancy problems. it just marks the handler for 
+ * later cleanup by the blocking function.
+ */
+void remove_io_handler (io_handler i)
+{
+    i->disabled=1;
+}
+
+static void set_flag(io_handler n,fd_set *fds)
+{
+    if (n->type & READ_HANDLER) FD_SET(n->fd,fds);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2);
+}
+
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int result;
+    io_handler j;
+    io_handler *k;
+
+    /* TODO: loop until the entire interval is expired*/
+    if (until){
+       when interval=until-now();
+        timeout.tv_sec=(interval>>32);
+        timeout.tv_usec=((interval<<32)/1000000)>>32;
+        timeout_pointer=&timeout;
+    } else timeout_pointer=0;
+
+    FD_ZERO(fds);
+    FD_ZERO(fds+1);
+    FD_ZERO(fds+2);
+    for (k=&io_handlers;*k;){
+        if ((*k)->disabled){
+            j=*k;
+            *k=(*k)->next;
+            free(j);
+        }
+        if (*k) {
+           set_flag(*k,fds);
+           k=&(*k)->next;
+       }
+    }
+    result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer);
+
+    if (result > 0)
+        for (j=io_handlers;j;j=j->next){
+            if (!(j->disabled) && 
+                ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){
+                if (!(*j->function)(j->argument))
+                    j->disabled=1;
+            }
+        }
+}
+
+/* Function: init_unix_timer()
+ *   is called to initialize the library 
+ */
+void init_unix_timer()
+{
+    io_handlers=0;
+    gettimeofday(&beginning_of_epoch, 0);
+    initialize_timer(select_timer_block);
+}
diff --git a/lnet/ulnds/socklnd/Makefile.am b/lnet/ulnds/socklnd/Makefile.am
new file mode 100644 (file)
index 0000000..dc427b0
--- /dev/null
@@ -0,0 +1,5 @@
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
+lib_LIBRARIES = libtcpnal.a
+pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
diff --git a/lnet/ulnds/socklnd/README b/lnet/ulnds/socklnd/README
new file mode 100644 (file)
index 0000000..6cb93d9
--- /dev/null
@@ -0,0 +1,53 @@
+This library implements two NAL interfaces, both running over IP.
+The first, tcpnal, creates TCP connections between participating
+processes in order to transport the portals requests. The second,
+ernal, provides a simple transport protocol which runs over
+UDP datagrams.
+
+The interface functions return both of these values in host order for
+convenience and readability. However this means that addresses
+exchanged in messages between hosts of different orderings will not
+function properly.
+
+Both NALs use the same support functions in order to schedule events
+and communicate with the generic portals implementation.
+
+            -------------------------
+            |         api           |
+            |_______________________|
+            |         lib           |
+            |_______________________|
+            | ernal  |   |tcpnal    |
+            |--------|   |----------|
+            | udpsock|   |connection|
+            |-----------------------|
+            |     timer/select      |
+            -------------------------
+
+
+  These NALs uses the framework from fdnal of a pipe between the api
+and library sides. This is wrapped up in the select on the library
+side, and blocks on the api side. Performance could be severely
+enhanced by collapsing this aritificial barrier, by using shared
+memory queues, or by wiring the api layer directly to the library.
+
+
+nid is defined as the low order 24-bits of the IP address of the
+physical node left shifted by 8 plus a virtual node number of 0
+through 255 (really only 239).  The virtual node number of a tcpnal
+application should be specified using the environment variable
+PTL_VIRTNODE.  pid is now a completely arbitrary number in the
+range of 0 to 255.  The IP interface used can be overridden by
+specifying the appropriate hostid by setting the PTL_HOSTID
+environment variable.  The value can be either dotted decimal
+(n.n.n.n) or hex starting with "0x".
+TCPNAL:
+  As the NAL needs to try to send to a particular nid/pid pair, it
+  will open up connections on demand. Because the port associated with
+  the connecting socket is different from the bound port, two
+  connections will normally be established between a pair of peers, with
+  data flowing from the anonymous connect (active) port to the advertised
+  or well-known bound (passive) port of each peer.
+
+  Should the connection fail to open, an error is reported to the
+  library component, which causes the api request to fail.
diff --git a/lnet/ulnds/socklnd/address.c b/lnet/ulnds/socklnd/address.c
new file mode 100644 (file)
index 0000000..b422c3f
--- /dev/null
@@ -0,0 +1,146 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* address.c:
+ * this file provides functions to aquire the IP address of the node
+ * and translate them into a NID/PID pair which supports a static
+ * mapping of virtual nodes into the port range of an IP socket.
+*/
+
+#include <stdlib.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <portals/p30.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+/* Function:  get_node_id
+ * Returns: a 32 bit id for this node, actually a big-endian IP address
+ *
+ * get_node_id() determines the host name and uses the resolver to
+ *  find out its ip address. This is fairly fragile and inflexible, but
+ *  explicitly asking about interfaces and their addresses is very
+ *  complicated and nonportable.
+ */
+static unsigned int get_node_id(void)
+{
+    char buffer[255];
+    unsigned int x;
+    struct hostent *he;
+    char * host_envp;
+
+    if (!(host_envp = getenv("PTL_HOSTID")))
+        {
+            gethostname(buffer,sizeof(buffer));
+            he=gethostbyname(buffer);
+            if (he)
+                    x=*(unsigned int *)he->h_addr_list[0];
+            else
+                    x = 0;
+            return(ntohl(x));
+        }
+    else 
+        {
+            if (host_envp[1] != 'x')
+                {
+                    int a, b, c, d;
+                    sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d);
+                    return ((a<<24) | (b<<16) | (c<<8) | d);
+                }
+            else
+                {
+                    long long hostid = strtoll(host_envp, 0, 0);
+                    return((unsigned int) hostid);
+                }
+        }
+}
+
+
+/* Function:  set_address
+ * Arugments: t: a procnal structure to populate with the request
+ *
+ * set_address performs the bit manipulations to set the nid, pid, and
+ *    iptop8 fields of the procnal structures.
+ *
+ * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY
+ */
+
+#ifdef DIRECT_IP_MODE
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int port;
+    if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
+    else port=pidrequest;
+    t->nal_cb->ni.nid=get_node_id();
+    t->nal_cb->ni.pid=port;
+}
+#else
+
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int virtnode, in_addr, port; 
+    ptl_pid_t pid;
+
+    /* get and remember my node id*/
+    if (!getenv("PTL_VIRTNODE"))
+        virtnode = 0;
+    else 
+        {
+            int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT 
+                                              >> PNAL_VNODE_SHIFT);
+            virtnode = atoi(getenv("PTL_VIRTNODE"));
+            if (virtnode > maxvnode)
+                {
+                    fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n",
+                            virtnode, maxvnode);
+                    return;
+                }
+        }
+    
+    in_addr = get_node_id();
+
+    t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
+    t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) 
+                            << PNAL_VNODE_SHIFT)
+        + virtnode;
+
+    pid=pidrequest;
+    /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
+#ifdef notyet
+    if (pid==(unsigned short)PTL_PID_ANY) port = 0;
+#endif
+    if (pid==(unsigned short)PTL_PID_ANY) 
+        {
+            fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n");
+            return;
+        }
+    else if (pid > PNAL_PID_MASK)
+        {
+            fprintf(stderr, "portal pid of %d is too large - max %d\n",
+                    pid, PNAL_PID_MASK);
+            return;
+        }
+    else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
+    t->nal_cb->ni.pid=pid;
+}
+#endif
diff --git a/lnet/ulnds/socklnd/bridge.h b/lnet/ulnds/socklnd/bridge.h
new file mode 100644 (file)
index 0000000..0b4940f
--- /dev/null
@@ -0,0 +1,29 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <portals/lib-p30.h>
+
+typedef struct bridge {
+    int alive;
+    nal_cb_t *nal_cb;
+    void *lower;
+    void *local;
+    void (*shutdown)(struct bridge *);
+    /* this doesn't really belong here */
+    unsigned char iptop8;
+} *bridge;
+
+
+nal_t *bridge_init(ptl_interface_t nal,
+                   ptl_pid_t pid_request,
+                   ptl_ni_limits_t *desired,
+                   ptl_ni_limits_t *actual,
+                   int *rc);
+
+typedef int (*nal_initialize)(bridge);
+extern nal_initialize nal_table[PTL_IFACE_MAX];
diff --git a/lnet/ulnds/socklnd/connection.c b/lnet/ulnds/socklnd/connection.c
new file mode 100644 (file)
index 0000000..310e899
--- /dev/null
@@ -0,0 +1,294 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* connection.c:
+   This file provides a simple stateful connection manager which
+   builds tcp connections on demand and leaves them open for
+   future use. It also provides the machinery to allow peers
+   to connect to it
+*/
+
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <table.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <connection.h>
+#include <errno.h>
+
+
+/* global variable: acceptor port */
+unsigned short tcpnal_acceptor_port = 988;
+
+
+/* Function:  compare_connection
+ * Arguments: connection c:      a connection in the hash table
+ *            ptl_process_id_t:  an id to verify  agains
+ * Returns: 1 if the connection is the one requested, 0 otherwise
+ *
+ *    compare_connection() tests for collisions in the hash table
+ */
+static int compare_connection(void *arg1, void *arg2)
+{
+        connection c = arg1;
+        unsigned int * id = arg2;
+        return((c->ip==id[0]) && (c->port==id[1]));
+}
+
+
+/* Function:  connection_key
+ * Arguments: ptl_process_id_t id:  an id to hash
+ * Returns: a not-particularily-well-distributed hash
+ *          of the id
+ */
+static unsigned int connection_key(unsigned int *id)
+{
+    return(id[0]^id[1]);
+}
+
+
+/* Function:  remove_connection
+ * Arguments: c: the connection to remove
+ */
+void remove_connection(void *arg)
+{
+        connection c = arg;
+        unsigned int id[2];
+        
+        id[0]=c->ip;
+        id[1]=c->port;
+        hash_table_remove(c->m->connections,id);
+        close(c->fd);
+        free(c);
+}
+
+
+/* Function:  read_connection: 
+ * Arguments: c:    the connection to read from 
+ *            dest: the buffer to read into
+ *            len:  the number of bytes to read   
+ * Returns: success as 1, or failure as 0
+ *
+ *   read_connection() reads data from the connection, continuing
+ *   to read partial results until the request is satisfied or
+ *   it errors. TODO: this read should be covered by signal protection.
+ */
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len)
+{
+    int offset=0,rc;
+
+    if (len){
+        do {
+            if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){
+                if (errno==EINTR) {
+                    rc=0;
+                } else {
+                    remove_connection(c);
+                    return(0);
+                }
+            }
+            offset+=rc;
+        } while (offset<len);
+    }
+    return(1);
+}
+
+static int connection_input(void *d)
+{
+        connection c = d;
+        return((*c->m->handler)(c->m->handler_arg,c));
+}
+
+
+/* Function:  allocate_connection
+ * Arguments: t:    tcpnal the allocation is occuring in the context of
+ *            dest: portal endpoint address for this connection
+ *            fd:   open file descriptor for the socket
+ * Returns: an allocated connection structure
+ *
+ * just encompasses the action common to active and passive
+ *  connections of allocation and placement in the global table
+ */
+static connection allocate_connection(manager m,
+                               unsigned int ip,
+                               unsigned short port,
+                               int fd)
+{
+    connection c=malloc(sizeof(struct connection));
+    unsigned int id[2];
+    c->m=m;
+    c->fd=fd;
+    c->ip=ip;
+    c->port=port;
+    id[0]=ip;
+    id[1]=port;
+    register_io_handler(fd,READ_HANDLER,connection_input,c);
+    hash_table_insert(m->connections,c,id);
+    return(c);
+}
+
+
+/* Function:  new_connection
+ * Arguments: t: opaque argument holding the tcpname
+ * Returns: 1 in order to reregister for new connection requests
+ *
+ *  called when the bound service socket recieves
+ *     a new connection request, it always accepts and
+ *     installs a new connection
+ */
+static int new_connection(void *z)
+{
+    manager m=z;
+    struct sockaddr_in s;
+    int len=sizeof(struct sockaddr_in);
+    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
+    unsigned int nid=*((unsigned int *)&s.sin_addr);
+    /* cfs specific hack */
+    //unsigned short pid=s.sin_port;
+    allocate_connection(m,htonl(nid),0/*pid*/,fd);
+    return(1);
+}
+
+
+/* Function:  force_tcp_connection
+ * Arguments: t: tcpnal
+ *            dest: portals endpoint for the connection
+ * Returns: an allocated connection structure, either
+ *          a pre-existing one, or a new connection
+ */
+connection force_tcp_connection(manager m,
+                                unsigned int ip,
+                                unsigned short port)
+{
+    connection c;
+    struct sockaddr_in addr;
+    unsigned int id[2];
+
+    port = tcpnal_acceptor_port;
+
+    id[0]=ip;
+    id[1]=port;
+
+    if (!(c=hash_table_find(m->connections,id))){
+        int fd;
+
+        bzero((char *) &addr, sizeof(addr));
+        addr.sin_family      = AF_INET;
+        addr.sin_addr.s_addr = htonl(ip);
+        addr.sin_port        = htons(port);
+
+        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
+            perror("tcpnal socket failed");
+            exit(-1);
+        }
+        if (connect(fd,
+                    (struct sockaddr *)&addr,
+                    sizeof(struct sockaddr_in)))
+            {
+                perror("tcpnal connect");
+                return(0);
+            }
+        return(allocate_connection(m,ip,port,fd));
+    }
+    return(c);
+}
+
+
+/* Function:  bind_socket
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: 1 on success, or 0 on error
+ *
+ * bind_socket() attempts to allocate and bind a socket to the requested
+ *  port, or dynamically assign one from the kernel should the port be
+ *  zero. Sets the bound and bound_handler elements of m.
+ *
+ *  TODO: The port should be an explicitly sized type.
+ */
+static int bind_socket(manager m,unsigned short port)
+{
+    struct sockaddr_in addr;
+    int alen=sizeof(struct sockaddr_in);
+    
+    if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0)  
+        return(0);
+    
+    bzero((char *) &addr, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = 0;
+    addr.sin_port        = port; 
+    
+    if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){
+        perror ("tcpnal bind"); 
+        return(0);
+    }
+    
+    getsockname(m->bound,(struct sockaddr *)&addr, &alen);
+
+    m->bound_handler=register_io_handler(m->bound,READ_HANDLER,
+                                         new_connection,m);
+    listen(m->bound,5); 
+    m->port=addr.sin_port;
+    return(1);
+}
+
+
+/* Function:  shutdown_connections
+ * Arguments: m: the manager structure
+ *
+ * close all connections and reclaim resources
+ */
+void shutdown_connections(manager m)
+{
+    close(m->bound);
+    remove_io_handler(m->bound_handler);
+    hash_destroy_table(m->connections,remove_connection);
+    free(m);
+}
+
+
+/* Function:  init_connections
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: a newly allocated manager structure, or
+ *          zero if the fixed port could not be bound
+ */
+manager init_connections(unsigned short pid,
+                         int (*input)(void *, void *),
+                         void *a)
+{
+    manager m=(manager)malloc(sizeof(struct manager));
+    m->connections=hash_create_table(compare_connection,connection_key);
+    m->handler=input;
+    m->handler_arg=a;
+    if (bind_socket(m,pid)) return(m);
+    free(m);
+    return(0);
+}
diff --git a/lnet/ulnds/socklnd/connection.h b/lnet/ulnds/socklnd/connection.h
new file mode 100644 (file)
index 0000000..6f57287
--- /dev/null
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <table.h>
+
+typedef struct manager {
+    table connections;
+    int bound;
+    io_handler bound_handler;
+    int (*handler)(void *, void *);
+    void *handler_arg;
+    unsigned short port;
+} *manager;
+
+
+typedef struct connection {
+    unsigned int ip;
+    unsigned short port;
+    int fd;
+    manager m;
+} *connection;
+
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+manager init_connections(unsigned short, int (*f)(void *, void *), void *);
+void remove_connection(void *arg);
+void shutdown_connections(manager m);
+int read_connection(connection c, unsigned char *dest, int len);
diff --git a/lnet/ulnds/socklnd/debug.c b/lnet/ulnds/socklnd/debug.c
new file mode 100644 (file)
index 0000000..529bb2d
--- /dev/null
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sys/time.h>
+
+int smp_processor_id = 1;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+FILE *debug_file_fd;
+
+int portals_do_debug_dumplog(void *arg)
+{
+        printf("Look in %s\n", debug_file_name);
+        return 0;
+}
+
+
+void portals_debug_print(void)
+{
+        return;
+}
+
+
+void portals_debug_dumplog(void)
+{
+        printf("Look in %s\n", debug_file_name);
+        return;
+}
+
+
+int portals_debug_init(unsigned long bufsize)
+{ 
+        debug_file_fd = stdout;
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        return 0; //close(portals_debug_fd);
+}
+
+int portals_debug_clear_buffer(void)
+{
+        return 0;
+}
+
+int portals_debug_mark_buffer(char *text)
+{
+
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+
+        return 0;
+}
+
+int portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        return 0;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        struct timeval tv;
+        int nob;
+
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        gettimeofday(&tv, NULL);
+
+        nob += fprintf(debug_file_fd,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id,
+                              tv.tv_sec, tv.tv_usec);
+
+        nob += fprintf(debug_file_fd,
+                            "(%s:%d:%s() %d+%ld): ",
+                            file, line, fn, 0,
+                            8192 - ((unsigned long)&flags & 8191UL));
+
+        va_start (ap, format);
+        nob += fprintf(debug_file_fd, format, ap);
+        va_end (ap);
+
+
+}
+
diff --git a/lnet/ulnds/socklnd/dispatch.h b/lnet/ulnds/socklnd/dispatch.h
new file mode 100644 (file)
index 0000000..34dd070
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* this file is only called dispatch.h to prevent it
+   from colliding with /usr/include/sys/select.h */
+
+typedef struct io_handler *io_handler;
+
+struct io_handler{
+  io_handler *last;
+  io_handler next;
+  int fd;
+  int type;
+  int (*function)(void *);
+  void *argument;
+  int disabled;
+};
+
+
+#define READ_HANDLER 1
+#define WRITE_HANDLER 2
+#define EXCEPTION_HANDLER 4
+#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER)
+
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg);
+
+void remove_io_handler (io_handler i);
+void init_unix_timer(void);
+void select_timer_block(when until);
+when now(void);
diff --git a/lnet/ulnds/socklnd/ipmap.h b/lnet/ulnds/socklnd/ipmap.h
new file mode 100644 (file)
index 0000000..85b1e18
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#define DIRECT_IP_MODE
+#ifdef DIRECT_IP_MODE
+#define PNAL_NID(in_addr, port) (in_addr)
+#define PNAL_PID(pid) (pid)
+#define PNAL_IP(in_addr, port) (in_addr)
+#define PNAL_PORT(nid, pid) (pid)
+#else
+
+#define PNAL_BASE_PORT 4096
+#define PNAL_HOSTID_SHIFT 24
+#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1)
+#define PNAL_VNODE_SHIFT 8
+#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1)
+#define PNAL_PID_SHIFT 8
+#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1)
+
+#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \
+                                    << PNAL_VNODE_SHIFT) \
+                                   | (((ntohs(port)-PNAL_BASE_PORT) >>\
+                                       PNAL_PID_SHIFT)))
+#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT)  & PNAL_PID_MASK)
+
+#define PNAL_IP(nid,t)  (htonl((((unsigned)(nid))\
+                                >> PNAL_VNODE_SHIFT)\
+                               | (t->iptop8 << PNAL_HOSTID_SHIFT)))
+#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \
+                                 << PNAL_VNODE_SHIFT) \
+                                | ((pid) & PNAL_PID_MASK)) \
+                               + PNAL_BASE_PORT))
+#endif
diff --git a/lnet/ulnds/socklnd/pqtimer.c b/lnet/ulnds/socklnd/pqtimer.c
new file mode 100644 (file)
index 0000000..fa2fb4f
--- /dev/null
@@ -0,0 +1,226 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* timer.c:
+ *   this file implements a simple priority-queue based timer system. when
+ * combined with a file which implements now() and block(), it can
+ * be used to provide course-grained time-based callbacks.
+ */
+
+#include <pqtimer.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+};
+
+typedef struct thunk *thunk;
+struct thunk {
+    void (*f)(void *);
+    void *a;
+    thunk next;
+};
+
+extern when now(void);
+
+static thunk thunks;
+static int internal;
+static void (*block_function)(when);
+static int number_of_timers;
+static int size_of_pqueue;
+static timer *timers;
+
+
+static void heal(int where)
+{
+    int left=(where<<1);
+    int right=(where<<1)+1;
+    int min=where;
+    timer temp;
+  
+    if (left <= number_of_timers)
+       if (timers[left]->w < timers[min]->w) min=left;
+    if (right <= number_of_timers)
+       if (timers[right]->w < timers[min]->w) min=right;
+    if (min != where){
+       temp=timers[where];
+       timers[where]=timers[min];
+       timers[min]=temp;
+       heal(min);
+    }
+}
+
+static void add_pqueue(int i)
+{
+    timer temp;
+    int parent=(i>>1);
+    if ((i>1) && (timers[i]->w< timers[parent]->w)){
+       temp=timers[i];
+       timers[i]=timers[parent];
+       timers[parent]=temp;
+       add_pqueue(parent);
+    }
+}
+
+static void add_timer(timer t)
+{
+    if (size_of_pqueue<(number_of_timers+2)){
+       int oldsize=size_of_pqueue;
+       timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10));
+       memcpy(new,timers,sizeof(timer)*oldsize);
+       timers=new;
+    }
+    timers[++number_of_timers]=t;
+    add_pqueue(number_of_timers);
+}
+
+/* Function: register_timer
+ * Arguments: interval: the time interval from the current time when
+ *                      the timer function should be called
+ *            function: the function to call when the time has expired
+ *            argument: the argument to call it with.
+ * Returns: a pointer to a timer structure
+ */
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument)
+{
+    timer t=(timer)malloc(sizeof(struct timer));
+
+    t->arg=argument;
+    t->function=function;
+    t->interval=interval;
+    t->disable=0;
+    t->w=now()+interval;
+    add_timer(t);
+    if (!internal && (number_of_timers==1))
+        block_function(t->w);
+    return(t);
+}
+
+/* Function: remove_timer
+ * Arguments: t: 
+ * Returns: nothing
+ *
+ * remove_timer removes a timer from the system, insuring
+ * that it will never be called. It does not actually
+ * free the timer due to reentrancy issues.
+ */
+
+void remove_timer(timer t)
+{
+    t->disable=1;
+}
+
+
+
+void timer_fire()
+{
+    timer current;
+
+    current=timers[1];
+    timers[1]=timers[number_of_timers--];
+    heal(1);
+    if (!current->disable) {
+        (*current->function)(current->arg);
+    }
+    free(current);
+}
+
+when next_timer(void)
+{
+    when here=now();
+
+    while (number_of_timers && (timers[1]->w <= here)) timer_fire();
+    if (number_of_timers) return(timers[1]->w);
+    return(0);
+}
+
+/* Function: timer_loop
+ * Arguments: none
+ * Returns: never
+ * 
+ * timer_loop() is the blocking dispatch function for the timer.
+ * Is calls the block() function registered with init_timer,
+ * and handles associated with timers that have been registered.
+ */
+void timer_loop()
+{
+    when here;
+
+    while (1){
+       thunk z;
+       here=now();
+
+       for (z=thunks;z;z=z->next) (*z->f)(z->a);
+
+       if (number_of_timers){
+           if (timers[1]->w > here){
+               (*block_function)(timers[1]->w);
+           } else {
+                timer_fire();
+           }
+       } else {
+           thunk z;
+           for (z=thunks;z;z=z->next) (*z->f)(z->a);
+           (*block_function)(0);
+       }
+    }
+}
+
+
+/* Function: register_thunk
+ * Arguments: f: the function to call
+ *            a: the single argument to call it with
+ *
+ * Thunk functions get called at irregular intervals, they
+ * should not assume when, or take a particularily long
+ * amount of time. Thunks are for background cleanup tasks.
+ */
+void register_thunk(void (*f)(void *),void *a)
+{
+    thunk t=(void *)malloc(sizeof(struct thunk));
+    t->f=f;
+    t->a=a;
+    t->next=thunks;
+    thunks=t;
+}
+
+/* Function: initialize_timer
+ * Arguments: block: the function to call to block for the specified interval 
+ *
+ * initialize_timer() must be called before any other timer function,
+ * including timer_loop.
+ */
+void initialize_timer(void (*block)(when))
+{
+    block_function=block;
+    number_of_timers=0;
+    size_of_pqueue=10;
+    timers=(timer *)malloc(sizeof(timer)*size_of_pqueue);
+    thunks=0;
+}
diff --git a/lnet/ulnds/socklnd/pqtimer.h b/lnet/ulnds/socklnd/pqtimer.h
new file mode 100644 (file)
index 0000000..11efb0e
--- /dev/null
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned long long when;
+when now(void);
+typedef struct timer *timer;
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument);
+timer register_timer_wait(void);
+void remove_timer(timer);
+void timer_loop(void);
+void initialize_timer(void (*block)(when));
+void timer_fire(void);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lnet/ulnds/socklnd/procapi.c b/lnet/ulnds/socklnd/procapi.c
new file mode 100644 (file)
index 0000000..6da3210
--- /dev/null
@@ -0,0 +1,283 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* api.c:
+ *  This file provides the 'api' side for the process-based nals.
+ *  it is responsible for creating the 'library' side thread,
+ *  and passing wrapped portals transactions to it.
+ *
+ *  Along with initialization, shutdown, and transport to the library
+ *  side, this file contains some stubs to satisfy the nal definition.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <errno.h>
+
+
+/* Function: forward
+ * Arguments: nal_t *nal: pointer to my top-side nal structure
+ *            id: the command to pass to the lower layer
+ *            args, args_len:pointer to and length of the request
+ *            ret, ret_len:  pointer to and size of the result
+ * Returns: a portals status code
+ *
+ * forwards a packaged api call from the 'api' side to the 'library'
+ *   side, and collects the result
+ */
+#define forward_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(PTL_SEGV);\
+       }
+static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len,
+                             void *ret, ptl_size_t ret_len)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int lib=p->to_lib[1];
+    int k;
+
+    forward_failure(write,lib, &id, sizeof(id));
+    forward_failure(write,lib,&args_len, sizeof(args_len));
+    forward_failure(write,lib,&ret_len, sizeof(ret_len));
+    forward_failure(write,lib,args, args_len);
+
+    do {
+        k=syscall(SYS_read, p->from_lib[0], ret, ret_len);
+    } while ((k!=ret_len) && (errno += EINTR));
+
+    if(k!=ret_len){
+        perror("nal: read return block");
+        return PTL_SEGV;
+    }
+    return (PTL_OK);
+}
+#undef forward_failure
+
+
+/* Function: shutdown
+ * Arguments: nal: a pointer to my top side nal structure
+ *            ni: my network interface index
+ *
+ * cleanup nal state, reclaim the lower side thread and
+ *   its state using PTL_FINI codepoint
+ */
+static int procbridge_shutdown(nal_t *n, int ni)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int code=PTL_FINI;
+
+    syscall(SYS_write, p->to_lib[1],&code,sizeof(code));
+    syscall(SYS_read, p->from_lib[0],&code,sizeof(code));
+
+    syscall(SYS_close, p->to_lib[0]);
+    syscall(SYS_close, p->to_lib[1]);
+    syscall(SYS_close, p->from_lib[0]);
+    syscall(SYS_close, p->from_lib[1]);
+
+    free(p);
+    return(0);
+}
+
+
+/* Function: validate
+ *    useless stub
+ */
+static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent)
+{
+    return(0);
+}
+
+
+/* Function: yield
+ * Arguments:  pid:
+ *
+ *  this function was originally intended to allow the
+ *   lower half thread to be scheduled to allow progress. we
+ *   overload it to explicitly block until signalled by the
+ *   lower half.
+ */
+static void procbridge_yield(nal_t *n)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_wait(&p->cond,&p->mutex);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+static void procbridge_lock(nal_t * nal, unsigned long *flags){}
+static void procbridge_unlock(nal_t * nal, unsigned long *flags){}
+/* api_nal
+ *  the interface vector to allow the generic code to access
+ *  this nal. this is seperate from the library side nal_cb.
+ *  TODO: should be dyanmically allocated
+ */
+static nal_t api_nal = {
+    ni:       {0},
+    nal_data: NULL,
+    forward:  procbridge_forward,
+    shutdown: procbridge_shutdown,
+    validate: procbridge_validate,
+    yield:    procbridge_yield,
+    lock:     procbridge_lock,
+    unlock:   procbridge_unlock
+};
+
+/* Function: bridge_init
+ *
+ * Arguments:  pid: requested process id (port offset)
+ *                  PTL_ID_ANY not supported.
+ *             desired: limits passed from the application
+ *                      and effectively ignored
+ *             actual:  limits actually allocated and returned
+ *
+ * Returns: a pointer to my statically allocated top side NAL
+ *          structure
+ *
+ * initializes the tcp nal. we define unix_failure as an
+ * error wrapper to cut down clutter.
+ */
+#define unix_failure(operand,fd,buffer,length,text)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          perror(text);\
+          return(NULL);\
+       }
+#if 0
+static nal_t *bridge_init(ptl_interface_t nal,
+                          ptl_pid_t pid_request,
+                          ptl_ni_limits_t *desired,
+                          ptl_ni_limits_t *actual,
+                          int *rc)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (desired) limits = *desired;
+    unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t),
+                 "tcp_init: read");
+    unix_failure(read,p->from_lib[0], rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(*rc) return(NULL);
+
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#endif
+
+ptl_nid_t tcpnal_mynid;
+
+nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+    int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (ptl_size)
+           limits.max_ptable_index = ptl_size;
+    if (acl_size)
+           limits.max_atable_index = acl_size;
+
+    unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], &rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(rc) return(NULL);
+
+    b->nal_cb->ni.nid = tcpnal_mynid;
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#undef unix_failure
diff --git a/lnet/ulnds/socklnd/procbridge.h b/lnet/ulnds/socklnd/procbridge.h
new file mode 100644 (file)
index 0000000..060ae7b
--- /dev/null
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef _PROCBRIDGE_H_
+#define _PROCBRIDGE_H_
+
+#include <pthread.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+typedef struct procbridge {
+    pthread_t t;
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    int to_lib[2];
+    int from_lib[2];
+} *procbridge;
+
+extern void *nal_thread(void *);
+
+
+#define PTL_INIT        (LIB_MAX_DISPATCH+1)
+#define PTL_FINI        (LIB_MAX_DISPATCH+2)
+
+#define MAX_ACLS        1
+#define MAX_PTLS        128
+
+extern void set_address(bridge t,ptl_pid_t pidrequest);
+extern nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid);
+
+#endif
diff --git a/lnet/ulnds/socklnd/proclib.c b/lnet/ulnds/socklnd/proclib.c
new file mode 100644 (file)
index 0000000..c3ee103
--- /dev/null
@@ -0,0 +1,270 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* lib.c:
+ *  This file provides the 'library' side for the process-based nals.
+ *  it is responsible for communication with the 'api' side and
+ *  providing service to the generic portals 'library'
+ *  implementation. 'library' might be better termed 'communication'
+ *  or 'kernel'.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <errno.h>
+#include <timer.h>
+//#include <util/pqtimer.h>
+#include <dispatch.h>
+
+/* the following functions are stubs to satisfy the nal definition
+   without doing anything particularily useful*/
+
+static int nal_write(nal_cb_t *nal,
+                     void *private,
+                     user_ptr dst_addr,
+                     void *src_addr,
+                     ptl_size_t len)
+{
+    memcpy(dst_addr, src_addr, len);
+    return 0;
+}
+
+static int nal_read(nal_cb_t * nal,
+                    void *private,
+                   void *dst_addr,
+                   user_ptr src_addr,
+                   size_t len)
+{
+       memcpy(dst_addr, src_addr, len);
+       return 0;
+}
+
+static void *nal_malloc(nal_cb_t *nal,
+                        ptl_size_t len)
+{
+    void *buf =  malloc(len);
+    return buf;
+}
+
+static void nal_free(nal_cb_t *nal,
+                     void *buf,
+                     ptl_size_t len)
+{
+    free(buf);
+}
+
+static void nal_printf(nal_cb_t *nal,
+                       const char *fmt,
+                       ...)
+{
+    va_list        ap;
+
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
+}
+
+
+static void nal_cli(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static void nal_sti(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static int nal_dist(nal_cb_t *nal,
+                    ptl_nid_t nid,
+                    unsigned long *dist)
+{
+    return 0;
+}
+    
+
+
+/* Function:  data_from_api
+ * Arguments: t: the nal state for this interface
+ * Returns: whether to continue reading from the pipe
+ *
+ *   data_from_api() reads data from the api side in response
+ *   to a select.
+ *
+ *   We define data_failure() for syntactic convenience
+ *   of unix error reporting.
+ */
+
+#define data_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(0);\
+       }
+static int data_from_api(void *arg)
+{
+        bridge b = arg;
+    procbridge p=(procbridge)b->local;
+    /* where are these two sizes derived from ??*/
+    char arg_block[ 256 ];
+    char ret_block[ 128 ];
+    ptl_size_t arg_len,ret_len;
+    int fd=p->to_lib[0];
+    int index;
+
+    data_failure(read,fd, &index, sizeof(index));
+
+    if (index==PTL_FINI) {
+        lib_fini(b->nal_cb);
+        if (b->shutdown) (*b->shutdown)(b);
+        syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive));
+
+        /* a heavy-handed but convenient way of shutting down
+           the lower side thread */
+        pthread_exit(0);
+    }
+
+    data_failure(read,fd, &arg_len, sizeof(arg_len));
+    data_failure(read,fd, &ret_len, sizeof(ret_len));
+    data_failure(read,fd, arg_block, arg_len);
+
+    lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block);
+
+    data_failure(write,p->from_lib[1],ret_block, ret_len);
+    return(1);
+}
+#undef data_failure
+
+
+
+static void wakeup_topside(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_broadcast(&p->cond);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+/* Function:  nal_thread
+ * Arguments: z: an opaque reference to a nal control structure
+ *               allocated and partially populated by the api level code
+ * Returns: nothing, and only on error or explicit shutdown
+ *
+ *  This function is the entry point of the pthread initiated on 
+ *  the api side of the interface. This thread is used to handle
+ *  asynchronous delivery to the application.
+ * 
+ *  We define a limit macro to place a ceiling on limits
+ *   for syntactic convenience
+ */
+#define LIMIT(x,y,max)\
+     if ((unsigned int)x > max) y = max;
+
+extern int tcpnal_init(bridge);
+
+nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
+
+void *nal_thread(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+    int rc;
+    ptl_pid_t pid_request;
+    int nal_type;
+    ptl_ni_limits_t desired;
+    ptl_ni_limits_t actual;
+    
+    b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t));
+    b->nal_cb->nal_data=b;
+    b->nal_cb->cb_read=nal_read;
+    b->nal_cb->cb_write=nal_write;
+    b->nal_cb->cb_malloc=nal_malloc;
+    b->nal_cb->cb_free=nal_free;
+    b->nal_cb->cb_map=NULL;
+    b->nal_cb->cb_unmap=NULL;
+    b->nal_cb->cb_printf=nal_printf;
+    b->nal_cb->cb_cli=nal_cli;
+    b->nal_cb->cb_sti=nal_sti;
+    b->nal_cb->cb_dist=nal_dist;
+
+
+    register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b);
+
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type))))
+        perror("procbridge read from api");
+
+    actual = desired;
+    LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES);
+    LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS);
+    LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS);
+    LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS);
+    LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS);
+
+    set_address(b,pid_request);
+
+    if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
+    /* initialize the generic 'library' level code */
+
+    rc = lib_init(b->nal_cb, 
+                  b->nal_cb->ni.nid,
+                  b->nal_cb->ni.pid,
+                 10,
+                 actual.max_ptable_index,
+                 actual.max_atable_index);
+
+    /*
+     * Whatever the initialization returned is passed back to the
+     * user level code for further interpretation.  We just exit if
+     * it is non-zero since something went wrong.
+     */
+    /* this should perform error checking */
+#if 0
+    write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t));
+#endif
+    syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc));
+    
+    if(!rc) {
+        /* the thunk function is called each time the timer loop
+           performs an operation and returns to blocking mode. we
+           overload this function to inform the api side that
+           it may be interested in looking at the event queue */
+        register_thunk(wakeup_topside,b);
+        timer_loop();
+    }
+    return(0);
+}
+#undef LIMIT
+
diff --git a/lnet/ulnds/socklnd/select.c b/lnet/ulnds/socklnd/select.c
new file mode 100644 (file)
index 0000000..c4f84f4
--- /dev/null
@@ -0,0 +1,165 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* select.c:
+ *  Provides a general mechanism for registering and dispatching
+ *  io events through the select system call.
+ */
+
+#ifdef sun
+#include <sys/filio.h>
+#else
+#include <sys/ioctl.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+
+
+static struct timeval beginning_of_epoch;
+static io_handler io_handlers;
+
+/* Function: now
+ *
+ * Return: the current time in canonical units: a 64 bit number
+ *   where the most significant 32 bits contains the number
+ *   of seconds, and the least signficant a count of (1/(2^32))ths
+ *   of a second.
+ */
+when now()
+{
+    struct timeval result;
+  
+    gettimeofday(&result,0);
+    return((((unsigned long long)result.tv_sec)<<32)|
+           (((unsigned long long)result.tv_usec)<<32)/1000000);
+}
+
+
+/* Function: register_io_handler
+ * Arguments: fd: the file descriptor of interest
+ *            type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER
+ *            function: a function to call when io is available on fd
+ *            arg: an opaque correlator to return to the handler
+ * Returns: a pointer to the io_handler structure
+ */
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg)
+{
+    io_handler i=(io_handler)malloc(sizeof(struct io_handler));
+    if ((i->fd=fd)>=0){
+        i->type=type;
+        i->function=function;
+        i->argument=arg;
+        i->disabled=0;
+        i->last=&io_handlers;
+        if ((i->next=io_handlers)) i->next->last=&i->next;
+        io_handlers=i;
+    }
+    return(i);
+}
+
+/* Function: remove_io_handler
+ * Arguments: i: a pointer to the handler to stop servicing
+ *
+ * remove_io_handler() doesn't actually free the handler, due
+ * to reentrancy problems. it just marks the handler for 
+ * later cleanup by the blocking function.
+ */
+void remove_io_handler (io_handler i)
+{
+    i->disabled=1;
+}
+
+static void set_flag(io_handler n,fd_set *fds)
+{
+    if (n->type & READ_HANDLER) FD_SET(n->fd,fds);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2);
+}
+
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int result;
+    io_handler j;
+    io_handler *k;
+
+    /* TODO: loop until the entire interval is expired*/
+    if (until){
+       when interval=until-now();
+        timeout.tv_sec=(interval>>32);
+        timeout.tv_usec=((interval<<32)/1000000)>>32;
+        timeout_pointer=&timeout;
+    } else timeout_pointer=0;
+
+    FD_ZERO(fds);
+    FD_ZERO(fds+1);
+    FD_ZERO(fds+2);
+    for (k=&io_handlers;*k;){
+        if ((*k)->disabled){
+            j=*k;
+            *k=(*k)->next;
+            free(j);
+        }
+        if (*k) {
+           set_flag(*k,fds);
+           k=&(*k)->next;
+       }
+    }
+    result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer);
+
+    if (result > 0)
+        for (j=io_handlers;j;j=j->next){
+            if (!(j->disabled) && 
+                ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){
+                if (!(*j->function)(j->argument))
+                    j->disabled=1;
+            }
+        }
+}
+
+/* Function: init_unix_timer()
+ *   is called to initialize the library 
+ */
+void init_unix_timer()
+{
+    io_handlers=0;
+    gettimeofday(&beginning_of_epoch, 0);
+    initialize_timer(select_timer_block);
+}
diff --git a/lnet/ulnds/socklnd/table.c b/lnet/ulnds/socklnd/table.c
new file mode 100644 (file)
index 0000000..bef13c5
--- /dev/null
@@ -0,0 +1,264 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <table.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+/* table.c:
+ * a very simple hash table implementation with paramerterizable 
+ * comparison and key generation functions. it does resize
+ * in order to accomidate more entries, but never collapses 
+ * the table 
+ */
+
+static table_entry *table_lookup (table t,void *comparator,
+                                  unsigned int k,
+                                  int (*compare_function)(void *, void *),
+                                  int *success)
+{
+    unsigned int key=k%t->size;
+    table_entry *i;
+
+    for (i=&(t->entries[key]);*i;i=&((*i)->next)){
+        if (compare_function && ((*i)->key==k))
+            if ((*t->compare_function)((*i)->value,comparator)){
+                *success=1;
+                return(i);
+            }
+    }
+    *success=0;
+    return(&(t->entries[key]));
+}
+
+
+static void resize_table(table t, int size)
+{
+    int old_size=t->size;
+    table_entry *old_entries=t->entries;
+    int i; 
+    table_entry j,n;
+    table_entry *position;
+    int success;
+  
+    t->size=size;
+    t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size);
+    memset(t->entries,0,sizeof(table_entry)*t->size);
+
+    for (i=0;i<old_size;i++)
+        for (j=old_entries[i];j;j=n){
+            n=j->next;
+            position=table_lookup(t,0,j->key,0,&success);
+            j->next= *position;
+            *position=j;
+        }
+    free(old_entries);
+}
+
+
+/* Function: key_from_int
+ * Arguments: int i: value to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_int(int i)
+{
+    return(i);
+}
+
+
+/* Function: key_from_string
+ * Arguments: char *s: the null terminated string
+ *                     to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_string(char *s)
+{
+    unsigned int result=0;
+    unsigned char *n;
+    int i;
+    if (!s) return(1);
+    for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i;
+    return(result);
+}
+
+
+/* Function: hash_create_table
+ * Arguments: compare_function: a function to compare
+ *                              a table instance with a correlator
+ *            key_function: a function to generate a 32 bit 
+ *                          hash key from a correlator
+ * Returns: a pointer to the new table
+ */
+table hash_create_table (int (*compare_function)(void *, void *),
+                    unsigned int (*key_function)(unsigned int *))
+{
+    table new=(table)malloc(sizeof(struct table));
+    memset(new, 0, sizeof(struct table));
+
+    new->compare_function=compare_function;
+    new->key_function=key_function;
+    new->number_of_entries=0;
+    new->size=4;
+    new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size);
+    memset(new->entries,0,sizeof(table_entry)*new->size);
+    return(new);
+}
+
+
+/* Function: hash_table_find
+ * Arguments: t: a table to look in
+ *            comparator: a value to access the table entry
+ * Returns: the element references to by comparator, or null
+ */
+void *hash_table_find (table t, void *comparator)
+{
+    int success;
+    table_entry* entry=table_lookup(t,comparator,
+                                    (*t->key_function)(comparator),
+                                    t->compare_function,
+                                    &success);
+    if (success)  return((*entry)->value);
+    return(0);
+}
+
+
+/* Function: hash_table_insert
+ * Arguments: t: a table to insert the object
+ *            value: the object to put in the table
+ *            comparator: the value by which the object 
+ *                        will be addressed
+ * Returns: nothing
+ */
+void hash_table_insert (table t, void *value, void *comparator)
+{
+    int success;
+    unsigned int k=(*t->key_function)(comparator);
+    table_entry *position=table_lookup(t,comparator,k,
+                                       t->compare_function,&success);
+    table_entry entry;
+
+    if (success) {
+        entry = *position;
+    } else {
+        entry = (table_entry)malloc(sizeof(struct table_entry));
+        memset(entry, 0, sizeof(struct table_entry));
+        entry->next= *position;
+        *position=entry;
+        t->number_of_entries++;
+    }
+    entry->value=value;
+    entry->key=k;
+    if (t->number_of_entries > t->size) resize_table(t,t->size*2);
+}
+
+/* Function: hash_table_remove
+ * Arguments: t: the table to remove the object from
+ *            comparator: the index value of the object to remove
+ * Returns: 
+ */
+void hash_table_remove (table t, void *comparator)
+{
+    int success;
+    table_entry temp;
+    table_entry *position=table_lookup(t,comparator,
+                                       (*t->key_function)(comparator),
+                                       t->compare_function,&success);
+    if(success) {
+        temp=*position;
+        *position=(*position)->next;
+        free(temp); /* the value? */
+        t->number_of_entries--;
+    }
+}
+
+/* Function: hash_iterate_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ */
+void hash_iterate_table_entries(table t,
+                           void (*handler)(void *,void *), 
+                           void *arg)
+{
+    int i;
+    table_entry *j,*next;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            (*handler)(arg,(*j)->value);
+        }
+}
+
+/* Function: hash_filter_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ * Notes: operations on the table inside handler are not safe
+ *
+ * filter_table_entires() calls the handler function for each
+ *   item in the table, passing it and arg. The handler function
+ *   returns 1 if it is to be retained in the table, and 0
+ *   if it is to be removed.
+ */
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg)
+{
+    int i;
+    table_entry *j,*next,v;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            if (!(*handler)(arg,(*j)->value)){
+                next=j;
+                v=*j;
+                *j=(*j)->next;
+                free(v);
+                t->number_of_entries--;
+            }
+        }
+}
+
+/* Function: destroy_table
+ * Arguments: t: the table to free
+ *            thunk: a function to call with each element,
+ *                   most likely free()
+ * Returns: nothing
+ */
+void hash_destroy_table(table t,void (*thunk)(void *))
+{
+    table_entry j,next;
+    int i;
+    for (i=0;i<t->size;i++)
+        for (j=t->entries[i];j;j=next){
+            next=j->next;
+            if (thunk) (*thunk)(j->value);
+            free(j);
+        }
+    free(t->entries);
+    free(t);
+}
diff --git a/lnet/ulnds/socklnd/table.h b/lnet/ulnds/socklnd/table.h
new file mode 100644 (file)
index 0000000..7fab586
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef E_TABLE
+#define E_TABLE
+
+typedef struct table_entry {
+  unsigned int key;
+  void *value;
+  struct table_entry *next;
+} *table_entry;
+
+
+typedef struct table {
+  unsigned int size;
+  int number_of_entries;
+  table_entry *entries;
+  int (*compare_function)(void *, void *);
+  unsigned int (*key_function)(unsigned int *);
+} *table;
+
+/* table.c */
+unsigned int key_from_int(int i);
+unsigned int key_from_string(char *s);
+table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *));
+void *hash_table_find(table t, void *comparator);
+void hash_table_insert(table t, void *value, void *comparator);
+void hash_table_remove(table t, void *comparator);
+void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg);
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg);
+void hash_destroy_table(table t, void (*thunk)(void *));
+
+#endif
diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c
new file mode 100644 (file)
index 0000000..534fc17
--- /dev/null
@@ -0,0 +1,198 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* tcpnal.c:
+   This file implements the TCP-based nal by providing glue
+   between the connection service and the generic NAL implementation */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <bridge.h>
+#include <ipmap.h>
+#include <connection.h>
+
+/* Function:  tcpnal_send
+ * Arguments: nal:     pointer to my nal control block
+ *            private: unused
+ *            cookie:  passed back to the portals library
+ *            hdr:     pointer to the portals header
+ *            nid:     destination node
+ *            pid:     destination process
+ *            data:    body of the message
+ *            len:     length of the body
+ * Returns: zero on success
+ *
+ * sends a packet to the peer, after insuring that a connection exists
+ */
+#warning FIXME: "param 'type' is newly added, make use of it!!"
+int tcpnal_send(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+               ptl_hdr_t *hdr,
+               int type,
+               ptl_nid_t nid,
+               ptl_pid_t pid,
+                unsigned int niov,
+                struct iovec *iov,
+               size_t len)
+{
+    connection c;
+    bridge b=(bridge)n->nal_data;
+    struct iovec tiov[2];
+    int count = 1;
+
+    if (!(c=force_tcp_connection((manager)b->lower,
+                                 PNAL_IP(nid,b),
+                                 PNAL_PORT(nid,pid)))) 
+        return(1);
+
+#if 0
+    /* TODO: these results should be checked. furthermore, provision
+       must be made for the SIGPIPE which is delivered when
+       writing on a tcp socket which has closed underneath
+       the application. there is a linux flag in the sendmsg
+       call which turns off the signally behaviour, but its
+       nonstandard */
+    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
+    LASSERT (niov <= 1);
+    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
+#else
+    LASSERT (niov <= 1);
+
+    tiov[0].iov_base = hdr;
+    tiov[0].iov_len = sizeof(ptl_hdr_t);
+
+    if (len) {
+            tiov[1].iov_base = iov[0].iov_base;
+            tiov[1].iov_len = len;
+            count++;
+    }
+
+    syscall(SYS_writev, c->fd, tiov, count);
+#endif
+    lib_finalize(n, private, cookie);
+        
+    return(0);
+}
+
+
+/* Function:  tcpnal_recv
+ * Arguments: nal_cb_t *nal:     pointer to my nal control block
+ *            void *private:     connection pointer passed through
+ *                               lib_parse()
+ *            lib_msg_t *cookie: passed back to portals library
+ *            user_ptr data:     pointer to the destination buffer
+ *            size_t mlen:       length of the body
+ *            size_t rlen:       length of data in the network
+ * Returns: zero on success
+ *
+ * blocking read of the requested data. must drain out the
+ * difference of mainpulated and requested lengths from the network
+ */
+int tcpnal_recv(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+                unsigned int niov,
+                struct iovec *iov,
+               ptl_size_t mlen,
+               ptl_size_t rlen)
+
+{
+    if (mlen) {
+        LASSERT (niov <= 1);
+        read_connection(private,iov[0].iov_base,mlen);
+        lib_finalize(n, private, cookie);
+    }
+
+    if (mlen!=rlen){
+        char *trash=malloc(rlen-mlen);
+        
+        /*TODO: check error status*/
+        read_connection(private,trash,rlen-mlen);
+        free(trash);
+    }
+
+    return(rlen);
+}
+
+
+/* Function:  from_connection: 
+ * Arguments: c: the connection to read from 
+ * Returns: whether or not to continue reading from this connection,
+ *          expressed as a 1 to continue, and a 0 to not
+ *
+ *  from_connection() is called from the select loop when i/o is 
+ *  available. It attempts to read the portals header and 
+ *  pass it to the generic library for processing.
+ */
+static int from_connection(void *a, void *d)
+{
+        connection c = d;
+        bridge b=a;
+        ptl_hdr_t hdr;
+
+        if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
+                lib_parse(b->nal_cb, &hdr, c);
+                return(1);
+        }
+        return(0);
+}
+
+
+static void tcpnal_shutdown(bridge b)
+{
+    shutdown_connections(b->lower);
+}
+
+/* Function:  PTL_IFACE_TCP
+ * Arguments: pid_request: desired port number to bind to
+ *            desired: passed NAL limits structure
+ *            actual: returned NAL limits structure
+ * Returns: a nal structure on success, or null on failure
+ */
+int tcpnal_init(bridge b)
+{
+    manager m;
+        
+    b->nal_cb->cb_send=tcpnal_send;
+    b->nal_cb->cb_recv=tcpnal_recv;
+    b->shutdown=tcpnal_shutdown;
+    
+    if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid,
+                                       b->nal_cb->ni.pid),
+                             from_connection,b))){
+        /* TODO: this needs to shut down the
+           newly created junk */
+        return(PTL_NAL_FAILED);
+    }
+    /* XXX cfs hack */
+    b->nal_cb->ni.pid=0;
+    b->lower=m;
+    return(PTL_OK);
+}
diff --git a/lnet/ulnds/socklnd/timer.h b/lnet/ulnds/socklnd/timer.h
new file mode 100644 (file)
index 0000000..aaf39d2
--- /dev/null
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* TODO: make this an explicit type when they become available */
+typedef unsigned long long when;
+
+typedef struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+} *timer;
+
+timer register_timer(when, void (*f)(void *), void *a);
+void remove_timer(timer t);
+void timer_loop(void);
+void initialize_timer(void);
+void register_thunk(void (*f)(void *),void *a);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lnet/ulnds/socklnd/utypes.h b/lnet/ulnds/socklnd/utypes.h
new file mode 100644 (file)
index 0000000..7eca959
--- /dev/null
@@ -0,0 +1,12 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned short uint16;
+typedef unsigned long uint32;
+typedef unsigned long long uint64;
+typedef unsigned char uint8;
diff --git a/lnet/ulnds/table.c b/lnet/ulnds/table.c
new file mode 100644 (file)
index 0000000..bef13c5
--- /dev/null
@@ -0,0 +1,264 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <table.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+/* table.c:
+ * a very simple hash table implementation with paramerterizable 
+ * comparison and key generation functions. it does resize
+ * in order to accomidate more entries, but never collapses 
+ * the table 
+ */
+
+static table_entry *table_lookup (table t,void *comparator,
+                                  unsigned int k,
+                                  int (*compare_function)(void *, void *),
+                                  int *success)
+{
+    unsigned int key=k%t->size;
+    table_entry *i;
+
+    for (i=&(t->entries[key]);*i;i=&((*i)->next)){
+        if (compare_function && ((*i)->key==k))
+            if ((*t->compare_function)((*i)->value,comparator)){
+                *success=1;
+                return(i);
+            }
+    }
+    *success=0;
+    return(&(t->entries[key]));
+}
+
+
+static void resize_table(table t, int size)
+{
+    int old_size=t->size;
+    table_entry *old_entries=t->entries;
+    int i; 
+    table_entry j,n;
+    table_entry *position;
+    int success;
+  
+    t->size=size;
+    t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size);
+    memset(t->entries,0,sizeof(table_entry)*t->size);
+
+    for (i=0;i<old_size;i++)
+        for (j=old_entries[i];j;j=n){
+            n=j->next;
+            position=table_lookup(t,0,j->key,0,&success);
+            j->next= *position;
+            *position=j;
+        }
+    free(old_entries);
+}
+
+
+/* Function: key_from_int
+ * Arguments: int i: value to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_int(int i)
+{
+    return(i);
+}
+
+
+/* Function: key_from_string
+ * Arguments: char *s: the null terminated string
+ *                     to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_string(char *s)
+{
+    unsigned int result=0;
+    unsigned char *n;
+    int i;
+    if (!s) return(1);
+    for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i;
+    return(result);
+}
+
+
+/* Function: hash_create_table
+ * Arguments: compare_function: a function to compare
+ *                              a table instance with a correlator
+ *            key_function: a function to generate a 32 bit 
+ *                          hash key from a correlator
+ * Returns: a pointer to the new table
+ */
+table hash_create_table (int (*compare_function)(void *, void *),
+                    unsigned int (*key_function)(unsigned int *))
+{
+    table new=(table)malloc(sizeof(struct table));
+    memset(new, 0, sizeof(struct table));
+
+    new->compare_function=compare_function;
+    new->key_function=key_function;
+    new->number_of_entries=0;
+    new->size=4;
+    new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size);
+    memset(new->entries,0,sizeof(table_entry)*new->size);
+    return(new);
+}
+
+
+/* Function: hash_table_find
+ * Arguments: t: a table to look in
+ *            comparator: a value to access the table entry
+ * Returns: the element references to by comparator, or null
+ */
+void *hash_table_find (table t, void *comparator)
+{
+    int success;
+    table_entry* entry=table_lookup(t,comparator,
+                                    (*t->key_function)(comparator),
+                                    t->compare_function,
+                                    &success);
+    if (success)  return((*entry)->value);
+    return(0);
+}
+
+
+/* Function: hash_table_insert
+ * Arguments: t: a table to insert the object
+ *            value: the object to put in the table
+ *            comparator: the value by which the object 
+ *                        will be addressed
+ * Returns: nothing
+ */
+void hash_table_insert (table t, void *value, void *comparator)
+{
+    int success;
+    unsigned int k=(*t->key_function)(comparator);
+    table_entry *position=table_lookup(t,comparator,k,
+                                       t->compare_function,&success);
+    table_entry entry;
+
+    if (success) {
+        entry = *position;
+    } else {
+        entry = (table_entry)malloc(sizeof(struct table_entry));
+        memset(entry, 0, sizeof(struct table_entry));
+        entry->next= *position;
+        *position=entry;
+        t->number_of_entries++;
+    }
+    entry->value=value;
+    entry->key=k;
+    if (t->number_of_entries > t->size) resize_table(t,t->size*2);
+}
+
+/* Function: hash_table_remove
+ * Arguments: t: the table to remove the object from
+ *            comparator: the index value of the object to remove
+ * Returns: 
+ */
+void hash_table_remove (table t, void *comparator)
+{
+    int success;
+    table_entry temp;
+    table_entry *position=table_lookup(t,comparator,
+                                       (*t->key_function)(comparator),
+                                       t->compare_function,&success);
+    if(success) {
+        temp=*position;
+        *position=(*position)->next;
+        free(temp); /* the value? */
+        t->number_of_entries--;
+    }
+}
+
+/* Function: hash_iterate_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ */
+void hash_iterate_table_entries(table t,
+                           void (*handler)(void *,void *), 
+                           void *arg)
+{
+    int i;
+    table_entry *j,*next;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            (*handler)(arg,(*j)->value);
+        }
+}
+
+/* Function: hash_filter_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ * Notes: operations on the table inside handler are not safe
+ *
+ * filter_table_entires() calls the handler function for each
+ *   item in the table, passing it and arg. The handler function
+ *   returns 1 if it is to be retained in the table, and 0
+ *   if it is to be removed.
+ */
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg)
+{
+    int i;
+    table_entry *j,*next,v;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            if (!(*handler)(arg,(*j)->value)){
+                next=j;
+                v=*j;
+                *j=(*j)->next;
+                free(v);
+                t->number_of_entries--;
+            }
+        }
+}
+
+/* Function: destroy_table
+ * Arguments: t: the table to free
+ *            thunk: a function to call with each element,
+ *                   most likely free()
+ * Returns: nothing
+ */
+void hash_destroy_table(table t,void (*thunk)(void *))
+{
+    table_entry j,next;
+    int i;
+    for (i=0;i<t->size;i++)
+        for (j=t->entries[i];j;j=next){
+            next=j->next;
+            if (thunk) (*thunk)(j->value);
+            free(j);
+        }
+    free(t->entries);
+    free(t);
+}
diff --git a/lnet/ulnds/table.h b/lnet/ulnds/table.h
new file mode 100644 (file)
index 0000000..7fab586
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef E_TABLE
+#define E_TABLE
+
+typedef struct table_entry {
+  unsigned int key;
+  void *value;
+  struct table_entry *next;
+} *table_entry;
+
+
+typedef struct table {
+  unsigned int size;
+  int number_of_entries;
+  table_entry *entries;
+  int (*compare_function)(void *, void *);
+  unsigned int (*key_function)(unsigned int *);
+} *table;
+
+/* table.c */
+unsigned int key_from_int(int i);
+unsigned int key_from_string(char *s);
+table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *));
+void *hash_table_find(table t, void *comparator);
+void hash_table_insert(table t, void *value, void *comparator);
+void hash_table_remove(table t, void *comparator);
+void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg);
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg);
+void hash_destroy_table(table t, void (*thunk)(void *));
+
+#endif
diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c
new file mode 100644 (file)
index 0000000..534fc17
--- /dev/null
@@ -0,0 +1,198 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* tcpnal.c:
+   This file implements the TCP-based nal by providing glue
+   between the connection service and the generic NAL implementation */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <bridge.h>
+#include <ipmap.h>
+#include <connection.h>
+
+/* Function:  tcpnal_send
+ * Arguments: nal:     pointer to my nal control block
+ *            private: unused
+ *            cookie:  passed back to the portals library
+ *            hdr:     pointer to the portals header
+ *            nid:     destination node
+ *            pid:     destination process
+ *            data:    body of the message
+ *            len:     length of the body
+ * Returns: zero on success
+ *
+ * sends a packet to the peer, after insuring that a connection exists
+ */
+#warning FIXME: "param 'type' is newly added, make use of it!!"
+int tcpnal_send(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+               ptl_hdr_t *hdr,
+               int type,
+               ptl_nid_t nid,
+               ptl_pid_t pid,
+                unsigned int niov,
+                struct iovec *iov,
+               size_t len)
+{
+    connection c;
+    bridge b=(bridge)n->nal_data;
+    struct iovec tiov[2];
+    int count = 1;
+
+    if (!(c=force_tcp_connection((manager)b->lower,
+                                 PNAL_IP(nid,b),
+                                 PNAL_PORT(nid,pid)))) 
+        return(1);
+
+#if 0
+    /* TODO: these results should be checked. furthermore, provision
+       must be made for the SIGPIPE which is delivered when
+       writing on a tcp socket which has closed underneath
+       the application. there is a linux flag in the sendmsg
+       call which turns off the signally behaviour, but its
+       nonstandard */
+    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
+    LASSERT (niov <= 1);
+    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
+#else
+    LASSERT (niov <= 1);
+
+    tiov[0].iov_base = hdr;
+    tiov[0].iov_len = sizeof(ptl_hdr_t);
+
+    if (len) {
+            tiov[1].iov_base = iov[0].iov_base;
+            tiov[1].iov_len = len;
+            count++;
+    }
+
+    syscall(SYS_writev, c->fd, tiov, count);
+#endif
+    lib_finalize(n, private, cookie);
+        
+    return(0);
+}
+
+
+/* Function:  tcpnal_recv
+ * Arguments: nal_cb_t *nal:     pointer to my nal control block
+ *            void *private:     connection pointer passed through
+ *                               lib_parse()
+ *            lib_msg_t *cookie: passed back to portals library
+ *            user_ptr data:     pointer to the destination buffer
+ *            size_t mlen:       length of the body
+ *            size_t rlen:       length of data in the network
+ * Returns: zero on success
+ *
+ * blocking read of the requested data. must drain out the
+ * difference of mainpulated and requested lengths from the network
+ */
+int tcpnal_recv(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+                unsigned int niov,
+                struct iovec *iov,
+               ptl_size_t mlen,
+               ptl_size_t rlen)
+
+{
+    if (mlen) {
+        LASSERT (niov <= 1);
+        read_connection(private,iov[0].iov_base,mlen);
+        lib_finalize(n, private, cookie);
+    }
+
+    if (mlen!=rlen){
+        char *trash=malloc(rlen-mlen);
+        
+        /*TODO: check error status*/
+        read_connection(private,trash,rlen-mlen);
+        free(trash);
+    }
+
+    return(rlen);
+}
+
+
+/* Function:  from_connection: 
+ * Arguments: c: the connection to read from 
+ * Returns: whether or not to continue reading from this connection,
+ *          expressed as a 1 to continue, and a 0 to not
+ *
+ *  from_connection() is called from the select loop when i/o is 
+ *  available. It attempts to read the portals header and 
+ *  pass it to the generic library for processing.
+ */
+static int from_connection(void *a, void *d)
+{
+        connection c = d;
+        bridge b=a;
+        ptl_hdr_t hdr;
+
+        if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
+                lib_parse(b->nal_cb, &hdr, c);
+                return(1);
+        }
+        return(0);
+}
+
+
+static void tcpnal_shutdown(bridge b)
+{
+    shutdown_connections(b->lower);
+}
+
+/* Function:  PTL_IFACE_TCP
+ * Arguments: pid_request: desired port number to bind to
+ *            desired: passed NAL limits structure
+ *            actual: returned NAL limits structure
+ * Returns: a nal structure on success, or null on failure
+ */
+int tcpnal_init(bridge b)
+{
+    manager m;
+        
+    b->nal_cb->cb_send=tcpnal_send;
+    b->nal_cb->cb_recv=tcpnal_recv;
+    b->shutdown=tcpnal_shutdown;
+    
+    if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid,
+                                       b->nal_cb->ni.pid),
+                             from_connection,b))){
+        /* TODO: this needs to shut down the
+           newly created junk */
+        return(PTL_NAL_FAILED);
+    }
+    /* XXX cfs hack */
+    b->nal_cb->ni.pid=0;
+    b->lower=m;
+    return(PTL_OK);
+}
diff --git a/lnet/ulnds/timer.h b/lnet/ulnds/timer.h
new file mode 100644 (file)
index 0000000..aaf39d2
--- /dev/null
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* TODO: make this an explicit type when they become available */
+typedef unsigned long long when;
+
+typedef struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+} *timer;
+
+timer register_timer(when, void (*f)(void *), void *a);
+void remove_timer(timer t);
+void timer_loop(void);
+void initialize_timer(void);
+void register_thunk(void (*f)(void *),void *a);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lnet/ulnds/utypes.h b/lnet/ulnds/utypes.h
new file mode 100644 (file)
index 0000000..7eca959
--- /dev/null
@@ -0,0 +1,12 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned short uint16;
+typedef unsigned long uint32;
+typedef unsigned long long uint64;
+typedef unsigned char uint8;
diff --git a/lnet/utils/.cvsignore b/lnet/utils/.cvsignore
new file mode 100644 (file)
index 0000000..148310a
--- /dev/null
@@ -0,0 +1,8 @@
+Makefile
+Makefile.in
+acceptor
+debugctl
+ptlctl
+.deps
+routerstat
+wirecheck
\ No newline at end of file
diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am
new file mode 100644 (file)
index 0000000..05af598
--- /dev/null
@@ -0,0 +1,27 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+COMPILE = gcc -Wall -g -I$(srcdir)/../include 
+LINK = gcc -o $@
+
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck
+lib_LIBRARIES = libptlctl.a
+
+acceptor_SOURCES = acceptor.c # -lefence
+
+wirecheck_SOURCES = wirecheck.c
+
+libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+
+ptlctl_SOURCES = ptlctl.c
+ptlctl_LDADD =  -L. -lptlctl -lncurses # -lefence
+ptlctl_DEPENDENCIES = libptlctl.a
+
+debugctl_SOURCES = debugctl.c
+debugctl_LDADD = -L. -lptlctl -lncurses # -lefence
+debugctl_DEPENDENCIES = libptlctl.a
+
+routerstat_SOURCES = routerstat.c
diff --git a/lnet/utils/acceptor.c b/lnet/utils/acceptor.c
new file mode 100644 (file)
index 0000000..c6590db
--- /dev/null
@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <asm/byteorder.h>
+#include <syslog.h>
+
+#include <errno.h>
+
+#include <portals/api-support.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+
+/* should get this from autoconf somehow */
+#ifndef PIDFILE_DIR
+#define PIDFILE_DIR "/var/run"
+#endif 
+
+#define PROGNAME "acceptor"
+
+void create_pidfile(char *name, int port)
+{
+        char pidfile[1024];
+        FILE *fp;
+
+        snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", 
+                 PIDFILE_DIR, name, port);
+        
+        if ((fp = fopen(pidfile, "w"))) {
+                fprintf(fp, "%d\n", getpid());
+                fclose(fp);
+        } else {
+                syslog(LOG_ERR, "%s: %s\n", pidfile, 
+                       strerror(errno));
+        }
+}
+
+int pidfile_exists(char *name, int port)
+{
+        char pidfile[1024];
+
+        snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", 
+                 PIDFILE_DIR, name, port);
+        
+        if (!access(pidfile, F_OK)) {
+                fprintf(stderr, "%s: exists, acceptor already running.\n", 
+                        pidfile);
+                return (1);
+        } 
+        return (0);
+}
+
+int
+parse_size (int *sizep, char *str)
+{
+        int             size;
+        char            mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod))
+        {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod)
+                {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+void
+show_connection (int fd, __u32 net_ip, ptl_nid_t nid)
+{
+        struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET);
+        __u32 host_ip = ntohl (net_ip);
+        int  rxmem = 0;
+        int  txmem = 0;
+        int  nonagle = 0;
+        int  len;
+        char host[1024];
+        
+        len = sizeof (txmem);
+        if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0)
+                perror ("Cannot get write buffer size");
+        
+        len = sizeof (rxmem);
+        if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0)
+                perror ("Cannot get read buffer size");
+        
+        len = sizeof (nonagle);
+        if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0)
+                perror ("Cannot get nagle");
+
+        if (h == NULL)
+                snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff,
+                                    (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff);
+        else
+                snprintf (host, sizeof(host), "%s", h->h_name);
+                
+        syslog (LOG_INFO, "Accepted host: %s NID: "LPX64" snd: %d rcv %d nagle: %s\n", 
+                 host, nid, txmem, rxmem, nonagle ? "disabled" : "enabled");
+}
+
+int
+sock_write (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = write (cfd, buffer, nob);
+
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+
+                if (rc == 0)
+                {
+                        fprintf (stderr, "Unexpected zero sock_write\n");
+                        abort();
+                }
+
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+sock_read (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = read (cfd, buffer, nob);
+                
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+                
+                if (rc == 0)                    /* EOF */
+                {
+                        errno = ECONNABORTED;
+                        return (-1);
+                }
+                
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid)
+{
+        int                      rc;
+        ptl_hdr_t                hdr;
+        ptl_magicversion_t      *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        memset (&hdr, 0, sizeof (hdr));
+        
+        hmv->magic          = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+
+        hdr.src_nid = __cpu_to_le64 (my_nid);
+        hdr.type = __cpu_to_le32 (PTL_MSG_HELLO);
+        
+        /* Assume there's sufficient socket buffering for a portals HELLO header */
+        rc = sock_write (cfd, &hdr, sizeof (hdr));
+        if (rc != 0) {
+                perror ("Can't send initial HELLO");
+                return (-1);
+        }
+
+        /* First few bytes down the wire are the portals protocol magic and
+         * version, no matter what protocol version we're running. */
+
+        rc = sock_read (cfd, hmv, sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read from peer");
+                return (-1);
+        }
+
+        if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) {
+                fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", 
+                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC);
+                return (-1);
+        }
+
+        if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR ||
+            __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) {
+                fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n",
+                         __cpu_to_le16 (hmv->version_major),
+                         __cpu_to_le16 (hmv->version_minor),
+                         PORTALS_PROTO_VERSION_MAJOR,
+                         PORTALS_PROTO_VERSION_MINOR);
+        }
+
+        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+         * so read the rest of it in now... */
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read rest of HELLO hdr");
+                return (-1);
+        }
+
+        /* ...and check we got what we expected */
+        if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO ||
+            __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) {
+                fprintf (stderr, "Expecting a HELLO hdr with 0 payload,"
+                         " but got type %d with %d payload\n",
+                         __cpu_to_le32 (hdr.type),
+                         __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)));
+                return (-1);
+        }
+        
+        *peer_nid = __le64_to_cpu (hdr.src_nid);
+        return (0);
+}
+
+void
+usage (char *myname)
+{
+        fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-N nal_id] port\n", myname);
+        exit (1);
+}
+
+int main(int argc, char **argv)
+{
+        int o, fd, rc, port, pfd;
+        struct sockaddr_in srvaddr;
+        int c;
+        int rxmem = 0;
+        int txmem = 0;
+        int noclose = 0;
+        int nonagle = 1;
+        int nal = SOCKNAL;
+        int xchg_nids = 0;
+        int bind_irq = 0;
+        
+        while ((c = getopt (argc, argv, "N:r:s:nlxi")) != -1)
+                switch (c)
+                {
+                case 'r':
+                        if (parse_size (&rxmem, optarg) != 0 || rxmem < 0)
+                                usage (argv[0]);
+                        break;
+                        
+                case 's':
+                        if (parse_size (&txmem, optarg) != 0 || txmem < 0)
+                                usage (argv[0]);
+                        break;
+
+                case 'n':
+                        nonagle = 0;
+                        break;
+
+                case 'l':
+                        noclose = 1;
+                        break;
+
+                case 'x':
+                        xchg_nids = 1;
+                        break;
+
+                case 'i':
+                        bind_irq = 1;
+                        break;
+                        
+                case 'N':
+                        if (parse_size(&nal, optarg) != 0 || 
+                            nal < 0 || nal > NAL_MAX_NR)
+                                usage(argv[0]);
+                        break;
+                        
+                default:
+                        usage (argv[0]);
+                        break;
+                }
+
+        if (optind >= argc)
+                usage (argv[0]);
+
+        port = atol(argv[optind++]);
+
+        if (pidfile_exists(PROGNAME, port))
+                exit(1);
+
+        memset(&srvaddr, 0, sizeof(srvaddr));
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons(port);
+        srvaddr.sin_addr.s_addr = INADDR_ANY;
+
+        fd = socket(PF_INET, SOCK_STREAM, 0);
+        if (fd < 0) {
+                perror("opening socket");
+                exit(1);
+        }
+
+        o = 1;
+        if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) {
+                perror("Cannot set REUSEADDR socket opt");
+                exit(1);
+        }
+
+        if (nonagle)
+        {
+                o = 1;
+                rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o));
+                if (rc != 0) 
+                { 
+                        perror ("Cannot disable nagle");
+                        exit (1);
+                }
+        }
+
+        if (txmem != 0)
+        {
+                rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem));
+                if (rc != 0)
+                {
+                        perror ("Cannot set write buffer size");
+                        exit (1);
+                }
+        }
+        
+        if (rxmem != 0)
+        {
+                rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem));
+                if (rc != 0)
+                {
+                        perror ("Cannot set read buffer size");
+                        exit (1);
+               }
+        }
+                
+        rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+        if ( rc == -1 ) {
+                perror("bind: ");
+                exit(1);
+        }
+
+        if (listen(fd, 127)) {
+                perror("listen: ");
+                exit(1);
+        }
+        fprintf(stderr, "listening on port %d\n", port);
+
+        pfd = open("/dev/portals", O_RDWR);
+        if ( pfd < 0 ) {
+                perror("opening portals device");
+                exit(1);
+        }
+
+        rc = daemon(1, noclose);
+        if (rc < 0) {
+                perror("daemon(): ");
+                exit(1);
+        }
+
+        openlog(PROGNAME, LOG_PID, LOG_DAEMON);
+        syslog(LOG_INFO, "started, listening on port %d\n", port);
+        create_pidfile(PROGNAME, port);
+
+        while (1) {
+                struct sockaddr_in clntaddr;
+                int len = sizeof(clntaddr);
+                int cfd;
+                struct portal_ioctl_data data;
+                ptl_nid_t peer_nid;
+                
+                cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
+                if ( cfd < 0 ) {
+                        perror("accept");
+                        exit(0);
+                        continue;
+                }
+
+                if (!xchg_nids)
+                        peer_nid = ntohl (clntaddr.sin_addr.s_addr); /* HOST byte order */
+                else
+                {
+                        PORTAL_IOC_INIT (data);
+                        data.ioc_nal = nal;
+                        rc = ioctl (pfd, IOC_PORTAL_GET_NID, &data);
+                        if (rc < 0)
+                        {
+                                perror ("Can't get my NID");
+                                close (cfd);
+                                continue;
+                        }
+                        
+                        rc = exchange_nids (cfd, data.ioc_nid, &peer_nid);
+                        if (rc != 0)
+                        {
+                                close (cfd);
+                                continue;
+                        }
+                }
+
+                show_connection (cfd, clntaddr.sin_addr.s_addr, peer_nid);
+                
+                PORTAL_IOC_INIT(data);
+                data.ioc_fd = cfd;
+                data.ioc_nal = nal;
+                data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD;
+                data.ioc_nid = peer_nid;
+                data.ioc_flags = bind_irq;
+                
+                if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) {
+                        perror("ioctl failed");
+
+                } else {
+                        printf("client registered\n");
+                }
+                rc = close(cfd);
+                if (rc)
+                        perror ("close failed");
+        }
+
+        closelog();
+        exit(0);
+
+}
diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c
new file mode 100644 (file)
index 0000000..9ab1c73
--- /dev/null
@@ -0,0 +1,618 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Some day I'll split all of this functionality into a cfs_debug module
+ * of its own.  That day is not today.
+ *
+ */
+
+#include <stdio.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <syscall.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#define BUG()                            /* workaround for module.h includes */
+#include <linux/version.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/module.h>
+#endif
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include "parser.h"
+
+static char rawbuf[8192];
+static char *buf = rawbuf;
+static int max = 8192;
+//static int g_pfd = -1;
+static int subsystem_array[1 << 8];
+static int debug_mask = ~0;
+
+static const char *portal_debug_subsystems[] =
+        {"undefined", "mdc", "mds", "osc", "ost", "class", "obdfs", "llite",
+         "rpc", "ext2obd", "portals", "socknal", "qswnal", "pinger", "filter",
+         "obdtrace", "echo", "ldlm", "lov", "gmnal", "router", "ptldb", NULL};
+static const char *portal_debug_masks[] =
+        {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
+         "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
+         "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", NULL};
+
+struct debug_daemon_cmd {
+        char *cmd;
+        unsigned int cmdv;
+};
+
+static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = {
+        {"start", DEBUG_DAEMON_START},
+        {"stop", DEBUG_DAEMON_STOP},
+        {"pause", DEBUG_DAEMON_PAUSE},
+        {"continue", DEBUG_DAEMON_CONTINUE},
+        {0, 0}
+};
+
+static int do_debug_mask(char *name, int enable)
+{
+        int found = 0, i;
+
+        for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
+                if (strcasecmp(name, portal_debug_subsystems[i]) == 0 ||
+                    strcasecmp(name, "all_subs") == 0) {
+                        printf("%s output from subsystem \"%s\"\n",
+                                enable ? "Enabling" : "Disabling",
+                                portal_debug_subsystems[i]);
+                        subsystem_array[i] = enable;
+                        found = 1;
+                }
+        }
+        for (i = 0; portal_debug_masks[i] != NULL; i++) {
+                if (strcasecmp(name, portal_debug_masks[i]) == 0 ||
+                    strcasecmp(name, "all_types") == 0) {
+                        printf("%s output of type \"%s\"\n",
+                                enable ? "Enabling" : "Disabling",
+                                portal_debug_masks[i]);
+                        if (enable)
+                                debug_mask |= (1 << i);
+                        else
+                                debug_mask &= ~(1 << i);
+                        found = 1;
+                }
+        }
+
+        return found;
+}
+
+int dbg_initialize(int argc, char **argv)
+{
+        memset(subsystem_array, 1, sizeof(subsystem_array));
+        return 0;
+}
+
+int jt_dbg_filter(int argc, char **argv)
+{
+        int   i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n",
+                        argv[0]);
+                return 0;
+        }
+
+        for (i = 1; i < argc; i++)
+                if (!do_debug_mask(argv[i], 0))
+                        fprintf(stderr, "Unknown subsystem or debug type: %s\n",
+                                argv[i]);
+        return 0;
+}
+
+int jt_dbg_show(int argc, char **argv)
+{
+        int    i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n",
+                        argv[0]);
+                return 0;
+        }
+
+        for (i = 1; i < argc; i++)
+                if (!do_debug_mask(argv[i], 1))
+                        fprintf(stderr, "Unknown subsystem or debug type: %s\n",
+                                argv[i]);
+
+        return 0;
+}
+
+static int applymask(char* procpath, int value)
+{
+        int rc;
+        char buf[64];
+        int len = snprintf(buf, 64, "%d", value);
+
+        int fd = open(procpath, O_WRONLY);
+        if (fd == -1) {
+                fprintf(stderr, "Unable to open %s: %s\n",
+                        procpath, strerror(errno));
+                return fd;
+        }
+        rc = write(fd, buf, len+1);
+        if (rc<0) {
+                fprintf(stderr, "Write to %s failed: %s\n",
+                        procpath, strerror(errno));
+                return rc;
+        }
+        close(fd);
+        return 0;
+}
+
+extern char *dump_filename;
+extern int dump(int dev_id, int opc, void *buf);
+
+static void applymask_all(unsigned int subs_mask, unsigned int debug_mask)
+{
+        if (!dump_filename) {
+                applymask("/proc/sys/portals/subsystem_debug", subs_mask);
+                applymask("/proc/sys/portals/debug", debug_mask);
+        } else {
+                struct portals_debug_ioctl_data data;
+
+                data.hdr.ioc_len = sizeof(data);
+                data.hdr.ioc_version = 0;
+                data.subs = subs_mask;
+                data.debug = debug_mask;
+
+                dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data);
+        }
+        printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n",
+               subs_mask, debug_mask);
+}
+
+int jt_dbg_list(int argc, char **argv)
+{
+        int i;
+
+        if (argc != 2) {
+                fprintf(stderr, "usage: %s <subs || types>\n", argv[0]);
+                return 0;
+        }
+
+        if (strcasecmp(argv[1], "subs") == 0) {
+                printf("Subsystems: all_subs");
+                for (i = 0; portal_debug_subsystems[i] != NULL; i++)
+                        printf(", %s", portal_debug_subsystems[i]);
+                printf("\n");
+        } else if (strcasecmp(argv[1], "types") == 0) {
+                printf("Types: all_types");
+                for (i = 0; portal_debug_masks[i] != NULL; i++)
+                        printf(", %s", portal_debug_masks[i]);
+                printf("\n");
+        }
+        else if (strcasecmp(argv[1], "applymasks") == 0) {
+                unsigned int subsystem_mask = 0;
+                for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
+                        if (subsystem_array[i]) subsystem_mask |= (1 << i);
+                }
+                applymask_all(subsystem_mask, debug_mask);
+        }
+        return 0;
+}
+
+/* if 'raw' is true, don't strip the debug information from the front of the
+ * lines */
+static void dump_buffer(FILE *fd, char *buf, int size, int raw)
+{
+        char *p, *z;
+        unsigned long subsystem, debug, dropped = 0, kept = 0;
+        int max_sub, max_type;
+
+        for (max_sub = 0; portal_debug_subsystems[max_sub] != NULL; max_sub++)
+                ;
+        for (max_type = 0; portal_debug_masks[max_type] != NULL; max_type++)
+                ;
+
+        while (size) {
+                p = memchr(buf, '\n', size);
+                if (!p)
+                        break;
+                subsystem = strtoul(buf, &z, 16);
+                debug = strtoul(z + 1, &z, 16);
+
+                z++;
+                /* for some reason %*s isn't working. */
+                *p = '\0';
+                if (subsystem < max_sub &&
+                    subsystem_array[subsystem] &&
+                    (!debug || (debug_mask & debug))) {
+                        if (raw)
+                                fprintf(fd, "%s\n", buf);
+                        else
+                                fprintf(fd, "%s\n", z);
+                        //printf("%s\n", buf);
+                        kept++;
+                } else {
+                        //fprintf(stderr, "dropping line (%lx:%lx): %s\n", subsystem, debug, buf);
+                        dropped++;
+                }
+                *p = '\n';
+                p++;
+                size -= (p - buf);
+                buf = p;
+        }
+
+        printf("Debug log: %lu lines, %lu kept, %lu dropped.\n",
+                dropped + kept, kept, dropped);
+}
+
+int jt_dbg_debug_kernel(int argc, char **argv)
+{
+        int rc, raw = 1;
+        FILE *fd = stdout;
+        const int databuf_size = (6 << 20);
+        struct portal_ioctl_data data, *newdata;
+        char *databuf = NULL;
+
+        if (argc > 3) {
+                fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
+                return 0;
+        }
+
+        if (argc > 1) {
+                fd = fopen(argv[1], "w");
+                if (fd == NULL) {
+                        fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
+                                strerror(errno));
+                        return -1;
+                }
+        }
+        if (argc > 2)
+                raw = atoi(argv[2]);
+
+        databuf = malloc(databuf_size);
+        if (!databuf) {
+                fprintf(stderr, "No memory for buffer.\n");
+                goto out;
+        }
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_plen1 = databuf_size;
+        data.ioc_pbuf1 = databuf;
+
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                goto out;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_GET_DEBUG failed: %s\n",
+                        strerror(errno));
+                goto out;
+        }
+
+        newdata = (struct portal_ioctl_data *)buf;
+        if (newdata->ioc_size > 0)
+                dump_buffer(fd, databuf, newdata->ioc_size, raw);
+
+ out:
+        if (databuf)
+                free(databuf);
+        if (fd != stdout)
+                fclose(fd);
+        return 0;
+}
+
+int jt_dbg_debug_daemon(int argc, char **argv)
+{
+        int i, rc;
+        unsigned int cmd = 0;
+        FILE *fd = stdout;
+        struct portal_ioctl_data data;
+
+        if (argc <= 1) {
+                fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|"
+                        "continue]\n", argv[0]);
+                return 0;
+        }
+        for (i = 0; portal_debug_daemon_cmd[i].cmd != NULL; i++) {
+                if (strcasecmp(argv[1], portal_debug_daemon_cmd[i].cmd) == 0) {
+                        cmd = portal_debug_daemon_cmd[i].cmdv;
+                        break;
+                }
+        }
+        if (portal_debug_daemon_cmd[i].cmd == NULL) {
+                fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|"
+                        "continue]\n", argv[0]);
+                return 0;
+        }
+        memset(&data, 0, sizeof(data));
+        if (cmd == DEBUG_DAEMON_START) {
+                if (argc < 3) {
+                        fprintf(stderr, "usage: %s [start file <#MB>|stop|"
+                                "pause|continue]\n", argv[0]);
+                        return 0;
+                }
+                if (access(argv[2], F_OK) != 0) {
+                        fd = fopen(argv[2], "w");
+                        if (fd != NULL) {
+                                fclose(fd);
+                                remove(argv[2]);
+                                goto ok;
+                        }
+                }
+                if (access(argv[2], W_OK) == 0)
+                        goto ok;
+                fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
+                        strerror(errno));
+                return -1;
+ok:
+                data.ioc_inllen1 = strlen(argv[2]) + 1;
+                data.ioc_inlbuf1 = argv[2];
+                data.ioc_misc = 0;
+                if (argc == 4) {
+                        unsigned long size;
+                        errno = 0;
+                        size = strtoul(argv[3], NULL, 0);
+                        if (errno) {
+                                fprintf(stderr, "file size(%s): error %s\n",
+                                        argv[3], strerror(errno));
+                                return -1;
+                        }
+                        data.ioc_misc = size;
+                }
+        }
+        data.ioc_count = cmd;
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_SET_DAEMON, buf);
+        if (rc < 0) {
+                fprintf(stderr, "IOC_PORTAL_SET_DEMON failed: %s\n",
+                                strerror(errno));
+                return rc;
+        }
+        return 0;
+}
+
+int jt_dbg_debug_file(int argc, char **argv)
+{
+        int rc, fd = -1, raw = 1;
+        FILE *output = stdout;
+        char *databuf = NULL;
+        struct stat statbuf;
+
+        if (argc > 4 || argc < 2) {
+                fprintf(stderr, "usage: %s <input> [output] [raw]\n", argv[0]);
+                return 0;
+        }
+
+        fd = open(argv[1], O_RDONLY);
+        if (fd < 0) {
+                fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
+                        strerror(errno));
+                return -1;
+        }
+#warning FIXME: cleanup fstat issue here
+#ifndef SYS_fstat64
+#define __SYS_fstat__ SYS_fstat
+#else
+#define __SYS_fstat__ SYS_fstat64
+#endif
+        rc = syscall(__SYS_fstat__, fd, &statbuf);
+        if (rc < 0) {
+                fprintf(stderr, "fstat failed: %s\n", strerror(errno));
+                goto out;
+        }
+
+        if (argc >= 3) {
+                output = fopen(argv[2], "w");
+                if (output == NULL) {
+                        fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
+                                strerror(errno));
+                        goto out;
+                }
+        }
+
+        if (argc == 4)
+                raw = atoi(argv[3]);
+
+        databuf = mmap(NULL, statbuf.st_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE, fd, 0);
+        if (databuf == NULL) {
+                fprintf(stderr, "mmap failed: %s\n", strerror(errno));
+                goto out;
+        }
+
+        dump_buffer(output, databuf, statbuf.st_size, raw);
+
+ out:
+        if (databuf)
+                munmap(databuf, statbuf.st_size);
+        if (output != stdout)
+                fclose(output);
+        if (fd > 0)
+                close(fd);
+        return 0;
+}
+
+int jt_dbg_clear_debug_buf(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+
+        memset(&data, 0, sizeof(data));
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+int jt_dbg_mark_debug_buf(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+        char *text;
+        time_t now = time(NULL);
+
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [marker text]\n", argv[0]);
+                return 0;
+        }
+
+        if (argc == 2) {
+                text = argv[1];
+        } else {
+                text = ctime(&now);
+                text[strlen(text) - 1] = '\0'; /* stupid \n */
+        }
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_inllen1 = strlen(text) + 1;
+        data.ioc_inlbuf1 = text;
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+
+int jt_dbg_modules(int argc, char **argv)
+{
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        struct mod_paths {
+                char *name, *path;
+        } *mp, mod_paths[] = {
+                {"portals", "lustre/portals/libcfs"},
+                {"ksocknal", "lustre/portals/knals/socknal"},
+                {"obdclass", "lustre/obdclass"},
+                {"ptlrpc", "lustre/ptlrpc"},
+                {"obdext2", "lustre/obdext2"},
+                {"ost", "lustre/ost"},
+                {"osc", "lustre/osc"},
+                {"mds", "lustre/mds"},
+                {"mdc", "lustre/mdc"},
+                {"llite", "lustre/llite"},
+                {"obdecho", "lustre/obdecho"},
+                {"ldlm", "lustre/ldlm"},
+                {"obdfilter", "lustre/obdfilter"},
+                {"extN", "lustre/extN"},
+                {"lov", "lustre/lov"},
+                {"fsfilt_ext3", "lustre/obdclass"},
+                {"fsfilt_extN", "lustre/obdclass"},
+                {"mds_ext2", "lustre/mds"},
+                {"mds_ext3", "lustre/mds"},
+                {"mds_extN", "lustre/mds"},
+                {"ptlbd", "lustre/ptlbd"},
+                {NULL, NULL}
+        };
+        char *path = "..";
+        char *kernel = "linux";
+
+        if (argc >= 2)
+                path = argv[1];
+        if (argc == 3)
+                kernel = argv[2];
+        if (argc > 3) {
+                printf("%s [path] [kernel]\n", argv[0]);
+                return 0;
+        }
+
+        for (mp = mod_paths; mp->name != NULL; mp++) {
+                struct module_info info;
+                int rc;
+                size_t crap;
+                int query_module(const char *name, int which, void *buf,
+                                 size_t bufsize, size_t *ret);
+
+                rc = query_module(mp->name, QM_INFO, &info, sizeof(info),
+                                  &crap);
+                if (rc < 0) {
+                        if (errno != ENOENT)
+                                printf("query_module(%s) failed: %s\n",
+                                       mp->name, strerror(errno));
+                } else {
+                        printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path,
+                               mp->path, mp->name,
+                               info.addr + sizeof(struct module));
+                }
+        }
+
+        return 0;
+#else
+        printf("jt_dbg_module is not yet implemented for Linux 2.5\n");
+        return 0;
+#endif /* linux 2.5 */
+}
+
+int jt_dbg_panic(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+
+        memset(&data, 0, sizeof(data));
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
diff --git a/lnet/utils/debugctl.c b/lnet/utils/debugctl.c
new file mode 100644 (file)
index 0000000..02cb9b4
--- /dev/null
@@ -0,0 +1,66 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Some day I'll split all of this functionality into a cfs_debug module
+ * of its own.  That day is not today.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include "parser.h"
+
+
+command_t list[] = {
+        {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"},
+        {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file [#MB]|stop|pause|continue], control debug daemon to dump debug buffer to a file"}, 
+        {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file <input> [output] [raw], read debug buffer from input and print it [to output]"},
+        {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"},
+        {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"},
+        {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"},
+        {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"},
+        {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"},
+        {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: <path>)"},
+        {"panic", jt_dbg_panic, 0, "cause the kernel to panic"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
+        {"help", Parser_help, 0, "help"},
+        {"exit", Parser_quit, 0, "quit"},
+        {"quit", Parser_quit, 0, "quit"},
+        { 0, 0, 0, NULL }
+};
+
+int main(int argc, char **argv)
+{
+        if (dbg_initialize(argc, argv) < 0)
+                exit(2);
+
+        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+
+        Parser_init("debugctl > ", list);
+        if (argc > 1)
+                return Parser_execarg(argc - 1, &argv[1], list);
+
+        Parser_commands();
+
+        unregister_ioc_dev(PORTALS_DEV_ID);
+        return 0;
+}
diff --git a/lnet/utils/l_ioctl.c b/lnet/utils/l_ioctl.c
new file mode 100644 (file)
index 0000000..722bb57
--- /dev/null
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+
+struct ioc_dev {
+       const char * dev_name;
+       int dev_fd;
+};
+
+static struct ioc_dev ioc_dev_list[10];
+
+struct dump_hdr {
+       int magic;
+       int dev_id;
+       int opc;
+};
+
+char * dump_filename;
+
+static int
+open_ioc_dev(int dev_id) 
+{
+       const char * dev_name;
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return -EINVAL;
+
+       dev_name = ioc_dev_list[dev_id].dev_name;
+       if (dev_name == NULL) {
+                fprintf(stderr, "unknown device id: %d\n", dev_id);
+               return -EINVAL;
+       }
+
+       if (ioc_dev_list[dev_id].dev_fd < 0) {
+               int fd = open(dev_name, O_RDWR);
+               
+               if (fd < 0) {
+                       fprintf(stderr, "opening %s failed: %s\n"
+                               "hint: the kernel modules may not be loaded\n",
+                               dev_name, strerror(errno));
+                       return fd;
+               }
+               ioc_dev_list[dev_id].dev_fd = fd;
+       }
+
+       return ioc_dev_list[dev_id].dev_fd;
+}
+
+
+static int 
+do_ioctl(int dev_id, int opc, void *buf)
+{
+       int fd, rc;
+       
+       fd = open_ioc_dev(dev_id);
+       if (fd < 0) 
+               return fd;
+
+       rc = ioctl(fd, opc, buf);
+       return rc;
+       
+}
+
+static FILE *
+get_dump_file() 
+{
+       FILE *fp = NULL;
+       
+       if (!dump_filename) {
+               fprintf(stderr, "no dump filename\n");
+       } else 
+               fp = fopen(dump_filename, "a");
+       return fp;
+}
+
+/*
+ * The dump file should start with a description of which devices are
+ * used, but for now it will assumed whatever app reads the file will
+ * know what to do. */
+int 
+dump(int dev_id, int opc, void *buf)
+{
+       FILE *fp;
+       struct dump_hdr dump_hdr;
+       struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
+       int rc;
+       
+       printf("dumping opc %x to %s\n", opc, dump_filename);
+       
+
+       dump_hdr.magic = 0xdeadbeef;
+       dump_hdr.dev_id = dev_id;
+       dump_hdr.opc = opc;
+
+       fp = get_dump_file();
+       if (fp == NULL) {
+               fprintf(stderr, "%s: %s\n", dump_filename, 
+                       strerror(errno));
+               return -EINVAL;
+       }
+       
+       rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
+       if (rc == 1)
+               rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
+       fclose(fp);
+       if (rc != 1) {
+               fprintf(stderr, "%s: %s\n", dump_filename, 
+                       strerror(errno));
+               return -EINVAL;
+       }
+       
+       return 0;
+}
+
+/* register a device to send ioctls to.  */
+int 
+register_ioc_dev(int dev_id, const char * dev_name) 
+{
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return -EINVAL;
+
+       unregister_ioc_dev(dev_id);
+
+       ioc_dev_list[dev_id].dev_name = dev_name;
+       ioc_dev_list[dev_id].dev_fd = -1;
+
+       return dev_id;
+}
+
+void
+unregister_ioc_dev(int dev_id) 
+{
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return;
+       if (ioc_dev_list[dev_id].dev_name != NULL &&
+           ioc_dev_list[dev_id].dev_fd >= 0) 
+               close(ioc_dev_list[dev_id].dev_fd);
+
+       ioc_dev_list[dev_id].dev_name = NULL;
+       ioc_dev_list[dev_id].dev_fd = -1;
+}
+
+/* If this file is set, then all ioctl buffers will be 
+   appended to the file. */
+int
+set_ioctl_dump(char * file)
+{
+       if (dump_filename)
+               free(dump_filename);
+       
+       dump_filename = strdup(file);
+       return 0;
+}
+
+int
+l_ioctl(int dev_id, int opc, void *buf)
+{
+       if (dump_filename) 
+               return dump(dev_id, opc, buf);
+       else 
+               return do_ioctl(dev_id, opc, buf);
+}
+
+/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer
+ * in the file.  For example:
+ *
+ * parse_dump("lctl.dump", l_ioctl);
+ *
+ * Note: if using l_ioctl, then you also need to register_ioc_dev() for 
+ * each device used in the dump.
+ */
+int 
+parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
+{
+       int fd, line =0;
+       struct stat st;
+       char *buf, *end;
+       
+       fd = syscall(SYS_open, dump_file, O_RDONLY);
+
+#warning FIXME: cleanup fstat issue here
+#ifndef SYS_fstat64
+#define __SYS_fstat__ SYS_fstat
+#else
+#define __SYS_fstat__ SYS_fstat64
+#endif
+       if (syscall(__SYS_fstat__, fd, &st)) { 
+               perror("stat fails");
+               exit(1);
+       }
+
+       if (st.st_size < 1) {
+               fprintf(stderr, "KML is empty\n");
+               exit(1);
+       }
+
+       buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+       end = buf + st.st_size;
+       close(fd);
+       while (buf < end) {
+               struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
+               struct portal_ioctl_hdr * data;
+               char tmp[8096];
+               int rc;
+               
+               line++;
+
+               data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+               if (buf + data->ioc_len > end ) {
+                       fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
+                               data->ioc_len, end);
+                       return -1;
+               }
+#if 0
+               printf ("dump_hdr: %lx data: %lx\n",
+                       (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
+               
+               printf("%d: opcode %x len: %d  ver: %x ", line, dump_hdr->opc,
+                      data->ioc_len, data->ioc_version);
+#endif
+
+               memcpy(tmp, data, data->ioc_len);
+
+               rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
+               if (rc) {
+                       printf("failed: %d\n", rc);
+                       exit(1);
+               }
+
+               buf += data->ioc_len + sizeof(*dump_hdr);
+       }
+       return 0;
+}
+
+int 
+jt_ioc_dump(int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+       printf("setting dumpfile to: %s\n", argv[1]);
+       
+       set_ioctl_dump(argv[1]);
+       return 0;
+}
diff --git a/lnet/utils/parser.c b/lnet/utils/parser.c
new file mode 100644 (file)
index 0000000..4d93645
--- /dev/null
@@ -0,0 +1,703 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <sys/param.h>
+#include <assert.h>
+
+#include <config.h>
+#ifdef HAVE_LIBREADLINE
+#define        READLINE_LIBRARY
+#include <readline/readline.h>
+#endif
+//extern char **completion_matches __P((char *, rl_compentry_func_t *));
+extern void using_history(void);
+extern void stifle_history(int);
+extern void add_history(char *);
+
+#include "parser.h"
+
+static command_t * top_level;      /* Top level of commands, initialized by
+                                    * InitParser                            */
+static char * parser_prompt = NULL;/* Parser prompt, set by InitParser      */
+static int done;                  /* Set to 1 if user types exit or quit   */
+
+
+/* static functions */
+static char *skipwhitespace(char *s);
+static char *skiptowhitespace(char *s);
+static command_t *find_cmd(char *name, command_t cmds[], char **next);
+static int process(char *s, char **next, command_t *lookup, command_t **result,
+                   char **prev);
+static void print_commands(char *str, command_t *table);
+
+static char * skipwhitespace(char * s)
+{
+    char * t;
+    int    len;
+
+    len = (int)strlen(s);
+    for (t = s; t <= s + len && isspace(*t); t++);
+    return(t);
+}
+
+
+static char * skiptowhitespace(char * s)
+{
+    char * t;
+
+    for (t = s; *t && !isspace(*t); t++);
+    return(t);
+}
+
+static int line2args(char *line, char **argv, int maxargs)
+{
+    char *arg;
+    int i = 0;
+
+    arg = strtok(line, " \t");
+    if ( arg ) {
+       argv[i] = arg;
+       i++;
+    } else
+       return 0;
+
+    while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) {
+       argv[i] = arg;
+       i++;
+    }
+    return i;
+}
+
+/* find a command -- return it if unique otherwise print alternatives */
+static command_t *Parser_findargcmd(char *name, command_t cmds[])
+{
+       command_t *cmd;
+
+       for (cmd = cmds; cmd->pc_name; cmd++) {
+               if (strcmp(name, cmd->pc_name) == 0)
+                       return cmd;
+       }
+       return NULL;
+}
+
+int Parser_execarg(int argc, char **argv, command_t cmds[])
+{
+       command_t *cmd;
+
+        cmd = Parser_findargcmd(argv[0], cmds);
+       if ( cmd ) {
+               return (cmd->pc_func)(argc, argv);
+       } else {
+               printf("Try interactive use without arguments or use one of:\n");
+               for (cmd = cmds; cmd->pc_name; cmd++)
+                       printf("\"%s\" ", cmd->pc_name);
+               printf("\nas argument.\n");
+       }
+       return -1;
+}
+
+/* returns the command_t * (NULL if not found) corresponding to a
+   _partial_ match with the first token in name.  It sets *next to
+   point to the following token. Does not modify *name. */
+static command_t * find_cmd(char * name, command_t cmds[], char ** next)
+{
+        int    i, len;
+    
+        if (!cmds || !name ) 
+                return NULL;
+    
+        /* This sets name to point to the first non-white space character,
+           and next to the first whitespace after name, len to the length: do
+           this with strtok*/
+        name = skipwhitespace(name);
+        *next = skiptowhitespace(name);
+        len = *next - name;
+        if (len == 0) 
+                return NULL;
+
+        for (i = 0; cmds[i].pc_name; i++) {
+                if (strncasecmp(name, cmds[i].pc_name, len) == 0) {
+                        *next = skipwhitespace(*next);
+                        return(&cmds[i]);
+                }
+        }
+        return NULL;
+}
+
+/* Recursively process a command line string s and find the command
+   corresponding to it. This can be ambiguous, full, incomplete,
+   non-existent. */
+static int process(char *s, char ** next, command_t *lookup,
+                  command_t **result, char **prev)
+{
+    *result = find_cmd(s, lookup, next);
+    *prev = s;
+
+        /* non existent */
+        if ( ! *result ) 
+                return CMD_NONE;
+
+        /* found entry: is it ambigous, i.e. not exact command name and
+           more than one command in the list matches.  Note that find_cmd
+           points to the first ambiguous entry */
+        if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) &&
+             find_cmd(s, (*result) + 1, next)) 
+                return CMD_AMBIG;
+
+        /* found a unique command: component or full? */
+        if ( (*result)->pc_func ) {
+                return CMD_COMPLETE;
+        } else {
+                if ( *next == '\0' ) {
+                        return CMD_INCOMPLETE;
+                } else {
+                        return process(*next, next, (*result)->pc_sub_cmd, result, prev);
+                }
+        }
+}
+
+#ifdef HAVE_LIBREADLINE
+static command_t * match_tbl;   /* Command completion against this table */
+static char * command_generator(const char * text, int state)
+{
+        static int index,
+                len;
+        char       *name;
+
+        /* Do we have a match table? */
+        if (!match_tbl)
+                return NULL;
+
+        /* If this is the first time called on this word, state is 0 */
+        if (!state) {
+                index = 0;
+                len = (int)strlen(text);
+        }
+
+        /* Return next name in the command list that paritally matches test */
+        while ( (name = (match_tbl + index)->pc_name) ) {
+                index++;
+
+                if (strncasecmp(name, text, len) == 0) {
+                        return(strdup(name));
+                }
+        }
+
+    /* No more matches */
+    return NULL;
+}
+
+/* probably called by readline */
+static char **command_completion(char * text, int start, int end)
+{
+    command_t  * table;
+    char       * pos;
+
+    match_tbl = top_level;
+    for (table = find_cmd(rl_line_buffer, match_tbl, &pos);
+        table;
+        table = find_cmd(pos, match_tbl, &pos)) {
+
+       if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd;
+    }
+
+    return(completion_matches(text, command_generator));
+}
+#endif
+
+/* take a string and execute the function or print help */
+int execute_line(char * line)
+{
+        command_t         *cmd, *ambig;
+        char *prev;
+        char *next, *tmp;
+        char *argv[MAXARGS];
+        int         i;
+        int rc = 0;
+
+        switch( process(line, &next, top_level, &cmd, &prev) ) {
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, cmd, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        cmd = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "No such command, type help\n");
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_COMPLETE:
+                i = line2args(line, argv, MAXARGS);
+                rc = (cmd->pc_func)(i, argv);
+
+                if (rc == CMD_HELP)
+                        fprintf(stderr, "%s\n", cmd->pc_help);
+
+                break;
+        }
+
+        return rc;
+}
+
+int
+noop_fn ()
+{
+        return (0);
+}
+
+/* just in case you're ever in an airplane and discover you 
+   forgot to install readline-dev. :) */
+int init_input() 
+{
+        int   interactive = isatty (fileno (stdin));
+
+#ifdef HAVE_LIBREADLINE
+        using_history();
+        stifle_history(HISTORY);
+
+        if (!interactive)
+        {
+                rl_prep_term_function = (rl_vintfunc_t *)noop_fn;
+                rl_deprep_term_function = (rl_voidfunc_t *)noop_fn;
+        }
+
+        rl_attempted_completion_function = (CPPFunction *)command_completion;
+        rl_completion_entry_function = (void *)command_generator;
+#endif 
+        return interactive;
+}
+
+#ifndef HAVE_LIBREADLINE
+#define add_history(s)
+char * readline(char * prompt) 
+{
+        char line[2048];
+        int n = 0;
+        if (prompt)
+                printf ("%s", prompt);
+        if (fgets(line, sizeof(line), stdin) == NULL)
+                return (NULL);
+        n = strlen(line);
+        if (n && line[n-1] == '\n')
+                line[n-1] = '\0';
+        return strdup(line);
+}
+#endif
+
+/* this is the command execution machine */
+int Parser_commands(void)
+{
+        char *line, *s;
+        int rc = 0;
+        int interactive;
+        
+        interactive = init_input();
+
+        while(!done) {
+                line = readline(interactive ? parser_prompt : NULL);
+
+                if (!line) break;
+
+                s = skipwhitespace(line);
+
+                if (*s) {
+                        add_history(s);
+                        rc = execute_line(s);
+                }
+                
+                free(line);
+        }
+        return rc;
+}
+
+
+/* sets the parser prompt */
+void Parser_init(char * prompt, command_t * cmds)
+{
+    done = 0;
+    top_level = cmds;
+    if (parser_prompt) free(parser_prompt);
+    parser_prompt = strdup(prompt);
+}
+
+/* frees the parser prompt */
+void Parser_exit(int argc, char *argv[])
+{
+    done = 1;
+    free(parser_prompt);
+    parser_prompt = NULL;
+}
+
+/* convert a string to an integer */
+int Parser_int(char *s, int *val)
+{
+    int ret;
+
+    if (*s != '0')
+       ret = sscanf(s, "%d", val);
+    else if (*(s+1) != 'x')
+       ret = sscanf(s, "%o", val);
+    else {
+       s++;
+       ret = sscanf(++s, "%x", val);
+    }
+
+    return(ret);
+}
+
+
+void Parser_qhelp(int argc, char *argv[]) {
+
+    printf("Available commands are:\n");
+
+    print_commands(NULL, top_level);
+    printf("For more help type: help command-name\n");
+}
+
+int Parser_help(int argc, char **argv) 
+{
+        char line[1024];
+        char *next, *prev, *tmp;
+        command_t *result, *ambig;
+        int i;
+
+        if ( argc == 1 ) {
+                Parser_qhelp(argc, argv);
+                return 0;
+        }
+
+        line[0]='\0';
+        for ( i = 1 ;  i < argc ; i++ ) {
+                strcat(line, argv[i]);
+        }
+
+        switch ( process(line, &next, top_level, &result, &prev) ) {
+        case CMD_COMPLETE:
+                fprintf(stderr, "%s: %s\n",line, result->pc_help);
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "%s: Unknown command.\n", line);
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; result->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, result, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        result = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        }
+        return 0;
+}  
+
+
+void Parser_printhelp(char *cmd)
+{
+        char *argv[] = { "help", cmd }; 
+        Parser_help(2, argv);
+}
+
+/*************************************************************************
+ * COMMANDS                                                             *
+ *************************************************************************/
+
+
+static void print_commands(char * str, command_t * table) {
+    command_t * cmds;
+    char       buf[80];
+
+    for (cmds = table; cmds->pc_name; cmds++) {
+       if (cmds->pc_func) {
+           if (str) printf("\t%s %s\n", str, cmds->pc_name);
+           else printf("\t%s\n", cmds->pc_name);
+       }
+       if (cmds->pc_sub_cmd) {
+           if (str) {
+               sprintf(buf, "%s %s", str, cmds->pc_name);
+               print_commands(buf, cmds->pc_sub_cmd);
+           } else {
+               print_commands(cmds->pc_name, cmds->pc_sub_cmd);
+           }
+       }
+    }
+}
+
+char *Parser_getstr(const char *prompt, const char *deft, char *res,
+                   size_t len)
+{
+    char *line = NULL;
+    int size = strlen(prompt) + strlen(deft) + 8;
+    char *theprompt;
+    theprompt = malloc(size);
+    assert(theprompt);
+
+    sprintf(theprompt, "%s [%s]: ", prompt, deft);
+
+    line  = readline(theprompt);
+    free(theprompt);
+
+    if ( line == NULL || *line == '\0' ) {
+       strncpy(res, deft, len);
+    } else {
+       strncpy(res, line, len);
+    }
+
+    if ( line ) {
+       free(line);
+       return res;
+    } else {
+       return NULL;
+    }
+}
+
+/* get integer from prompt, loop forever to get it */
+int Parser_getint(const char *prompt, long min, long max, long deft, int base)
+{
+    int rc;
+    long result;
+    char *line;
+    int size = strlen(prompt) + 40;
+    char *theprompt = malloc(size);
+    assert(theprompt);
+    sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft);
+
+    fflush(stdout);
+
+    do {
+       line = NULL;
+       line = readline(theprompt);
+       if ( !line ) {
+           fprintf(stdout, "Please enter an integer.\n");
+           fflush(stdout);
+           continue;
+       }
+       if ( *line == '\0' ) {
+           free(line);
+           result =  deft;
+           break;
+       }
+       rc = Parser_arg2int(line, &result, base);
+       free(line);
+       if ( rc != 0 ) {
+           fprintf(stdout, "Invalid string.\n");
+           fflush(stdout);
+       } else if ( result > max || result < min ) {
+           fprintf(stdout, "Error: response must lie between %ld and %ld.\n",
+                   min, max);
+           fflush(stdout);
+       } else {
+           break;
+       }
+    } while ( 1 ) ;
+
+    if (theprompt)
+       free(theprompt);
+    return result;
+
+}
+
+/* get boolean (starting with YyNn; loop forever */
+int Parser_getbool(const char *prompt, int deft)
+{
+    int result = 0;
+    char *line;
+    int size = strlen(prompt) + 8;
+    char *theprompt = malloc(size);
+    assert(theprompt);
+
+    fflush(stdout);
+
+    if ( deft != 0 && deft != 1 ) {
+       fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n",
+               deft);
+       assert ( 0 );
+    }
+    sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y");
+
+    do {
+       line = NULL;
+       line = readline(theprompt);
+       if ( line == NULL ) {
+           result = deft;
+           break;
+       }
+       if ( *line == '\0' ) {
+           result = deft;
+           break;
+       }
+       if ( *line == 'y' || *line == 'Y' ) {
+           result = 1;
+           break;
+       }
+       if ( *line == 'n' || *line == 'N' ) {
+           result = 0;
+           break;
+       }
+       if ( line )
+           free(line);
+       fprintf(stdout, "Invalid string. Must start with yY or nN\n");
+       fflush(stdout);
+    } while ( 1 );
+
+    if ( line )
+       free(line);
+    if ( theprompt )
+       free(theprompt);
+    return result;
+}
+
+/* parse int out of a string or prompt for it */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                 int min, int max, int base)
+{
+    long result;
+    int rc;
+
+    rc = Parser_arg2int(inp, &result, base);
+
+    if ( rc == 0 ) {
+       return result;
+    } else {
+       return Parser_getint(prompt, deft, min, max, base);
+    }
+}
+
+/* parse int out of a string or prompt for it */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                   char *answer, int len)
+{
+    if ( inp == NULL || *inp == '\0' ) {
+       return Parser_getstr(prompt, deft, answer, len);
+    } else
+       return inp;
+}
+
+/* change a string into a number: return 0 on success. No invalid characters
+   allowed. The processing of base and validity follows strtol(3)*/
+int Parser_arg2int(const char *inp, long *result, int base)
+{
+    char *endptr;
+
+    if ( (base !=0) && (base < 2 || base > 36) )
+       return 1;
+
+    *result = strtol(inp, &endptr, base);
+
+        if ( *inp != '\0' && *endptr == '\0' )
+                return 0;
+        else 
+                return 1;
+}
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size (int *sizep, char *str) {
+        int size;
+        char mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod) {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool (int *b, char *str) {
+        if (!strcasecmp (str, "no") ||
+            !strcasecmp (str, "n") ||
+            !strcasecmp (str, "off") ||
+            !strcasecmp (str, "disable"))
+        {
+                *b = 0;
+                return (0);
+        }
+        
+        if (!strcasecmp (str, "yes") ||
+            !strcasecmp (str, "y") ||
+            !strcasecmp (str, "on") ||
+            !strcasecmp (str, "enable"))
+        {
+                *b = 1;
+                return (0);
+        }
+        
+        return (-1);
+}
+
+int Parser_quit(int argc, char **argv)
+{
+        argc = argc;
+        argv = argv;
+        done = 1;
+        return 0;
+}
diff --git a/lnet/utils/parser.h b/lnet/utils/parser.h
new file mode 100644 (file)
index 0000000..dead9f5
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef _PARSER_H_
+#define _PARSER_H_
+
+#define HISTORY        100             /* Don't let history grow unbounded    */
+#define MAXARGS 100
+
+#define CMD_COMPLETE   0
+#define CMD_INCOMPLETE 1
+#define CMD_NONE       2
+#define CMD_AMBIG      3
+#define CMD_HELP       4
+
+typedef struct parser_cmd {
+       char    *pc_name;
+       int     (* pc_func)(int, char **);
+       struct parser_cmd * pc_sub_cmd;
+       char *pc_help;
+} command_t;
+
+typedef struct argcmd {
+       char    *ac_name;
+       int      (*ac_func)(int, char **);
+       char     *ac_help;
+} argcmd_t;
+
+typedef struct network {
+       char    *type;
+       char    *server;
+       int     port;
+} network_t;
+
+int  Parser_quit(int argc, char **argv);
+void Parser_init(char *, command_t *); /* Set prompt and load command list */
+int Parser_commands(void);                     /* Start the command parser */
+void Parser_qhelp(int, char **);       /* Quick help routine */
+int Parser_help(int, char **);         /* Detailed help routine */
+void Parser_printhelp(char *);         /* Detailed help routine */
+void Parser_exit(int, char **);                /* Shuts down command parser */
+int Parser_execarg(int argc, char **argv, command_t cmds[]);
+int execute_line(char * line);
+
+/* Converts a string to an integer */
+int Parser_int(char *, int *);
+
+/* Prompts for a string, with default values and a maximum length */
+char *Parser_getstr(const char *prompt, const char *deft, char *res, 
+                   size_t len);
+
+/* Prompts for an integer, with minimum, maximum and default values and base */
+int Parser_getint(const char *prompt, long min, long max, long deft,
+                 int base);
+
+/* Prompts for a yes/no, with default */
+int Parser_getbool(const char *prompt, int deft);
+
+/* Extracts an integer from a string, or prompts if it cannot get one */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                  int min, int max, int base);
+
+/* Extracts a word from the input, or propmts if it cannot get one */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                   char *answer, int len);
+
+/* Extracts an integer from a string  with a base */
+int Parser_arg2int(const char *inp, long *result, int base);
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size(int *sizep, char *str);
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool(int *b, char *str);
+
+#endif
diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c
new file mode 100644 (file)
index 0000000..90d66f5
--- /dev/null
@@ -0,0 +1,985 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <asm/byteorder.h>
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+#include "parser.h"
+
+unsigned int portal_debug;
+unsigned int portal_printk;
+unsigned int portal_stack;
+
+
+static ptl_nid_t g_nid = 0;
+static unsigned int g_nal = 0;
+static unsigned short g_port = 0;
+
+static int g_socket_txmem = 0;
+static int g_socket_rxmem = 0;
+static int g_socket_nonagle = 1;
+
+typedef struct
+{
+        char *name;
+        int   num;
+} name2num_t;
+
+static name2num_t nalnames[] = {
+        {"tcp",                SOCKNAL},
+        {"toe",                TOENAL},
+        {"elan",       QSWNAL},
+        {"gm",         GMNAL},
+        {"scimac",      SCIMACNAL},
+        {NULL,         -1}
+};
+
+static name2num_t *
+name2num_lookup_name (name2num_t *table, char *str)
+{
+        while (table->name != NULL)
+                if (!strcmp (str, table->name))
+                        return (table);
+                else
+                        table++;
+        return (NULL);
+}
+
+static name2num_t *
+name2num_lookup_num (name2num_t *table, int num)
+{
+        while (table->name != NULL)
+                if (num == table->num)
+                        return (table);
+                else
+                        table++;
+        return (NULL);
+}
+
+int
+ptl_name2nal (char *str)
+{
+        name2num_t *e = name2num_lookup_name (nalnames, str);
+
+        return ((e == NULL) ? 0 : e->num);
+}
+
+static char *
+nal2name (int nal)
+{
+        name2num_t *e = name2num_lookup_num (nalnames, nal);
+
+        return ((e == NULL) ? "???" : e->name);
+}
+
+int
+ptl_parse_nid (ptl_nid_t *nidp, char *str)
+{
+        struct hostent *he;
+        int             a;
+        int             b;
+        int             c;
+        int             d;
+        
+        if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 &&
+            (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+            (c & ~0xff) == 0 && (d & ~0xff) == 0)
+        {
+                __u32 addr = (a<<24)|(b<<16)|(c<<8)|d;
+
+                *nidp = (ptl_nid_t)addr;
+                return (0);
+        }
+        
+        if ((('a' <= str[0] && str[0] <= 'z') ||
+             ('A' <= str[0] && str[0] <= 'Z')) &&
+             (he = gethostbyname (str)) != NULL)
+        {
+                __u32 addr = *(__u32 *)he->h_addr;
+
+                *nidp = (ptl_nid_t)ntohl(addr);  /* HOST byte order */
+                return (0);
+        }
+
+        if (sscanf (str, "%i", &a) == 1)
+        {
+                *nidp = (ptl_nid_t)a;
+                return (0);
+        }
+
+        if (sscanf (str, "%x", &a) == 1)
+        {
+                *nidp = (ptl_nid_t) a;
+                return (0);
+        }
+
+        return (-1);
+}
+
+char *
+ptl_nid2str (char *buffer, ptl_nid_t nid)
+{
+        __u32           addr = htonl((__u32)nid); /* back to NETWORK byte order */
+        struct hostent *he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET);
+
+        if (he != NULL)
+                strcpy (buffer, he->h_name);
+        else
+                sprintf (buffer, "0x"LPX64, nid);
+        
+        return (buffer);
+}
+
+int
+sock_write (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = write (cfd, buffer, nob);
+
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+
+                if (rc == 0)
+                {
+                        fprintf (stderr, "Unexpected zero sock_write\n");
+                        abort();
+                }
+
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+sock_read (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = read (cfd, buffer, nob);
+                
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+                
+                if (rc == 0)                    /* EOF */
+                {
+                        errno = ECONNABORTED;
+                        return (-1);
+                }
+                
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int ptl_initialize(int argc, char **argv) 
+{
+        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+        return 0;
+}
+
+
+int jt_ptl_network(int argc, char **argv)
+{
+        int  nal;
+        
+        if (argc != 2 ||
+            (nal = ptl_name2nal (argv[1])) == 0)
+        {
+                name2num_t *entry;
+                
+                fprintf(stderr, "usage: %s \n", argv[0]);
+                for (entry = nalnames; entry->name != NULL; entry++)
+                        fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name);
+                fprintf(stderr, ">\n");
+        }
+        else
+                g_nal = nal;
+
+        return (0);
+}
+
+int
+exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid)
+{
+        int                      rc;
+        ptl_hdr_t                hdr;
+        ptl_magicversion_t      *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        memset (&hdr, 0, sizeof (hdr));
+        
+        hmv->magic          = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+
+        hdr.src_nid = __cpu_to_le64 (my_nid);
+        hdr.type = __cpu_to_le32 (PTL_MSG_HELLO);
+        
+        /* Assume there's sufficient socket buffering for a portals HELLO header */
+        rc = sock_write (cfd, &hdr, sizeof (hdr));
+        if (rc != 0) {
+                perror ("Can't send initial HELLO");
+                return (-1);
+        }
+
+        /* First few bytes down the wire are the portals protocol magic and
+         * version, no matter what protocol version we're running. */
+
+        rc = sock_read (cfd, hmv, sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read from peer");
+                return (-1);
+        }
+
+        if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) {
+                fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", 
+                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC);
+                return (-1);
+        }
+
+        if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR ||
+            __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) {
+                fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n",
+                         __cpu_to_le16 (hmv->version_major),
+                         __cpu_to_le16 (hmv->version_minor),
+                         PORTALS_PROTO_VERSION_MAJOR,
+                         PORTALS_PROTO_VERSION_MINOR);
+        }
+
+        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+         * so read the rest of it in now... */
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read rest of HELLO hdr");
+                return (-1);
+        }
+
+        /* ...and check we got what we expected */
+        if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO ||
+            __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) {
+                fprintf (stderr, "Expecting a HELLO hdr with 0 payload,"
+                         " but got type %d with %d payload\n",
+                         __cpu_to_le32 (hdr.type),
+                         __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)));
+                return (-1);
+        }
+        
+        *peer_nid = __le64_to_cpu (hdr.src_nid);
+        return (0);
+}
+
+int jt_ptl_connect(int argc, char **argv)
+{
+        if (argc < 2) {
+        usage:
+                fprintf(stderr, "usage: %s <hostname port [xi]> or <elan ID>\n",
+                        argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                ptl_nid_t peer_nid;
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                struct sockaddr_in srvaddr;
+                char *flag;
+                int fd, rc;
+                int nonagle = 0;
+                int rxmem = 0;
+                int txmem = 0;
+                int bind_irq = 0;
+                int xchange_nids = 0;
+                int o;
+                int olen;
+                
+                if (argc < 3) {
+                        goto usage;
+                }
+
+                he = gethostbyname(argv[1]);
+                if (!he) {
+                        fprintf(stderr, "gethostbyname error: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                g_port = atol(argv[2]);
+
+                if (argc > 3)
+                        for (flag = argv[3]; *flag != 0; flag++)
+                                switch (*flag)
+                                {
+                                case 'i':
+                                        bind_irq = 1;
+                                        break;
+                                        
+                                case 'x':
+                                        xchange_nids = 1;
+                                        break;
+
+                                default:
+                                        fprintf (stderr, "unrecognised flag '%c'\n",
+                                                 *flag);
+                                        return (-1);
+                                }
+                
+                memset(&srvaddr, 0, sizeof(srvaddr));
+                srvaddr.sin_family = AF_INET;
+                srvaddr.sin_port = htons(g_port);
+                srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr;
+        
+                fd = socket(PF_INET, SOCK_STREAM, 0);
+                if ( fd < 0 ) {
+                        fprintf(stderr, "socket() failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                if (g_socket_nonagle)
+                {
+                        o = 1;
+                        if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                if (g_socket_rxmem != 0)
+                {
+                        o = g_socket_rxmem;
+                        if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                if (g_socket_txmem != 0)
+                {
+                        o = g_socket_txmem;
+                        if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+                if ( rc == -1 ) { 
+                        fprintf(stderr, "connect() failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                olen = sizeof (txmem);
+                if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0)
+                        fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno));
+                olen = sizeof (rxmem);
+                if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0)
+                        fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno));
+                olen = sizeof (nonagle);
+                if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0)
+                        fprintf (stderr, "Can't get nagle: %s\n", strerror (errno));
+
+                if (xchange_nids) {
+                        
+                        PORTAL_IOC_INIT (data);
+                        data.ioc_nal = g_nal;
+                        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
+                        if (rc != 0)
+                        {
+                                fprintf (stderr, "failed to get my nid: %s\n",
+                                         strerror (errno));
+                                close (fd);
+                                return (-1);
+                        }
+                        
+                        rc = exchange_nids (fd, data.ioc_nid, &peer_nid);
+                        if (rc != 0)
+                        {
+                                close (fd);
+                                return (-1);
+                        }
+                }
+                else
+                        peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */
+
+                printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1],
+                       peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled");
+
+                PORTAL_IOC_INIT(data);
+                data.ioc_fd = fd;
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD;
+                data.ioc_nid = peer_nid;
+                data.ioc_flags = bind_irq;
+                
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to register fd with portals: "
+                                "%s\n", strerror(errno));
+                        close (fd);
+                        return -1;
+                }
+
+                g_nid = peer_nid;
+                printf("Connection to "LPX64" registered with socknal\n", g_nid);
+
+                rc = close(fd);
+                if (rc) {
+                        fprintf(stderr, "close failed: %d\n", rc);
+                }
+        } else if (g_nal == QSWNAL) {
+                g_nid = atoi(argv[1]);
+        } else if (g_nal == GMNAL) {
+                g_nid = atoi(argv[1]);
+        } else if (g_nal == SCIMACNAL) {
+                unsigned int    tmpnid;
+                if(sscanf(argv[1], "%x", &tmpnid) == 1) {
+                        g_nid=tmpnid;
+                }
+                else {
+                        fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]);
+                }
+
+
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+        }
+
+        return 0;
+}
+
+int jt_ptl_disconnect(int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                int rc;
+
+                PORTAL_IOC_INIT(data);
+                if (argc == 2) {
+                        he = gethostbyname(argv[1]);
+                        if (!he) {
+                                fprintf(stderr, "gethostbyname error: %s\n",
+                                        strerror(errno));
+                                return -1;
+                        }
+                        
+                        data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */
+
+                } else {
+                        printf("Disconnecting ALL connections.\n");
+                        /* leave ioc_nid zeroed == disconnect all */
+                }
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION;
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to remove connection: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+        } else if (g_nal == QSWNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "elan.\n");
+        } else if (g_nal == GMNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "GM.\n");
+        } else if (g_nal == SCIMACNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "SCI.\n");
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        return 0;
+}
+
+int jt_ptl_push_connection (int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                int rc;
+
+                PORTAL_IOC_INIT(data);
+                if (argc == 2) {
+                        he = gethostbyname(argv[1]);
+                        if (!he) {
+                                fprintf(stderr, "gethostbyname error: %s\n",
+                                        strerror(errno));
+                                return -1;
+                        }
+                        
+                        data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */
+
+                } else {
+                        printf("Pushing ALL connections.\n");
+                        /* leave ioc_nid zeroed == disconnect all */
+                }
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION;
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to push connection: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+        } else if (g_nal == QSWNAL) {
+                printf("'push' doesn't make any sense for elan.\n");
+        } else if (g_nal == GMNAL) {
+                printf("'push' doesn't make any sense for GM.\n");
+        } else if (g_nal == SCIMACNAL) {
+                printf("'push' doesn't make any sense for SCI.\n");
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        return 0;
+}
+
+int jt_ptl_ping(int argc, char **argv)
+{
+        int       rc;
+        ptl_nid_t nid;
+        long      count   = 1;
+        long      size    = 4;
+        long      timeout = 1;
+        struct portal_ioctl_data data;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]);
+                return 0;
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
+                return (-1);
+        }
+        
+        if (argc > 2)
+        {
+                count = atol(argv[2]);
+
+                if (count < 0 || count > 20000) 
+                {
+                        fprintf(stderr, "are you insane?  %ld is a crazy count.\n", count);
+                        return -1;
+                }
+        }
+        
+        if (argc > 3)
+                size= atol(argv[3]);
+
+        if (argc > 4)
+                timeout = atol (argv[4]);
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_count   = count;
+        data.ioc_size    = size;
+        data.ioc_nid     = nid;
+        data.ioc_nal     = g_nal;
+        data.ioc_timeout = timeout;
+        
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data);
+        if (rc) {
+                fprintf(stderr, "failed to start pinger: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+int jt_ptl_shownid(int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        int                      rc;
+        
+        if (argc > 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+        
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command first\n");
+                return -1;
+        }
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_nal = g_nal;
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
+        if (rc < 0)
+                fprintf(stderr, "getting my NID failed: %s\n",
+                        strerror (errno));
+        else
+                printf(LPX64"\n", data.ioc_nid);
+        return 0;
+}
+
+int jt_ptl_mynid(int argc, char **argv)
+{
+        int rc;
+        char hostname[1024];
+        char *nidstr;
+        struct portal_ioctl_data data;
+        ptl_nid_t mynid;
+        
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [NID]\n", argv[0]);
+                fprintf(stderr, "NID defaults to the primary IP address of the machine.\n");
+                return 0;
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+
+        if (argc >= 2)
+                nidstr = argv[1];
+        else if (gethostname(hostname, sizeof(hostname)) != 0) {
+                fprintf(stderr, "gethostname failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        else
+                nidstr = hostname;
+
+        rc = ptl_parse_nid (&mynid, nidstr);
+        if (rc != 0) {
+                fprintf (stderr, "Can't convert '%s' into a NID\n", nidstr);
+                return -1;
+        }
+        
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = mynid;
+        data.ioc_nal = g_nal;
+        data.ioc_nal_cmd = NAL_CMD_REGISTER_MYNID;
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+        if (rc < 0)
+                fprintf(stderr, "setting my NID failed: %s\n",
+                       strerror(errno));
+        else
+                printf("registered my nid "LPX64" (%s)\n", mynid, hostname);
+        return 0;
+}
+
+int
+jt_ptl_fail_nid (int argc, char **argv)
+{
+        int                      rc;
+        ptl_nid_t                nid;
+        unsigned int             threshold;
+        struct portal_ioctl_data data;
+
+        if (argc < 2 || argc > 3)
+        {
+                fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]);
+                return (0);
+        }
+        
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return (-1);
+        }
+
+        if (!strcmp (argv[1], "_all_"))
+                nid = PTL_NID_ANY;
+        else if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        if (argc < 3)
+                threshold = PTL_MD_THRESH_INF;
+        else if (sscanf (argv[2], "%i", &threshold) != 1) {
+                fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]);
+                return (-1);
+        }
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_nal = g_nal;
+        data.ioc_nid = nid;
+        data.ioc_count = threshold;
+        
+        rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data);
+        if (rc < 0)
+                fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n",
+                         strerror (errno));
+        else
+                printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]);
+        
+        return (0);
+}
+
+int
+jt_ptl_rxmem (int argc, char **argv)
+{
+        int   size;
+        
+        if (argc > 1)
+        {
+                if (Parser_size (&size, argv[1]) != 0 || size < 0)
+                {
+                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
+                        return (0);
+                }
+
+                g_socket_rxmem = size;
+        }
+        printf ("Socket rmem = %d\n", g_socket_rxmem);        
+        return (0);
+}
+
+int
+jt_ptl_txmem (int argc, char **argv)
+{
+        int   size;
+        
+        if (argc > 1)
+        {
+                if (Parser_size (&size, argv[1]) != 0 || size < 0)
+                {
+                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
+                        return (0);
+                }
+                g_socket_txmem = size;
+        }
+        printf ("Socket txmem = %d\n", g_socket_txmem);
+        return (0);
+}
+
+int
+jt_ptl_nagle (int argc, char **argv)
+{
+        int enable;
+
+        if (argc > 1)
+        {
+                if (Parser_bool (&enable, argv[1]) != 0)
+                {
+                        fprintf (stderr, "Can't parse boolean %s\n", argv[1]);
+                        return (0);
+                }
+                g_socket_nonagle = !enable;
+        }
+        printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled");
+        return (0);
+}
+
+int
+jt_ptl_add_route (int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        ptl_nid_t                nid1;
+        ptl_nid_t                nid2;
+        ptl_nid_t                gateway_nid;
+        int                      rc;
+        
+        if (argc < 3)
+        {
+                fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]);
+                return (0);
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return (-1);
+        }
+
+        if (ptl_parse_nid (&gateway_nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        if (ptl_parse_nid (&nid1, argv[2]) != 0)
+        {
+                fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]);
+                return (-1);
+        }
+
+        if (argc < 4)
+                nid2 = nid1;
+        else if (ptl_parse_nid (&nid2, argv[3]) != 0)
+        {
+                fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]);
+                return (-1);
+        }
+
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = gateway_nid;
+        data.ioc_nal = g_nal;
+        data.ioc_nid2 = MIN (nid1, nid2);
+        data.ioc_nid3 = MAX (nid1, nid2);
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_ADD_ROUTE, &data);
+        if (rc != 0) 
+        {
+                fprintf (stderr, "IOC_PORTAL_ADD_ROUTE failed: %s\n", strerror (errno));
+                return (-1);
+        }
+        
+        return (0);
+}
+
+int
+jt_ptl_del_route (int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        ptl_nid_t                nid;
+        int                      rc;
+        
+        if (argc < 2)
+        {
+                fprintf (stderr, "usage: %s targetNID\n", argv[0]);
+                return (0);
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = nid;
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data);
+        if (rc != 0) 
+        {
+                fprintf (stderr, "IOC_PORTAL_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno));
+                return (-1);
+        }
+        
+        return (0);
+}
+
+int
+jt_ptl_print_routes (int argc, char **argv)
+{
+        char                      buffer[3][128];
+        struct portal_ioctl_data  data;
+        int                       rc;
+        int                       index;
+        int                      gateway_nal;
+        ptl_nid_t                gateway_nid;
+        ptl_nid_t                nid1;
+        ptl_nid_t                nid2;
+        
+        
+        for (index = 0;;index++)
+        {
+                PORTAL_IOC_INIT(data);
+                data.ioc_count = index;
+                
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_ROUTE, &data);
+                if (rc != 0)
+                        break;
+
+                gateway_nal = data.ioc_nal;
+                gateway_nid = data.ioc_nid;
+                nid1 = data.ioc_nid2;
+                nid2 = data.ioc_nid3;
+                
+                printf ("%8s %18s : %s - %s\n", 
+                        nal2name (gateway_nal), 
+                        ptl_nid2str (buffer[0], gateway_nid),
+                        ptl_nid2str (buffer[1], nid1),
+                        ptl_nid2str (buffer[2], nid2));
+        }
+        return (0);
+}
+
diff --git a/lnet/utils/ptlctl.c b/lnet/utils/ptlctl.c
new file mode 100644 (file)
index 0000000..8c56d93
--- /dev/null
@@ -0,0 +1,65 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+
+#include "parser.h"
+
+
+command_t list[] = {
+        {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"},
+        {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: <hostname port> | <id> for tcp/elan respectively)"},
+        {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"},
+        {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"},
+        {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"},
+        {"shownid", jt_ptl_shownid, 0, "print the local NID"},
+        {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"},
+        {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
+        {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"},
+        {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"},
+        {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"},
+        {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"},
+        {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
+        {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"},
+        {"help", Parser_help, 0, "help"},
+        {"exit", Parser_quit, 0, "quit"},
+        {"quit", Parser_quit, 0, "quit"},
+        { 0, 0, 0, NULL }
+};
+
+int main(int argc, char **argv)
+{
+        if (ptl_initialize(argc, argv) < 0)
+                exit(1);
+
+        Parser_init("ptlctl > ", list);
+        if (argc > 1)
+                return Parser_execarg(argc - 1, &argv[1], list);
+
+        Parser_commands();
+
+        return 0;
+}
diff --git a/lnet/utils/routerstat.c b/lnet/utils/routerstat.c
new file mode 100644 (file)
index 0000000..37da12c
--- /dev/null
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+double
+timenow ()
+{
+   struct timeval tv;
+   
+   gettimeofday (&tv, NULL);
+   return (tv.tv_sec + tv.tv_usec / 1000000.0);
+}
+
+void
+do_stat (int fd)
+{
+   static char  buffer[1024];
+   static double last = 0.0;
+   double now;
+   double t;
+   long long bytes;
+   long      packets;
+   long      errors;
+   long      depth;
+   int    n;
+   
+   lseek (fd, 0, SEEK_SET);
+   now = timenow();
+   n = read (fd, buffer, sizeof (buffer));
+   if (n < 0)
+   {
+      fprintf (stderr, "Can't read statfile\n");
+      exit (1);
+   }    
+   buffer[n] = 0;
+   
+   n = sscanf (buffer, "%Ld %ld %ld %ld", &bytes, &packets, &errors, &depth);
+   
+   if (n < 3)
+   {
+      fprintf (stderr, "Can't parse statfile\n");
+      exit (1);
+   }
+   
+   if (last == 0.0)
+      printf ("%Ld bytes, %ld packets (sz %Ld) %ld errors", 
+             bytes, packets, (long long)((packets == 0) ? 0LL : bytes/packets), errors);
+   else
+   {
+      t = now - last;
+
+      printf ("%9Ld (%7.2fMb/s), %7ld packets (sz %5Ld, %5ld/s) %ld errors (%ld/s)", 
+             bytes, ((double)bytes)/((1<<20) * t),
+             packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t),
+             errors, (long)(errors/t));
+   }
+
+   if (n == 4)
+      printf (" (%ld)\n", depth);
+   else
+      printf ("\n");
+
+   fflush (stdout);
+   
+   lseek (fd, 0, SEEK_SET);
+   write (fd, "\n", 1);
+   last = timenow();
+}
+
+int main (int argc, char **argv)
+{
+   int  interval = 0;
+   int  fd;
+   
+   if (argc > 1)
+      interval = atoi (argv[1]);
+
+   fd = open ("/proc/sys/portals/router", O_RDWR);
+   if (fd < 0)
+   {
+      fprintf (stderr, "Can't open stat: %s\n", strerror (errno));
+      return (1);
+   }
+   
+   do_stat (fd);
+   if (interval == 0)
+      return (0);
+   
+   for (;;)
+   {
+      sleep (interval);
+      do_stat (fd);
+   }
+}
diff --git a/lnet/utils/wirecheck.c b/lnet/utils/wirecheck.c
new file mode 100644 (file)
index 0000000..6a4377b
--- /dev/null
@@ -0,0 +1,141 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <portals/api-support.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+
+#define BLANK_LINE()                           \
+do {                                           \
+       printf ("\n");                          \
+} while (0)
+
+#define COMMENT(c)                             \
+do {                                           \
+       printf ("        /* "c" */\n");         \
+} while (0)
+
+#define STRINGIFY(a) #a
+
+#define CHECK_DEFINE(a)                                                \
+do {                                                           \
+       printf ("        LASSERT ("#a" == "STRINGIFY(a)");\n"); \
+} while (0)
+
+#define CHECK_VALUE(a)                                 \
+do {                                                   \
+       printf ("        LASSERT ("#a" == %d);\n", a);  \
+} while (0)
+
+#define CHECK_MEMBER_OFFSET(s,m)               \
+do {                                           \
+       CHECK_VALUE(offsetof(s, m));            \
+} while (0)
+
+#define CHECK_MEMBER_SIZEOF(s,m)               \
+do {                                           \
+       CHECK_VALUE((int)sizeof(((s *)0)->m));  \
+} while (0)
+
+#define CHECK_MEMBER(s,m)                      \
+do {                                           \
+       CHECK_MEMBER_OFFSET(s, m);              \
+       CHECK_MEMBER_SIZEOF(s, m);              \
+} while (0)
+
+#define CHECK_STRUCT(s)                         \
+do {                                            \
+        BLANK_LINE ();                          \
+        COMMENT ("Checks for struct "#s);       \
+       CHECK_VALUE((int)sizeof(s));            \
+} while (0)
+
+void
+check_ptl_handle_wire (void)
+{
+       CHECK_STRUCT (ptl_handle_wire_t);
+       CHECK_MEMBER (ptl_handle_wire_t, wh_interface_cookie);
+       CHECK_MEMBER (ptl_handle_wire_t, wh_object_cookie);
+}
+
+void
+check_ptl_magicversion (void)
+{
+       CHECK_STRUCT (ptl_magicversion_t);
+       CHECK_MEMBER (ptl_magicversion_t, magic);
+       CHECK_MEMBER (ptl_magicversion_t, version_major);
+       CHECK_MEMBER (ptl_magicversion_t, version_minor);
+}
+
+void
+check_ptl_hdr (void)
+{
+       CHECK_STRUCT (ptl_hdr_t);
+       CHECK_MEMBER (ptl_hdr_t, dest_nid);
+       CHECK_MEMBER (ptl_hdr_t, src_nid);
+       CHECK_MEMBER (ptl_hdr_t, dest_pid);
+       CHECK_MEMBER (ptl_hdr_t, src_pid);
+       CHECK_MEMBER (ptl_hdr_t, type);
+
+        BLANK_LINE ();
+        COMMENT ("Ack");
+        CHECK_MEMBER (ptl_hdr_t, msg.ack.mlength);
+        CHECK_MEMBER (ptl_hdr_t, msg.ack.dst_wmd);
+        CHECK_MEMBER (ptl_hdr_t, msg.ack.match_bits);
+        CHECK_MEMBER (ptl_hdr_t, msg.ack.length);
+
+        BLANK_LINE ();
+        COMMENT ("Put");
+       CHECK_MEMBER (ptl_hdr_t, msg.put.ptl_index);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.ack_wmd);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.match_bits);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.length);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.offset);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.hdr_data);
+
+        BLANK_LINE ();
+        COMMENT ("Get");
+       CHECK_MEMBER (ptl_hdr_t, msg.get.ptl_index);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.return_wmd);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.match_bits);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.length);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.src_offset);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.return_offset);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.sink_length);
+
+        BLANK_LINE ();
+        COMMENT ("Reply");
+       CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_wmd);
+       CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_offset);
+       CHECK_MEMBER (ptl_hdr_t, msg.reply.length);
+}
+
+int
+main (int argc, char **argv)
+{
+       printf ("void lib_assert_wire_constants (void)\n"
+               "{\n");
+
+       COMMENT ("Wire protocol assertions generated by 'wirecheck'");
+       BLANK_LINE ();
+       
+       COMMENT ("Constants...");
+       CHECK_DEFINE (PORTALS_PROTO_MAGIC);
+       CHECK_DEFINE (PORTALS_PROTO_VERSION_MAJOR);
+       CHECK_DEFINE (PORTALS_PROTO_VERSION_MINOR);
+
+       CHECK_VALUE (PTL_MSG_ACK);
+       CHECK_VALUE (PTL_MSG_PUT);
+       CHECK_VALUE (PTL_MSG_GET);
+       CHECK_VALUE (PTL_MSG_REPLY);
+       CHECK_VALUE (PTL_MSG_HELLO);
+
+       check_ptl_handle_wire ();
+       check_ptl_magicversion ();
+       check_ptl_hdr ();
+       
+       printf ("}\n\n");
+       
+       return (0);
+}
index 34373dd..776ef36 100644 (file)
@@ -1,4 +1,5 @@
 .Xrefs
+.Xrefs-2.5
 aclocal.m4
 config.log
 config.status
@@ -13,3 +14,5 @@ lustre*.tar.gz
 cscope.files
 cscope.out
 autom4te-2.53.cache
+autom4te.cache
+
index 97789a8..89eaef7 100644 (file)
@@ -1,3 +1,45 @@
+tbd
+       * version v0_7
+       * bug fixes
+       - imports and exports cleanup too early, need refcounts (349, 879, 1045)
+       - per-import/export recovery handling (958, 931, 959)
+       - multiple last-rcvd slots, for serving multiple FSes (949)
+       - connections are again shared between multiple imp/exports (963, 964)
+       - "umount -f" would hang if any requests needed to be sent (393, 978)
+       - avoid pinning large req buffer by copying for queued messages (989)
+       - add "uuid" to "lctl device" command to help upcalls (991)
+       - "open" RPCs with transnos would confuse recovery counters (1037)
+       - do proper endian conversion of all wire messages (288, 340, 891)
+       - remove OST bulk get LBUGs, fix ost_brw_write cleanup (1126)
+       - call waiting locks callback from LDLM recovery thread (1127, 1151)
+       - fix ptlrpc_connection leak in target_handle_connect (1174)
+       - fix import refcounting bug in OST and MDS cleanup (1134)
+       - if an invalid-at-open-time OSC returned before close(), LBUG (1150)
+       - fix very unlikely obd_types race condition (501)
+       - remove osc_open hack for echo_client (1187)
+       - we leaked exports/dlmimps for forcibly disconnected clients (1143)
+       - a failure in read_inode2 leads to deadlock (1139)
+       - cancel ack-locks as soon as transaction is committed (1072)
+       - fix major leaks and crashes in the bulk I/O path (937, 1057)
+       - make sure to commitrw after any preprw to avoid deadlock (1162)
+       - failing to execute a file in a lustre FS would lock inode (1203)
+       - small DEBUG_REQ fix to avoid dereferencing a NULL (1227)
+       - don't ASSERT while cleaning up an incompletely-setup obd (1248)
+       - obd_uuid2tgt would walk off the end of the list (1255)
+       - on IA64 the osc would give portals incorrect bulk size (1258)
+       - fix debug daemon ioctl interface; allows daemon on ia64 (1274)
+       - fix lock inversion caused by new llite matching code (1282)
+       - limit the number of dirty pages on a client to 10MB (1286)
+       - timed out locks were not being corrected cancelled (1289)
+       - fix O_DIRECT above 4GB on IA-32 (1292)
+       * major user-visible changes
+       - fail out/fail over policy now controlled by the upcall (993)
+       * protocol changes
+       - add OBD_PING to check server availability and failure (954)
+       - lustre messages are now sent in sending host order (288, 340, 891)
+       - add eadatalen to MDS getattr reply (340)
+       - OST read replies may contain second buffer, with per-page status (593)
+
 2003-03-11  Phil Schwan  <phil@clusterfs.com>
        * version v0_6
        * bug fixes
index 7ad7358..47d3c28 100644 (file)
@@ -8,22 +8,21 @@ AUTOMAKE_OPTIONS = foreign
 if LINUX25
 DIRS24 = 
 else
-DIRS24 = extN ptlbd
+DIRS24 = ptlbd
 endif
 
 if LIBLUSTRE
-#SUBDIRS = lov obdclass ptlrpc obdecho ldlm osc liblustre utils
-SUBDIRS = lov obdclass ptlrpc obdecho ldlm osc utils
+SUBDIRS = portals lov obdclass ptlrpc obdecho ldlm osc utils mdc #liblustre
 else
 # NOTE: keep extN before obdclass, mds, and obdfilter.  Keep obdclass as early
 # as possible, to have the best chance at stopping with "wrong kernel version"
 # instead of some related build failure.
-SUBDIRS = $(DIRS24) obdclass mds utils ptlrpc ldlm lib obdfilter mdc osc ost
-SUBDIRS+= llite obdecho lov cobd tests doc scripts conf
+SUBDIRS = portals $(DIRS24) obdclass mds utils ldlm obdfilter mdc osc ost
+SUBDIRS+= llite obdecho lov cobd tests doc scripts conf ptlrpc
 endif
 
 DIST_SUBDIRS = $(SUBDIRS) liblustre
-EXTRA_DIST = BUGS FDL Rules include archdep.m4 kernel_patches
+EXTRA_DIST = BUGS FDL Rules include kernel_patches
 
 # We get the version from the spec file.
 CONFIGURE_DEPENDENCIES = scripts/lustre.spec.in
@@ -36,3 +35,4 @@ include $(top_srcdir)/Rules
 
 rpms: dist Makefile
        rpmbuild -ta $(distdir).tar.gz
+
diff --git a/lustre/Makefile.mk b/lustre/Makefile.mk
new file mode 100644 (file)
index 0000000..e540148
--- /dev/null
@@ -0,0 +1,4 @@
+include fs/lustre/portals/Kernelenv
+
+obj-y += portals/
+obj-y += mds/
index a7b7240..1a80657 100644 (file)
@@ -1,4 +1,4 @@
-Instructions for building, configuring, and running Lustre can be found in
+Instructions for building, configuring and running Lustre can be found in
 the file doc/lustre-HOWTO.txt.
 
 If you have checked lustre directly out of CVS, then you either need to
index d4e5ed7..0d92246 100644 (file)
 #  name_SOURCES = my.c files.c
 #  include $(top_srcdir)/Rules
 
+if LINUX25
+
+# We still need to link each module with vermagic.o to get rid of "kernel taited" warnings.
+basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g')
+AM_CPPFLAGS=-I$(top_builddir)/include -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2  -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename)
+
+else
+
+AM_CPPFLAGS=-I$(top_builddir)/include
+
+endif
 
 $(MODULE).o: $($(MODULE)_OBJECTS)
        $(LD) -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r -o $(MODULE).o $($(MODULE)_OBJECTS)
@@ -17,9 +28,6 @@ $(MODULE).o: $($(MODULE)_OBJECTS)
 tags:
        rm -f $(top_srcdir)/TAGS
        rm -f $(top_srcdir)/tags
-       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a
        find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a
-       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a
        find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a
 
-AM_CPPFLAGS=-I$(top_builddir)/include
diff --git a/lustre/archdep.m4 b/lustre/archdep.m4
deleted file mode 100644 (file)
index 2bdd785..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-AC_ARG_WITH(lib, [  --with-lib compile lustre library], host_cpu="lib")
-
-AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...)
-if test $host_cpu = "lib" ; then 
-        host_cpu="lib"
-       AC_MSG_RESULT(no building Lustre library)
-else
-if test -e $LINUX/include/asm-um ; then
-if test  X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then
-       host_cpu="um";
-       AC_MSG_RESULT(yes)
-else
-       AC_MSG_RESULT(no (asm doesn't point at asm-um))
-fi
-
-else 
-        AC_MSG_RESULT(no (asm-um missing))
-fi
-fi
-
-AC_MSG_CHECKING(setting make flags system architecture: )
-case ${host_cpu} in
-       lib )
-       AC_MSG_RESULT($host_cpu)
-       KCFLAGS='-g -Wall '
-       KCPPFLAGS='-D__arch_lib__ '
-        MOD_LINK=elf_i386
-;;
-       um )
-       AC_MSG_RESULT($host_cpu)
-       KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common '
-        case ${linux25} in
-                yes )
-                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) '
-        ;;
-                * )
-               KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include '
-       ;;
-       esac
-
-        MOD_LINK=elf_i386
-;;
-       i*86 )
-       AC_MSG_RESULT($host_cpu)
-        KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe'
-        case ${linux25} in
-                yes )
-               KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include ' 
-        ;;
-                * )
-               KCPPFLAGS='-D__KERNEL__ -DMODULE '
-       ;;
-       esac
-        MOD_LINK=elf_i386
-;;
-
-       alphaev6 )
-       AC_MSG_RESULT($host_cpu)
-        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
-        KCPPFLAGS='-D__KERNEL__ -DMODULE '
-        MOD_LINK=elf64alpha
-;;
-
-       alphaev67 )
-       AC_MSG_RESULT($host_cpu)
-        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
-        KCPPFLAGS='-D__KERNEL__ -DMODULE '
-        MOD_LINK=elf64alpha
-;;
-
-       alpha* )
-       AC_MSG_RESULT($host_cpu)
-        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5'
-        KCPPFLAGS='-D__KERNEL__ -DMODULE '
-        MOD_LINK=elf64alpha
-;;
-
-       ia64 )
-       AC_MSG_RESULT($host_cpu)
-        KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step'
-       KCPPFLAGS='-D__KERNEL__ -DMODULE'
-        MOD_LINK=elf64_ia64
-;;
-
-       sparc64 )
-       AC_MSG_RESULT($host_cpu)
-        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs'
-        KCPPFLAGS='-D__KERNEL__'
-        MOD_LINK=elf64_sparc
-
-;;
-
-       powerpc )
-       AC_MSG_RESULT($host_cpu)
-        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring'
-        KCPPFLAGS='-D__KERNEL__'
-        MOD_LINK=elf32ppclinux
-;;
-
-        *)
-       AC_ERROR("Unknown Linux Platform: $host_cpu")
-;;
-esac
-
-if test $host_cpu != lib ; then 
-AC_MSG_CHECKING(for MODVERSIONS)
-if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1;
-then
-       MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB"
-       AC_MSG_RESULT(yes)
-else
-       MFLAGS=
-       AC_MSG_RESULT(no)
-fi
-
-AC_MSG_CHECKING(for SMP)
-if egrep -e SMP=y $LINUX/.config >/dev/null 2>&1; then
-       SMPFLAG=
-       AC_MSG_RESULT(yes)
-else
-       SMPFLAG=
-       AC_MSG_RESULT(no)
-fi
-fi
-
-CFLAGS="$KCFLAGS $MFLAGS"
-ARCHCPPFLAGS="$KCPPFLAGS"
index 087ff09..9deed73 100644 (file)
@@ -1,6 +1,5 @@
 #!/bin/sh
 
-find . -type d -name .deps | xargs rm -rf
 aclocal &&
-${AUTOMAKE:-automake} --add-missing &&
+automake --add-missing &&
 ${AUTOCONF:-autoconf}
index 67b4e62..c96b2ad 100644 (file)
@@ -71,23 +71,23 @@ cobd_setup (struct obd_device *dev, obd_count len, void *buf)
 
         /* don't bother checking attached/setup;
          * obd_connect() should, and it can change underneath us */
-        rc = obd_connect (&cobd->cobd_target, target, &target_uuid, NULL, NULL);
+        rc = obd_connect (&cobd->cobd_target, target, &target_uuid);
         if (rc != 0)
                 return (rc);
 
-        rc = obd_connect (&cobd->cobd_cache, cache, &cache_uuid, NULL, NULL);
+        rc = obd_connect (&cobd->cobd_cache, cache, &cache_uuid);
         if (rc != 0)
                 goto fail_0;
 
         return (0);
 
  fail_0:
-        obd_disconnect (&cobd->cobd_target);
+        obd_disconnect (&cobd->cobd_target, 0 );
         return (rc);
 }
 
 static int
-cobd_cleanup (struct obd_device *dev)
+cobd_cleanup (struct obd_device *dev, int force, int failover)
 {
         struct cache_obd  *cobd = &dev->u.cobd;
         int                rc;
@@ -95,11 +95,11 @@ cobd_cleanup (struct obd_device *dev)
         if (!list_empty (&dev->obd_exports))
                 return (-EBUSY);
 
-        rc = obd_disconnect (&cobd->cobd_cache);
+        rc = obd_disconnect (&cobd->cobd_cache, failover);
         if (rc != 0)
                 CERROR ("error %d disconnecting cache\n", rc);
 
-        rc = obd_disconnect (&cobd->cobd_target);
+        rc = obd_disconnect (&cobd->cobd_target, failover);
         if (rc != 0)
                 CERROR ("error %d disconnecting target\n", rc);
 
@@ -108,8 +108,7 @@ cobd_cleanup (struct obd_device *dev)
 
 static int
 cobd_connect (struct lustre_handle *conn, struct obd_device *obd,
-              struct obd_uuid *cluuid, struct recovd_obd *recovd,
-              ptlrpc_recovery_cb_t recover)
+              struct obd_uuid *cluuid)
 {
         int rc = class_connect (conn, obd, cluuid);
 
@@ -118,9 +117,9 @@ cobd_connect (struct lustre_handle *conn, struct obd_device *obd,
 }
 
 static int
-cobd_disconnect (struct lustre_handle *conn)
+cobd_disconnect (struct lustre_handle *conn, int failover)
 {
-       int rc = class_disconnect (conn);
+       int rc = class_disconnect (conn, failover);
 
         CERROR ("rc %d\n", rc);
        return (rc);
@@ -128,13 +127,13 @@ cobd_disconnect (struct lustre_handle *conn)
 
 static int
 cobd_get_info(struct lustre_handle *conn, obd_count keylen,
-              void *key, obd_count *vallen, void **val)
+              void *key, __u32 *vallen, void *val)
 {
         struct obd_device *obd = class_conn2obd(conn);
         struct cache_obd  *cobd;
 
         if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return -EINVAL;
         }
 
@@ -142,8 +141,7 @@ cobd_get_info(struct lustre_handle *conn, obd_count keylen,
 
         /* intercept cache utilisation info? */
 
-        return (obd_get_info (&cobd->cobd_target,
-                              keylen, key, vallen, val));
+        return obd_get_info(&cobd->cobd_target, keylen, key, vallen, val);
 }
 
 static int
@@ -153,7 +151,7 @@ cobd_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
         struct cache_obd  *cobd;
 
         if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return -EINVAL;
         }
 
@@ -169,7 +167,7 @@ cobd_getattr(struct lustre_handle *conn, struct obdo *oa,
         struct cache_obd  *cobd;
 
         if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return -EINVAL;
         }
 
@@ -179,18 +177,19 @@ cobd_getattr(struct lustre_handle *conn, struct obdo *oa,
 
 static int
 cobd_open(struct lustre_handle *conn, struct obdo *oa,
-          struct lov_stripe_md *lsm, struct obd_trans_info *oti)
+          struct lov_stripe_md *lsm, struct obd_trans_info *oti,
+          struct obd_client_handle *och)
 {
         struct obd_device *obd = class_conn2obd(conn);
         struct cache_obd  *cobd;
 
         if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return -EINVAL;
         }
 
         cobd = &obd->u.cobd;
-        return (obd_open (&cobd->cobd_target, oa, lsm, oti));
+        return (obd_open (&cobd->cobd_target, oa, lsm, oti, och));
 }
 
 static int
@@ -201,7 +200,7 @@ cobd_close(struct lustre_handle *conn, struct obdo *oa,
         struct cache_obd  *cobd;
 
         if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return -EINVAL;
         }
 
@@ -209,66 +208,59 @@ cobd_close(struct lustre_handle *conn, struct obdo *oa,
         return (obd_close (&cobd->cobd_target, oa, lsm, oti));
 }
 
-static int
-cobd_preprw(int cmd, struct lustre_handle *conn,
-            int objcount, struct obd_ioobj *obj,
-            int niocount, struct niobuf_remote *nb,
-            struct niobuf_local *res, void **desc_private, 
-            struct obd_trans_info *oti)
+static int cobd_preprw(int cmd, struct obd_export *exp,
+                       int objcount, struct obd_ioobj *obj,
+                       int niocount, struct niobuf_remote *nb,
+                       struct niobuf_local *res, void **desc_private,
+                       struct obd_trans_info *oti)
 {
-        struct obd_device *obd = class_conn2obd(conn);
-        struct cache_obd  *cobd;
+        struct obd_export *cobd_exp;
+        int rc;
 
-        if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+        if (exp->exp_obd == NULL)
                 return -EINVAL;
-        }
 
         if ((cmd & OBD_BRW_WRITE) != 0)
                 return -EOPNOTSUPP;
 
-        cobd = &obd->u.cobd;
-        return (obd_preprw (cmd, &cobd->cobd_target,
-                            objcount, obj,
-                            niocount, nb,
-                            res, desc_private, oti));
+        cobd_exp = class_conn2export(&exp->exp_obd->u.cobd.cobd_target);
+        rc = obd_preprw(cmd, cobd_exp, objcount, obj, niocount, nb, res,
+                        desc_private, oti);
+        class_export_put(cobd_exp);
+        return rc;
 }
 
-static int
-cobd_commitrw(int cmd, struct lustre_handle *conn,
-              int objcount, struct obd_ioobj *obj,
-              int niocount, struct niobuf_local *local,
-              void *desc_private, struct obd_trans_info *oti)
+static int cobd_commitrw(int cmd, struct obd_export *exp,
+                         int objcount, struct obd_ioobj *obj,
+                         int niocount, struct niobuf_local *local,
+                         void *desc_private, struct obd_trans_info *oti)
 {
-        struct obd_device *obd = class_conn2obd(conn);
-        struct cache_obd  *cobd;
+        struct obd_export *cobd_exp;
+        int rc;
 
-        if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+        if (exp->exp_obd == NULL)
                 return -EINVAL;
-        }
 
         if ((cmd & OBD_BRW_WRITE) != 0)
                 return -EOPNOTSUPP;
 
-        cobd = &obd->u.cobd;
-        return (obd_commitrw (cmd, &cobd->cobd_target,
-                              objcount, obj,
-                              niocount, local,
-                              desc_private, oti));
+        cobd_exp = class_conn2export(&exp->exp_obd->u.cobd.cobd_target);
+        rc = obd_commitrw(cmd, cobd_exp, objcount, obj, niocount, local,
+                          desc_private, oti);
+        class_export_put(cobd_exp);
+        return rc;
 }
 
 static inline int
 cobd_brw(int cmd, struct lustre_handle *conn,
          struct lov_stripe_md *lsm, obd_count oa_bufs,
-         struct brw_page *pga, struct obd_brw_set *set, 
-         struct obd_trans_info *oti)
+         struct brw_page *pga, struct obd_trans_info *oti)
 {
         struct obd_device *obd = class_conn2obd(conn);
         struct cache_obd  *cobd;
 
         if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return -EINVAL;
         }
 
@@ -277,7 +269,7 @@ cobd_brw(int cmd, struct lustre_handle *conn,
 
         cobd = &obd->u.cobd;
         return (obd_brw (cmd, &cobd->cobd_target,
-                         lsm, oa_bufs, pga, set, oti));
+                         lsm, oa_bufs, pga, oti));
 }
 
 static int
@@ -288,7 +280,7 @@ cobd_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
         struct cache_obd  *cobd;
 
         if (obd == NULL) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return -EINVAL;
         }
 
index 7e5c267..fd7474b 100644 (file)
@@ -40,13 +40,14 @@ static int rd_target(char *page, char **start, off_t off, int count,
         LASSERT(dev != NULL);
         conn = &dev->u.cobd.cobd_target;
 
-       if ((dev->obd_flags & OBD_SET_UP) == 0)
+       if (!dev->obd_set_up) {
                rc = snprintf (page, count, "not set up\n");
-       else {
-               exp = class_conn2export (conn);
+       else {
+               exp = class_conn2export(conn);
                LASSERT(exp != NULL);
                rc = snprintf(page, count, "%s\n", 
                               exp->exp_obd->obd_uuid.uuid);
+                class_export_put(exp);
        }
        return (rc);
 }
@@ -62,13 +63,14 @@ static int rd_cache(char *page, char **start, off_t off, int count,
         LASSERT(dev != NULL);
         conn = &dev->u.cobd.cobd_cache;
 
-       if ((dev->obd_flags & OBD_SET_UP) == 0)
+       if (!dev->obd_set_up) {
                rc = snprintf (page, count, "not set up\n");
-       else {
-               exp = class_conn2export (conn);
+        } else {
+               exp = class_conn2export(conn);
                LASSERT (exp != NULL);
                rc = snprintf(page, count, "%s\n", 
                               exp->exp_obd->obd_uuid.uuid);
+                class_export_put(exp);
        }
        return (rc);
 }
index 8d575a6..51d1d1a 100644 (file)
 <!-- main elements -->
 <!ELEMENT lustre (node | profile | mountpoint | ldlm | ptlrpc |echoclient |
                   mds | mdsdev| ost | osd | lov | lovconfig)*>
+<!ATTLIST lustre version CDATA #REQUIRED>
 
-<!ELEMENT node (network | routetbl | profile_ref)*>
+<!ELEMENT node (network | routetbl | profile_ref | timeout | 
+                lustreUpcall | portalsUpcall)*>
 <!ATTLIST node %object.attr;
                router CDATA #IMPLIED>
                
 <!ELEMENT network (nid | port | route_tbl | sendmem | recvmem)*>
 <!ATTLIST network %object.attr;
-                  nettype (tcp | elan | gm) 'tcp'>
+                  nettype (tcp | elan | gm | scimac) 'tcp'>
 
 <!ELEMENT routetbl (route)*>
 <!ATTLIST routetbl %object.attr;>
 <!ELEMENT route %object.content;>
-<!ATTLIST route type (elan | tcp | gm) #REQUIRED
+<!ATTLIST route type (elan | tcp | gm | scimac) #REQUIRED
                 gw CDATA #REQUIRED
                 lo CDATA #REQUIRED
                 hi CDATA #IMPLIED >
 <!ATTLIST ptlrpc %object.attr;>
 
 <!ELEMENT osd (fstype | devpath | devsize | autoformat | 
-               target_ref | node_ref)*>
+               target_ref | node_ref | journalsize )*>
 <!ATTLIST osd %object.attr; 
               osdtype (obdfilter | obdecho) 'obdfilter'>
 
-<!ELEMENT ost (active_ref)*>
-<!ATTLIST ost %object.attr;>
+<!ELEMENT ost (active_ref | group)*>
+<!ATTLIST ost %object.attr;
+              failover ( 1 | 0 ) #IMPLIED>
 
-<!ELEMENT mds (active_ref | lovconfig_ref)*>
-<!ATTLIST mds %object.attr;>
+<!ELEMENT mds (active_ref | lovconfig_ref | group)*>
+<!ATTLIST mds %object.attr;
+              failover ( 1 | 0 ) #IMPLIED>
 
 <!ELEMENT mdsdev (fstype | devpath | devsize | autoformat | 
-                  target_ref | node_ref )*>
+                  target_ref | node_ref | journalsize )*>
 <!ATTLIST mdsdev %object.attr;>
 
 <!ELEMENT lov (mds_ref |(obd_ref)+)*>
 <!ATTLIST lovconfig %object.attr;>
 
 <!-- basic elements -->
+<!ELEMENT recoveryUpcall %object.content;>
+<!ELEMENT timeout       %object.content;>
+<!ELEMENT journalsize   %object.content;>
 <!ELEMENT fstype        %object.content;>
 <!ELEMENT nid           %object.content;>
 <!ELEMENT port          %object.content;>
-<!ELEMENT sendmem      %object.content;>
-<!ELEMENT recvmem      %object.content;>
+<!ELEMENT sendmem       %object.content;>
+<!ELEMENT recvmem       %object.content;>
 <!ELEMENT autoformat    %object.content;>
 <!ELEMENT activetarget  %object.content;>
 <!ELEMENT devpath       %object.content;>
index f3c1364..5fc6f9c 100644 (file)
@@ -10,6 +10,7 @@ dn: <value-of select="$basedn"/>
 uuid: CONFIG_UUID
 objectClass: LUSTRECONFIG
 config: <value-of select="$config"/>
+version: <value-of select="@version"/>
 <text>
 </text><apply-templates/>
 </template>
@@ -23,8 +24,30 @@ networkRef: <value-of select="network/@uuid"/>
 <for-each select="profile_ref">
 profileRef: <value-of select="@uuidref"/>
 </for-each>
+<if test="timeout">
+timeout: <value-of select="timeout"/>
+</if>
+<if test="lustreUpcall">
+lustreUpcall: <value-of select="lustreUpcall"/>
+</if>
+<if test="portalsUpcall">
+portalsUpcall: <value-of select="portalsUpcall"/>
+</if>
 <text>
-</text><apply-templates/>
+</text>
+<for-each select="network">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: NETWORK
+lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+nettype: <value-of select="@nettype"/>
+nid: <value-of select="nid"/>
+<if test="port">
+port: <value-of select="port"/>
+</if>
+<text>
+</text>
+</for-each>
 </template>
 
 <template match="profile">
@@ -50,11 +73,25 @@ port: <value-of select="port"/>
 </text>
 </template>
 
+
 <template match="mds">
 dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
 objectClass: MDS
 lustreName: <value-of select="@name"/>
-uuid: <value-of select="@uuid"/><apply-templates/>
+uuid: <value-of select="@uuid"/>
+activeRef: <value-of select="active_ref/@uuidref"/>
+<if test="lovconfig_ref">
+lovconfigRef: <value-of select="lovconfig_ref/@uuidref"/>
+</if>
+<if test="filesystem_ref">
+filesystemRef: <value-of select="filesystem_ref/@uuidref"/>
+</if>
+<if test="@failover">
+failover: <value-of select="@failover"/>
+</if>
+<if test="group">
+group: <value-of select="group"/>
+</if>
 <text>
 </text>
 </template>
@@ -76,6 +113,9 @@ devpath: <value-of select="devpath"/>
 <if test="devsize">
 devsize: <value-of select="devsize"/>
 </if>
+<if test="journalsize">
+journalsize: <value-of select="journalsize"/>
+</if>
 nodeRef: <value-of select="node_ref/@uuidref"/>
 targetRef: <value-of select="target_ref/@uuidref"/>
 <text>
@@ -124,6 +164,9 @@ devpath: <value-of select="devpath"/>
 <if test="devsize">
 devsize: <value-of select="devsize"/>
 </if>
+<if test="journalsize">
+journalsize: <value-of select="journalsize"/>
+</if>
 <text>
 </text>
 </template>
@@ -132,6 +175,22 @@ devsize: <value-of select="devsize"/>
 dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
 objectClass: OST
 lustreName: <value-of select="@name"/>
+uuid: <value-of select="@uuid"/>
+activeRef: <value-of select="active_ref/@uuidref"/>
+<if test="@failover">
+failover: <value-of select="@failover"/>
+</if>
+<if test="group">
+group: <value-of select="group"/>
+</if>
+<text>
+</text>
+</template>
+
+<template match="filesystem">
+dn: uuid=<value-of select="@uuid"/>,<value-of select="$basedn"/>
+objectClass: FILESYSTEM
+lustreName: <value-of select="@name"/>
 uuid: <value-of select="@uuid"/><apply-templates/>
 <text>
 </text>
@@ -209,6 +268,10 @@ mdsdevRef: <value-of select="@uuidref"/>
 mountpointRef: <value-of select="@uuidref"/>
 </template>
 
+<template match="filesystem_ref">
+filesystemRef: <value-of select="@uuidref"/>
+</template>
+
 <template match="echoclient_ref">
 echoclientRef: <value-of select="@uuidref"/>
 </template>
@@ -217,17 +280,8 @@ echoclientRef: <value-of select="@uuidref"/>
 lovRef: <value-of select="@uuidref"/>
 </template>
 
-<template match="lovconfig_ref">
-lovconfigRef: <value-of select="@uuidref"/>
-</template>
-
 <template match="path">
 path: <value-of select="."/>
 </template>
 
-<template match="active_ref">
-activeRef: <value-of select="@uuidref"/>
-</template>
 </stylesheet>
-
-
index 7906908..8558f64 100644 (file)
@@ -1,6 +1,5 @@
 #######################################################################
 # lustre ldap config database
-# $Id: slapd-lustre.conf,v 1.3 2003/03/11 23:36:45 pschwan Exp $
 #######################################################################
 
 database       ldbm
index 5c5f438..0850115 100644 (file)
-AC_INIT
-AC_CANONICAL_SYSTEM
-
 # Copyright (C) 2001-2003 Cluster File Systems, Inc.
 #
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
+AC_INIT
+AC_CANONICAL_SYSTEM
+
 # Automake variables.  Steal the version number from lustre.spec.in.
 AM_INIT_AUTOMAKE(lustre, builtin([esyscmd], [sed -ne '/^%define version /{ s/.*version //; p; q; }' scripts/lustre.spec.in]))
 #AM_MAINTAINER_MODE
 
-AC_PROG_CC
-AC_MSG_CHECKING(for buggy compiler)
-CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"`
-bad_cc() {
-       echo
-       echo "   '$CC_VERSION'"
-       echo "  has been known to generate bad code, "
-       echo "  please get an updated compiler."
-       AC_MSG_ERROR(sorry)
-}
-TMP_VERSION=`echo $CC_VERSION | cut -c 1-16`
-if test "$TMP_VERSION" = "gcc version 2.95"; then
-        bad_cc
-fi
-case "$CC_VERSION" in 
-       # ost_pack_niobuf putting 64bit NTOH temporaries on the stack
-       # without "sub    $0xc,%esp" to protect the stack from being
-       # stomped on by interrupts (bug 606)
-       "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)")
-               bad_cc
-               ;;
-       *)
-               AC_MSG_RESULT(no known problems)
-               ;;
-esac
-
-AC_PROG_RANLIB
-
-# 
-# Check for required packages
-
-# this doesn't seem to work on older autoconf
-# AC_CHECK_LIB(readline, readline,,)
-
-AC_ARG_ENABLE(readline,        [  --enable-readline  use readline library],,
-                       enable_readline="yes")
-if test "$enable_readline" = "yes" ; then
-   LIBREADLINE="-lreadline -lncurses"
-   HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1"
-else 
-   LIBREADLINE=""
-   HAVE_LIBREADLINE=""
-fi
-AC_SUBST(LIBREADLINE)
-AC_SUBST(HAVE_LIBREADLINE)
-
-AC_ARG_ENABLE(efence,  [  --enable-efence  use efence library],,
-                       enable_efence="no")
-if test "$enable_efence" = "yes" ; then
-   LIBEFENCE="-lefence"
-   HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1"
-else 
-   LIBEFENCE=""
-   HAVE_LIBEFENCE=""
-fi
-AC_SUBST(LIBEFENCE)
-AC_SUBST(HAVE_LIBEFENCE)
-
-# XXX this should be a runtime option
-AC_MSG_CHECKING(if you are enabling OST recovery...)
-AC_ARG_ENABLE(ost_recovery, [  --enable-ost-recovery: enable support for ost recovery],,
-             enable_ost_recovery="no")
-if test "$enable_ost_recovery" = "yes" ; then
-   ENABLE_OST_RECOVERY="-DOST_RECOVERY=1"
-   AC_MSG_RESULT(yes)
-else 
-   ENABLE_OST_RECOVERY=""
-   AC_MSG_RESULT(no)
-fi
-AC_SUBST(ENABLE_OST_RECOVERY)
-
-
-# Kernel build environment.
-ac_default_prefix=
-bindir='${exec_prefix}/usr/bin'
-sbindir='${exec_prefix}/usr/sbin'
-
-linuxdir_def=/usr/src/linux
-AC_ARG_WITH(linux, [  --with-linux=[path] set path to Linux source (default=/usr/src/linux)], enable_linuxdir=$withval)
-AC_ARG_ENABLE(linuxdir, [  --enable-linuxdir=[path] (deprecated) set path to Linux source (default=/usr/src/linux)],, enable_linuxdir=$linuxdir_def)
-
-LINUX=$enable_linuxdir
-AC_SUBST(LINUX)
-
-AC_MSG_CHECKING(if you are running linux 2.5...)
-if test -e $LINUX/include/linux/namei.h ; then
-       linux25="yes"
-       AC_MSG_RESULT(yes)
-else
-       linux25="no"
-       AC_MSG_RESULT(no)
-fi
-AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
-
-sinclude(archdep.m4)
-
-
-portalsdir_def='$(top_srcdir)/../portals'
-AC_ARG_WITH(portals, [  --with-portals=[path] set path to Portals source (default=../portals)], enable_portalsdir=$withval)
-AC_ARG_ENABLE(portalsdir, [  --enable-portalsdir=[path] (deprecated) set path to Portals source (default=$portalsdir_def],, enable_portalsdir=$portalsdir_def)
-PORTALS=$enable_portalsdir
-
-if test $PORTALS = $portalsdir_def; then
-       PORTALSLOC='../portals'
-else
-       PORTALSLOC=$PORTALS
-fi
-
-AC_SUBST(PORTALS)
-AC_SUBST(PORTALSLOC)
-
-portalslib_def=$enable_portalsdir/linux/utils
-AC_ARG_WITH(portalslib, [  --with-portalslib=[path] set path to Portals library (default=../portals/linux/utils)], enable_portalslib=$withval)
-AC_ARG_ENABLE(portalslib, [  --enable-portalslib=[path] (deprecated) set path to Portals lib (default=../portals/linux/utils)],, enable_portalslib=$portalslib_def)
-
-
-if ! test -z "$enable_portalslib"; then
-       PORTALSLIB=${enable_portalslib}
-fi
-AC_SUBST(PORTALSLIB)
-
-AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
-AC_MSG_CHECKING(if you are building lib lustre)
-if test "$host_cpu" = "lib"; then
-   AC_MSG_RESULT(yes)
-   libdir='${exec_prefix}/lib/lustre'
-else
-   AC_MSG_RESULT(no)
-fi
-
-if test $host_cpu != "lib" ; then 
-KINCFLAGS='-I$(top_srcdir)/include -I$(PORTALS)/include -I$(LINUX)/include'
-else
-KINCFLAGS='-I$(top_srcdir)/include -I$(PORTALS)/include'
-fi
-CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS $ENABLE_OST_RECOVERY"
-
-if test $host_cpu != "lib" ; then 
-AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) )
-if  test -f $LINUX/include/linux/config.h ; then
-       AC_MSG_RESULT(yes)
-else
-       AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.)
-fi
-
-AC_MSG_CHECKING(if autoconf.h is in kernel source)
-if test -f $LINUX/include/linux/autoconf.h ; then
-       AC_MSG_RESULT(yes)
-else
-       AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.)
-fi
-
-AC_MSG_CHECKING(for Linux release)
-
-dnl We need to rid ourselves of the nasty [ ] quotes.
-changequote(, )
-dnl Get release from version.h
-RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`"
-changequote([, ])
-
-moduledir='$(libdir)/modules/'$RELEASE/kernel
-AC_SUBST(moduledir)
-
-modulefsdir='$(moduledir)/fs/$(PACKAGE)'
-AC_SUBST(modulefsdir)
-
-AC_MSG_RESULT($RELEASE)
-AC_SUBST(RELEASE)
-
-fi
-# Directories for documentation and demos.
-docdir='${prefix}/usr/share/doc/$(PACKAGE)'
-AC_SUBST(docdir)
-
-demodir='$(docdir)/demo'
-AC_SUBST(demodir)
-
-# not needed until the AC_CHECK_LIB(readline) above works
-# AM_CONFIG_HEADER(include/config.h)
-
-AC_OUTPUT(Makefile lib/Makefile ldlm/Makefile obdecho/Makefile ptlrpc/Makefile \
-       liblustre/Makefile \
-       lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \
-       cobd/Makefile ptlbd/Makefile conf/Makefile \
-       utils/Makefile utils/lconf tests/Makefile obdfilter/Makefile \
-        obdclass/Makefile llite/Makefile doc/Makefile scripts/Makefile \
-       scripts/lustre.spec extN/Makefile, chmod +x utils/lconf)
+# LLNL patches their ext3 and calls it extN
+AC_ARG_ENABLE(extN, [  --enable-extN use extN instead of ext3 for lustre backend])
+AM_CONDITIONAL(EXTN, test x$enable_extN = xyes)
+
+AC_ARG_WITH(obd-buffer-size, [  --with-obd-buffer-size=[size] set lctl ioctl maximum (default=8K)],OBD_BUFFER_SIZE=$with_obd_buffer_size,OBD_BUFFER_SIZE=8192)
+AC_SUBST(OBD_BUFFER_SIZE)
+
+sinclude(portals/build.m4)
+sinclude(portals/archdep.m4)
+
+if test x$enable_inkernel = xyes ; then
+cp Makefile.mk Makefile.in
+cp mds/Makefile.mk mds/Makefile.in
+cp portals/Kernelenv.mk portals/Kernelenv.in
+cp portals/Makefile.mk portals/Makefile.in
+cp portals/libcfs/Makefile.mk portals/libcfs/Makefile.in
+cp portals/portals/Makefile.mk portals/portals/Makefile.in
+cp portals/knals/Makefile.mk portals/knals/Makefile.in
+cp portals/knals/socknal/Makefile.mk portals/knals/socknal/Makefile.in
+cp portals/router/Makefile.mk portals/router/Makefile.in
+fi
+
+AM_CONFIG_HEADER(portals/include/config.h)
+
+AC_OUTPUT([Makefile portals/Makefile portals/Kernelenv \
+          portals/libcfs/Makefile portals/portals/Makefile \
+          portals/unals/Makefile portals/knals/Makefile \
+          portals/router/Makefile portals/knals/socknal/Makefile \
+          portals/knals/gmnal/Makefile portals/knals/qswnal/Makefile \
+         portals/knals/scimacnal/Makefile portals/knals/toenal/Makefile \
+          portals/utils/Makefile portals/tests/Makefile portals/doc/Makefile \
+          ldlm/Makefile obdecho/Makefile ptlrpc/Makefile liblustre/Makefile \
+         lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \
+         cobd/Makefile ptlbd/Makefile conf/Makefile  tests/Makefile \
+         utils/Makefile utils/Lustre/Makefile obdfilter/Makefile \
+          obdclass/Makefile llite/Makefile doc/Makefile scripts/Makefile \
+         scripts/lustre.spec])
index 3bfecbd..85c670b 100644 (file)
@@ -48,7 +48,24 @@ lconf\SpecialChar ~
 DESCRIPTION
 \layout Standard
 
-This program configures a node following directives in the <XML-config-file>.
+This program configures a node following directives in the <XML-config-file>..
+ There will be single configuration file for all the nodes in a single cluster.
+ This file should be distributed to all the nodes in the cluster or kept
+ in a location accessible to all the nodes.
+ One option is to store the cluster configuration information in LDAP format
+ on an LDAP server that can be reached from all the cluster nodes.
+\layout Description
+
+--ldapurl\SpecialChar ~
+<arg> LDAP server URL 
+\layout Description
+
+--config\SpecialChar ~
+<arg> Cluster configuration name used for LDAP query
+\layout Description
+
+--select\SpecialChar ~
+<arg> Select a particular node for a service 
 \layout Description
 
 --node\SpecialChar ~
@@ -67,7 +84,7 @@ node_name
  error.
 \layout Description
 
---cleanup Unconfigure a node.
+--d|--cleanup Unconfigure a node.
  The same config and 
 \emph on 
 --node
@@ -77,6 +94,21 @@ node_name
  including unloading the kernel modules.
 \layout Description
 
+--force Forced unmounting and/or obd detach during cleanup.
+ Default is 0.
+\layout Description
+
+--mds_ost_conn Open connections to OSTs on MDS.
+\layout Description
+
+--failover Used to shutdown without saving state.
+ Default is 0.
+ This will allow the node to give up service to another node for failover
+ purposes.
+ This will not be a clean shutdown.
+\layout Description
+
 --noexec Print, but don't execute, the steps lconf will perform.
  This is useful for debugging a configuration, and when used with 
 \emph on 
@@ -90,9 +122,27 @@ node_name
  module script is always created, however).
 \layout Description
 
+--gdb_script\SpecialChar ~
+<arg> Full name of gdb debug script.
+ Default is /tmp/ogdb.
+\layout Description
+
+--dump_path\SpecialChar ~
+<arg> Path to save debug dumps.
+ Default is /tmp/lustre_log
+\layout Description
+
+--recover\SpecialChar ~
+<arg> Recover a device.
+\layout Description
+
 --nosetup Only load modules, do not configure devices or services.
 \layout Description
 
+--group\SpecialChar ~
+<arg> The group of devices tol cleanup/configure.
+\layout Description
+
 --nomod Only setup devices and services, do not load modules.
 \layout Description
 
@@ -102,15 +152,44 @@ node_name
 --verbose,-v Be verbose and show actions while going along.
 \layout Description
 
---reformat Reformat all the devices
+--timeout\SpecialChar ~
+<arg> Set the recovery timeout period.
+\layout Description
+
+--lustre_upcall\SpecialChar ~
+<path> Set the location of the Lustre upcall scripts used
+ by the client for recovery
+\layout Description
+
+--portals_upcall\SpecialChar ~
+<path> Specify the location of the Portals upcall scripts
+ used by the client for recovery
+\layout Description
+
+--upcall\SpecialChar ~
+<path> Set the location of both Lustre and Portals upcall scripts
+ used by the client for recovery
+\layout Description
+
+--lctl-dump\SpecialChar ~
+<arg> Dump all ioctls to the specified file
+\layout Description
+
+--dump\SpecialChar ~
+<file> Dump the kernel debug log to the specified file before portals
+ is unloaded during cleanup.
+\layout Description
+
+--reformat Reformat all the devices.
+ This is essential on the first time the file system is brought up.
 \layout Description
 
 -h,--help Print help.
 \layout Description
 
 --maxlevel\SpecialChar ~
-<level> [NOT IMPLEMENTED] Perform configuration of devices and
services up to level given.
+<level> Perform configuration of devices and services up to level
+ given.
  
 \emph on 
 level
@@ -122,6 +201,32 @@ net, dev, svc, fs.
 \series default 
 When used in conjunction with cleanup, services are torn down up to a certain
  level.
+ Default is 100.
+\layout Description
+
+--minlevel\SpecialChar ~
+<level> Specify the minimum level of services to configure/cleanup.
+ Default is 0.
+\layout Description
+
+--lustre=src_dir Specify the base directory for Lustre sources, this parameter
+ will cause lconf to load the lustre modules from this soure tree.
+\layout Description
+
+--portals=src_dir Portals source directory.
+ If this is a relative path, it is assumed to be relative to Lustre source
+ tree location.
+\layout Description
+
+--ptldebug\SpecialChar ~
+debug\SpecialChar ~
+level This options can be used to set the required debug
+ level 
+\layout Description
+
+--subsystem\SpecialChar ~
+<arg> Set the portals debug subsystem 
 \layout Subsection
 
 EXAMPLES
@@ -136,6 +241,47 @@ lconf --node client config.xml
 \layout Standard
 
 in order to give clients, regardless of hostname, a single configuration.
+\layout Standard
+
+Required debug levels can be set like this:
+\layout LyX-Code
+
+
+\size small 
+   ## Everything but these
+\layout LyX-Code
+
+
+\size small 
+lconf --ptldebug 
+\begin_inset Quotes eld
+\end_inset 
+
+~(portals | malloc | trace)
+\begin_inset Quotes erd
+\end_inset 
+
+
+\layout LyX-Code
+
+\layout LyX-Code
+
+
+\size small 
+## Only these debug types
+\layout LyX-Code
+
+
+\size small 
+lconf --ptldebug 
+\begin_inset Quotes eld
+\end_inset 
+
+ldlm|ha
+\begin_inset Quotes erd
+\end_inset 
+
+
 \layout Subsection
 
 BUGS
index 33b40b2..b3f3f3e 100644 (file)
@@ -179,8 +179,14 @@ on a device name.
 devno 
 \emph default 
 option is used as above.
-\layout LyX-Code
+\layout Description
+
+--ignore_errors\SpecialChar ~
+|\SpecialChar ~
+ignore_errors Ignore errors during script processing
+\layout Description
 
+dump Save ioctls to a file 
 \layout LyX-Code
 
 \layout Description
@@ -276,6 +282,15 @@ send_mem\SpecialChar ~
 nagle\SpecialChar ~
 [on/off] Enable/disable nagle; omitting the argument will cause the
  default value to be printed.
+\layout Description
+
+fail\SpecialChar ~
+nid|all\SpecialChar ~
+[count] Fail/restore communications.
+ Ommiting tha count implies fail indefinitely, count of zero indicates that
+ communication should be restored.
+ A non-zero count indicates the number of portals messages to be dropped
+ after which the communication is restored.
 \end_deeper 
 \layout Description
 
@@ -297,6 +312,9 @@ device This will select the specified OBD device.
 \layout Description
 
 device_list Show all the devices.
+\layout Description
+
+lustre_build_version Print the Lustre build version.
 \end_deeper 
 \layout Description
 
@@ -340,13 +358,13 @@ detach Remove driver (and name and UUID) from the current device.
 
 lov_setconfig\SpecialChar ~
 lov-uuid\SpecialChar ~
-default-stripe-count\SpecialChar ~
+stripe-count\SpecialChar ~
 default-stripe-size\SpecialChar ~
 offset\SpecialChar ~
 pattern\SpecialChar ~
 UUID1\SpecialChar ~
-[U
-UID2...] Write LOV configuration to an MDS device.
+[UUID2...]
+ Write LOV configuration to an MDS device.
 \layout Description
 
 lov_getconfig\SpecialChar ~
@@ -371,6 +389,12 @@ probe\SpecialChar ~
 close
 \emph on 
  
+\emph default 
+Close the
+\emph on 
+\emph default 
+connection handle
 \layout Description
 
 getattr\SpecialChar ~
@@ -405,7 +429,18 @@ create\SpecialChar ~
 \layout Description
 
 destroy\SpecialChar ~
-<objid> Destroy an OST object.
+<num>\SpecialChar ~
+starting\SpecialChar ~
+at\SpecialChar ~
+<objid> Destroy <
+\emph on 
+num
+\emph default 
+> number of objects starting from the object with object id <
+\emph on 
+objid
+\emph default 
+>.
 \layout Description
 
 test_getattr\SpecialChar ~
@@ -476,9 +511,45 @@ ldlm_regress_stop Stop lock manager stress test.
 dump_ldlm Dump all lock manager state, this is very useful for debugging
 \layout Description
 
-newconn\SpecialChar ~
-<olduuid>\SpecialChar ~
-[newuuid]
+activate Activate an import
+\layout Description
+
+deacttivate De-activate an import
+\layout Description
+
+recover\SpecialChar ~
+<connection UUID> 
+\layout Description
+
+lookup\SpecialChar ~
+<directory>\SpecialChar ~
+<file>
+\layout Description
+
+notransno Disable sending of committed transnumber updates
+\layout Description
+
+readonly Disable writes to the underlying device
+\layout Description
+
+abort_recovery Abort recovery on MDS device
+\layout Description
+
+mount_option Dump mount options to a file
+\layout Description
+
+get_stripe show stripe info for an echo client object.
+\layout Description
+
+set_stripe\SpecialChar ~
+<objid>[\SpecialChar ~
+width!count[@offset]\SpecialChar ~
+[:id:id....] set stripe info for an echo
+ client
+\layout Description
+
+unset_stripe\SpecialChar ~
+<objid> unset stripe info for an echo client object.
 \end_deeper 
 \layout Description
 
@@ -486,6 +557,9 @@ Debug
 \begin_deeper 
 \layout Description
 
+debug_daemon debug daemon control and dump to a file
+\layout Description
+
 debug_kernel\SpecialChar ~
 [file]\SpecialChar ~
 [raw] Get debug buffer and dump to a 
index 2cbcdc0..7a90023 100644 (file)
@@ -142,6 +142,44 @@ To generate configuration data associated with systems in a Lustre cluster:
 -
 \emph default 
 -add\SpecialChar ~
+node Adds a new node in the cluster configuration.
+\begin_deeper 
+\layout Standard
+
+The arguments required are:
+\layout Description
+
+--node\SpecialChar ~
+''node_name'' This will create a new node with the given name if not
+ already present.
+\layout Description
+
+--timeout\SpecialChar ~
+<num> Timeout before going into recovery
+\layout Description
+
+--lustre_upcall\SpecialChar ~
+<path> Set the location of the Lustre upcall scripts used
+ by the client for recovery
+\layout Description
+
+--portals_upcall\SpecialChar ~
+<path> Specify the location of the Portals upcall scripts
+ used by the client for recovery
+\layout Description
+
+--upcall\SpecialChar ~
+<path> Specify the location of both (Lustre and Portals) upcall
+ scripts used by the client for recovery
+\end_deeper 
+\layout Description
+
+
+\emph on 
+-
+\emph default 
+-add\SpecialChar ~
 net Adds a network device descriptor for the given node, with parameters
  as indicated.
 \begin_deeper 
@@ -159,7 +197,7 @@ The arguments required are:
 --nettype\SpecialChar ~
 <type> This can be 
 \series bold 
-tcp, elan, gm.
+tcp, elan, gm, scimac.
 \layout Description
 
 --nid\SpecialChar ~
@@ -193,6 +231,9 @@ client
  configuration.
 \layout Description
 
+--hostaddr addr
+\layout Description
+
 --router Optional flag to mark this node as a router
 \layout Description
 
@@ -210,13 +251,26 @@ profiles
 \layout Description
 
 --port\SpecialChar ~
-[port] Optional argument to indicate the tcp port.
+[port] Optional arguement to indicate the tcp port.
  The default is 988.
  
 \layout Description
 
 --tcpbuf\SpecialChar ~
-<size> Optional argument.
+<size> Optional arguement.
+ The default TCP buffer size is 1MB.
+\layout Description
+
+--irq_affinity\SpecialChar ~
+0|1 Optional arguement.
+ Default is 0.
+\layout Description
+
+--nid_exchange\SpecialChar ~
+0|1 Optional arguement since some OSTs might not have the
+ required support.
+ This is turned off by default, value of 1 will turn it ON.
 \end_deeper 
 \layout Description
 
@@ -225,6 +279,11 @@ mds
 \begin_deeper 
 \layout Description
 
+--node\SpecialChar ~
+<node\SpecialChar ~
+name> Name of the node on which the MDS resides
+\layout Description
+
 --mds\SpecialChar ~
 <mds_name> 
 \layout Description
@@ -235,8 +294,8 @@ mds
 \layout Description
 
 --size\SpecialChar ~
-<size> Optional argument indicating the size of the device to be created
- (used typically for loop devices).
+<size> Optional arguement indicating the size of the device to be
created (used typically for loop devices).
 \layout Description
 
 --node\SpecialChar ~
@@ -246,6 +305,23 @@ mds
 --node
 \emph default 
  argument, and it must not be a profile node.
+\layout Description
+
+--fstype\SpecialChar ~
+extN|ext3 Optional arguement used to specify the file system type.
+ Default is ext3.
+\layout Description
+
+--journal_size\SpecialChar ~
+<size> Optional arguement to specify the journal size for
+ the ext2/ext3 file system.
+ The size should be in the units expected by 
+\series bold 
+mkfs
+\series default 
+, so for ext3 it should be in MB.
+ If this is option is not used, the ext2/ext3 filesystem will be configured
+ with the default journal size.
 \end_deeper 
 \layout Description
 
@@ -272,11 +348,13 @@ mds_name
 \layout Description
 
 --stripe_cnt\SpecialChar ~
-<count>
+<count> A value of 0 for this means to stripe on all available
+ OSTs.
+ Default is 0.
 \layout Description
 
 --stripe_pattern\SpecialChar ~
-<pattern> Pattern can be 0.
+<pattern> Only Pattern 0 (RAID 0) is supported currently.
 \end_deeper 
 \layout Description
 
@@ -286,8 +364,8 @@ ost Creates an OBD, OST, and OSC.
 \begin_deeper 
 \layout Description
 
---obd\SpecialChar ~
-<name> Assign a name to the OBD device.
+--ost\SpecialChar ~
+<name> Assign a name to the OST device.
 \layout Description
 
 --node\SpecialChar ~
@@ -305,18 +383,36 @@ ost Creates an OBD, OST, and OSC.
 [size]
 \layout Description
 
+--osdtype\SpecialChar ~
+obdfilter|obdecho 
+\layout Description
+
 --lov\SpecialChar ~
-<name> Name of LOV to which this OSC will be attached.
+<name> Optional arguement.
+ Name of LOV to which this OSC will be attached.
  
 \layout Description
 
---obduuid\SpecialChar ~
-UUID Specify the UUID of the OBD device.
- The default value is 
-\emph on 
-OBD_nodename_UUID
-\emph default 
-.
+--ostuuid\SpecialChar ~
+UUID Specify the UUID of the OST device.
+\layout Description
+
+--fstype\SpecialChar ~
+extN|ext3 Optional arguement used to specify the file system type.
+ Default is ext3.
+\layout Description
+
+--journal_size\SpecialChar ~
+<size> Optional arguement to specify the journal size for
+ the ext2/ext3 file system.
+ The size should be in the units expected by 
+\series bold 
+mkfs
+\series default 
+, so for ext3 it should be in MB.
+ If this is option is not used, the ext2/ext3 filesystem will be configured
+ with the default journal size.
 \end_deeper 
 \layout Description
 
@@ -343,8 +439,8 @@ mtpt
 mds_name 
 \layout Description
 
---obd\SpecialChar ~
-obd_name\SpecialChar ~
+--ost\SpecialChar ~
+ost_name\SpecialChar ~
 |\SpecialChar ~
 --lov\SpecialChar ~
 lov_name
@@ -396,7 +492,8 @@ nid.
 \layout Description
 
 --add\SpecialChar ~
-echo-client Used for testing purpose only
+echo-client Used for testing purpose only.
 \begin_deeper 
 \layout Description
 
diff --git a/lustre/extN/Makefile.am b/lustre/extN/Makefile.am
deleted file mode 100644 (file)
index d1de59b..0000000
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (C) 2001  Cluster File Systems, Inc.
-#
-# This code is issued under the GNU General Public License.
-# See the file COPYING in this distribution
-
-DEFS=-DEXPORT_SYMTAB
-MODULE = extN
-modulefs_DATA = extN.o
-EXTRA_PROGRAMS = extN
-
-# NOTE: If you are not using a RedHat 12.5 or later kernel, then you need to
-#       apply the "fixes" patch first, as it fixes a number of bugs in ext3.
-#       It will be applied automatically by the extN build process, or you
-#       can apply it to the source kernel tree and fix ext3 also.  For chaos22
-#       (or other RH < 12.5 kernels) use the "chaos22" patch instead.
-EXTN_FIXES = patch-2.4.18-chaos22
-#EXTN_FIXES = ext3-2.4.18-fixes.diff
-EXTNP = htree-ext3-2.4.18.diff linux-2.4.18ea-0.8.26.diff ext3-2.4-ino_t.diff
-EXTNP+= ext3-2.4.18-ino_sb_macro.diff extN-misc-fixup.diff extN-noread.diff
-EXTNP+= extN-wantedi.diff extN-san.diff extN-2.4.18-ino_sb_fixup.diff
-#EXTNP+= extN-iget-debug.diff
-EXTNC = balloc.c bitmap.c dir.c file.c fsync.c ialloc.c inode.c ioctl.c
-EXTNC+= namei.c super.c symlink.c
-EXTNI = extN_fs.h extN_fs_i.h extN_fs_sb.h extN_jbd.h quotaops.h
-EXTN_EXTRA = include/linux/xattr.h include/linux/extN_xattr.h fs/extN/xattr.c
-EXTN_EXTRA += include/linux/quotaops.h
-extN_SOURCES = $(EXTNC) xattr.c # punch.c
-extN_DEPENDENCIES = patch-stamp
-EXTRA_DIST = $(EXTNP) $(EXTN_FIXES) \
-       ext3-largefile.diff extN-2.4.18-exports.diff \
-       ext3-use-after-free.diff ext3-unmount_sync.diff $(wildcard extN.patch-*)
-DISTCLEANFILES = -r $(extN_SOURCES) sed-stamp patch-stamp *.orig *.rej
-SUB=-e "s/ext3/extN/g" -e "s/EXT3/EXTN/g" -e "s/extern __inline__/static inline/"
-
-distclean:
-       cd .. && rm -f $(EXTN_EXTRA)
-
-include $(top_srcdir)/Rules
-
-# Following 2 vars are for buildind outside the source tree.
-extN_orig = $(top_builddir)/$(subdir)/extN.orig
-extN_include_orig = $(top_builddir)/$(subdir)/extN-include.orig
-
-# Create a fresh extN patch.
-# This is for when the patch-stamp target fails for your kernel.
-# Just edit the files until you like them, then do `make diff', and
-# it will create a specialized patch for your particular kernel.
-# Check it in, and the build should work for you without disrupting
-# the other developers.
-# Of course, the ideal is to merge changes so that the default patch
-# set works for nearly everybody.  This is mainly for damage control.
-
-diff:
-       $(RM) extN.patchT
-       l='$(EXTNC)'; for f in $$l; do                                        \
-          echo "$$f";                                                         \
-          (diff -u $(extN_orig)/$$f extN/$$f) >> extN.patchT;                 \
-          test $$? -le 1 || exit 1;                                           \
-       done
-       l='$(EXTNI)'; for f in $$l; do                                        \
-          echo "$$f";                                                         \
-          (diff -u $(extN_include_orig)/$$f $(top_srcdir)/include/linux/$$f)>>extN.patchT;\
-          test $$? -le 1 || exit 1;                                           \
-       done
-       l='$(EXTN_EXTRA)'; for f in $$l; do                                   \
-          f=`echo "$$f" | sed 's%^fs/%%'`;                                    \
-          echo "$$f";                                                         \
-          (cd $(top_srcdir) &&                                                \
-            diff -u /dev/null $$f) >> extN.patchT;                            \
-          test $$? -le 1 || exit 1;                                           \
-       done
-       mv -f extN.patchT $(top_builddir)/$(subdir)/extN.patch-$(RELEASE)
-       echo "Don't forget to add $(srcdir)/extN.patch-$(RELEASE) to CVS!"
-
-.PHONY: diff
-
-# Just do the SUB transformation on all our source files.
-sed-stamp:
-       $(RM) $@
-       rm -rf $(extN_orig) $(extN_include_orig)
-       mkdir $(extN_orig) $(extN_include_orig)
-       list='$(EXTNC)'; for f in $$list; do                                  \
-          echo "creating $(extN_orig)/$$f";                                   \
-          sed $(SUB) $(LINUX)/fs/ext3/$$f > $(extN_orig)/$$f;                 \
-       done
-       list='$(EXTNI)'; for i in $$list; do                                  \
-          s=`echo $$i | sed "s/extN/ext3/"`;                                  \
-          echo "creating $(extN_include_orig)/$$i";                           \
-          sed $(SUB) $(LINUX)/include/linux/$$s > $(extN_include_orig)/$$i;   \
-       done
-       echo timestamp > $@
-
-
-# Patch the kernel files with our ext3 patches.  We need to go through some
-# extra hoops because the include files are in a different tree and because
-# patch likes to make local copies of files with (sym)links when it is patching
-# them.  To avoid this, we copy/patch in the source dir instead of the build
-# dir (if they are different).
-# We also want to preserve the pristine transformed files for the diff target.
-
-
-
-patch-stamp: sed-stamp $(EXTNP)
-       test -e $(top_builddir)/include/linux || mkdir -p $(top_builddir)/include/linux 
-       cp -a $(extN_orig)/* $(top_builddir)/$(subdir)
-       cp -a $(extN_include_orig)/* $(top_builddir)/include/linux
-       test -e $(top_builddir)/fs || ln -s . $(top_builddir)/fs
-       list='$(EXTN_EXTRA)'; for f in $$list; do $(RM) $(top_builddir)/$$f; done
-       if [ -f $(srcdir)/extN.patch-$(RELEASE) ]; then                       \
-         echo "applying patch $(srcdir)/extN.patch-$(RELEASE)";              \
-         (cd $(top_builddir) && patch -p0) < $(srcdir)/extN.patch-$(RELEASE);\
-       else                                                                  \
-         list='$(EXTNP)'; \
-         grep -q "err = extN_mark_inode_dirty" $(extN_orig)/namei.c ||       \
-           list="ext3-use-after-free.diff $$list";                           \
-         sed '/i_version/q' $(extN_orig)/namei.c | tail -2 |                 \
-           grep -q extN_mark_inode_dirty && list="$(EXTN_FIXES) $$list";     \
-         grep -q "if (do_sync_supers)" $(extN_orig)/super.c &&               \
-           list="ext3-unmount_sync.diff $$list";                             \
-         grep -q "ext3_journal_start(inode, 2)" $(extN_orig)/inode.c ||      \
-           list="ext3-largefile.diff $$list";                                \
-         grep -q "EXPORT_SYMBOL(extN_bread)" $(extN_orig)/super.c ||         \
-           list="$$list extN-2.4.18-exports.diff";                           \
-         for p in $$list; do                                                 \
-           echo "applying patch $$p";                                        \
-           sed $(SUB) $(srcdir)/$$p |                                        \
-             (cd $(top_builddir) && patch -p1) || exit $$?;                  \
-         done;                                                               \
-       fi
-       echo timestamp > $@
-
-
-
-
-$(extN_SOURCES) $(EXTNI) $(EXTN_EXTRA): patch-stamp
-
-# Don't distribute any patched files.
-dist-hook:
-       $(RM) $(top_srcdir)/fs
-       list='$(EXTNC)'; for f in $$list; do $(RM) $(distdir)/$$f; done
-       list='$(EXTNI)'; for i in $$list; do                                  \
-         $(RM) $(distdir)/../include/linux/$$i;                              \
-       done
-       list='$(EXTN_EXTRA)'; for f in $$list; do $(RM) $(distdir)/../$$f; done
diff --git a/lustre/extN/ext3-largefile.diff b/lustre/extN/ext3-largefile.diff
deleted file mode 100644 (file)
index db41aab..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-Under rare conditions (filesystem corruption, really) it is possible
-for ext3_dirty_inode() to require _two_ blocks for the transaction: one
-for the inode and one to update the superblock - to set
-EXT3_FEATURE_RO_COMPAT_LARGE_FILE.  This causes the filesystem to go
-BUG.
-
-So reserve an additional block for that eventuality.
-
-
- fs/ext3/inode.c |    2 +-
- 1 files changed, 1 insertion(+), 1 deletion(-)
-
---- 25/fs/ext3/inode.c~ext3-transaction-reserved-blocks        Sat Dec 14 18:28:21 2002
-+++ 25-akpm/fs/ext3/inode.c    Sat Dec 14 18:28:21 2002
-@@ -2698,7 +2698,7 @@ void ext3_dirty_inode(struct inode *inod
-       handle_t *handle;
-       lock_kernel();
--      handle = ext3_journal_start(inode, 1);
-+      handle = ext3_journal_start(inode, 2);
-       if (IS_ERR(handle))
-               goto out;
-       if (current_handle &&
diff --git a/lustre/extN/ext3-unmount_sync.diff b/lustre/extN/ext3-unmount_sync.diff
deleted file mode 100644 (file)
index 1f9b796..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-From adilger@clusterfs.com Mon Dec  2 10:26:44 2002
-Date: Mon, 2 Dec 2002 10:26:44 -0700
-From: Andreas Dilger <adilger@clusterfs.com>
-To: Lustre LLNL Mailing list <lc-lustre@llnl.gov>,
-       Lustre Development Mailing List <lustre-devel@lists.sourceforge.net>
-Subject: Re: data corrupting bug in 2.4.20 ext3, data=journal
-Message-ID: <20021202102644.H1422@schatzie.adilger.int>
-Mail-Followup-To: Lustre LLNL Mailing list <lc-lustre@llnl.gov>,
-       Lustre Development Mailing List <lustre-devel@lists.sourceforge.net>
-Mime-Version: 1.0
-Content-Type: text/plain; charset=us-ascii
-Content-Disposition: inline
-User-Agent: Mutt/1.2.5.1i
-X-GPG-Key: 1024D/0D35BED6
-X-GPG-Fingerprint: 7A37 5D79 BF1B CECA D44F  8A29 A488 39F5 0D35 BED6
-Status: RO
-Content-Length: 1160
-Lines: 39
-
-Here is the new-improved fix for the ext3 discarding data at umount bug
-discovered late last week.  To be used instead of the previous ext3 fix.
-
-Sadly, this is completely unrelated to the problems Mike is having with
-ext3 under UML, since it is an unmount-time problem.
-
------ Forwarded message from "Stephen C. Tweedie" <sct@redhat.com> -----
-The attached patch seems to fix things for me.
-
-Cheers,
- Stephen
-
-
---- linux-2.4-ext3merge/fs/ext3/super.c.=K0027=.orig   2002-12-02 15:35:13.000000000 +0000
-+++ linux-2.4-ext3merge/fs/ext3/super.c        2002-12-02 15:35:14.000000000 +0000
-@@ -1640,7 +1640,12 @@
-       sb->s_dirt = 0;
-       target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
--      if (do_sync_supers) {
-+      /*
-+       * Tricky --- if we are unmounting, the write really does need
-+       * to be synchronous.  We can detect that by looking for NULL in
-+       * sb->s_root.
-+       */
-+      if (do_sync_supers || !sb->s_root) {
-               unlock_super(sb);
-               log_wait_commit(EXT3_SB(sb)->s_journal, target);
-               lock_super(sb);
-
-
------ End forwarded message -----
-
-Cheers, Andreas
---
-Andreas Dilger
-http://sourceforge.net/projects/ext2resize/
-http://www-mddsp.enel.ucalgary.ca/People/adilger/
-
-
diff --git a/lustre/extN/extN-2.4.18-exports.diff b/lustre/extN/extN-2.4.18-exports.diff
deleted file mode 100644 (file)
index 8780209..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
---- linux-2.4.17/fs/extN/super.c.orig  Fri Dec 21 10:41:55 2001
-+++ linux-2.4.17/fs/extN/super.c       Fri Mar 22 11:00:41 2002
-@@ -1742,7 +1742,7 @@
-       unregister_filesystem(&extN_fs_type);
- }
--EXPORT_NO_SYMBOLS;
-+EXPORT_SYMBOL(extN_bread);
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
- MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
diff --git a/lustre/extN/extN-2.4.18-ino_sb_fixup.diff b/lustre/extN/extN-2.4.18-ino_sb_fixup.diff
deleted file mode 100644 (file)
index 37fd692..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
---- ./include/linux/extN_fs.h.orig     Tue May  7 17:06:03 2002
-+++ ./include/linux/extN_fs.h  Tue May  7 17:07:11 2002
-@@ -17,6 +17,8 @@
- #define _LINUX_EXTN_FS_H
- #include <linux/types.h>
-+#include <linux/extN_fs_sb.h>
-+#include <linux/extN_fs_i.h>
- /*
-  * The second extended filesystem constants/structures
-@@ -86,8 +88,8 @@
- #define EXTN_MIN_BLOCK_LOG_SIZE                 10
- #ifdef __KERNEL__
--#define EXTN_SB(sb)   (&((sb)->u.extN_sb))
--#define EXTN_I(inode) (&((inode)->u.extN_i))
-+#define EXTN_SB(sb)   ((struct extN_sb_info *)&((sb)->u.generic_sbp))
-+#define EXTN_I(inode) ((struct extN_inode_info *)&((inode)->u.generic_ip))
- #define EXTN_BLOCK_SIZE(s)            ((s)->s_blocksize)
- #define EXTN_BLOCK_SIZE_BITS(s)               ((s)->s_blocksize_bits)
-@@ -447,7 +447,9 @@
- #define NEXT_ORPHAN(inode) EXTN_I(inode)->i_dtime
- static inline struct inode *orphan_list_entry(struct list_head *l)
- {
--      return list_entry(l, struct inode, u.extN_i.i_orphan);
-+      return ((struct inode *)((char *)l -
-+              (unsigned long)(offsetof(struct inode, u.generic_ip) +
-+                              offsetof(struct extN_inode_info, i_orphan))));
- }
- /*
diff --git a/lustre/extN/extN-san.diff b/lustre/extN/extN-san.diff
deleted file mode 100644 (file)
index 4d0f277..0000000
+++ /dev/null
@@ -1,88 +0,0 @@
---- lustre/extN/inode.orig.c   2002-12-29 18:48:56.000000000 +0800
-+++ lustre/extN/inode.c        2002-12-29 19:17:24.000000000 +0800
-@@ -2728,3 +2728,85 @@
-  * here, in extN_aops_journal_start() to ensure that the forthcoming "see if we
-  * need to extend" test in extN_prepare_write() succeeds.  
-  */
-+
-+/* for each block: 1 ind + 1 dind + 1 tind
-+ * for each block: 3 bitmap blocks
-+ * for each block: 3 group descriptor blocks
-+ * i inode block
-+ * 1 superblock
-+ * 2 * EXTN_SINGLEDATA_TRANS_BLOCKS for the quote files
-+ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXTN_SINGLEDATA_TRANS_BLOCKS
-+ *
-+ * XXX assuming:
-+ * (1) fs logic block size == page size
-+ * (2) extN in writeback mode
-+ */
-+static inline int extN_san_write_trans_blocks(int nblocks)
-+{
-+      int ret;
-+      
-+      ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1;
-+
-+#ifdef CONFIG_QUOTA
-+      ret += 2 * EXTN_SINGLEDATA_TRANS_BLOCKS;
-+#endif
-+
-+      return ret;
-+}
-+
-+/* Alloc blocks for an inode, while don't create any buffer/page
-+ * for data I/O; set the inode size if file is extended.
-+ *
-+ * @inode:    target inode
-+ * @blocks:   array of logic block number
-+ * @nblocks:  how many blocks need be alloced
-+ * @newsize:  new filesize we should set
-+ *
-+ * return:    0 success, otherwise failed
-+ *            (*blocks) contains physical block number alloced
-+ *
-+ * XXX this assume the fs block size == page size
-+ */
-+int extN_prep_san_write(struct inode *inode, long *blocks,
-+                      int nblocks, loff_t newsize)
-+{
-+      handle_t *handle;
-+      struct buffer_head bh_tmp;
-+      int needed_blocks;
-+      int i, ret = 0, ret2;
-+
-+      needed_blocks = extN_san_write_trans_blocks(nblocks);
-+
-+      lock_kernel();
-+      handle = extN_journal_start(inode, needed_blocks);
-+      if (IS_ERR(handle)) {
-+              unlock_kernel();
-+              return PTR_ERR(handle);
-+      }
-+      unlock_kernel();
-+
-+      /* alloc blocks one by one */
-+      for (i = 0; i < nblocks; i++) {
-+              ret = extN_get_block_handle(handle, inode, blocks[i],
-+                                              &bh_tmp, 1);
-+              if (ret)
-+                      break;
-+
-+              blocks[i] = bh_tmp.b_blocknr;
-+      }
-+
-+      /* set inode size if needed */
-+      if (!ret && (newsize > inode->i_size)) {
-+              inode->i_size = newsize;
-+              extN_mark_inode_dirty(handle, inode);
-+      }
-+
-+      lock_kernel();
-+      ret2 = extN_journal_stop(handle, inode);
-+      unlock_kernel();
-+
-+      if (!ret)
-+              ret = ret2;
-+      return ret;
-+}
-+EXPORT_SYMBOL(extN_prep_san_write);
diff --git a/lustre/extN/extN-wantedi.diff b/lustre/extN/extN-wantedi.diff
deleted file mode 100644 (file)
index a55aec0..0000000
+++ /dev/null
@@ -1,163 +0,0 @@
---- lustre/extN-clean/namei.c  2002-12-30 05:56:09.000000000 -0500
-+++ lustre/extN/namei.c        2002-12-30 06:29:39.000000000 -0500
-@@ -1224,7 +1224,8 @@
-       if (IS_SYNC(dir))
-               handle->h_sync = 1;
--      inode = extN_new_inode (handle, dir, mode);
-+      inode = extN_new_inode (handle, dir, mode,
-+                              (unsigned long)dentry->d_fsdata);
-       err = PTR_ERR(inode);
-       if (!IS_ERR(inode)) {
-               inode->i_op = &extN_file_inode_operations;
-@@ -1254,7 +1254,8 @@
-       if (IS_SYNC(dir))
-               handle->h_sync = 1;
--      inode = extN_new_inode (handle, dir, mode);
-+      inode = extN_new_inode (handle, dir, mode,
-+                              (unsigned long)dentry->d_fsdata);
-       err = PTR_ERR(inode);
-       if (!IS_ERR(inode)) {
-               init_special_inode(inode, mode, rdev);
-@@ -1286,7 +1286,8 @@
-       if (IS_SYNC(dir))
-               handle->h_sync = 1;
--      inode = extN_new_inode (handle, dir, S_IFDIR | mode);
-+      inode = extN_new_inode (handle, dir, S_IFDIR | mode,
-+                              (unsigned long)dentry->d_fsdata);
-       err = PTR_ERR(inode);
-       if (IS_ERR(inode))
-               goto out_stop;
-@@ -1680,7 +1681,8 @@
-       if (IS_SYNC(dir))
-               handle->h_sync = 1;
--      inode = extN_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
-+      inode = extN_new_inode (handle, dir, S_IFLNK|S_IRWXUGO,
-+                              (unsigned long)dentry->d_fsdata);
-       err = PTR_ERR(inode);
-       if (IS_ERR(inode))
-               goto out_stop;
---- lustre/extN-clean/ialloc.c 2002-12-28 23:56:42.000000000 -0500
-+++ lustre/extN/ialloc.c       2002-12-30 06:29:39.000000000 -0500
-@@ -329,8 +329,8 @@
-  * For other inodes, search forward from the parent directory's block
-  * group to find a free inode.
-  */
--struct inode * extN_new_inode (handle_t *handle,
--                              const struct inode * dir, int mode)
-+struct inode *extN_new_inode(handle_t *handle, const struct inode *dir,
-+                           int mode, unsigned long goal)
- {
-       struct super_block * sb;
-       struct buffer_head * bh;
-@@ -360,6 +361,38 @@
-       lock_super (sb);
-       es = sbi->s_es;
-+
-+      if (goal) {
-+              i = (goal - 1) / EXTN_INODES_PER_GROUP(sb);
-+              j = (goal - 1) % EXTN_INODES_PER_GROUP(sb);
-+              gdp = extN_get_group_desc(sb, i, &bh2);
-+
-+              bitmap_nr = load_inode_bitmap (sb, i);
-+              if (bitmap_nr < 0)
-+                      goto fail;
-+
-+              bh = sbi->s_inode_bitmap[bitmap_nr];
-+
-+              BUFFER_TRACE(bh, "get_write_access");
-+              err = extN_journal_get_write_access(handle, bh);
-+              if (err) goto fail;
-+
-+              if (extN_set_bit(j, bh->b_data)) {
-+                      printk(KERN_ERR "goal inode %lu unavailable\n", goal);
-+                      /* Oh well, we tried. */
-+                      goto repeat;
-+              }
-+
-+              BUFFER_TRACE(bh, "call extN_journal_dirty_metadata");
-+              err = extN_journal_dirty_metadata(handle, bh);
-+              if (err) goto fail;
-+
-+              /* We've shortcircuited the allocation system successfully,
-+               * now finish filling in the inode.
-+               */
-+              goto have_bit_and_group;
-+      }
-+
- repeat:
-       gdp = NULL;
-       i = 0;
-@@ -474,6 +509,7 @@
-               }
-               goto repeat;
-       }
-+have_bit_and_group:
-       j += i * sbi->s_inodes_per_group + 1;
-       if (j < sbi->s_first_ino || j > le32_to_cpu(es->s_inodes_count)) {
-               extN_error (sb, "extN_new_inode",
---- lustre/extN-clean/ioctl.c  2002-12-28 23:56:42.000000000 -0500
-+++ lustre/extN/ioctl.c        2002-12-30 06:29:39.000000000 -0500
-@@ -24,6 +24,31 @@
-       extN_debug ("cmd = %u, arg = %lu\n", cmd, arg);
-       switch (cmd) {
-+      case EXTN_IOC_CREATE_INUM: {
-+              char name[32];
-+              struct dentry *dchild, *dparent;
-+              int rc = 0;
-+
-+              dparent = list_entry(inode->i_dentry.next, struct dentry,
-+                                   d_alias);
-+              snprintf(name, sizeof name, "%lu", arg);
-+              dchild = lookup_one_len(name, dparent, strlen(name));
-+              if (dchild->d_inode) {
-+                      printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n",
-+                             dparent->d_name.len, dparent->d_name.name, arg,
-+                             dchild->d_inode->i_ino);
-+                      rc = -EEXIST;
-+              } else {
-+                      dchild->d_fsdata = (void *)arg;
-+                      rc = vfs_create(inode, dchild, 0644);
-+                      if (rc)
-+                              printk(KERN_ERR "vfs_create: %d\n", rc);
-+                      else if (dchild->d_inode->i_ino != arg)
-+                              rc = -EEXIST;
-+              }
-+              dput(dchild);
-+              return rc;
-+      }
-       case EXTN_IOC_GETFLAGS:
-               flags = ei->i_flags & EXTN_FL_USER_VISIBLE;
-               return put_user(flags, (int *) arg);
---- lustre/include/linux/extN_fs.h~    2002-12-30 06:01:43.000000000 -0500
-+++ lustre/include/linux/extN_fs.h     2002-12-30 06:02:51.000000000 -0500
-@@ -200,6 +200,7 @@
- #define       EXTN_IOC_SETFLAGS               _IOW('f', 2, long)
- #define       EXTN_IOC_GETVERSION             _IOR('f', 3, long)
- #define       EXTN_IOC_SETVERSION             _IOW('f', 4, long)
-+/* EXTN_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
- #define       EXTN_IOC_GETVERSION_OLD         _IOR('v', 1, long)
- #define       EXTN_IOC_SETVERSION_OLD         _IOW('v', 2, long)
- #ifdef CONFIG_JBD_DEBUG
-@@ -632,7 +633,8 @@
- extern int extN_sync_file (struct file *, struct dentry *, int);
- /* ialloc.c */
--extern struct inode * extN_new_inode (handle_t *, const struct inode *, int);
-+extern struct inode * extN_new_inode (handle_t *, const struct inode *, int,
-+                                    unsigned long);
- extern void extN_free_inode (handle_t *, struct inode *);
- extern struct inode * extN_orphan_get (struct super_block *, ino_t);
- extern unsigned long extN_count_free_inodes (struct super_block *);
-@@ -714,4 +716,6 @@
- #endif        /* __KERNEL__ */
-+#define EXTN_IOC_CREATE_INUM                  _IOW('f', 5, long)
-+
- #endif        /* _LINUX_EXTN_FS_H */
index 864df96..7b78c04 100644 (file)
@@ -4,6 +4,7 @@ config.status
 configure
 config.h
 stamp-h
+stamp-h1
 stamp-h.in
 Makefile
 Makefile.in
diff --git a/lustre/include/ioctl.h b/lustre/include/ioctl.h
new file mode 100644 (file)
index 0000000..a4ec8a5
--- /dev/null
@@ -0,0 +1,64 @@
+#ifndef _ASMI386_IOCTL_H
+#define _ASMI386_IOCTL_H
+
+/* ioctl command encoding: 32 bits total, command in lower 16 bits,
+ * size of the parameter structure in the lower 14 bits of the
+ * upper 16 bits.
+ * Encoding the size of the parameter structure in the ioctl request
+ * The highest 2 bits are reserved for indicating the ``access mode''.
+ * NOTE: This limits the max parameter size to 16kB -1 !
+ */
+
+/*
+ * The following is for compatibility across the various Linux
+ * platforms.  The i386 ioctl numbering scheme doesn't really enforce
+ * a type field.  De facto, however, the top 8 bits of the lower 16
+ * bits are indeed used as a type field, so we might just as well make
+ * this explicit here.  Please be sure to use the decoding macros
+ * below from now on.
+ */
+#define _IOC_NRBITS     8
+#define _IOC_TYPEBITS   8
+#define _IOC_SIZEBITS   14
+#define _IOC_DIRBITS    2
+
+#define _IOC_NRMASK     ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK   ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK   ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK    ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT    0
+#define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE       0U
+#define _IOC_WRITE      1U
+#define _IOC_READ       2U
+
+#define _IOC(dir,type,nr,size) (((dir)  << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr)   << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr)            _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size)      _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size)      _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size)     _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr)            (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr)           (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr)             (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr)           (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/* ...and for the drivers/sound files... */
+
+#define IOC_IN          (_IOC_WRITE << _IOC_DIRSHIFT)
+#define IOC_OUT         (_IOC_READ << _IOC_DIRSHIFT)
+#define IOC_INOUT       ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
+#define IOCSIZE_MASK    (_IOC_SIZEMASK << _IOC_SIZESHIFT)
+#define IOCSIZE_SHIFT   (_IOC_SIZESHIFT)
+
+#endif /* _ASMI386_IOCTL_H */
index 1e57ea4..017d5b6 100644 (file)
 #define LIBLUSTRE_H__
 
 #include <sys/mman.h>
+#ifndef  __CYGWIN__
+#include <stdint.h>
 #include <asm/page.h>
+#else
+#include <sys/types.h>
+#include "ioctl.h"
+#endif
 #include <stdio.h>
 #include <sys/ioctl.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 
 #include <portals/list.h>
 #include <portals/p30.h>
+#include <linux/kp30.h>
 
 /* definitions for liblustre */
 
+#ifdef __CYGWIN__
+
+#define PAGE_SHIFT 12
+#define PAGE_SIZE (1UL << PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE-1))
+#define loff_t __u64
+#define ERESTART 2001
+typedef unsigned short umode_t;
+
+#endif
+
 /* always adopt 2.5 definitions */
-#define LINUX_VERSION_CODE 1
-#define KERNEL_VERSION(a,b,c) 0
+#define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c)
+#define LINUX_VERSION_CODE (2*200+5*10+0)
 
 static inline void inter_module_put(void *a)
 {
@@ -51,16 +68,7 @@ static inline void inter_module_put(void *a)
 
 extern ptl_handle_ni_t         tcpnal_ni;
 
-static inline void *inter_module_get(char *arg)
-{
-
-        if (strcmp(arg, "tcpnal_ni") == 0 )
-                return &tcpnal_ni;
-        else
-                return NULL;
-
-}
-
+void *inter_module_get(char *arg);
 
 /* cheats for now */
 
@@ -108,6 +116,93 @@ typedef void *read_proc_t;
 typedef void *write_proc_t;
 
 
+/* byteorder */
+#define __swab16(x) \
+({ \
+       __u16 __x = (x); \
+       ((__u16)( \
+               (((__u16)(__x) & (__u16)0x00ffU) << 8) | \
+               (((__u16)(__x) & (__u16)0xff00U) >> 8) )); \
+})
+
+#define __swab32(x) \
+({ \
+       __u32 __x = (x); \
+       ((__u32)( \
+               (((__u32)(__x) & (__u32)0x000000ffUL) << 24) | \
+               (((__u32)(__x) & (__u32)0x0000ff00UL) <<  8) | \
+               (((__u32)(__x) & (__u32)0x00ff0000UL) >>  8) | \
+               (((__u32)(__x) & (__u32)0xff000000UL) >> 24) )); \
+})
+
+#define __swab64(x) \
+({ \
+       __u64 __x = (x); \
+       ((__u64)( \
+               (__u64)(((__u64)(__x) & (__u64)0x00000000000000ffULL) << 56) | \
+               (__u64)(((__u64)(__x) & (__u64)0x000000000000ff00ULL) << 40) | \
+               (__u64)(((__u64)(__x) & (__u64)0x0000000000ff0000ULL) << 24) | \
+               (__u64)(((__u64)(__x) & (__u64)0x00000000ff000000ULL) <<  8) | \
+               (__u64)(((__u64)(__x) & (__u64)0x000000ff00000000ULL) >>  8) | \
+               (__u64)(((__u64)(__x) & (__u64)0x0000ff0000000000ULL) >> 24) | \
+               (__u64)(((__u64)(__x) & (__u64)0x00ff000000000000ULL) >> 40) | \
+               (__u64)(((__u64)(__x) & (__u64)0xff00000000000000ULL) >> 56) )); \
+})
+
+#define __swab16s(x)    __swab16(*(x))
+#define __swab32s(x)    __swab32(*(x))
+#define __swab64s(x)    __swab64(*(x))
+
+#define __LITTLE_ENDIAN__
+#ifdef  __LITTLE_ENDIAN__
+# define le16_to_cpu(x) ((__u16)(x))
+# define cpu_to_le16(x) ((__u16)(x))
+# define le32_to_cpu(x) ((__u32)(x))
+# define cpu_to_le32(x) ((__u32)(x))
+# define le64_to_cpu(x) ((__u64)(x))
+# define cpu_to_le64(x) ((__u64)(x))
+#else
+# define le16_to_cpu(x) __swab16(x)
+# define cpu_to_le16(x) __swab16(x)
+# define le32_to_cpu(x) __swab32(x)
+# define cpu_to_le32(x) __swab32(x)
+# define le64_to_cpu(x) __swab64(x)
+# define cpu_to_le64(x) __swab64(x)
+# error "do more check here!!!"
+#endif
+
+/* bits ops */
+static __inline__ int set_bit(int nr,long * addr)
+{
+       int     mask, retval;
+
+       addr += nr >> 5;
+       mask = 1 << (nr & 0x1f);
+       retval = (mask & *addr) != 0;
+       *addr |= mask;
+       return retval;
+}
+
+static __inline__ int clear_bit(int nr, long * addr)
+{
+       int     mask, retval;
+
+       addr += nr >> 5;
+       mask = 1 << (nr & 0x1f);
+       retval = (mask & *addr) != 0;
+       *addr &= ~mask;
+       return retval;
+}
+
+static __inline__ int test_bit(int nr, long * addr)
+{
+       int     mask;
+
+       addr += nr >> 5;
+       mask = 1 << (nr & 0x1f);
+       return ((mask & *addr) != 0);
+}
+
 /* modules */
 
 struct module {
@@ -144,6 +239,7 @@ extern int ptlrpc_init(void);
 extern int ldlm_init(void);
 extern int osc_init(void);
 extern int lov_init(void);
+extern int mdc_init(void);
 extern int echo_client_init(void);
 
 
@@ -168,21 +264,20 @@ static inline void spin_unlock_bh(spinlock_t *l)
 {
         return;
 }
-static inline void spin_lock_irqrestore(a,b)
+static inline void spin_unlock_irqrestore(spinlock_t *a, long b)
 {
         return;
 }
-static inline void spin_unlock_irqrestore(a,b)
-{
-        return;
-}
-static inline void spin_lock_irqsave(a,b)
+static inline void spin_lock_irqsave(spinlock_t *a, long b)
 {
         return;
 }
 
 #define barrier() do {int a= 1; a++; } while (0)
 
+#define min(x,y) ((x)<(y) ? (x) : (y))
+#define max(x,y) ((x)>(y) ? (x) : (y))
+
 /* registering symbols */
 
 #define ERESTARTSYS ERESTART
@@ -192,18 +287,18 @@ static inline void spin_lock_irqsave(a,b)
 
 static inline void get_random_bytes(void *ptr, int size)
 {
-        static int r;
         int *p = (int *)ptr;
-        int *end = p + (size / sizeof(int));
-        r = rand();
-        while ( p + sizeof(int) < end ) {
-                *p = r;
-                p++;
-        }
+        int i, count = size/sizeof(int);
+
+        for (i = 0; i< count; i++)
+                *p++ = rand();
 }
 
 /* memory */
 
+/* FIXME */
+#define num_physpages (16 * 1024)
+
 static inline int copy_from_user(void *a,void *b, int c)
 {
         memcpy(a,b,c);
@@ -222,26 +317,35 @@ typedef struct {
          int size;
 } kmem_cache_t;
 #define SLAB_HWCACHE_ALIGN 0
-static inline kmem_cache_t *kmem_cache_create(name,objsize,cdum,d,e,f)
+static inline kmem_cache_t *
+kmem_cache_create(const char *name, size_t objsize, size_t cdum,
+                  unsigned long d,
+                  void (*e)(void *, kmem_cache_t *, unsigned long),
+                  void (*f)(void *, kmem_cache_t *, unsigned long))
 {
         kmem_cache_t *c;
         c = malloc(sizeof(*c));
         if (!c)
                 return NULL;
         c->size = objsize;
+        CDEBUG(D_MALLOC, "alloc slab cache %s at %p, objsize %d\n",
+               name, c, (int)objsize);
         return c;
 };
 
 static inline int kmem_cache_destroy(kmem_cache_t *a)
 {
+        CDEBUG(D_MALLOC, "destroy slab cache %p, objsize %u\n", a, a->size);
         free(a);
         return 0;
 }
 #define kmem_cache_validate(a,b) 1
 #define kmem_cache_alloc(cache, prio) malloc(cache->size)
-#define kmem_cache_free(cache, obj) OBD_FREE(obj, cache->size)
-#define PORTAL_SLAB_ALLOC(lock,cache,size) do { lock = kmem_cache_alloc(cache,prio); } while (0)
-#define PORTAL_SLAB_FREE(lock,cache,size) do { lock = kmem_cache_alloc(cache,prio); } while (0)
+#define kmem_cache_free(cache, obj) free(obj)
+
+#define PAGE_CACHE_SIZE PAGE_SIZE
+#define PAGE_CACHE_SHIFT 12
+#define PAGE_CACHE_MASK PAGE_MASK
 
 struct page {
         void *addr;
@@ -251,7 +355,7 @@ struct page {
 #define kmap(page) (page)->addr
 #define kunmap(a) do { int foo = 1; foo++; } while (0)
 
-static inline struct page *alloc_pages(mask,foo)
+static inline struct page *alloc_pages(int mask, unsigned long foo)
 {
         struct page *pg = malloc(sizeof(*pg));
 
@@ -280,29 +384,82 @@ static inline void __free_pages(struct page *pg, int what)
         free(pg);
 }
 
+static inline struct page* __grab_cache_page(int index)
+{
+        struct page *pg = alloc_pages(0, 0);
+
+        if (pg)
+                pg->index = index;
+        return pg;
+}
+
+#define grab_cache_page(index) __grab_cache_page(index)
+#define page_cache_release(page) __free_pages(page, 0)
+
 /* arithmetic */
-#define do_div(a,b) (a)/(b)
+#define do_div(a,b)                     \
+        ({                              \
+                unsigned long ret;      \
+                ret = (a)%(b);          \
+                (a) = (a)/(b);          \
+                (ret);                  \
+        })
+
+/* VFS stuff */
+#define ATTR_MODE       1
+#define ATTR_UID        2
+#define ATTR_GID        4
+#define ATTR_SIZE       8
+#define ATTR_ATIME      16
+#define ATTR_MTIME      32
+#define ATTR_CTIME      64
+#define ATTR_ATIME_SET  128
+#define ATTR_MTIME_SET  256
+#define ATTR_FORCE      512     /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG  1024
+#define ATTR_RAW        2048    /* file system, not vfs will massage attrs */
+#define ATTR_FROM_OPEN  4096    /* called from open path, ie O_TRUNC */
 
-/* dentries / intents */
-struct lookup_intent {
-        void *it_iattr;
+struct iattr {
+        unsigned int    ia_valid;
+        umode_t         ia_mode;
+        uid_t           ia_uid;
+        gid_t           ia_gid;
+        loff_t          ia_size;
+        time_t          ia_atime;
+        time_t          ia_mtime;
+        time_t          ia_ctime;
+        unsigned int    ia_attr_flags;
 };
 
-struct iattr {
-        int mode;
+/* copy from kernel header */
+#define IT_OPEN     (1)
+#define IT_CREAT    (1<<1)
+#define IT_READDIR  (1<<2)
+#define IT_GETATTR  (1<<3)
+#define IT_LOOKUP   (1<<4)
+#define IT_UNLINK   (1<<5)
+
+struct lookup_intent {
+        int it_op;
+        int it_mode;
+        int it_flags;
+        int it_disposition;
+        int it_status;
+        struct iattr *it_iattr;
+        __u64 it_lock_handle[2];
+        int it_lock_mode;
+        void *it_data;
 };
 
 struct dentry {
         int d_count;
 };
-struct file {
-        struct dentry *f_dentry;
-        void *private_data;
-} ;
 
 struct vfsmount {
         void *pwd;
 };
+
 #define cpu_to_le32(x) ((__u32)(x))
 
 /* semaphores */
@@ -327,16 +484,24 @@ struct signal {
         int signal;
 };
 
+struct fs_struct {
+        int umask;
+};
+
 struct task_struct {
+        struct fs_struct *fs;
         int state;
         struct signal pending;
         char comm[32];
         int pid;
+        int fsuid;
+        int fsgid;
+        __u32 cap_effective;
 };
 
 extern struct task_struct *current;
 
-
+#define in_group_p(a) 0 /* FIXME */
 
 #define set_current_state(foo) do { current->state = foo; } while (0)
 
@@ -351,9 +516,10 @@ extern struct task_struct *current;
 #define TASK_UNINTERRUPTIBLE 1
 #define TASK_RUNNING 2
 
+#define in_interrupt() (0)
 
 #define schedule() do { int a; a++; } while (0)
-static inline int schedule_timeout(t)
+static inline int schedule_timeout(signed long t)
 {
         return 0;
 }
@@ -364,7 +530,7 @@ static inline int schedule_timeout(t)
 #define recalc_sigpending(l) do { int a; a++; } while (0)
 #define kernel_thread(l,m,n)
 
-static inline int call_usermodehelper(char *prog, char **argv, char **evnp)
+static inline int call_usermodehelper(char *prog, char **argv, char **evnp, int unknown)
 {
         return 0;
 }
@@ -416,7 +582,11 @@ typedef struct { volatile int counter; } atomic_t;
 #define atomic_add(b,a)  do {(a)->counter += b;} while (0)
 #define atomic_sub(b,a)  do {(a)->counter -= b;} while (0)
 
-#define LBUG() do { sleep(1000000); } while (0)
+#define LBUG()                                                          \
+        do {                                                            \
+                printf("!!!LBUG at %s:%d\n", __FILE__, __LINE__);       \
+                sleep(1000000);                                         \
+        } while (0)
 
 #include <linux/obd_support.h>
 #include <linux/lustre_idl.h>
index d0060fc..5ce5e98 100644 (file)
@@ -36,7 +36,7 @@
 #endif
 
 struct lprocfs_vars {
-        char *name;
+        const char   *name;
         read_proc_t *read_fptr;
         write_proc_t *write_fptr;
         void *data;
@@ -47,11 +47,121 @@ struct lprocfs_static_vars {
         struct lprocfs_vars *obd_vars;
 };
 
+/* Lprocfs counters are can be configured using the enum bit masks below.
+ *
+ * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already
+ * protects this counter from concurrent updates. If not specified,
+ * lprocfs an internal per-counter lock variable. External locks are
+ * not used to protect counter increments, but are used to protect
+ * counter readout and resets.
+ *
+ * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples,
+ * (i.e. counter can be incremented by more than "1"). When specified,
+ * the counter maintains min, max and sum in addition to a simple
+ * invocation count. This allows averages to be be computed.
+ * If not specified, the counter is an increment-by-1 counter.
+ * min, max, sum, etc. are not maintained.
+ *
+ * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of
+ * squares (for multi-valued counter samples only). This allows
+ * external computation of standard deviation, but involves a 64-bit
+ * multiply per counter increment.
+ */
+
+enum {
+        LPROCFS_CNTR_EXTERNALLOCK = 1,
+        LPROCFS_CNTR_AVGMINMAX    = 2,
+        LPROCFS_CNTR_STDDEV       = 4,
+};
+
+struct lprocfs_counter {
+        union {
+                spinlock_t    internal; /* when there is no external lock */
+                spinlock_t   *external; /* external lock, when available */
+        } l;
+        unsigned int  config;
+        __u64         count;
+        __u64         sum;
+        __u64         min;
+        __u64         max;
+        __u64         sumsquare;
+        const char    *name;   /* must be static */
+        const char    *units;  /* must be static */
+};
+
+
+struct lprocfs_counters {
+        unsigned int           num;
+        unsigned int           padto8byteboundary;
+        struct lprocfs_counter cntr[0];
+};
+
+
 /* class_obd.c */
 extern struct proc_dir_entry *proc_lustre_root;
-
+struct obd_device;
 
 #ifdef LPROCFS
+
+/* Two optimized LPROCFS counter increment macros are provided:
+ *     LPROCFS_COUNTER_INCR(cntr, value) - use for multi-valued counters
+ *     LPROCFS_COUNTER_INCBY1(cntr) - optimized for by-one counters
+ * Counter data layout allows config flag, counter lock and the
+ * count itself to reside within a single cache line.
+ */
+
+#define LPROCFS_COUNTER_INCR(cntr, value)                         \
+        do {                                                      \
+               struct lprocfs_counter *c = (cntr);                \
+               LASSERT(c != NULL);                                \
+               if (!(c->config & LPROCFS_CNTR_EXTERNALLOCK))      \
+                     spin_lock(&c->l.internal);                   \
+               c->count++;                                        \
+               if (c->config & LPROCFS_CNTR_AVGMINMAX) {          \
+                      __u64 val = (__u64) (value);                \
+                      c->sum += val;                              \
+                      if (c->config & LPROCFS_CNTR_STDDEV)        \
+                         c->sumsquare += (val*val);               \
+                      if (val < c->min) c->min = val;             \
+                      if (val > c->max) c->max = val;             \
+               }                                                  \
+               if (!(c->config & LPROCFS_CNTR_EXTERNALLOCK))      \
+                      spin_unlock(&c->l.internal);                \
+      } while (0)
+
+#define LPROCFS_COUNTER_INCBY1(cntr)                              \
+        do {                                                      \
+               struct lprocfs_counter *c = (cntr);                \
+               LASSERT(c != NULL);                                \
+               if (!(c->config & LPROCFS_CNTR_EXTERNALLOCK))      \
+                     spin_lock(&c->l.internal);                   \
+               c->count++;                                        \
+               if (!(c->config & LPROCFS_CNTR_EXTERNALLOCK))      \
+                      spin_unlock(&c->l.internal);                \
+      } while (0)
+
+#define LPROCFS_COUNTER_INIT(cntr, conf, lck, nam, un)                 \
+        do {                                                           \
+               struct lprocfs_counter *c = (cntr);                     \
+               LASSERT(c != NULL);                                     \
+               memset(c, 0, sizeof(struct lprocfs_counter));           \
+               if (conf & LPROCFS_CNTR_EXTERNALLOCK) c->l.external = (lck); \
+               else spin_lock_init(&c->l.internal);                    \
+               c->config = conf;                                       \
+               c->min = (~(__u64)0);                                   \
+               c->name = (nam);                                        \
+               c->units = (un);                                        \
+        } while (0)
+
+extern struct lprocfs_counters* lprocfs_alloc_counters(unsigned int num);
+extern void lprocfs_free_counters(struct lprocfs_counters* cntrs);
+extern int lprocfs_alloc_obd_counters(struct obd_device *obddev,
+                                      unsigned int num_private_counters);
+extern void lprocfs_free_obd_counters(struct obd_device *obddev);
+extern int lprocfs_register_counters(struct proc_dir_entry *root,
+                                     const char* name,
+                                     struct lprocfs_counters *cntrs);
+
 #define LPROCFS_INIT_MULTI_VARS(array, size)                              \
 void lprocfs_init_multi_vars(unsigned int idx,                            \
                              struct lprocfs_static_vars *x)               \
@@ -71,7 +181,7 @@ void lprocfs_init_vars(struct lprocfs_static_vars *x)  \
 }                                                      \
 
 extern void lprocfs_init_vars(struct lprocfs_static_vars *var);
-extern void lprocfs_init_multi_vars(unsigned int idx, 
+extern void lprocfs_init_multi_vars(unsigned int idx,
                                     struct lprocfs_static_vars *var);
 /* lprocfs_status.c */
 extern int lprocfs_add_vars(struct proc_dir_entry *root,
@@ -85,7 +195,6 @@ extern struct proc_dir_entry *lprocfs_register(const char *name,
 
 extern void lprocfs_remove(struct proc_dir_entry *root);
 
-struct obd_device;
 extern int lprocfs_obd_attach(struct obd_device *dev, struct lprocfs_vars *list);
 extern int lprocfs_obd_detach(struct obd_device *dev);
 
@@ -119,18 +228,44 @@ extern int lprocfs_rd_filesfree(char *page, char **start, off_t off,
 extern int lprocfs_rd_filegroups(char *page, char **start, off_t off,
                                  int count, int *eof, struct statfs *sfs);
 
-#define DEFINE_LPROCFS_STATFS_FCT(fct_name, get_statfs_fct)      \
-int fct_name(char *page, char **start, off_t off,                \
-             int count, int *eof, void *data)                    \
-{                                                                \
-        struct statfs sfs;                                       \
-        int rc = get_statfs_fct((struct obd_device*)data, &sfs); \
-        return (rc==0                                            \
-                ? lprocfs_##fct_name (page, start, off, count, eof, &sfs) \
-                : rc);                                       \
+/* lprocfs_status.c: counter read/write functions */
+struct file;
+extern int lprocfs_counter_read(char *page, char **start, off_t off,
+                                int count, int *eof, void *data);
+extern int lprocfs_counter_write(struct file *file, const char *buffer,
+                                 unsigned long count, void *data);
+
+#define DEFINE_LPROCFS_STATFS_FCT(fct_name, get_statfs_fct)               \
+int fct_name(char *page, char **start, off_t off,                         \
+             int count, int *eof, void *data)                             \
+{                                                                         \
+        struct statfs sfs;                                                \
+        int rc = get_statfs_fct((struct obd_device*)data, &sfs);          \
+        return (rc == 0 ?                                                 \
+                lprocfs_##fct_name (page, start, off, count, eof, &sfs) : \
+                rc);                                                      \
 }
 
 #else
+/* LPROCFS is not defined */
+#define LPROCFS_COUNTER_INCR(cntr, value)
+#define LPROCFS_COUNTER_INCBY1(cntr)
+#define LPROCFS_COUNTER_INIT(cntr, conf, lock, nam, un)
+
+static inline struct lprocfs_counters* lprocfs_alloc_counters(unsigned int num)
+{ return NULL; }
+static inline void lprocfs_free_counters(struct lprocfs_counters* cntrs)
+{ return; }
+
+static inline int lprocfs_register_counters(struct proc_dir_entry *root,
+                                            const char* name,
+                                            struct lprocfs_counters *cntrs)
+{ return 0; }
+static inline int lprocfs_alloc_obd_counters(struct obd_device *obddev,
+                                             unsigned int num_private_counters)
+{ return 0; }
+static inline void lprocfs_free_obd_counters(struct obd_device *obddev)
+{ return; }
 
 static inline struct proc_dir_entry *
 lprocfs_register(const char *name, struct proc_dir_entry *parent,
@@ -181,6 +316,13 @@ int lprocfs_rd_filesfree(char *page, char **start, off_t off,
 static inline
 int lprocfs_rd_filegroups(char *page, char **start, off_t off,
                           int count, int *eof, struct statfs *sfs) { return 0; }
+static inline
+int lprocfs_counter_read(char *page, char **start, off_t off,
+                         int count, int *eof, void *data) { return 0; }
+struct file;
+static inline
+int lprocfs_counter_write(struct file *file, const char *buffer,
+                          unsigned long count, void *data) { return 0; }
 
 #define DEFINE_LPROCFS_STATFS_FCT(fct_name, get_statfs_fct)  \
 int fct_name(char *page, char **start, off_t off,            \
diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h
new file mode 100644 (file)
index 0000000..4275a10
--- /dev/null
@@ -0,0 +1,76 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _COMPAT25_H
+#define _COMPAT25_H
+
+#include <linux/portals_compat25.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+#define KDEVT_VAL(dev, val)         dev.value = 0               
+#else
+#define KDEVT_VAL(dev, val)         dev = 0               
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+# define PGCACHE_WRLOCK(mapping)          write_lock(&mapping->page_lock)
+# define PGCACHE_WRUNLOCK(mapping)        write_unlock(&mapping->page_lock)
+#else
+# define PGCACHE_WRLOCK(mapping)          spin_lock(&pagecache_lock)
+# define PGCACHE_WRUNLOCK(mapping)        spin_unlock(&pagecache_lock)
+#endif
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+# define filemap_fdatasync(mapping)       filemap_fdatawrite(mapping)
+#endif
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+# define TryLockPage(page)                TestSetPageLocked(page)
+#endif
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+# define Page_Uptodate(page)              PageUptodate(page)
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+# define USERMODEHELPER(path, argv, envp) call_usermodehelper(path, argv, envp, 0)
+#else
+# define USERMODEHELPER(path, argv, envp) call_usermodehelper(path, argv, envp)
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+# define LL_CHECK_DIRTY(sb)              do { }while(0)
+#else
+# define LL_CHECK_DIRTY(sb)              ll_check_dirty(sb)
+#endif
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#define  rb_node_s rb_node
+#define  rb_root_s rb_root
+typedef struct rb_root_s rb_root_t;
+typedef struct rb_node_s rb_node_t;
+#endif
+
+#endif /* _COMPAT25_H */
index 70e7e87..c2a54b9 100644 (file)
@@ -7,13 +7,14 @@
 #define _LUSTRE_DLM_H__
 
 #ifdef __KERNEL__
-#include <linux/proc_fs.h>
+# include <linux/proc_fs.h>
 #endif 
 
 #include <linux/lustre_lib.h>
 #include <linux/lustre_net.h>
 #include <linux/lustre_import.h>
 #include <linux/lustre_handles.h>
+#include <linux/lustre_export.h> /* for obd_export, for LDLM_DEBUG */
 
 struct obd_ops;
 struct obd_device;
@@ -26,11 +27,9 @@ typedef enum {
         ELDLM_LOCK_CHANGED = 300,
         ELDLM_LOCK_ABORTED = 301,
         ELDLM_LOCK_REPLACED = 302,
-        ELDLM_LOCK_MATCHED = 303,
 
         ELDLM_NAMESPACE_EXISTS = 400,
-        ELDLM_BAD_NAMESPACE    = 401,
-        ELDLM_GETATTR_ERROR    = 402
+        ELDLM_BAD_NAMESPACE    = 401
 } ldlm_error_t;
 
 #define LDLM_NAMESPACE_SERVER 0
@@ -56,10 +55,14 @@ typedef enum {
 
 #define LDLM_FL_INTENT_ONLY    (1 << 9) /* don't grant lock, just do intent */
 #define LDLM_FL_LOCAL_ONLY     (1 << 10) /* see ldlm_cli_cancel_unused */
-#define LDLM_FL_NO_CALLBACK    (1 << 11) /* see ldlm_cli_cancel_unused */
+
+/* don't run the cancel callback under ldlm_cli_cancel_unused */
+#define LDLM_FL_NO_CALLBACK    (1 << 11)
+
 #define LDLM_FL_HAS_INTENT     (1 << 12) /* lock request has intent */
 #define LDLM_FL_CANCELING      (1 << 13) /* lock cancel has already been sent */
 #define LDLM_FL_LOCAL          (1 << 14) // a local lock (ie, no srv/cli split)
+#define LDLM_FL_WARN           (1 << 15) /* see ldlm_cli_cancel_unused */
 
 /* The blocking callback is overloaded to perform two functions.  These flags
  * indicate which operation should be performed. */
@@ -146,9 +149,8 @@ struct ldlm_lock;
 typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock,
                                       struct ldlm_lock_desc *new, void *data,
                                       int flag);
-typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags, void *data);
-typedef int (*ldlm_granted_callback)(struct ldlm_lock *,
-                                     struct lustre_msg *, int offset);
+typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags,
+                                        void *data);
 
 struct ldlm_lock {
         struct portals_handle l_handle; // must be first in the structure
@@ -168,14 +170,12 @@ struct ldlm_lock {
 
         ldlm_completion_callback l_completion_ast;
         ldlm_blocking_callback   l_blocking_ast;
-        ldlm_granted_callback l_granted_cb;
 
         struct obd_export    *l_export;
         struct lustre_handle *l_connh;
         __u32                 l_flags;
         struct lustre_handle  l_remote_handle;
         void                 *l_data;
-        void                 *l_cp_data;
         struct ldlm_extent    l_extent;
         __u32                 l_version[RES_VERSION_SIZE];
 
@@ -233,12 +233,6 @@ struct ldlm_ast_work {
         int w_datalen;
 };
 
-/* Per-export ldlm state. */
-struct ldlm_export_data {
-        struct list_head       led_held_locks; /* protected by namespace lock */
-        struct obd_import      led_import;
-};
-
 extern struct obd_ops ldlm_obd_ops;
 
 extern char *ldlm_lockname[];
@@ -250,8 +244,8 @@ do {                                                                          \
         if (lock->l_resource == NULL) {                                       \
                 CDEBUG(level, "### " format                                   \
                        " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "\
-                       "res: \?\? rrc=\?\? type: \?\?\? remote: "LPX64")\n"   \
-                       , ## a, lock, lock->l_handle.h_cookie,                 \
+                       "res: \?\? rrc=\?\? type: \?\?\? remote: "             \
+                       LPX64"\n" , ## a, lock, lock->l_handle.h_cookie,       \
                        atomic_read(&lock->l_refc),                            \
                        lock->l_readers, lock->l_writers,                      \
                        ldlm_lockname[lock->l_granted_mode],                   \
@@ -281,7 +275,8 @@ do {                                                                          \
                 CDEBUG(level, "### " format                                   \
                        " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "  \
                        "res: "LPU64"/"LPU64" rrc: %d type: %s remote: "LPX64  \
-                       "\n" , ## a, lock->l_resource->lr_namespace->ns_name,  \
+                       "\n" , ## a,                                           \
+                       lock->l_resource->lr_namespace->ns_name,               \
                        lock, lock->l_handle.h_cookie,                         \
                        atomic_read (&lock->l_refc),                           \
                        lock->l_readers, lock->l_writers,                      \
@@ -342,7 +337,7 @@ void ldlm_unregister_intent(void);
 void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh);
 struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *, int flags);
 void ldlm_cancel_callback(struct ldlm_lock *);
-int ldlm_lock_set_data(struct lustre_handle *, void *data, void *cp_data);
+int ldlm_lock_set_data(struct lustre_handle *, void *data);
 void ldlm_lock_remove_from_lru(struct ldlm_lock *);
 struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *,
                                       struct lustre_handle *);
@@ -380,11 +375,11 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags, struct ldlm_res_id *,
 struct ldlm_lock *
 ldlm_lock_create(struct ldlm_namespace *ns,
                  struct lustre_handle *parent_lock_handle, struct ldlm_res_id,
-                 __u32 type, ldlm_mode_t mode, void *data, void *cp_data);
+                 __u32 type, ldlm_mode_t, ldlm_blocking_callback,
+                 void *data);
 ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **,
                                void *cookie, int cookie_len, int *flags,
-                               ldlm_completion_callback completion,
-                               ldlm_blocking_callback blocking);
+                               ldlm_completion_callback completion);
 struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
                                         int *flags);
 void ldlm_lock_cancel(struct ldlm_lock *lock);
@@ -444,7 +439,6 @@ int ldlm_cli_enqueue(struct lustre_handle *conn,
                      ldlm_completion_callback completion,
                      ldlm_blocking_callback callback,
                      void *data,
-                     void *cp_data,
                      struct lustre_handle *lockh);
 int ldlm_match_or_enqueue(struct lustre_handle *connh,
                           struct ptlrpc_request *req,
@@ -458,15 +452,13 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh,
                           ldlm_completion_callback completion,
                           ldlm_blocking_callback callback,
                           void *data,
-                          void *cp_data,
                           struct lustre_handle *lockh);
 int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new,
                     void *data, __u32 data_len);
 int ldlm_cli_convert(struct lustre_handle *, int new_mode, int *flags);
 int ldlm_cli_cancel(struct lustre_handle *lockh);
 int ldlm_cli_cancel_unused(struct ldlm_namespace *, struct ldlm_res_id *,
-                           int flags);
-int ldlm_cancel_lru(struct ldlm_namespace *ns);
+                           int flags, void *opaque);
 
 /* mds/handler.c */
 /* This has to be here because recurisve inclusion sucks. */
index 694bd3e..6939a95 100644 (file)
 #define __EXPORT_H
 
 #include <linux/lustre_idl.h>
-#include <linux/lustre_dlm.h>
-#include <linux/lustre_mds.h>
 #include <linux/obd_filter.h>
 
+struct mds_client_data;
+
+struct mds_export_data {
+        struct list_head        med_open_head;
+        spinlock_t              med_open_lock;
+        struct mds_client_data *med_mcd;
+        int                     med_off;
+};
+
+struct ldlm_export_data {
+        struct list_head       led_held_locks; /* protected by namespace lock */
+        struct obd_import     *led_import;
+};
+
 struct lov_export_data {
         spinlock_t       led_lock;
         struct list_head led_open_head;
@@ -26,13 +38,17 @@ struct ec_export_data { /* echo client */
 };
 
 struct obd_export {
-        __u64                     exp_cookie;
+        struct portals_handle     exp_handle;
+        atomic_t                  exp_refcount;
         struct obd_uuid           exp_client_uuid;
         struct list_head          exp_obd_chain;
-        struct list_head          exp_conn_chain;
         struct obd_device        *exp_obd;
         struct ptlrpc_connection *exp_connection;
         struct ldlm_export_data   exp_ldlm_data;
+        struct ptlrpc_request    *exp_outstanding_reply;
+        time_t                    exp_last_request_time;
+        spinlock_t                exp_lock; /* protects flags int below */
+        int                       exp_failed:1, exp_failover:1;
         union {
                 struct mds_export_data    eu_mds_data;
                 struct filter_export_data eu_filter_data;
index 6b0cbfa..f736d4b 100644 (file)
@@ -44,7 +44,7 @@ struct fsfilt_operations {
         void   *(* fs_start)(struct inode *inode, int op);
         void   *(* fs_brw_start)(int objcount, struct fsfilt_objinfo *fso,
                                  int niocount, struct niobuf_remote *nb);
-        int     (* fs_commit)(struct inode *inode, void *handle);
+        int     (* fs_commit)(struct inode *inode, void *handle,int force_sync);
         int     (* fs_setattr)(struct dentry *dentry, void *handle,
                                struct iattr *iattr);
         int     (* fs_set_md)(struct inode *inode, void *handle, void *md,
@@ -79,22 +79,25 @@ extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops);
 static inline void *fsfilt_start(struct obd_device *obd,
                                  struct inode *inode, int op)
 {
-        ENTRY;
-        return obd->obd_fsops->fs_start(inode, op);
+        void *handle = obd->obd_fsops->fs_start(inode, op);
+        CDEBUG(D_HA, "starting handle %p\n", handle);
+        return handle;
 }
 
 static inline void *fsfilt_brw_start(struct obd_device *obd, int objcount,
                                      struct fsfilt_objinfo *fso, int niocount,
                                      struct niobuf_remote *nb)
 {
-        return obd->obd_fsops->fs_brw_start(objcount, fso, niocount, nb);
+        void *handle = obd->obd_fsops->fs_brw_start(objcount, fso, niocount,nb);
+        CDEBUG(D_HA, "starting handle %p\n", handle);
+        return handle;
 }
 
 static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode,
-                                void *handle)
+                                void *handle, int force_sync)
 {
-        return obd->obd_fsops->fs_commit(inode, handle);
-        EXIT;
+        CDEBUG(D_HA, "committing handle %p\n", handle);
+        return obd->obd_fsops->fs_commit(inode, handle, force_sync);
 }
 
 static inline int fsfilt_setattr(struct obd_device *obd, struct dentry *dentry,
index 87b0bf3..fffbd60 100644 (file)
@@ -5,60 +5,21 @@
 #ifndef _LUSTRE_HA_H
 #define _LUSTRE_HA_H
 
-#define LUSTRE_HA_NAME "ptlrpc"
-
-struct recovd_data;
-struct recovd_obd;
 struct obd_import;
-struct ptlrpc_connection;
-
-/* rd_phase/rd_next_phase values */
-#define RD_IDLE              0
-#define RD_TROUBLED          1
-#define RD_PREPARING         2
-#define RD_PREPARED          3
-#define RD_RECOVERING        4
-#define RD_RECOVERED         5
-#define RD_FAILED            6
-
-/* recovd_state values */
-#define RECOVD_READY             1
-#define RECOVD_STOPPING          2  /* how cleanup tells recovd to quit */
-#define RECOVD_STOPPED           4  /* after recovd has stopped */
-
-#define PTLRPC_RECOVD_PHASE_PREPARE  1
-#define PTLRPC_RECOVD_PHASE_RECOVER  2
-#define PTLRPC_RECOVD_PHASE_FAILURE  3
-#define PTLRPC_RECOVD_PHASE_NOTCONN  4
-
-typedef int (*ptlrpc_recovery_cb_t)(struct recovd_data *, int);
-
-struct recovd_data {
-        /* you must hold recovd->recovd_lock when touching rd_managed_chain */
-        struct list_head     rd_managed_chain;
-        ptlrpc_recovery_cb_t rd_recover;
-        struct recovd_obd   *rd_recovd;
-        __u32                rd_phase;
-        __u32                rd_next_phase;
-        __u32                rd_flags;
-};
-
-void recovd_conn_fail(struct ptlrpc_connection *conn);
-void recovd_conn_manage(struct ptlrpc_connection *conn, struct recovd_obd *mgr,
-                        ptlrpc_recovery_cb_t recover);
-void recovd_conn_unmanage(struct ptlrpc_connection *conn);
-void recovd_conn_fixed(struct ptlrpc_connection *conn);
-int recovd_setup(struct recovd_obd *mgr);
-int recovd_cleanup(struct recovd_obd *mgr);
-
-extern struct recovd_obd *ptlrpc_recovd;
+struct obd_export;
+struct obd_device;
 struct ptlrpc_request;
 
-int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn);
-int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc,
+void ptlrpc_run_failed_import_upcall(struct obd_import *imp);
+void ptlrpc_run_recovery_over_upcall(struct obd_device *obd);
+int ptlrpc_reconnect_import(struct obd_import *imp,
                             struct ptlrpc_request **reqptr);
 int ptlrpc_replay(struct obd_import *imp);
 int ptlrpc_resend(struct obd_import *imp);
 void ptlrpc_free_committed(struct obd_import *imp);
 void ptlrpc_wake_delayed(struct obd_import *imp);
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid);
+int ptlrpc_set_import_active(struct obd_import *imp, int active);
+void ptlrpc_fail_import(struct obd_import *imp, int generation);
+void ptlrpc_fail_export(struct obd_export *exp);
 #endif
index b99d996..b3acada 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  * (Un)packing of OST requests
+ *
+ * We assume all nodes are either little-endian or big-endian, and we
+ * always send messages in the sender's native format.  The receiver
+ * detects the message format by checking the 'magic' field of the message
+ * (see lustre_msg_swabbed() below).
+ *
+ * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines,
+ * implemented either here, inline (trivial implementations) or in
+ * ptlrpc/pack_generic.c.  These 'swabbers' convert the type from "other"
+ * endian, in-place in the message buffer.
+ * 
+ * A swabber takes a single pointer argument.  The caller must already have
+ * verified that the length of the message buffer >= sizeof (type).  
+ *
+ * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine
+ * may be defined that swabs just the variable part, after the caller has
+ * verified that the message buffer is large enough.
  */
 
 #ifndef _LUSTRE_IDL_H_
 # include <linux/types.h>
 # include <linux/list.h>
 # include <linux/string.h> /* for strncpy, below */
+# include <asm/byteorder.h>
+#else
+#ifdef __CYGWIN__
+# include <sys/types.h>
 #else
-# define __KERNEL__
 # include <asm/types.h>
-# include <linux/list.h>
-# undef __KERNEL__
 # include <stdint.h>
 #endif
+# include <portals/list.h>
+#endif
 /*
  * this file contains all data structures used in Lustre interfaces:
  * - obdo and obd_request records
@@ -52,12 +72,19 @@ struct obd_uuid {
         __u8 uuid[37];
 };
 
+static inline int obd_uuid_equals(struct obd_uuid *u1, struct obd_uuid *u2)
+{
+        return strcmp(u1->uuid, u2->uuid) == 0;
+}
+
 static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
 {
         strncpy(uuid->uuid, tmp, sizeof(*uuid));
         uuid->uuid[sizeof(*uuid) - 1] = '\0';
 }
 
+extern struct obd_uuid lctl_fake_uuid;
+
 /* FOO_REQUEST_PORTAL is for incoming requests on the FOO
  * FOO_REPLY_PORTAL   is for incoming replies on the FOO
  * FOO_BULK_PORTAL    is for incoming bulk on the FOO
@@ -67,7 +94,7 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
 #define CONNMGR_REPLY_PORTAL    2
 //#define OSC_REQUEST_PORTAL      3
 #define OSC_REPLY_PORTAL        4
-#define OSC_BULK_PORTAL         5
+//#define OSC_BULK_PORTAL         5
 #define OST_REQUEST_PORTAL      6
 //#define OST_REPLY_PORTAL        7
 #define OST_BULK_PORTAL         8
@@ -96,32 +123,27 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp)
 
 #define LUSTRE_CONN_NEW          1
 #define LUSTRE_CONN_CON          2
-#define LUSTRE_CONN_RECOVD       3
-#define LUSTRE_CONN_FULL         4
+#define LUSTRE_CONN_NOTCONN      3
+#define LUSTRE_CONN_RECOVD       4
+#define LUSTRE_CONN_FULL         5
 
 /* packet types */
 #define PTL_RPC_MSG_REQUEST 4711
 #define PTL_RPC_MSG_ERR     4712
 #define PTL_RPC_MSG_REPLY   4713
 
-#define PTLRPC_MSG_MAGIC (cpu_to_le32(0x0BD00BD0))
-#define PTLRPC_MSG_VERSION (cpu_to_le32(0x00040001))
+#define PTLRPC_MSG_MAGIC    0x0BD00BD0
+#define PTLRPC_MSG_VERSION  0x00040002
 
 struct lustre_handle {
-        __u64 addr;
         __u64 cookie;
 };
 #define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabe
 
-static inline void ptlrpc_invalidate_handle(struct lustre_handle *hdl)
-{
-        hdl->addr = hdl->cookie = 0; /* XXX invalid enough? */
-}
-
 /* we depend on this structure to be 8-byte aligned */
+/* this type is only endian-adjusted in lustre_unpack_msg() */
 struct lustre_msg {
-        __u64 addr;
-        __u64 cookie; /* security token */
+        struct lustre_handle handle;
         __u32 magic;
         __u32 type;
         __u32 version;
@@ -130,11 +152,16 @@ struct lustre_msg {
         __u64 last_committed;
         __u64 transno;
         __u32 status;
-        __u32 bufcount;
         __u32 flags;
+        __u32 bufcount;
         __u32 buflens[0];
 };
 
+static inline int lustre_msg_swabbed (struct lustre_msg *msg)
+{
+        return (msg->magic == __swab32 (PTLRPC_MSG_MAGIC));
+}
+
 /* Flags that are operation-specific go in the top 16 bits. */
 #define MSG_OP_FLAG_MASK   0xffff0000
 #define MSG_OP_FLAG_SHIFT  16
@@ -206,6 +233,10 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags)
 #define OST_SAN_READ   14
 #define OST_SAN_WRITE  15
 #define OST_SYNCFS     16
+/* When adding OST RPC opcodes, please update 
+ * LAST/FIRST macros used in ptlrpc/ptlrpc_internals.h */
+#define OST_LAST_OPC   (OST_SYNCFS+1)
+#define OST_FIRST_OPC  OST_REPLY
 
 
 typedef uint64_t        obd_id;
@@ -226,10 +257,7 @@ typedef uint32_t        obd_count;
 #define OBD_FL_OBDMDEXISTS      (0x00000002)
 
 #define OBD_INLINESZ    60
-#define FD_OSTDATA_SIZE 32
-#if (FD_OSTDATA_SIZE > OBD_INLINESZ)
-# error FD_OSTDATA_SIZE must be smaller than OBD_INLINESZ
-#endif
+#define FD_OSTDATA_SIZE sizeof(struct obd_client_handle)
 
 /* Note: 64-bit types are 64-bit aligned in structure */
 struct obdo {
@@ -241,7 +269,7 @@ struct obdo {
         obd_size                o_size;
         obd_blocks              o_blocks;
         obd_rdev                o_rdev;
-        obd_blksize             o_blksize;
+        obd_blksize             o_blksize;      /* optimal IO blocksize */
         obd_mode                o_mode;
         obd_uid                 o_uid;
         obd_gid                 o_gid;
@@ -254,6 +282,8 @@ struct obdo {
         char                    o_inline[OBD_INLINESZ];
 };
 
+extern void lustre_swab_obdo (struct obdo *o);
+
 struct lov_object_id { /* per-child structure */
         __u64 l_object_id;
 };
@@ -305,16 +335,20 @@ struct obd_statfs {
         __u8            os_fsid[40];
         __u32           os_bsize;
         __u32           os_namelen;
-        __u32           os_spare[12];
+        __u64           os_maxbytes;
+        __u32           os_spare[10];
 };
 
+extern void lustre_swab_obd_statfs (struct obd_statfs *os);
+
 /* ost_body.data values for OST_BRW */
 
-#define OBD_BRW_READ   0x1
-#define OBD_BRW_WRITE  0x2
-#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE)
-#define OBD_BRW_CREATE 0x4
-#define OBD_BRW_SYNC   0x8
+#define OBD_BRW_READ    0x01
+#define OBD_BRW_WRITE   0x02
+#define OBD_BRW_RWMASK  (OBD_BRW_READ | OBD_BRW_WRITE)
+#define OBD_BRW_CREATE  0x04
+#define OBD_BRW_SYNC    0x08
+#define OBD_BRW_CHECK   0x10
 
 #define OBD_OBJECT_EOF 0xffffffffffffffffULL
 
@@ -325,13 +359,17 @@ struct obd_ioobj {
         __u32                ioo_bufcnt;
 } __attribute__((packed));
 
+extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo);
+
+/* multiple of 8 bytes => can array */
 struct niobuf_remote {
         __u64 offset;
         __u32 len;
-        __u32 xid;
         __u32 flags;
 } __attribute__((packed));
 
+extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
+
 /* request structure for OST's */
 
 #define OST_REQ_HAS_OA1  0x1
@@ -340,6 +378,8 @@ struct ost_body {
         struct  obdo oa;
 };
 
+extern void lustre_swab_ost_body (struct ost_body *b);
+
 /*
  *   MDS REQ RECORDS
  */
@@ -355,6 +395,10 @@ struct ost_body {
 #define MDS_GETSTATUS    40
 #define MDS_STATFS       41
 #define MDS_GETLOVINFO   42
+/* When adding MDS RPC opcodes, please update 
+ * LAST/FIRST macros used in ptlrpc/ptlrpc_internals.h */
+#define MDS_LAST_OPC     (MDS_GETLOVINFO+1)
+#define MDS_FIRST_OPC    MDS_GETATTR
 /*
  * Do not exceed 63 
  */
@@ -374,15 +418,13 @@ struct ost_body {
 #define IT_OPEN_CREATE  (1 << 4)
 #define IT_OPEN_OPEN    (1 << 5)
 
-#define REINT_OPCODE_MASK 0xff /* opcodes must fit into this mask */
-#define REINT_REPLAYING 0x1000 /* masked into the opcode to indicate replay */
-
 struct ll_fid {
         __u64 id;
         __u32 generation;
         __u32 f_type;
 };
 
+extern void lustre_swab_ll_fid (struct ll_fid *fid);
 
 #define MDS_STATUS_CONN 1
 #define MDS_STATUS_LOV 2
@@ -392,24 +434,20 @@ struct mds_status_req {
         __u32  repbuf;
 };
 
+extern void lustre_swab_mds_status_req (struct mds_status_req *r);
+
 struct mds_fileh_body {
         struct ll_fid f_fid;
         struct lustre_handle f_handle;
 };
 
-struct mds_conn_status {
-        struct ll_fid rootfid;
-        __u64          xid;
-        __u64          last_committed;
-        __u64          last_rcvd;
-        /* XXX preallocated quota & obj fields here */
-};
+extern void lustre_swab_mds_fileh_body (struct mds_fileh_body *f);
 
 struct mds_body {
         struct ll_fid  fid1;
         struct ll_fid  fid2;
         struct lustre_handle handle;
-        __u64          size;
+        __u64          size;   /* Offset, in the case of MDS_READPAGE */
         __u64          blocks; /* XID, in the case of MDS_READPAGE */
         __u32          ino;   /* make this a __u64 */
         __u32          valid;
@@ -424,17 +462,19 @@ struct mds_body {
         __u32          atime;
         __u32          flags;
         __u32          rdev;
-        __u32          nlink;
+        __u32          nlink; /* #bytes to read in the case of MDS_READPAGE */
         __u32          generation;
         __u32          suppgid;
+        __u32          eadatasize;
 };
 
+extern void lustre_swab_mds_body (struct mds_body *b);
+
 /* This is probably redundant with OBD_MD_FLEASIZE, but we need an audit */
 #define MDS_OPEN_HAS_EA 1 /* this open has an EA, for a delayed create*/
 
 /* MDS update records */
 
-
 //struct mds_update_record_hdr {
 //        __u32 ur_opcode;
 //};
@@ -458,6 +498,8 @@ struct mds_rec_setattr {
         __u32           sa_suppgid;
 };
 
+extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa);
+
 struct mds_rec_create {
         __u32           cr_opcode;
         __u32           cr_fsuid;
@@ -474,16 +516,21 @@ struct mds_rec_create {
         __u32           cr_suppgid;
 };
 
+extern void lustre_swab_mds_rec_create (struct mds_rec_create *cr);
+
 struct mds_rec_link {
         __u32           lk_opcode;
         __u32           lk_fsuid;
         __u32           lk_fsgid;
         __u32           lk_cap;
-        __u32           lk_suppgid;
+        __u32           lk_suppgid1;
+        __u32           lk_suppgid2;
         struct ll_fid   lk_fid1;
         struct ll_fid   lk_fid2;
 };
 
+extern void lustre_swab_mds_rec_link (struct mds_rec_link *lk);
+
 struct mds_rec_unlink {
         __u32           ul_opcode;
         __u32           ul_fsuid;
@@ -496,6 +543,8 @@ struct mds_rec_unlink {
         struct ll_fid   ul_fid2;
 };
 
+extern void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul);
+
 struct mds_rec_rename {
         __u32           rn_opcode;
         __u32           rn_fsuid;
@@ -507,6 +556,7 @@ struct mds_rec_rename {
         struct ll_fid   rn_fid2;
 };
 
+extern void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn);
 
 /*
  *  LOV data structures
@@ -515,6 +565,11 @@ struct mds_rec_rename {
 #define LOV_RAID0   0
 #define LOV_RAIDRR  1
 
+#define LOV_MAX_UUID_BUFFER_SIZE  8192
+/* The size of the buffer the lov/mdc reserves for the 
+ * array of UUIDs returned by the MDS.  With the current
+ * protocol, this will limit the max number of OSTs per LOV */
+
 struct lov_desc {
         __u32 ld_tgt_count;                /* how many OBD's */
         __u32 ld_active_tgt_count;         /* how many active */
@@ -525,6 +580,8 @@ struct lov_desc {
         struct obd_uuid ld_uuid;
 };
 
+extern void lustre_swab_lov_desc (struct lov_desc *ld);
+
 /*
  *   LDLM requests:
  */
@@ -534,6 +591,10 @@ struct lov_desc {
 #define LDLM_CANCEL        103
 #define LDLM_BL_CALLBACK   104
 #define LDLM_CP_CALLBACK   105
+/* When adding LDLM RPC opcodes, please update 
+ * LAST/FIRST macros used in ptlrpc/ptlrpc_internals.h */
+#define LDLM_LAST_OPC      (LDLM_CP_CALLBACK+1)
+#define LDLM_FIRST_OPC     LDLM_ENQUEUE
 
 #define RES_NAME_SIZE 3
 #define RES_VERSION_SIZE 4
@@ -542,6 +603,8 @@ struct ldlm_res_id {
         __u64 name[RES_NAME_SIZE];
 };
 
+extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id);
+
 /* lock types */
 typedef enum {
         LCK_EX = 1,
@@ -557,10 +620,14 @@ struct ldlm_extent {
         __u64 end;
 };
 
+extern void lustre_swab_ldlm_extent (struct ldlm_extent *e);
+
 struct ldlm_intent {
         __u64 opc;
 };
 
+extern void lustre_swab_ldlm_intent (struct ldlm_intent *i);
+
 /* Note this unaligned structure; as long as it's only used in ldlm_request
  * below, we're probably fine. */
 struct ldlm_resource_desc {
@@ -569,6 +636,8 @@ struct ldlm_resource_desc {
         __u32 lr_version[RES_VERSION_SIZE];
 };
 
+extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r);
+
 struct ldlm_lock_desc {
         struct ldlm_resource_desc l_resource;
         ldlm_mode_t l_req_mode;
@@ -577,6 +646,8 @@ struct ldlm_lock_desc {
         __u32 l_version[RES_VERSION_SIZE];
 };
 
+extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l);
+
 struct ldlm_request {
         __u32 lock_flags;
         struct ldlm_lock_desc lock_desc;
@@ -584,6 +655,8 @@ struct ldlm_request {
         struct lustre_handle lock_handle2;
 };
 
+extern void lustre_swab_ldlm_request (struct ldlm_request *rq);
+
 struct ldlm_reply {
         __u32 lock_flags;
         __u32 lock_mode;
@@ -594,6 +667,8 @@ struct ldlm_reply {
         __u64  lock_policy_res2;
 };
 
+extern void lustre_swab_ldlm_reply (struct ldlm_reply *r);
+
 /*
  * ptlbd, portal block device requests
  */
@@ -601,7 +676,14 @@ typedef enum {
         PTLBD_QUERY = 200,
         PTLBD_READ = 201,
         PTLBD_WRITE = 202,
+        PTLBD_FLUSH = 203,
+        PTLBD_CONNECT = 204,
+        PTLBD_DISCONNECT = 205,
 } ptlbd_cmd_t;
+/* When adding PTLBD RPC opcodes, please update 
+ * LAST/FIRST macros used in ptlrpc/ptlrpc_internals.h */
+#define PTLBD_LAST_OPC  (PTLBD_FLUSH+1)
+#define PTLBD_FIRST_OPC PTLBD_QUERY
 
 struct ptlbd_op {
         __u16 op_cmd;
@@ -611,6 +693,8 @@ struct ptlbd_op {
         __u32 op_block_cnt;
 };
 
+extern void lustre_swab_ptlbd_op (struct ptlbd_op *op);
+
 struct ptlbd_niob {
         __u64 n_xid;
         __u64 n_block_nr;
@@ -618,8 +702,19 @@ struct ptlbd_niob {
         __u32 n_length;
 };
 
+extern void lustre_swab_ptlbd_niob (struct ptlbd_niob *n);
+
 struct ptlbd_rsp {
         __u16 r_status;
         __u16 r_error_cnt;
 };
+
+extern void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r);
+
+/*
+ * Opcodes for multiple servers.
+ */
+
+#define OBD_PING 400
+
 #endif
index 4fc2581..c1af641 100644 (file)
 #ifndef __IMPORT_H
 #define __IMPORT_H
 
-
-#define IMP_INVALID       1
-#define IMP_REPLAYABLE    2
-
-
-struct obd_import;
-typedef int (*import_recover_t)(struct obd_import *imp, int phase);
+#include <linux/lustre_handles.h>
 #include <linux/lustre_idl.h>
 
 struct obd_import {
-        import_recover_t          imp_recover;
+        struct portals_handle     imp_handle;
+        atomic_t                  imp_refcount;
+        struct lustre_handle      imp_dlm_handle; /* client's ldlm export */
         struct ptlrpc_connection *imp_connection;
         struct ptlrpc_client     *imp_client;
-        struct lustre_handle      imp_handle;
-        struct list_head          imp_chain;
+        struct list_head          imp_observers;
+        struct list_head          imp_pinger_chain;
 
         /* Lists of requests that are retained for replay, waiting for a reply,
          * or waiting for recovery to complete, respectively.
@@ -34,17 +30,43 @@ struct obd_import {
         struct list_head          imp_delayed_list;
 
         struct obd_device        *imp_obd;
-        int                       imp_flags;
         int                       imp_level;
+        int                       imp_generation;
         __u64                     imp_max_transno;
         __u64                     imp_peer_committed_transno;
+        struct obd_uuid           imp_target_uuid; /* XXX -> lustre_name */
+        struct lustre_handle      imp_remote_handle;
 
-        /* Protects flags, level, *_list */
+        /* Protects flags, level, generation, *_list */
         spinlock_t                imp_lock;
+
+        /* flags */
+        int                       imp_invalid:1, imp_replayable:1,
+                                  imp_dlm_fake:1;
+        __u32                     imp_connect_op;
+};
+
+typedef void (*obd_import_callback)(struct obd_import *imp, void *closure,
+                                    int event, void *event_arg, void *cb_data);
+
+struct obd_import_observer {
+        struct list_head     oio_chain;
+        obd_import_callback  oio_cb;
+        void                *oio_cb_data;
 };
 
+void class_observe_import(struct obd_import *imp, obd_import_callback cb,
+                          void *cb_data);
+void class_unobserve_import(struct obd_import *imp, obd_import_callback cb,
+                            void *cb_data);
+void class_notify_import_observers(struct obd_import *imp, int event,
+                                   void *event_arg);
+
+#define IMP_EVENT_ACTIVE   1
+#define IMP_EVENT_INACTIVE 2
+
+/* genops.c */
 extern struct obd_import *class_conn2cliimp(struct lustre_handle *);
 extern struct obd_import *class_conn2ldlmimp(struct lustre_handle *);
 
-
 #endif /* __IMPORT_H */
index 6f38be0..c43cf95 100644 (file)
 
 #ifndef __KERNEL__
 # include <string.h>
+# include <sys/types.h>
 #else
 # include <asm/semaphore.h>
 # include <linux/sched.h>
 # include <linux/signal.h>
+# include <linux/types.h>
 #endif
-#include <linux/types.h>
 #include <linux/portals_lib.h>
 #include <linux/kp30.h> /* XXX just for LASSERT! */
 #include <linux/lustre_idl.h>
 
 /* target.c */
 struct ptlrpc_request;
-struct obd_device;
 struct recovd_data;
 struct recovd_obd;
 struct obd_export;
 #include <linux/lustre_ha.h>
 #include <linux/lustre_net.h>
-
+#include <linux/lustre_compat25.h>
 
 int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler);
 int target_handle_disconnect(struct ptlrpc_request *req);
 int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
                             struct obd_uuid *cluuid);
-int target_revoke_connection(struct recovd_data *rd, int phase);
+int target_handle_ping(struct ptlrpc_request *req);
+void target_cancel_recovery_timer(struct obd_device *obd);
 
 #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */
 void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler);
@@ -71,18 +72,26 @@ void target_abort_recovery(void *data);
 int target_queue_recovery_request(struct ptlrpc_request *req,
                                   struct obd_device *obd);
 int target_queue_final_reply(struct ptlrpc_request *req, int rc);
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id);
 
 /* client.c */
-int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd,
-                       struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                       ptlrpc_recovery_cb_t recover);
-int client_obd_disconnect(struct lustre_handle *conn);
+
 int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf);
 int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf);
-int client_obd_cleanup(struct obd_device * obddev);
+int client_obd_cleanup(struct obd_device * obddev, int force, int failover);
 struct client_obd *client_conn2cli(struct lustre_handle *conn);
 struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid);
 
+/* It is important that och_fh remain the first item in this structure: that
+ * way, we don't have to re-pack the obdo's inline data before we send it to
+ * the server, we can just send the whole struct unaltered. */
+struct obd_client_handle {
+        struct lustre_handle och_fh;
+        struct ptlrpc_request *och_req;
+        __u32 och_magic;
+};
+#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed
+
 /* statfs_pack.c */
 int obd_self_statfs(struct obd_device *dev, struct statfs *sfs);
 
@@ -99,45 +108,74 @@ void l_lock(struct lustre_lock *);
 void l_unlock(struct lustre_lock *);
 int l_has_lock(struct lustre_lock *);
 
-#define CB_PHASE_START   12
-#define CB_PHASE_FINISH  13
-
-/* This list head doesn't need to be locked, because it's only manipulated by
- * one thread at a time. */
-struct obd_brw_set {
-        struct list_head brw_desc_head; /* list of ptlrpc_bulk_desc */
-        wait_queue_head_t brw_waitq;
-        atomic_t brw_refcount;
-        atomic_t brw_desc_count;
-        int brw_flags;
+/* simple.c */
+struct obd_ucred {
+        __u32 ouc_fsuid;
+        __u32 ouc_fsgid;
+        __u32 ouc_cap;
+        __u32 ouc_suppgid1;
+        __u32 ouc_suppgid2;
+};
 
-        int (*brw_callback)(struct obd_brw_set *, int phase);
+#define OBD_RUN_CTXT_MAGIC      0xC0FFEEAA
+#define OBD_CTXT_DEBUG          /* development-only debugging */
+struct obd_run_ctxt {
+        struct vfsmount *pwdmnt;
+        struct dentry   *pwd;
+        mm_segment_t     fs;
+        struct obd_ucred ouc;
+        int              ngroups;
+#ifdef OBD_CTXT_DEBUG
+        __u32            magic;
+#endif
 };
 
-/* simple.c */
-struct obd_run_ctxt;
-struct obd_ucred;
+
+#ifdef OBD_CTXT_DEBUG
+#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC
+#else
+#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0)
+#endif
+
+#ifdef __KERNEL__
+
 void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx,
                struct obd_ucred *cred);
 void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx,
               struct obd_ucred *cred);
 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode);
 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode);
-int lustre_fread(struct file *file, char *str, int len, loff_t *off);
-int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off);
+int lustre_fread(struct file *file, void *buf, int len, loff_t *off);
+int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off);
 int lustre_fsync(struct file *file);
 
-#ifdef __KERNEL__
-
 static inline void l_dput(struct dentry *de)
 {
         if (!de || IS_ERR(de))
                 return;
-        shrink_dcache_parent(de);
+        //shrink_dcache_parent(de);
         LASSERT(atomic_read(&de->d_count) > 0);
         dput(de);
 }
 
+/* We need to hold the inode semaphore over the dcache lookup itself, or we
+ * run the risk of entering the filesystem lookup path concurrently on SMP
+ * systems, and instantiating two inodes for the same entry.  We still
+ * protect against concurrent addition/removal races with the DLM locking.
+ */
+static inline struct dentry *ll_lookup_one_len(char *fid_name,
+                                               struct dentry *dparent,
+                                               int fid_namelen)
+{
+        struct dentry *dchild;
+
+        down(&dparent->d_inode->i_sem);
+        dchild = lookup_one_len(fid_name, dparent, fid_namelen);
+        up(&dparent->d_inode->i_sem);
+
+        return dchild;
+}
+
 static inline void ll_sleep(int t)
 {
         set_current_state(TASK_INTERRUPTIBLE);
@@ -146,17 +184,10 @@ static inline void ll_sleep(int t)
 }
 #endif
 
-/* FIXME: This needs to validate pointers and cookies */
-static inline void *lustre_handle2object(struct lustre_handle *handle)
-{
-        if (handle)
-                return (void *)(unsigned long)(handle->addr);
-        return NULL;
-}
-
-static inline void ldlm_object2handle(void *object, struct lustre_handle *handle)
+#define LL_FID_NAMELEN         (16 + 1 + 8 + 1)
+static inline int ll_fid2str(char *str, __u64 id, __u32 generation)
 {
-        handle->addr = (__u64)(unsigned long)object;
+        return sprintf(str, "%llx:%08x", (unsigned long long)id, generation);
 }
 
 #include <linux/portals_lib.h>
@@ -170,7 +201,6 @@ struct obd_ioctl_data {
         uint32_t ioc_len;
         uint32_t ioc_version;
 
-        uint64_t ioc_addr;
         uint64_t ioc_cookie;
         uint32_t ioc_conn1;
         uint32_t ioc_conn2;
@@ -368,6 +398,8 @@ static inline int obd_ioctl_unpack(struct obd_ioctl_data *data, char *pbuf,
 
 #include <linux/obd_support.h>
 
+#define OBD_MAX_IOCTL_BUFFER 8192
+
 /* buffer MUST be at least the size of obd_ioctl_hdr */
 static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
 {
@@ -383,12 +415,13 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
         }
 
         if (hdr.ioc_version != OBD_IOCTL_VERSION) {
-                printk("OBD: version mismatch kernel vs application\n");
+                CERROR("Version mismatch kernel vs application\n");
                 return -EINVAL;
         }
 
-        if (hdr.ioc_len > 8192) {
-                printk("OBD: user buffer exceeds 8192 max buffer\n");
+        if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+                CERROR("User buffer len %d exceeds %d max buffer\n",
+                       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
                 return -EINVAL;
         }
 
@@ -397,8 +430,10 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
                 return -EINVAL;
         }
 
-        OBD_ALLOC(*buf, hdr.ioc_len);
-        if (!*buf) {
+        /* XXX allocate this more intelligently, using kmalloc when
+         * appropriate */
+        OBD_VMALLOC(*buf, hdr.ioc_len);
+        if (*buf == NULL) {
                 CERROR("Cannot allocate control buffer of len %d\n",
                        hdr.ioc_len);
                 RETURN(-EINVAL);
@@ -413,7 +448,7 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
         }
 
         if (obd_ioctl_is_invalid(data)) {
-                printk("OBD: ioctl not correctly formatted\n");
+                CERROR("ioctl not correctly formatted\n");
                 return -EINVAL;
         }
 
@@ -436,6 +471,15 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
         return 0;
 }
 
+static inline void obd_ioctl_freedata(char *buf, int len)
+{
+        ENTRY;
+
+        OBD_VFREE(buf, len);
+        EXIT;
+        return;
+}
+
 #define OBD_IOC_CREATE                 _IOR ('f', 101, long)
 #define OBD_IOC_SETUP                  _IOW ('f', 102, long)
 #define OBD_IOC_CLEANUP                _IO  ('f', 103      )
@@ -467,19 +511,18 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
 #define OBD_IOC_LIST                   _IOWR('f', 129, long)
 #define OBD_IOC_UUID2DEV               _IOWR('f', 130, long)
 
-#define OBD_IOC_RECOVD_NEWCONN         _IOWR('f', 131, long)
-#define OBD_IOC_LOV_SET_CONFIG         _IOWR('f', 132, long)
-#define OBD_IOC_LOV_GET_CONFIG         _IOWR('f', 133, long)
+#define OBD_IOC_LOV_SET_CONFIG         _IOWR('f', 131, long)
+#define OBD_IOC_LOV_GET_CONFIG         _IOWR('f', 132, long)
 #define OBD_IOC_LOV_CONFIG             OBD_IOC_LOV_SET_CONFIG
+#define OBD_IOC_CLIENT_RECOVER         _IOW ('f', 133, long)
 
 #define OBD_IOC_OPEN                   _IOWR('f', 134, long)
 #define OBD_IOC_CLOSE                  _IOWR('f', 135, long)
 
-#define OBD_IOC_RECOVD_FAILCONN        _IOWR('f', 136, long)
-
 #define OBD_IOC_DEC_FS_USE_COUNT       _IO  ('f', 139      )
 #define OBD_IOC_NO_TRANSNO             _IOW ('f', 140, long)
 #define OBD_IOC_SET_READONLY           _IOW ('f', 141, long)
+#define OBD_IOC_ABORT_RECOVERY         _IOR ('f', 142, long)
 
 #define OBD_GET_VERSION                _IOWR ('f', 144, long)
 
@@ -487,11 +530,20 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg)
 #define OBD_IOC_DEL_UUID               _IOWR ('f', 146, long)
 #define OBD_IOC_CLOSE_UUID             _IOWR ('f', 147, long)
 
+#define OBD_IOC_MOUNTOPT               _IOWR('f', 170, long)
+
 #define ECHO_IOC_GET_STRIPE            _IOWR('f', 200, long)
 #define ECHO_IOC_SET_STRIPE            _IOWR('f', 201, long)
 #define ECHO_IOC_ENQUEUE               _IOWR('f', 202, long)
 #define ECHO_IOC_CANCEL                _IOWR('f', 203, long)
 
+/* XXX _IOWR('f', 250, long) has been defined in
+ * portals/include/linux/kp30.h for debug, don't use it
+ */
+
+/* Until such time as we get_info the per-stripe maximum from the OST,
+ * we define this to be 2T - 4k, which is the ext3 maxbytes. */
+#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL
 
 #define CHECKSUM_BULK 0
 
@@ -507,8 +559,6 @@ static inline void ost_checksum(__u64 *cksum, void *addr, int len)
 
         *cksum = (*cksum << 2) + sum;
 }
-#else
-#define ost_checksum(cksum, addr, len) do {} while (0)
 #endif
 
 /*
@@ -551,7 +601,7 @@ struct l_wait_info {
         long   lwi_timeout;
         int  (*lwi_on_timeout)(void *);
         long   lwi_signals;
-        int  (*lwi_on_signal)(void *); /* XXX return is ignored for now */
+        void (*lwi_on_signal)(void *);
         void  *lwi_cb_data;
 };
 
@@ -587,11 +637,11 @@ static inline sigset_t l_w_e_set_sigs(int sigs)
         sigset_t old;
         unsigned long irqflags;
 
-        spin_lock_irqsave(&current->sigmask_lock, irqflags);
+        SIGNAL_MASK_LOCK(current, irqflags);
         old = current->blocked;
         siginitsetinv(&current->blocked, sigs);
-        recalc_sigpending(current);
-        spin_unlock_irqrestore(&current->sigmask_lock, irqflags);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, irqflags);
 
         return old;
 }
@@ -639,10 +689,10 @@ do {                                                                           \
             }                                                                  \
         }                                                                      \
                                                                                \
-        spin_lock_irqsave(&current->sigmask_lock, irqflags);                   \
+        SIGNAL_MASK_LOCK(current, irqflags);                                   \
         current->blocked = blocked;                                            \
-        recalc_sigpending(current);                                            \
-        spin_unlock_irqrestore(&current->sigmask_lock, irqflags);              \
+        RECALC_SIGPENDING;                                                     \
+        SIGNAL_MASK_UNLOCK(current, irqflags);                                 \
                                                                                \
         current->state = TASK_RUNNING;                                         \
         remove_wait_queue(&wq, &__wait);                                       \
@@ -656,6 +706,11 @@ do {                                                                           \
                 __l_wait_event(wq, condition, __info, __ret);                  \
         __ret;                                                                 \
 })
+#else
+#define l_wait_event(wq, condition, info)       \
+({                                              \
+        0;                                      \
+})
 #endif /* __KERNEL__ */
 
 #endif /* _LUSTRE_LIB_H */
index 9657f24..35d4994 100644 (file)
 #include <linux/lustre_mds.h>
 #include <linux/lustre_ha.h>
 
+#include <linux/rbtree.h>
+#include <linux/lustre_compat25.h>
+#include <linux/pagemap.h>
+
+/* careful, this is easy to screw up */
+#define PAGE_CACHE_MAXBYTES ((__u64)(~0UL) << PAGE_CACHE_SHIFT)
 
 extern kmem_cache_t *ll_file_data_slab;
 struct ll_file_data {
-        struct lustre_handle fd_mdshandle;
-        struct ptlrpc_request *fd_req;
-        char fd_ostdata[FD_OSTDATA_SIZE];
+        struct obd_client_handle fd_mds_och;
+        struct obd_client_handle fd_ost_och;
         __u32 fd_flags;
 };
 
@@ -47,30 +52,34 @@ struct ll_dentry_data {
 
 #define ll_d2d(dentry) ((struct ll_dentry_data*) dentry->d_fsdata)
 
-struct ll_read_inode2_cookie {
-        struct mds_body *lic_body;
-        struct lov_mds_md *lic_lmm;
+struct ll_dirty_offsets {
+        rb_root_t       do_root;
+        spinlock_t      do_lock;
+        unsigned long   do_num_dirty;
 };
 
+void ll_lldo_init(struct ll_dirty_offsets *lldo);
+void ll_record_dirty(struct inode *inode, unsigned long offset);
+void ll_remove_dirty(struct inode *inode, unsigned long start,
+                     unsigned long end);
+int ll_find_dirty(struct ll_dirty_offsets *lldo, unsigned long *start,
+                  unsigned long *end);
+int ll_farthest_dirty(struct ll_dirty_offsets *lldo, unsigned long *farthest);
+extern struct file_operations ll_pgcache_seq_fops;
+
 struct ll_inode_info {
-        struct lov_stripe_md *lli_smd;
-        char                 *lli_symlink_name;
-        struct semaphore      lli_open_sem;
-        atomic_t              lli_open_count; /* see ll_file_release */
-        /*
-         * the VALID flag and valid_sem are temporary measures to serialize
-         * the manual getattrs that we're doing at lock acquisition.  in
-         * the future the OST will always return its notion of the file
-         * size with the granted locks.
-         */
-        unsigned long         lli_flags;
-#define LLI_F_DID_GETATTR      0
-        struct semaphore      lli_getattr_sem;
-        struct list_head      lli_read_extents;
-        spinlock_t            lli_read_extent_lock;
+        struct lov_stripe_md   *lli_smd;
+        char                   *lli_symlink_name;
+        struct semaphore        lli_open_sem;
+        struct list_head        lli_read_extents;
+        loff_t                  lli_maxbytes;
+        spinlock_t              lli_read_extent_lock;
+        struct ll_dirty_offsets lli_dirty;
+        unsigned long           lli_flags;
+#define LLI_F_HAVE_SIZE_LOCK    0
 
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-        struct inode          lli_vfs_inode;
+        struct inode            lli_vfs_inode;
 #endif
 };
 
@@ -89,6 +98,32 @@ struct ll_read_extent {
 int ll_check_dirty( struct super_block *sb );
 int ll_batch_writepage( struct inode *inode, struct page *page );
 
+struct file_io_stats {
+        spinlock_t     fis_lock;
+        __u64   fis_dirty_pages;
+        __u64   fis_dirty_hits;
+        __u64   fis_dirty_misses;
+        __u64   fis_forced_pages;
+        __u64   fis_writepage_pages;
+        __u64   fis_wb_ok;
+        __u64   fis_wb_fail;
+        __u64   fis_wb_from_writepage;
+        __u64   fis_wb_from_pressure;
+};
+
+#define IO_STAT_ADD(FIS, STAT, VAL) do {        \
+        struct file_io_stats *_fis_ = (FIS);    \
+        spin_lock(&_fis_->fis_lock);            \
+        _fis_->fis_##STAT += VAL;               \
+        spin_unlock(&_fis_->fis_lock);          \
+} while (0)
+
+#define INODE_IO_STAT_ADD(INODE, STAT, VAL)        \
+        IO_STAT_ADD(&ll_i2sbi(INODE)->ll_iostats, STAT, VAL)
+
+#define PAGE_IO_STAT_ADD(PAGE, STAT, VAL)               \
+        INODE_IO_STAT_ADD((PAGE)->mapping, STAT, VAL)
+
 /* interpet return codes from intent lookup */
 #define LL_LOOKUP_POSITIVE 1
 #define LL_LOOKUP_NEGATIVE 2
@@ -119,6 +154,8 @@ struct ll_sb_info {
         struct list_head          ll_conn_chain; /* per-conn chain of SBs */
 
         struct list_head          ll_orphan_dentry_list; /*please don't ask -p*/
+
+        struct  file_io_stats     ll_iostats;
 };
 
 static inline struct ll_sb_info *ll_s2sbi(struct super_block *sb)
@@ -189,12 +226,7 @@ static inline struct lustre_handle *ll_i2obdconn(struct inode *inode)
 }
 
 static inline void ll_ino2fid(struct ll_fid *fid, obd_id ino, __u32 generation,
-                              int type)
-{
-        fid->id = ino;
-        fid->generation = generation;
-        fid->f_type = type;
-}
+                              int type);
 
 static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode)
 {
@@ -207,16 +239,28 @@ static inline int ll_mds_max_easize(struct super_block *sb)
         return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize;
 }
 
+static inline loff_t ll_file_maxbytes(struct inode *inode)
+{
+        return ll_i2info(inode)->lli_maxbytes;
+}
+
 /* namei.c */
 int ll_lock(struct inode *dir, struct dentry *dentry,
             struct lookup_intent *it, struct lustre_handle *lockh);
 int ll_unlock(__u32 mode, struct lustre_handle *lockh);
 
 typedef int (*intent_finish_cb)(int flag, struct ptlrpc_request *,
-                                struct dentry **, struct lookup_intent *,
-                                int offset, obd_id ino);
+                                struct inode *parent, struct dentry **, 
+                                struct lookup_intent *, int offset, obd_id ino);
 int ll_intent_lock(struct inode *parent, struct dentry **,
                    struct lookup_intent *, intent_finish_cb);
+int ll_mdc_blocking_ast(struct ldlm_lock *lock,
+                        struct ldlm_lock_desc *desc,
+                        void *data, int flag);
+void ll_mdc_lock_set_inode(struct lustre_handle *lock, struct inode *inode);
+void ll_prepare_mdc_op_data(struct mdc_op_data *data,
+                            struct inode *i1, struct inode *i2,
+                            const char *name, int namelen, int mode);
 
 /* dcache.c */
 void ll_intent_release(struct dentry *, struct lookup_intent *);
@@ -260,6 +304,8 @@ do {                                                                           \
         up(&ll_d2d(de)->lld_it_sem);                                           \
 } while(0)
 
+#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0")
+
 /* dcache.c */
 int ll_have_md_lock(struct dentry *de);
 
@@ -285,6 +331,9 @@ int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
                      struct lustre_handle *lockh);
 int ll_create_objects(struct super_block *sb, obd_id id, uid_t uid,
                       gid_t gid, struct lov_stripe_md **lsmp);
+int ll_file_open(struct inode *inode, struct file *file);
+int ll_file_release(struct inode *inode, struct file *file);
+
 
 /* rw.c */
 struct page *ll_getpage(struct inode *inode, unsigned long offset,
@@ -292,7 +341,7 @@ struct page *ll_getpage(struct inode *inode, unsigned long offset,
 void ll_truncate(struct inode *inode);
 
 /* super.c */
-void ll_update_inode(struct inode *, struct mds_body *, struct lov_mds_md *);
+void ll_update_inode(struct inode *, struct mds_body *, struct lov_stripe_md *);
 int ll_setattr_raw(struct inode *inode, struct iattr *attr);
 
 /* symlink.c */
@@ -303,8 +352,25 @@ extern struct inode_operations ll_symlink_inode_operations;
 void ll_sysctl_init(void);
 void ll_sysctl_clean(void);
 
+#else
+#include <linux/lustre_idl.h>
 #endif /* __KERNEL__ */
 
+static inline void ll_ino2fid(struct ll_fid *fid,
+                              obd_id ino,
+                              __u32 generation,
+                              int type)
+{
+        fid->id = ino;
+        fid->generation = generation;
+        fid->f_type = type;
+}
+
+struct ll_read_inode2_cookie {
+        struct mds_body      *lic_body;
+        struct lov_stripe_md *lic_lsm;
+};
+
 #include <asm/types.h>
 
 #define LL_IOC_GETFLAGS                 _IOR ('f', 151, long)
index c951637..683d78d 100644 (file)
 #define _LUSTRE_MDS_H
 
 #ifdef __KERNEL__
-#include <linux/fs.h>
+# include <linux/fs.h>
+# include <linux/dcache.h>
 #endif
+#include <linux/lustre_handles.h>
 #include <linux/kp30.h>
 #include <linux/lustre_idl.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_dlm.h>
 
 struct ldlm_lock_desc;
 struct mds_obd;
@@ -58,6 +62,36 @@ static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck)
         lck->rpcl_it = NULL;
 }
 
+#ifdef __KERNEL__
+/* Compat code for kernel patch v18 users, can be removed when everyone has
+ * upgraded --phik 02 June 2003 */
+#ifdef IT_FL_LOCKED
+static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, 
+                                    struct lookup_intent *it)
+{
+        down(&lck->rpcl_sem);
+        if (it) { 
+                lck->rpcl_it = it;
+                it->it_int_flags |= IT_FL_LOCKED;
+        }
+}
+
+static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, 
+                                    struct lookup_intent *it)
+{
+        if (it == NULL) {
+                LASSERT(it == lck->rpcl_it);
+                up(&lck->rpcl_sem);
+                return;
+        }
+        if (it != NULL && (it->it_int_flags & IT_FL_LOCKED)) {
+                it->it_int_flags &= ~IT_FL_LOCKED;
+                LASSERT(it == lck->rpcl_it);
+                lck->rpcl_it = NULL;
+                up(&lck->rpcl_sem);
+        }
+}
+#else
 static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, 
                                     struct lookup_intent *it)
 {
@@ -83,18 +117,24 @@ static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck,
                 up(&lck->rpcl_sem);
         }
 }
-struct  mdc_unlink_data {
-        struct inode *unl_dir;
-        struct inode *unl_de;
-        int unl_mode;
-        const char *unl_name;
-        int unl_len;
+#endif
+#endif
+
+struct mdc_op_data {
+        __u64   ino1;
+        __u32   gen1;
+        __u32   typ1;
+        __u32   gid1;
+        __u64   ino2;
+        __u32   gen2;
+        __u32   typ2;
+        __u32   gid2;
+        const char *name;
+        int     namelen;
+        int     mode;
 };
 
 struct mds_update_record {
-        __u32 ur_fsuid;
-        __u32 ur_fsgid;
-        __u32 ur_cap;
         __u32 ur_opcode;
         struct ll_fid *ur_fid1;
         struct ll_fid *ur_fid2;
@@ -102,17 +142,24 @@ struct mds_update_record {
         char *ur_name;
         int ur_tgtlen;
         char *ur_tgt;
+        int ur_eadatalen;
+        void *ur_eadata;
         struct iattr ur_iattr;
+        struct obd_ucred ur_uc;
         __u64 ur_rdev;
         __u32 ur_mode;
         __u32 ur_uid;
         __u32 ur_gid;
         __u64 ur_time;
         __u32 ur_flags;
-        __u32 ur_suppgid1;
-        __u32 ur_suppgid2;
 };
 
+#define ur_fsuid    ur_uc.ouc_fsuid
+#define ur_fsgid    ur_uc.ouc_fsgid
+#define ur_cap      ur_uc.ouc_cap
+#define ur_suppgid1 ur_uc.ouc_suppgid1
+#define ur_suppgid2 ur_uc.ouc_suppgid2
+
 #define MDS_LR_CLIENT  8192
 #define MDS_LR_SIZE     128
 
@@ -141,21 +188,14 @@ struct mds_client_data {
         __u8 padding[MDS_LR_SIZE - 74];
 };
 
-/* In-memory access to client data from MDS struct */
-struct mds_export_data {
-        struct list_head        med_open_head;
-        spinlock_t              med_open_lock;
-        struct mds_client_data *med_mcd;
-        int                     med_off;
-        struct ptlrpc_request  *med_outstanding_reply;
-};
-
 /* file data for open files on MDS */
 struct mds_file_data {
-        struct list_head     mfd_list;
-        __u64                mfd_servercookie;
-        __u64                mfd_xid;
-        struct file         *mfd_file;
+        struct portals_handle mfd_handle; /* must be first */
+        atomic_t              mfd_refcount;
+        struct list_head      mfd_list;
+        __u64                 mfd_xid;
+        int                   mfd_mode;
+        struct dentry        *mfd_dentry;
 };
 
 /* mds/mds_reint.c  */
@@ -166,41 +206,8 @@ int mds_reint_rec(struct mds_update_record *r, int offset,
 int mds_open(struct mds_update_record *rec, int offset,
              struct ptlrpc_request *req, struct lustre_handle *);
 
-/* lib/mds_updates.c */
-void mds_unpack_body(struct mds_body *b);
-void mds_unpack_fid(struct ll_fid *fid);
-void mds_pack_fid(struct ll_fid *fid);
-void mds_pack_req_body(struct ptlrpc_request *);
-void mds_pack_rep_body(struct ptlrpc_request *);
-int mds_update_unpack(struct ptlrpc_request *, int offset,
-                      struct mds_update_record *);
-
-void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, obd_id ino,
-                      int type, __u64 xid);
-void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset, int fl,
-                      struct inode *inode, const char *name, int namelen);
-void mds_setattr_pack(struct ptlrpc_request *, struct inode *,
-                      struct iattr *, void *ea, int ealen);
-void mds_create_pack(struct ptlrpc_request *, int offset, struct inode *dir,
-                     __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time,
-                     const char *name, int namelen, const void *data,
-                     int datalen);
-void mds_open_pack(struct ptlrpc_request *, int offset, struct inode *dir,
-                     __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time,
-                     __u32 flags, const char *name, int namelen,
-                     const void *data, int datalen);
-void mds_unlink_pack(struct ptlrpc_request *, int offset, struct inode *inode,
-                     struct inode *child, __u32 mode, const char *name,
-                     int namelen);
-void mds_link_pack(struct ptlrpc_request *, int offset, struct inode *ino,
-                   struct inode *dir, const char *name, int namelen);
-void mds_rename_pack(struct ptlrpc_request *, int offset, struct inode *srcdir,
-                     struct inode *tgtdir, const char *name, int namelen,
-                     const char *tgt, int tgtlen);
-void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode);
-void mds_pack_inode2body(struct mds_body *body, struct inode *inode);
-
 /* mds/handler.c */
+#ifdef __KERNEL__
 struct dentry *mds_name2locked_dentry(struct obd_device *, struct dentry *dir,
                                       struct vfsmount **mnt, char *name,
                                       int namelen, int lock_mode,
@@ -214,64 +221,60 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
 int mds_reint(struct ptlrpc_request *req, int offset, struct lustre_handle *);
 int mds_pack_md(struct obd_device *mds, struct lustre_msg *msg,
                 int offset, struct mds_body *body, struct inode *inode);
-void mds_steal_ack_locks(struct mds_export_data *med,
+void mds_steal_ack_locks(struct obd_export *exp,
                          struct ptlrpc_request *req);
 
 /* mds/mds_fs.c */
 int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt);
-int mds_fs_cleanup(struct obd_device *obddev);
+int mds_fs_cleanup(struct obd_device *obddev, int failover);
+#endif
 
 /* mdc/mdc_request.c */
 int mdc_enqueue(struct lustre_handle *conn, int lock_type,
-                struct lookup_intent *it, int lock_mode, struct inode *dir,
-                struct dentry *de, struct lustre_handle *lockh, char *tgt,
-                int tgtlen, void *data, int datalen);
-int mdc_cancel_unused(struct lustre_handle *conn, struct inode *, int flags);
+                struct lookup_intent *it, int lock_mode,
+                struct mdc_op_data *enq_data,
+                struct lustre_handle *lockh, char *tgt, int tgtlen,
+                ldlm_completion_callback cb_completion,
+                ldlm_blocking_callback cb_blocking,
+                void *cb_data);
 int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh,
                    struct ptlrpc_request **request);
 int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid);
-int mdc_getattr(struct lustre_handle *conn,
-                obd_id ino, int type, unsigned long valid, unsigned int ea_size,
+int mdc_getattr(struct lustre_handle *conn, struct ll_fid *fid,
+                unsigned long valid, unsigned int ea_size,
                 struct ptlrpc_request **request);
-int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent,
+int mdc_getattr_name(struct lustre_handle *conn, struct ll_fid *fid,
                      char *filename, int namelen, unsigned long valid,
                      unsigned int ea_size, struct ptlrpc_request **request);
 int mdc_setattr(struct lustre_handle *conn,
-                struct inode *, struct iattr *iattr,
-                void *ea, int ealen, struct ptlrpc_request **);
+                struct mdc_op_data *data,
+                struct iattr *iattr, void *ea, int ealen,
+                struct ptlrpc_request **request);
 int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags,
              struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh,
              struct ptlrpc_request **);
-void mdc_set_open_replay_data(struct ll_file_data *fd);
+struct obd_client_handle;
+void mdc_set_open_replay_data(struct obd_client_handle *och);
 int mdc_close(struct lustre_handle *conn, obd_id ino, int type,
               struct lustre_handle *fh,  struct ptlrpc_request **req);
-int mdc_readpage(struct lustre_handle *conn, obd_id ino,
-                 int type, __u64 offset, char *addr, struct ptlrpc_request **);
-int mdc_create(struct lustre_handle *conn,
-               struct inode *dir, const char *name, int namelen,
+int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset,
+                 struct page *, struct ptlrpc_request **);
+int mdc_create(struct lustre_handle *conn, struct mdc_op_data *op_data,
                const void *data, int datalen, int mode, __u32 uid, __u32 gid,
-               __u64 time, __u64 rdev, struct ptlrpc_request **);
-int mdc_unlink(struct lustre_handle *, struct inode *dir, struct inode *child,
-               __u32 mode, const char *name, int namelen,
-               struct ptlrpc_request **);
-int mdc_link(struct lustre_handle *conn,
-             struct inode *src, struct inode *dir, const char *name,
-             int namelen, struct ptlrpc_request **);
-int mdc_rename(struct lustre_handle *conn,
-               struct inode *src, struct inode *tgt, const char *old,
-               int oldlen, const char *new, int newlen,
-               struct ptlrpc_request **);
+               __u64 time, __u64 rdev, struct ptlrpc_request **request);
+int mdc_unlink(struct lustre_handle *conn, struct mdc_op_data *data,
+               struct ptlrpc_request **request);
+int mdc_link(struct lustre_handle *conn, struct mdc_op_data *data,
+             struct ptlrpc_request **);
+int mdc_rename(struct lustre_handle *conn, struct mdc_op_data *data,
+               const char *old, int oldlen, const char *new, int newlen,
+               struct ptlrpc_request **request);
 int mdc_create_client(struct obd_uuid uuid, struct ptlrpc_client *cl);
-void mdc_lock_set_inode(struct lustre_handle *lock, struct inode *inode);
 
 /* Store the generation of a newly-created inode in |req| for replay. */
 void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff,
                                 int repoff);
 
-int mds_client_add(struct mds_obd *mds, struct mds_export_data *med,
-                   int cl_off);
-int mds_client_free(struct obd_export *exp);
-
 
 /* ioctls for trying requests */
 #define IOC_REQUEST_TYPE                   'f'
index 6966424..ed5db88 100644 (file)
@@ -1,7 +1,7 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
  *
  *   This file is part of Lustre, http://www.lustre.org.
  *
@@ -30,7 +30,7 @@
 #else
 #include <linux/workqueue.h>
 #endif
-#endif 
+#endif
 
 #include <linux/kp30.h>
 // #include <linux/obd.h>
@@ -38,6 +38,7 @@
 #include <linux/lustre_idl.h>
 #include <linux/lustre_ha.h>
 #include <linux/lustre_import.h>
+#include <linux/lprocfs_status.h>
 
 /* The following constants determine how much memory is devoted to
  * buffering in the lustre services.
@@ -62,7 +63,8 @@
 #define LDLM_BUFSIZE    (8 * 1024)
 #define LDLM_MAXREQSIZE 1024
 
-#define MDT_NUM_THREADS 8
+#define MDT_MAX_THREADS 32UL
+#define MDT_NUM_THREADS min(num_physpages / 8192, MDT_MAX_THREADS)
 #define MDS_NEVENT_MAX  8192UL
 #define MDS_NEVENTS     min(num_physpages / 64, MDS_NEVENT_MAX)
 #define MDS_NBUF_MAX    512UL
  */
 #define MDS_MAXREQSIZE  (5 * 1024)
 
-#define OST_NUM_THREADS 6
+#define OST_MAX_THREADS 36UL
+#define OST_NUM_THREADS min(num_physpages / 8192, OST_MAX_THREADS)
 #define OST_NEVENT_MAX  32768UL
 #define OST_NEVENTS     min(num_physpages / 16, OST_NEVENT_MAX)
 #define OST_NBUF_MAX    1280UL
 #define OST_NBUFS       min(OST_NEVENTS / 64, OST_NBUF_MAX)
 #define OST_BUFSIZE     (8 * 1024)
-/* OST_MAXREQSIZE ~= 1896 bytes =
+/* OST_MAXREQSIZE ~= 1640 bytes =
  * lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote
  *
- * single object with 16 pages is 576 bytes
+ * single object with 16 pages is 512 bytes
  */
 #define OST_MAXREQSIZE  (2 * 1024)
 
@@ -120,19 +123,13 @@ struct ptlrpc_connection {
         __u32                   c_epoch;       /* changes when peer changes */
         __u32                   c_bootcount;   /* peer's boot count */
 
-        spinlock_t              c_lock;        /* also protects req->rq_list */
+        spinlock_t              c_lock;
 
         atomic_t                c_refcount;
         __u64                   c_token;
         __u64                   c_remote_conn;
         __u64                   c_remote_token;
 
-        struct list_head        c_delayed_head;/* delayed until post-recovery XXX imp? */
-        struct recovd_data      c_recovd_data;
-
-        struct list_head        c_imports;
-        struct list_head        c_exports;
-        struct list_head        c_sb_chain;
         __u32                   c_flags; // can we indicate INVALID elsewhere?
 };
 
@@ -147,28 +144,76 @@ struct ptlrpc_client {
 };
 
 /* state flags of requests */
+/* XXX only ones left are those used by the bulk descs as well! */
 #define PTL_RPC_FL_INTR      (1 << 0)  /* reply wait was interrupted by user */
-#define PTL_RPC_FL_REPLIED   (1 << 1)  /* reply was received */
-#define PTL_RPC_FL_SENT      (1 << 2)  /* request was sent */
-#define PTL_RPC_FL_WANT_ACK  (1 << 3)  /* reply is awaiting an ACK */
-#define PTL_BULK_FL_SENT     (1 << 4)  /* outgoing bulk was sent */
-#define PTL_BULK_FL_RCVD     (1 << 5)  /* incoming bulk was recieved */
-#define PTL_RPC_FL_ERR       (1 << 6)  /* request failed due to RPC error */
 #define PTL_RPC_FL_TIMEOUT   (1 << 7)  /* request timed out waiting for reply */
-#define PTL_RPC_FL_RESEND    (1 << 8)  /* retransmit the request */
-#define PTL_RPC_FL_RESTART   (1 << 9)  /* operation must be restarted */
-#define PTL_RPC_FL_RETAIN    (1 << 10) /* retain for replay after reply */
-#define PTL_RPC_FL_REPLAY    (1 << 11) /* replay upon recovery */
-#define PTL_RPC_FL_ALLOCREP  (1 << 12) /* reply buffer allocated */
-#define PTL_RPC_FL_NO_RESEND (1 << 13) /* don't automatically resend this req */
-#define PTL_RPC_FL_RESENT    (1 << 14) /* server rcvd resend of this req */
+
+#define REQ_MAX_ACK_LOCKS 4
+
+#define SWAB_PARANOIA 1
+#if SWAB_PARANOIA
+/* unpacking: assert idx not unpacked already */
+#define LASSERT_REQSWAB(rq, idx)                                \
+do {                                                            \
+        LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8);  \
+        LASSERT (((rq)->rq_req_swab_mask & (1 << (idx))) == 0); \
+        (rq)->rq_req_swab_mask |= (1 << (idx));                 \
+} while (0)
+
+#define LASSERT_REPSWAB(rq, idx)                                \
+do {                                                            \
+        LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8);  \
+        LASSERT (((rq)->rq_rep_swab_mask & (1 << (idx))) == 0); \
+        (rq)->rq_rep_swab_mask |= (1 << (idx));                 \
+} while (0)
+
+/* just looking: assert idx already unpacked */
+#define LASSERT_REQSWABBED(rq, idx)                     \
+LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8 && \
+         ((rq)->rq_req_swab_mask & (1 << (idx))) != 0)
+
+#define LASSERT_REPSWABBED(rq, idx)                     \
+LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8 && \
+         ((rq)->rq_rep_swab_mask & (1 << (idx))) != 0)
+#else
+#define LASSERT_REQSWAB(rq, idx)
+#define LASSERT_REPSWAB(rq, idx)
+#define LASSERT_REQSWABBED(rq, idx)
+#define LASSERT_REPSWABBED(rq, idx)
+#endif
+
+union ptlrpc_async_args {
+        /* Scratchpad for passing args to completion interpreter. Users
+         * cast to the struct of their choosing, and LASSERT that this is
+         * big enough.  For _tons_ of context, OBD_ALLOC a struct and store
+         * a pointer to it here.  The pointer_arg ensures this struct is at
+         * least big enough for that. */
+        void      *pointer_arg[4];
+        __u64      space[4];
+};
+
+struct ptlrpc_request_set {
+        int               set_remaining; /* # uncompleted requests */
+        wait_queue_head_t set_waitq;
+        struct list_head  set_requests;
+        void             *set_interpret; /* completion callback */
+        union ptlrpc_async_args set_args; /* completion context */
+};
+
+struct ptlrpc_bulk_desc;
 
 struct ptlrpc_request {
         int rq_type; /* one of PTL_RPC_MSG_* */
         struct list_head rq_list;
         struct obd_device *rq_obd;
         int rq_status;
-        int rq_flags;
+        spinlock_t rq_lock;
+        unsigned int rq_intr:1, rq_replied:1, rq_want_ack:1, rq_err:1,
+            rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1,
+            rq_no_resend:1, rq_resent:1, rq_no_recov:1, rq_waiting:1,
+            rq_receiving_reply:1;
+        int rq_phase;
+                
         atomic_t rq_refcount;
 
         int rq_request_portal; /* XXX FIXME bug 249 */
@@ -183,12 +228,18 @@ struct ptlrpc_request {
         __u64 rq_transno;
         __u64 rq_xid;
 
+#if SWAB_PARANOIA
+        __u32 rq_req_swab_mask;
+        __u32 rq_rep_swab_mask;
+#endif
+
+        int rq_import_generation;
         int rq_level;
         wait_queue_head_t rq_wait_for_rep; /* XXX also _for_ack */
 
         /* incoming reply */
         ptl_md_t rq_reply_md;
-        ptl_handle_me_t rq_reply_me_h;
+        ptl_handle_md_t rq_reply_md_h;
 
         /* outgoing req/rep */
         ptl_md_t rq_req_md;
@@ -202,26 +253,60 @@ struct ptlrpc_request {
         void (*rq_replay_cb)(struct ptlrpc_request *);
         void  *rq_replay_data;
 
+        struct ptlrpc_bulk_desc *rq_bulk;       /* client side bulk */
+        time_t rq_sent;                         /* when the request was sent */
+
+        /* Multi-rpc bits */
+        struct list_head rq_set_chain;
+        struct ptlrpc_request_set *rq_set;
+        void *rq_interpret_reply;               /* Async completion handler */
+        union ptlrpc_async_args rq_async_args;  /* Async completion context */
+
         /* Only used on the server side for tracking acks. */
         struct ptlrpc_req_ack_lock {
                 struct lustre_handle lock;
                 __u32                mode;
-        } rq_ack_locks[4];
+        } rq_ack_locks[REQ_MAX_ACK_LOCKS];
 };
 
+#define RQ_PHASE_NEW           0xebc0de00
+#define RQ_PHASE_RPC          0xebc0de01
+#define RQ_PHASE_BULK          0xebc0de02
+#define RQ_PHASE_INTERPRET     0xebc0de03
+#define RQ_PHASE_COMPLETE      0xebc0de04
+
+/* Spare the preprocessor, spoil the bugs. */
+#define FLAG(field, str) (field ? str : "")
+
+#define DEBUG_REQ_FLAGS(req)                                                    \
+        ((req->rq_phase == RQ_PHASE_NEW) ? "New" :                              \
+         (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" :                              \
+         (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" :                  \
+         (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : "?phase?"),        \
+        FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"),                    \
+        FLAG(req->rq_want_ack, "A"), FLAG(req->rq_err, "E"),                    \
+        FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"),   \
+        FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"),                  \
+        FLAG(req->rq_no_resend, "N"), FLAG(req->rq_resent, "s"),                \
+        FLAG(req->rq_no_recov, "n"), FLAG(req->rq_waiting, "W")
+
+#define REQ_FLAGS_FMT "%s%s%s%s%s%s%s%s%s%s%s%s%s"
+
 #define DEBUG_REQ(level, req, fmt, args...)                                    \
 do {                                                                           \
-CDEBUG(level,                                                                  \
-       "@@@ " fmt " req@%p x"LPD64"/t"LPD64" o%d->%s:%d lens %d/%d ref %d fl " \
-       "%x/%x/%x rc %x\n" ,  ## args, req, req->rq_xid,                        \
+CDEBUG(level, "@@@ " fmt                                                       \
+       " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d ref %d fl "         \
+       REQ_FLAGS_FMT"/%x/%x rc %x\n" ,  ## args, req, req->rq_xid,             \
        req->rq_reqmsg ? req->rq_reqmsg->transno : -1,                          \
        req->rq_reqmsg ? req->rq_reqmsg->opc : -1,                              \
+       req->rq_import ? (char *)req->rq_import->imp_target_uuid.uuid : "<?>",  \
        req->rq_connection ?                                                    \
           (char *)req->rq_connection->c_remote_uuid.uuid : "<?>",              \
        (req->rq_import && req->rq_import->imp_client) ?                        \
            req->rq_import->imp_client->cli_request_portal : -1,                \
        req->rq_reqlen, req->rq_replen,                                         \
-       atomic_read (&req->rq_refcount), req->rq_flags,                         \
+       atomic_read(&req->rq_refcount),                                         \
+       DEBUG_REQ_FLAGS(req),                                                   \
        req->rq_reqmsg ? req->rq_reqmsg->flags : 0,                             \
        req->rq_repmsg ? req->rq_repmsg->flags : 0,                             \
        req->rq_status);                                                        \
@@ -230,45 +315,43 @@ CDEBUG(level,                                                                  \
 struct ptlrpc_bulk_page {
         struct ptlrpc_bulk_desc *bp_desc;
         struct list_head bp_link;
-        void *bp_buf;
         int bp_buflen;
+        int bp_pageoffset;                      /* offset within a page */
         struct page *bp_page;
-        __u32 bp_xid;
-        __u32 bp_flags;
-        struct dentry *bp_dentry;
-        int (*bp_cb)(struct ptlrpc_bulk_page *);
 };
 
+#define BULK_GET_SOURCE          0
+#define BULK_PUT_SINK     1
+#define BULK_GET_SINK     2
+#define BULK_PUT_SOURCE   3
 
 struct ptlrpc_bulk_desc {
-        struct list_head bd_set_chain; /* entry in obd_brw_set */
-        struct obd_brw_set *bd_brw_set;
-        int bd_flags;
-        struct ptlrpc_connection *bd_connection;
-        struct ptlrpc_client *bd_client;
+        unsigned int bd_complete:1;
+        unsigned int bd_network_rw:1;           /* accessible to the network */
+        unsigned int bd_type:2;                 /* {put,get}{source,sink} */
+        unsigned int bd_registered:1;           /* client side */
+        spinlock_t   bd_lock;                   /* serialise with callback */
+        int bd_import_generation;
+        struct obd_export *bd_export;
+        struct obd_import *bd_import;
         __u32 bd_portal;
-        struct lustre_handle bd_conn;
-        void (*bd_ptl_ev_hdlr)(struct ptlrpc_bulk_desc *);
-
-        wait_queue_head_t bd_waitq;
+        struct ptlrpc_request *bd_req;          /* associated request */
+        wait_queue_head_t bd_waitq;             /* server side only WQ */
         struct list_head bd_page_list;
         __u32 bd_page_count;
-        atomic_t bd_refcount;
-        void *bd_desc_private;
-
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-        struct work_struct bd_queue;
-#else
-        struct tq_struct bd_queue;
-#endif
-
+        __u32 bd_last_xid;
+        
         ptl_md_t bd_md;
         ptl_handle_md_t bd_md_h;
         ptl_handle_me_t bd_me_h;
 
-        atomic_t bd_source_callback_count;
+        int bd_callback_count;                  /* server side callbacks */
 
+#ifdef __KERNEL__
+        ptl_kiov_t bd_iov[16];    /* self-sized pre-allocated iov */
+#else
         struct iovec bd_iov[16];    /* self-sized pre-allocated iov */
+#endif
 };
 
 struct ptlrpc_thread {
@@ -289,6 +372,7 @@ struct ptlrpc_request_buffer_desc {
 struct ptlrpc_ni {
         /* Generic interface state */
         char                   *pni_name;
+        int                     pni_number;
         ptl_handle_ni_t         pni_ni_h;
         ptl_handle_eq_t         pni_request_out_eq_h;
         ptl_handle_eq_t         pni_reply_in_eq_h;
@@ -328,29 +412,23 @@ struct ptlrpc_service {
         struct list_head srv_threads;
         int (*srv_handler)(struct ptlrpc_request *req);
         char *srv_name;  /* only statically allocated strings here; we don't clean them */
+        struct proc_dir_entry   *svc_procroot;    
+        struct lprocfs_counters *svc_counters;
 
         int                  srv_interface_rover;
         struct ptlrpc_srv_ni srv_interfaces[0];
 };
 
-static inline void ptlrpc_hdl2req(struct ptlrpc_request *req,
-                                  struct lustre_handle *h)
-{
-        req->rq_reqmsg->addr = h->addr;
-        req->rq_reqmsg->cookie = h->cookie;
-}
-
-typedef void (*bulk_callback_t)(struct ptlrpc_bulk_desc *, void *);
-
 typedef int (*svc_handler_t)(struct ptlrpc_request *req);
 
-/* rpc/events.c */
+/* ptlrpc/events.c */
 extern struct ptlrpc_ni ptlrpc_interfaces[];
 extern int              ptlrpc_ninterfaces;
-extern int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer);
+extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, struct ptlrpc_peer *peer);
 
-/* rpc/connection.c */
-void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *uuid);
+/* ptlrpc/connection.c */
+void ptlrpc_dump_connections(void);
+void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *);
 struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
                                                 struct obd_uuid *uuid);
 int ptlrpc_put_connection(struct ptlrpc_connection *c);
@@ -358,58 +436,74 @@ struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *);
 void ptlrpc_init_connection(void);
 void ptlrpc_cleanup_connection(void);
 
-/* rpc/niobuf.c */
-int ptlrpc_check_bulk_sent(struct ptlrpc_bulk_desc *bulk);
-int ptlrpc_check_bulk_received(struct ptlrpc_bulk_desc *bulk);
+/* ptlrpc/niobuf.c */
 int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *);
 int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *);
-int ptlrpc_register_bulk_put(struct ptlrpc_bulk_desc *);
-int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *);
-int ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
-struct obd_brw_set *obd_brw_set_new(void);
-void obd_brw_set_add(struct obd_brw_set *, struct ptlrpc_bulk_desc *);
-void obd_brw_set_del(struct ptlrpc_bulk_desc *);
-void obd_brw_set_decref(struct obd_brw_set *set);
-void obd_brw_set_addref(struct obd_brw_set *set);
-
-int ptlrpc_reply(struct ptlrpc_service *svc, struct ptlrpc_request *req);
-int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req);
+void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk);
+int ptlrpc_register_bulk(struct ptlrpc_request *req);
+void ptlrpc_unregister_bulk (struct ptlrpc_request *req);
+
+static inline int ptlrpc_bulk_complete (struct ptlrpc_bulk_desc *desc) 
+{
+        unsigned long flags;
+        int           rc;
+
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        rc = desc->bd_complete;
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
+        return (rc);
+}
+
+int ptlrpc_reply(struct ptlrpc_request *req);
+int ptlrpc_error(struct ptlrpc_request *req);
 void ptlrpc_resend_req(struct ptlrpc_request *request);
 int ptl_send_rpc(struct ptlrpc_request *request);
 void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd);
 
-/* rpc/client.c */
+/* ptlrpc/client.c */
 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
                         struct ptlrpc_client *);
 void ptlrpc_cleanup_client(struct obd_import *imp);
 struct obd_uuid *ptlrpc_req_to_uuid(struct ptlrpc_request *req);
 struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid);
 
-int ll_brw_sync_wait(struct obd_brw_set *, int phase);
-
 int ptlrpc_queue_wait(struct ptlrpc_request *req);
-void ptlrpc_continue_req(struct ptlrpc_request *req);
 int ptlrpc_replay_req(struct ptlrpc_request *req);
-int ptlrpc_abort(struct ptlrpc_request *req);
+void ptlrpc_unregister_reply(struct ptlrpc_request *req);
 void ptlrpc_restart_req(struct ptlrpc_request *req);
-void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import);
+void ptlrpc_abort_inflight(struct obd_import *imp);
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void);
+int ptlrpc_set_wait(struct ptlrpc_request_set *);
+void ptlrpc_set_destroy(struct ptlrpc_request_set *);
+void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *);
 
 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
                                        int count, int *lengths, char **bufs);
 void ptlrpc_free_req(struct ptlrpc_request *request);
 void ptlrpc_req_finished(struct ptlrpc_request *request);
 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req);
-struct ptlrpc_bulk_desc *ptlrpc_prep_bulk(struct ptlrpc_connection *);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
+                                               int type, int portal);
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req,
+                                              int type, int portal);
 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk);
-struct ptlrpc_bulk_page *ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc);
+int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+                          struct page *page, int pageoffset, int len);
 void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page);
 void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
                                       struct obd_import *imp);
+__u64 ptlrpc_next_xid(void);
 
-/* rpc/service.c */
+/* ptlrpc/ptlrpc_module.c */
+void ptlrpc_put_ldlm_hooks(void);
+int ptlrpc_ldlm_hooks_referenced(void);
+
+/* ptlrpc/service.c */
 struct ptlrpc_service *
 ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size,
-                int req_portal, int rep_portal, svc_handler_t, char *name);
+                int req_portal, int rep_portal, svc_handler_t, char *name,
+                struct obd_device *dev);
 void ptlrpc_stop_all_threads(struct ptlrpc_service *svc);
 int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
                         char *name);
@@ -422,31 +516,21 @@ struct ptlrpc_svc_data {
         struct obd_device *dev;
 };
 
-/* rpc/pack_generic.c */
+/* ptlrpc/pack_generic.c */
 int lustre_pack_msg(int count, int *lens, char **bufs, int *len,
                     struct lustre_msg **msg);
 int lustre_msg_size(int count, int *lengths);
 int lustre_unpack_msg(struct lustre_msg *m, int len);
-void *lustre_msg_buf(struct lustre_msg *m, int n);
-
-/* rpc/rpc.c */
-__u32 ptlrpc_next_xid(void);
-
-static inline void ptlrpc_bulk_decref(struct ptlrpc_bulk_desc *desc)
-{
-        CDEBUG(D_PAGE, "%p -> %d\n", desc, atomic_read(&desc->bd_refcount) - 1);
-
-        if (atomic_dec_and_test(&desc->bd_refcount)) {
-                CDEBUG(D_PAGE, "Released last ref on %p, freeing\n", desc);
-                ptlrpc_free_bulk(desc);
-        }
-}
-
-static inline void ptlrpc_bulk_addref(struct ptlrpc_bulk_desc *desc)
-{
-        atomic_inc(&desc->bd_refcount);
-        CDEBUG(D_PAGE, "Set refcount of %p to %d\n", desc,
-               atomic_read(&desc->bd_refcount));
-}
+void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen);
+char *lustre_msg_string (struct lustre_msg *m, int n, int max_len);
+void *lustre_swab_reqbuf (struct ptlrpc_request *req, int n, int minlen,
+                          void *swabber);
+void *lustre_swab_repbuf (struct ptlrpc_request *req, int n, int minlen,
+                          void *swabber);
+
+/* ldlm/ldlm_lib.c */
+int client_import_connect(struct lustre_handle *conn, struct obd_device *obd,
+                          struct obd_uuid *cluuid);
+int client_import_disconnect(struct lustre_handle *conn, int failover);
 
 #endif
index f3163fe..fe53974 100644 (file)
@@ -17,7 +17,11 @@ struct lov_oinfo { /* per-child structure */
 };
 
 struct lov_stripe_md {
+        /* Public members. */
         __u64 lsm_object_id;        /* lov object id */
+        __u64 lsm_maxbytes;
+
+        /* LOV-private members start here -- only for use in lov/. */
         __u32 lsm_magic;
         __u32 lsm_stripe_size;      /* size of the stripe */
         unsigned lsm_stripe_offset; /* offset of first stripe in lmd_objects */
@@ -28,6 +32,7 @@ struct lov_stripe_md {
 #define IOC_OSC_TYPE         'h'
 #define IOC_OSC_MIN_NR       20
 #define IOC_OSC_REGISTER_LOV _IOWR(IOC_OSC_TYPE, 20, struct obd_device *)
+#define IOC_OSC_SET_ACTIVE   _IOWR(IOC_OSC_TYPE, 21, struct obd_device *)
 #define IOC_OSC_MAX_NR       50
 
 #define IOC_MDC_TYPE         'i'
@@ -66,48 +71,8 @@ struct brw_page {
 
 /* Individual type definitions */
 
-struct ext2_obd {
-        struct super_block *e2_sb;
-        struct vfsmount *e2_vfsmnt;
-};
-
-struct obd_ucred {
-        __u32 ouc_fsuid;
-        __u32 ouc_fsgid;
-        __u32 ouc_cap;
-        __u32 ouc_suppgid1;
-        __u32 ouc_suppgid2;
-};
-
-#define OBD_RUN_CTXT_MAGIC      0xC0FFEEAA
-#define OBD_CTXT_DEBUG          /* development-only debugging */
-struct obd_run_ctxt {
-        struct vfsmount *pwdmnt;
-        struct dentry   *pwd;
-        mm_segment_t     fs;
-        __u32            fsuid;
-        __u32            fsgid;
-        __u32            cap;
-#ifdef OBD_CTXT_DEBUG
-        __u32            magic;
-#endif
-};
-
-
-#ifdef OBD_CTXT_DEBUG
-#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC
-#else
-#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0)
-#endif
-
 struct ost_server_data;
 
-#define FILTER_TRANSNO_SEM
-
-#ifndef OST_RECOVERY
-#undef FILTER_TRANSNO_SEM
-#endif
-
 struct filter_obd {
         char *fo_fstype;
         struct super_block *fo_sb;
@@ -117,11 +82,7 @@ struct filter_obd {
         struct dentry *fo_dentry_O_mode[16];
         struct dentry **fo_dentry_O_sub;
         spinlock_t fo_objidlock;        /* protects fo_lastobjid increment */
-#ifdef FILTER_TRANSNO_SEM
-        struct semaphore fo_transno_sem;
-#else
         spinlock_t fo_translock;        /* protects fsd_last_rcvd increment */
-#endif
         struct file *fo_rcvd_filp;
         struct filter_server_data *fo_fsd;
         unsigned long *fo_last_rcvd_slots;
@@ -137,10 +98,9 @@ struct filter_obd {
 struct mds_server_data;
 
 struct client_obd {
-        struct obd_import    cl_import;
+        struct obd_import   *cl_import;
         struct semaphore     cl_sem;
         int                  cl_conn_count;
-        struct obd_uuid           cl_target_uuid; /* XXX -> lustre_name */
         /* max_mds_easize is purely a performance thing so we don't have to
          * call obd_size_wiremd() all the time. */
         int                  cl_max_mds_easize;
@@ -155,6 +115,7 @@ struct mds_obd {
 
         struct super_block              *mds_sb;
         struct vfsmount                 *mds_vfsmnt;
+        struct dentry                   *mds_fid_de;
         struct obd_run_ctxt              mds_ctxt;
         struct file_operations          *mds_fop;
         struct inode_operations         *mds_iop;
@@ -170,6 +131,7 @@ struct mds_obd {
 
         int                              mds_has_lov_desc;
         struct lov_desc                  mds_lov_desc;
+        unsigned long                   *mds_client_bitmap;
 };
 
 struct ldlm_obd {
@@ -202,8 +164,10 @@ struct ptlbd_obd {
         struct ptlrpc_service *ptlbd_service;
         struct file *filp;
         /* client's */
-        struct ptlrpc_client bd_client;
-        struct obd_import bd_import;
+        struct ptlrpc_client    bd_client;
+        struct obd_import       *bd_import;
+        struct obd_uuid         bd_server_uuid;
+        struct lustre_handle    bd_connect_handle;
         int refcount; /* XXX sigh */
 };
 
@@ -219,18 +183,6 @@ struct recovd_obd {
         __u32                 recovd_state;
 };
 
-struct trace_obd {
-        struct obdtrace_opstats *stats;
-};
-
-#if 0
-struct snap_obd {
-        unsigned int snap_index;  /* which snapshot index are we accessing */
-        int snap_tableno;
-};
-
-#endif
-
 struct ost_obd {
         struct ptlrpc_service *ost_service;
 };
@@ -245,8 +197,8 @@ struct echo_client_obd {
 };
 
 struct cache_obd {
-        struct lustre_handle cobd_target;       /* local connection to target obd */
-        struct lustre_handle cobd_cache;        /* local connection to cache obd */
+        struct lustre_handle cobd_target;   /* local connection to target obd */
+        struct lustre_handle cobd_cache;    /* local connection to cache obd */
 };
 
 struct lov_tgt_desc {
@@ -267,11 +219,9 @@ struct lov_obd {
 struct niobuf_local {
         __u64 offset;
         __u32 len;
-        __u32 xid;
         __u32 flags;
-        void *addr;
+        __u32 rc;
         struct page *page;
-        void *target_private;
         struct dentry *dentry;
 };
 
@@ -280,6 +230,11 @@ struct niobuf_local {
 
 struct obd_trans_info {
         __u64     oti_transno;
+        /* Only used on the server side for tracking acks. */
+        struct oti_req_ack_lock {
+                struct lustre_handle lock;
+                __u32                mode;
+        } oti_ack_locks[4];
 };
 
 /* corresponds to one of the obd's */
@@ -291,7 +246,11 @@ struct obd_device {
         struct obd_uuid obd_uuid;
 
         int obd_minor;
-        int obd_flags;
+        int obd_attached:1, obd_set_up:1, obd_recovering:1,
+            obd_abort_recovery:1, obd_replayable:1, obd_no_transno:1,
+            obd_no_recov:1, obd_stopping:1;
+        atomic_t obd_refcount;
+        wait_queue_head_t obd_refcount_waitq;
         struct proc_dir_entry *obd_proc_entry;
         struct list_head       obd_exports;
         struct list_head       obd_imports;
@@ -309,12 +268,12 @@ struct obd_device {
         pid_t                            obd_processing_task;
         __u64                            obd_next_recovery_transno;
         wait_queue_head_t                obd_next_transno_waitq;
+        wait_queue_head_t                obd_commit_waitq;
         struct timer_list                obd_recovery_timer;
         struct list_head                 obd_recovery_queue;
         struct list_head                 obd_delayed_reply_queue;
 
         union {
-                struct ext2_obd ext2;
                 struct filter_obd filter;
                 struct mds_obd mds;
                 struct client_obd cli;
@@ -323,16 +282,12 @@ struct obd_device {
                 struct ldlm_obd ldlm;
                 struct echo_obd echo;
                 struct recovd_obd recovd;
-                struct trace_obd trace;
                 struct lov_obd lov;
                 struct cache_obd cobd;
                 struct ptlbd_obd ptlbd;
-#if 0
-                struct snap_obd snap;
-#endif
         } u;
        /* Fields used by LProcFS */
-        unsigned int cntr_mem_size;
+        unsigned int cntr_base;
         void *counters;
 };
 
@@ -340,27 +295,25 @@ struct obd_ops {
         struct module *o_owner;
         int (*o_iocontrol)(unsigned int cmd, struct lustre_handle *, int len,
                            void *karg, void *uarg);
-        int (*o_get_info)(struct lustre_handle *, obd_count keylen, void *key,
-                          obd_count *vallen, void **val);
-        int (*o_set_info)(struct lustre_handle *, obd_count keylen, void *key,
-                          obd_count vallen, void *val);
+        int (*o_get_info)(struct lustre_handle *, __u32 keylen, void *key,
+                          __u32 *vallen, void *val);
+        int (*o_set_info)(struct lustre_handle *, __u32 keylen, void *key,
+                          __u32 vallen, void *val);
         int (*o_attach)(struct obd_device *dev, obd_count len, void *data);
         int (*o_detach)(struct obd_device *dev);
         int (*o_setup) (struct obd_device *dev, obd_count len, void *data);
-        int (*o_cleanup)(struct obd_device *dev);
+        int (*o_cleanup)(struct obd_device *dev, int force, int failover);
         int (*o_connect)(struct lustre_handle *conn, struct obd_device *src,
-                         struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                         ptlrpc_recovery_cb_t recover);
-        int (*o_disconnect)(struct lustre_handle *conn);
-
+                         struct obd_uuid *cluuid);
+        int (*o_disconnect)(struct lustre_handle *conn, int failover);
 
         int (*o_statfs)(struct lustre_handle *conn, struct obd_statfs *osfs);
-        int (*o_syncfs)(struct lustre_handle *conn);
-        int (*o_packmd)(struct lustre_handle *, struct lov_mds_md **wire_tgt,
+        int (*o_syncfs)(struct obd_export *);
+        int (*o_packmd)(struct lustre_handle *, struct lov_mds_md **disk_tgt,
                         struct lov_stripe_md *mem_src);
         int (*o_unpackmd)(struct lustre_handle *,
                           struct lov_stripe_md **mem_tgt,
-                          struct lov_mds_md *wire_src);
+                          struct lov_mds_md *disk_src, int disk_len);
         int (*o_preallocate)(struct lustre_handle *, obd_count *req,
                              obd_id *ids);
         int (*o_create)(struct lustre_handle *conn,  struct obdo *oa,
@@ -371,14 +324,21 @@ struct obd_ops {
                          struct lov_stripe_md *ea, struct obd_trans_info *oti);
         int (*o_getattr)(struct lustre_handle *conn, struct obdo *oa,
                          struct lov_stripe_md *ea);
+        int (*o_getattr_async)(struct lustre_handle *conn, struct obdo *oa,
+                               struct lov_stripe_md *ea, 
+                               struct ptlrpc_request_set *set);
         int (*o_open)(struct lustre_handle *conn, struct obdo *oa,
-                      struct lov_stripe_md *ea, struct obd_trans_info *oti);
+                      struct lov_stripe_md *ea, struct obd_trans_info *oti,
+                      struct obd_client_handle *och);
         int (*o_close)(struct lustre_handle *conn, struct obdo *oa,
                        struct lov_stripe_md *ea, struct obd_trans_info *oti);
         int (*o_brw)(int rw, struct lustre_handle *conn,
                      struct lov_stripe_md *ea, obd_count oa_bufs,
-                     struct brw_page *pgarr, struct obd_brw_set *,
-                     struct obd_trans_info *oti);
+                     struct brw_page *pgarr, struct obd_trans_info *oti);
+        int (*o_brw_async)(int rw, struct lustre_handle *conn,
+                           struct lov_stripe_md *ea, obd_count oa_bufs,
+                           struct brw_page *pgarr, struct ptlrpc_request_set *,
+                           struct obd_trans_info *oti);
         int (*o_punch)(struct lustre_handle *conn, struct obdo *tgt,
                        struct lov_stripe_md *ea, obd_size count,
                        obd_off offset, struct obd_trans_info *oti);
@@ -392,12 +352,12 @@ struct obd_ops {
         int (*o_iterate)(struct lustre_handle *conn,
                          int (*)(obd_id, obd_gr, void *),
                          obd_id *startid, obd_gr group, void *data);
-        int (*o_preprw)(int cmd, struct lustre_handle *conn,
+        int (*o_preprw)(int cmd, struct obd_export *,
                         int objcount, struct obd_ioobj *obj,
                         int niocount, struct niobuf_remote *remote,
                         struct niobuf_local *local, void **desc_private, 
                         struct obd_trans_info *oti);
-        int (*o_commitrw)(int cmd, struct lustre_handle *conn,
+        int (*o_commitrw)(int cmd, struct obd_export *,
                           int objcount, struct obd_ioobj *obj,
                           int niocount, struct niobuf_local *local,
                           void *desc_private, struct obd_trans_info *oti);
@@ -406,12 +366,37 @@ struct obd_ops {
                          __u32 type, void *cookie, int cookielen, __u32 mode,
                          int *flags, void *cb, void *data, int datalen,
                          struct lustre_handle *lockh);
+        int (*o_match)(struct lustre_handle *conn, struct lov_stripe_md *md,
+                         __u32 type, void *cookie, int cookielen, __u32 mode,
+                         int *flags, struct lustre_handle *lockh);
         int (*o_cancel)(struct lustre_handle *, struct lov_stripe_md *md,
                         __u32 mode, struct lustre_handle *);
         int (*o_cancel_unused)(struct lustre_handle *, struct lov_stripe_md *,
-                               int local_only);
+                               int local_only, void *opaque);
         int (*o_san_preprw)(int cmd, struct lustre_handle *conn,
                             int objcount, struct obd_ioobj *obj,
                             int niocount, struct niobuf_remote *remote);
+        void (*o_destroy_export)(struct obd_export *export);
 };
+
+static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno,
+                                         int error)
+{
+        if (error) {
+                CDEBUG(D_ERROR, "%s: transno "LPD64" commit error: %d\n",
+                       obd->obd_name, transno, error);
+                return;
+        }
+        CDEBUG(D_HA, "%s: transno "LPD64" committed\n",
+               obd->obd_name, transno);
+        if (transno > obd->obd_last_committed) {
+                obd->obd_last_committed = transno;
+                wake_up(&obd->obd_commit_waitq);
+        }
+}
+
+/* When adding a function pointer to struct obd_ops, please update 
+ * function lprocfs_alloc_obd_counters() in obdclass/lprocfs_status.c
+ * accordingly. */
+
 #endif /* __OBD_H */
index b571b06..64b0a68 100644 (file)
 #define __LINUX_CLASS_OBD_H
 
 #ifndef __KERNEL__
-# include <stdint.h>
-# define __KERNEL__
-# include <linux/list.h>
-# undef __KERNEL__
+#include <sys/types.h>
+#include <portals/list.h>
 #else
 #include <asm/segment.h>
 #include <asm/uaccess.h>
 #define MAX_OBD_DEVICES 128
 extern struct obd_device obd_dev[MAX_OBD_DEVICES];
 
-#define OBD_ATTACHED       0x01
-#define OBD_SET_UP         0x02
-#define OBD_RECOVERING     0x04
-#define OBD_ABORT_RECOVERY 0x08
-#define OBD_REPLAYABLE     0x10
-#define OBD_NO_TRANSNO     0x20 /* XXX needs better name */
-
 /* OBD Operations Declarations */
 extern struct obd_device *class_conn2obd(struct lustre_handle *);
-extern struct obd_export *class_conn2export(struct lustre_handle *);
+
+/* genops.c */
+struct obd_export *class_conn2export(struct lustre_handle *);
+int class_register_type(struct obd_ops *ops, struct lprocfs_vars *, char *nm);
+int class_unregister_type(char *nm);
+int class_name2dev(char *name);
+int class_uuid2dev(struct obd_uuid *uuid);
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
+
+struct obd_export *class_export_get(struct obd_export *);
+void class_export_put(struct obd_export *);
+struct obd_export *class_new_export(struct obd_device *obddev);
+void class_unlink_export(struct obd_export *exp);
+
+struct obd_import *class_import_get(struct obd_import *);
+void class_import_put(struct obd_import *);
+struct obd_import *class_new_import(void);
+void class_destroy_import(struct obd_import *exp);
+
+struct obd_type *class_get_type(char *name);
+void class_put_type(struct obd_type *type);
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+                  struct obd_uuid *cluuid);
+int class_disconnect(struct lustre_handle *conn, int failover);
+void class_disconnect_exports(struct obd_device *obddev, int failover);
+/* generic operations shared by various OBD types */
+int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data);
+int class_multi_cleanup(struct obd_device *obddev);
 
 static inline int obd_check_conn(struct lustre_handle *conn)
 {
@@ -76,12 +94,12 @@ static inline int obd_check_conn(struct lustre_handle *conn)
                 RETURN(-ENODEV);
         }
 
-        if (!obd->obd_flags & OBD_ATTACHED ) {
+        if (!obd->obd_attached) {
                 CERROR("obd %d not attached\n", obd->obd_minor);
                 RETURN(-ENODEV);
         }
 
-        if (!obd->obd_flags & OBD_SET_UP) {
+        if (!obd->obd_set_up) {
                 CERROR("obd %d not setup\n", obd->obd_minor);
                 RETURN(-ENODEV);
         }
@@ -103,41 +121,108 @@ static inline int obd_check_conn(struct lustre_handle *conn)
 #define OBT(dev)        (dev)->obd_type
 #define OBP(dev, op)    (dev)->obd_type->typ_ops->o_ ## op
 
-#define OBD_CHECK_SETUP(conn, exp)                              \
+/* Ensure obd_setup: used for disconnect which might be called while
+   an obd is stopping. */
+#define OBD_CHECK_SETUP(conn, exp)                                      \
+do {                                                                    \
+        if (!(conn)) {                                                  \
+                CERROR("NULL connection\n");                            \
+                RETURN(-EINVAL);                                        \
+        }                                                               \
+                                                                        \
+        exp = class_conn2export(conn);                                  \
+        if (!(exp)) {                                                   \
+                CERROR("No export for conn "LPX64"\n", (conn)->cookie); \
+                RETURN(-EINVAL);                                        \
+        }                                                               \
+                                                                        \
+        if (!(exp)->exp_obd->obd_set_up) {                              \
+                CERROR("Device %d not setup\n",                         \
+                       (exp)->exp_obd->obd_minor);                      \
+                class_export_put(exp);                                  \
+                RETURN(-EINVAL);                                        \
+        }                                                               \
+} while (0)
+
+/* Ensure obd_setup and !obd_stopping. */
+#define OBD_CHECK_ACTIVE(conn, exp)                                     \
+do {                                                                    \
+        if (!(conn)) {                                                  \
+                CERROR("NULL connection\n");                            \
+                RETURN(-EINVAL);                                        \
+        }                                                               \
+                                                                        \
+        exp = class_conn2export(conn);                                  \
+        if (!(exp)) {                                                   \
+                CERROR("No export for conn "LPX64"\n", (conn)->cookie); \
+                RETURN(-EINVAL);                                        \
+        }                                                               \
+                                                                        \
+        if (!(exp)->exp_obd->obd_set_up || (exp)->exp_obd->obd_stopping) { \
+                CERROR("Device %d not setup\n",                         \
+                       (exp)->exp_obd->obd_minor);                      \
+                class_export_put(exp);                                  \
+                RETURN(-EINVAL);                                        \
+        }                                                               \
+} while (0)
+
+/* Ensure obd_setup: used for cleanup which must be called
+   while obd is stopping */
+#define OBD_CHECK_DEV_STOPPING(obd)                             \
 do {                                                            \
-        if (!(conn)) {                                          \
-                CERROR("NULL connection\n");                    \
-                RETURN(-EINVAL);                                \
+        if (!(obd)) {                                           \
+                CERROR("NULL device\n");                        \
+                RETURN(-ENODEV);                                \
         }                                                       \
                                                                 \
-        exp = class_conn2export(conn);                          \
-        if (!(exp)) {                                           \
-                CERROR("No export for conn "LPX64":"LPX64"\n",  \
-                       conn->addr, conn->cookie);               \
-                RETURN(-EINVAL);                                \
+        if (!(obd)->obd_set_up) {                               \
+                CERROR("Device %d not setup\n",                 \
+                       (obd)->obd_minor);                       \
+                RETURN(-ENODEV);                                \
         }                                                       \
                                                                 \
-        if (!((exp)->exp_obd->obd_flags & OBD_SET_UP)) {        \
-                CERROR("Device %d not setup\n",                 \
-                       (exp)->exp_obd->obd_minor);              \
-                RETURN(-EINVAL);                                \
+        if (!(obd)->obd_stopping) {                             \
+                CERROR("Device %d not stopping\n",              \
+                       (obd)->obd_minor);                       \
+                RETURN(-ENODEV);                                \
         }                                                       \
 } while (0)
 
-#define OBD_CHECK_DEVSETUP(obd)                                 \
+/* ensure obd_setup and !obd_stopping */
+#define OBD_CHECK_DEV_ACTIVE(obd)                               \
 do {                                                            \
         if (!(obd)) {                                           \
                 CERROR("NULL device\n");                        \
-                RETURN(-EINVAL);                                \
+                RETURN(-ENODEV);                                \
         }                                                       \
                                                                 \
-        if (!((obd)->obd_flags & OBD_SET_UP)) {                 \
+        if (!(obd)->obd_set_up || (obd)->obd_stopping) {        \
                 CERROR("Device %d not setup\n",                 \
                        (obd)->obd_minor);                       \
-                RETURN(-EINVAL);                                \
+                RETURN(-ENODEV);                                \
         }                                                       \
 } while (0)
 
+
+#ifdef LPROCFS
+#define OBD_COUNTER_OFFSET(op)                                  \
+        ((offsetof(struct obd_ops, o_ ## op) -                  \
+          offsetof(struct obd_ops, o_iocontrol))                \
+         / sizeof(((struct obd_ops *)(0))->o_iocontrol))
+
+#define OBD_COUNTER_INCREMENT(obd, op)                           \
+        if ((obd)->counters != NULL) {                           \
+            struct lprocfs_counters* cntrs = obd->counters;      \
+            unsigned int coffset;                                \
+            coffset = (obd)->cntr_base + OBD_COUNTER_OFFSET(op); \
+            LASSERT(coffset < cntrs->num);                       \
+            LPROCFS_COUNTER_INCBY1(&cntrs->cntr[coffset]);       \
+        }
+#else
+#define OBD_COUNTER_OFFSET(op) 
+#define OBD_COUNTER_INCREMENT(obd, op)           
+#endif
+
 #define OBD_CHECK_OP(obd, op)                                   \
 do {                                                            \
         if (!OBP((obd), op)) {                                  \
@@ -145,19 +230,21 @@ do {                                                            \
                        obd->obd_minor);                         \
                 RETURN(-EOPNOTSUPP);                            \
         }                                                       \
+        OBD_COUNTER_INCREMENT(obd, op);                         \
 } while (0)
 
-static inline int obd_get_info(struct lustre_handle *conn, obd_count keylen,
-                               void *key, obd_count *vallen, void **val)
+static inline int obd_get_info(struct lustre_handle *conn, __u32 keylen,
+                               void *key, __u32 *vallen, void *val)
 {
         struct obd_export *exp;
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, get_info);
 
         rc = OBP(exp->exp_obd, get_info)(conn, keylen, key, vallen, val);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -168,10 +255,11 @@ static inline int obd_set_info(struct lustre_handle *conn, obd_count keylen,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, set_info);
 
         rc = OBP(exp->exp_obd, set_info)(conn, keylen, key, vallen, val);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -186,85 +274,93 @@ static inline int obd_setup(struct obd_device *obd, int datalen, void *data)
         RETURN(rc);
 }
 
-static inline int obd_cleanup(struct obd_device *obd)
+static inline int obd_cleanup(struct obd_device *obd, int force, int failover)
 {
         int rc;
         ENTRY;
 
-        OBD_CHECK_DEVSETUP(obd);
+        OBD_CHECK_DEV_STOPPING(obd);
         OBD_CHECK_OP(obd, cleanup);
 
-        rc = OBP(obd, cleanup)(obd);
+        rc = OBP(obd, cleanup)(obd, force, failover);
         RETURN(rc);
 }
 
-/* Pack an in-memory MD struct for sending to the MDS and/or disk.
+/* Pack an in-memory MD struct for storage on disk.
  * Returns +ve size of packed MD (0 for free), or -ve error.
  *
- * If @wire_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
- * If @*wire_tgt != NULL and @mem_src == NULL, @*wire_tgt will be freed.
- * If @*wire_tgt == NULL, it will be allocated
+ * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL).
+ * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed.
+ * If @*disk_tgt == NULL, it will be allocated
  */
 static inline int obd_packmd(struct lustre_handle *conn,
-                             struct lov_mds_md **wire_tgt,
+                             struct lov_mds_md **disk_tgt,
                              struct lov_stripe_md *mem_src)
 {
         struct obd_export *exp;
+        int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, packmd);
 
-        RETURN(OBP(exp->exp_obd, packmd)(conn, wire_tgt, mem_src));
+        rc = OBP(exp->exp_obd, packmd)(conn, disk_tgt, mem_src);
+        class_export_put(exp);
+        RETURN(rc);
 }
 
-static inline int obd_size_wiremd(struct lustre_handle *conn,
+static inline int obd_size_diskmd(struct lustre_handle *conn,
                                   struct lov_stripe_md *mem_src)
 {
         return obd_packmd(conn, NULL, mem_src);
 }
 
 /* helper functions */
-static inline int obd_alloc_wiremd(struct lustre_handle *conn,
-                                   struct lov_mds_md **wire_tgt)
+static inline int obd_alloc_diskmd(struct lustre_handle *conn,
+                                   struct lov_mds_md **disk_tgt)
 {
-        LASSERT(wire_tgt);
-        LASSERT(*wire_tgt == NULL);
-        return obd_packmd(conn, wire_tgt, NULL);
+        LASSERT(disk_tgt);
+        LASSERT(*disk_tgt == NULL);
+        return obd_packmd(conn, disk_tgt, NULL);
 }
 
-static inline int obd_free_wiremd(struct lustre_handle *conn,
-                                  struct lov_mds_md **wire_tgt)
+static inline int obd_free_diskmd(struct lustre_handle *conn,
+                                  struct lov_mds_md **disk_tgt)
 {
-        LASSERT(wire_tgt);
-        LASSERT(*wire_tgt);
-        return obd_packmd(conn, wire_tgt, NULL);
+        LASSERT(disk_tgt);
+        LASSERT(*disk_tgt);
+        return obd_packmd(conn, disk_tgt, NULL);
 }
 
-/* Unpack an MD struct from the MDS and/or disk to in-memory format.
+/* Unpack an MD struct from disk to in-memory format.
  * Returns +ve size of unpacked MD (0 for free), or -ve error.
  *
- * If @mem_tgt == NULL, MD size is returned (max size if @wire_src == NULL).
- * If @*mem_tgt != NULL and @wire_src == NULL, @*mem_tgt will be freed.
+ * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL).
+ * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed.
  * If @*mem_tgt == NULL, it will be allocated
  */
 static inline int obd_unpackmd(struct lustre_handle *conn,
                                struct lov_stripe_md **mem_tgt,
-                               struct lov_mds_md *wire_src)
+                               struct lov_mds_md *disk_src,
+                               int disk_len)
 {
         struct obd_export *exp;
+        int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, unpackmd);
 
-        RETURN(OBP(exp->exp_obd, unpackmd)(conn, mem_tgt, wire_src));
+        rc = OBP(exp->exp_obd, unpackmd)(conn, mem_tgt, disk_src, disk_len);
+        class_export_put(exp);
+        RETURN(rc);
 }
 
 static inline int obd_size_memmd(struct lustre_handle *conn,
-                                 struct lov_mds_md *wire_src)
+                                 struct lov_mds_md *disk_src,
+                                 int disk_len)
 {
-        return obd_unpackmd(conn, NULL, wire_src);
+        return obd_unpackmd(conn, NULL, disk_src, disk_len);
 }
 
 /* helper functions */
@@ -273,7 +369,7 @@ static inline int obd_alloc_memmd(struct lustre_handle *conn,
 {
         LASSERT(mem_tgt);
         LASSERT(*mem_tgt == NULL);
-        return obd_unpackmd(conn, mem_tgt, NULL);
+        return obd_unpackmd(conn, mem_tgt, NULL, 0);
 }
 
 static inline int obd_free_memmd(struct lustre_handle *conn,
@@ -281,7 +377,7 @@ static inline int obd_free_memmd(struct lustre_handle *conn,
 {
         LASSERT(mem_tgt);
         LASSERT(*mem_tgt);
-        return obd_unpackmd(conn, mem_tgt, NULL);
+        return obd_unpackmd(conn, mem_tgt, NULL, 0);
 }
 
 static inline int obd_create(struct lustre_handle *conn, struct obdo *obdo,
@@ -292,10 +388,11 @@ static inline int obd_create(struct lustre_handle *conn, struct obdo *obdo,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, create);
 
         rc = OBP(exp->exp_obd, create)(conn, obdo, ea, oti);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -307,10 +404,11 @@ static inline int obd_destroy(struct lustre_handle *conn, struct obdo *obdo,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, destroy);
 
         rc = OBP(exp->exp_obd, destroy)(conn, obdo, ea, oti);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -321,10 +419,27 @@ static inline int obd_getattr(struct lustre_handle *conn, struct obdo *obdo,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, getattr);
 
         rc = OBP(exp->exp_obd, getattr)(conn, obdo, ea);
+        class_export_put(exp);
+        RETURN(rc);
+}
+
+static inline int obd_getattr_async(struct lustre_handle *conn, struct obdo *obdo,
+                                    struct lov_stripe_md *ea, 
+                                    struct ptlrpc_request_set *set)
+{
+        struct obd_export *exp;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_OP(exp->exp_obd, getattr);
+
+        rc = OBP(exp->exp_obd, getattr_async)(conn, obdo, ea, set);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -336,24 +451,27 @@ static inline int obd_close(struct lustre_handle *conn, struct obdo *obdo,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, close);
 
         rc = OBP(exp->exp_obd, close)(conn, obdo, ea, oti);
+        class_export_put(exp);
         RETURN(rc);
 }
 
 static inline int obd_open(struct lustre_handle *conn, struct obdo *obdo,
-                           struct lov_stripe_md *ea, struct obd_trans_info *oti)
+                           struct lov_stripe_md *ea, struct obd_trans_info *oti,
+                           struct obd_client_handle *och)
 {
         struct obd_export *exp;
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, open);
 
-        rc = OBP(exp->exp_obd, open)(conn, obdo, ea, oti);
+        rc = OBP(exp->exp_obd, open)(conn, obdo, ea, oti, och);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -365,29 +483,28 @@ static inline int obd_setattr(struct lustre_handle *conn, struct obdo *obdo,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, setattr);
 
         rc = OBP(exp->exp_obd, setattr)(conn, obdo, ea, oti);
+        class_export_put(exp);
         RETURN(rc);
 }
 
 static inline int obd_connect(struct lustre_handle *conn,
-                              struct obd_device *obd, struct obd_uuid *cluuid,
-                              struct recovd_obd *recovd,
-                              ptlrpc_recovery_cb_t recover)
+                              struct obd_device *obd, struct obd_uuid *cluuid)
 {
         int rc;
         ENTRY;
 
-        OBD_CHECK_DEVSETUP(obd);
+        OBD_CHECK_DEV_ACTIVE(obd);
         OBD_CHECK_OP(obd, connect);
 
-        rc = OBP(obd, connect)(conn, obd, cluuid, recovd, recover);
+        rc = OBP(obd, connect)(conn, obd, cluuid);
         RETURN(rc);
 }
 
-static inline int obd_disconnect(struct lustre_handle *conn)
+static inline int obd_disconnect(struct lustre_handle *conn, int failover)
 {
         struct obd_export *exp;
         int rc;
@@ -396,33 +513,41 @@ static inline int obd_disconnect(struct lustre_handle *conn)
         OBD_CHECK_SETUP(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, disconnect);
 
-        rc = OBP(exp->exp_obd, disconnect)(conn);
+        rc = OBP(exp->exp_obd, disconnect)(conn, failover);
+        class_export_put(exp);
         RETURN(rc);
 }
 
+static inline void obd_destroy_export(struct obd_export *exp)
+{
+        ENTRY;
+        if (OBP(exp->exp_obd, destroy_export))
+                OBP(exp->exp_obd, destroy_export)(exp);
+        EXIT;
+}
+
 static inline int obd_statfs(struct lustre_handle *conn,struct obd_statfs *osfs)
 {
         struct obd_export *exp;
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, statfs);
 
         rc = OBP(exp->exp_obd, statfs)(conn, osfs);
+        class_export_put(exp);
         RETURN(rc);
 }
 
-static inline int obd_syncfs(struct lustre_handle *conn)
+static inline int obd_syncfs(struct obd_export *exp)
 {
-        struct obd_export *exp;
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, syncfs);
 
-        rc = OBP(exp->exp_obd, syncfs)(conn);
+        rc = OBP(exp->exp_obd, syncfs)(exp);
         RETURN(rc);
 }
 
@@ -434,65 +559,86 @@ static inline int obd_punch(struct lustre_handle *conn, struct obdo *oa,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, punch);
 
         rc = OBP(exp->exp_obd, punch)(conn, oa, ea, start, end, oti);
+        class_export_put(exp);
         RETURN(rc);
 }
 
 static inline int obd_brw(int cmd, struct lustre_handle *conn,
                           struct lov_stripe_md *ea, obd_count oa_bufs,
-                          struct brw_page *pg, struct obd_brw_set *set,
-                          struct obd_trans_info *oti)
+                          struct brw_page *pg, struct obd_trans_info *oti)
 {
         struct obd_export *exp;
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, brw);
 
+        if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) {
+                CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, "
+                       "or OBD_BRW_CHECK\n");
+                LBUG();
+        }
+
+        rc = OBP(exp->exp_obd, brw)(cmd, conn, ea, oa_bufs, pg, oti);
+        class_export_put(exp);
+        RETURN(rc);
+}
+
+static inline int obd_brw_async(int cmd, struct lustre_handle *conn,
+                                struct lov_stripe_md *ea, obd_count oa_bufs,
+                                struct brw_page *pg,
+                                struct ptlrpc_request_set *set,
+                                struct obd_trans_info *oti)
+{
+        struct obd_export *exp;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_ACTIVE(conn, exp);
+        OBD_CHECK_OP(exp->exp_obd, brw_async);
+
         if (!(cmd & OBD_BRW_RWMASK)) {
                 CERROR("obd_brw: cmd must be OBD_BRW_READ or OBD_BRW_WRITE\n");
                 LBUG();
         }
 
-        rc = OBP(exp->exp_obd, brw)(cmd, conn, ea, oa_bufs, pg, set, oti);
+        rc = OBP(exp->exp_obd, brw_async)(cmd, conn, ea, oa_bufs, pg, set, oti);
+        class_export_put(exp);
         RETURN(rc);
 }
 
-static inline int obd_preprw(int cmd, struct lustre_handle *conn,
+static inline int obd_preprw(int cmd, struct obd_export *exp,
                              int objcount, struct obd_ioobj *obj,
                              int niocount, struct niobuf_remote *remote,
                              struct niobuf_local *local, void **desc_private,
                              struct obd_trans_info *oti)
 {
-        struct obd_export *exp;
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, preprw);
 
-        rc = OBP(exp->exp_obd, preprw)(cmd, conn, objcount, obj, niocount,
+        rc = OBP(exp->exp_obd, preprw)(cmd, exp, objcount, obj, niocount,
                                        remote, local, desc_private, oti);
         RETURN(rc);
 }
 
-static inline int obd_commitrw(int cmd, struct lustre_handle *conn,
+static inline int obd_commitrw(int cmd, struct obd_export *exp,
                                int objcount, struct obd_ioobj *obj,
                                int niocount, struct niobuf_local *local,
                                void *desc_private, struct obd_trans_info *oti)
 {
-        struct obd_export *exp;
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, commitrw);
 
-        rc = OBP(exp->exp_obd, commitrw)(cmd, conn, objcount, obj, niocount,
+        rc = OBP(exp->exp_obd, commitrw)(cmd, exp, objcount, obj, niocount,
                                          local, desc_private, oti);
         RETURN(rc);
 }
@@ -504,10 +650,11 @@ static inline int obd_iocontrol(unsigned int cmd, struct lustre_handle *conn,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, iocontrol);
 
         rc = OBP(exp->exp_obd, iocontrol)(cmd, conn, len, karg, uarg);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -522,15 +669,36 @@ static inline int obd_enqueue(struct lustre_handle *conn,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, enqueue);
 
         rc = OBP(exp->exp_obd, enqueue)(conn, ea, parent_lock, type,
                                         cookie, cookielen, mode, flags, cb,
                                         data, datalen, lockh);
+        class_export_put(exp);
+        RETURN(rc);
+}
+
+static inline int obd_match(struct lustre_handle *conn,
+                              struct lov_stripe_md *ea,
+                              __u32 type, void *cookie, int cookielen,
+                              __u32 mode, int *flags, 
+                              struct lustre_handle *lockh)
+{
+        struct obd_export *exp;
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_ACTIVE(conn, exp);
+        OBD_CHECK_OP(exp->exp_obd, match);
+
+        rc = OBP(exp->exp_obd, match)(conn, ea, type, cookie, cookielen, mode,
+                                      flags, lockh);
+        class_export_put(exp);
         RETURN(rc);
 }
 
+
 static inline int obd_cancel(struct lustre_handle *conn,
                              struct lov_stripe_md *ea, __u32 mode,
                              struct lustre_handle *lockh)
@@ -539,24 +707,27 @@ static inline int obd_cancel(struct lustre_handle *conn,
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, cancel);
 
         rc = OBP(exp->exp_obd, cancel)(conn, ea, mode, lockh);
+        class_export_put(exp);
         RETURN(rc);
 }
 
 static inline int obd_cancel_unused(struct lustre_handle *conn,
-                                    struct lov_stripe_md *ea, int local)
+                                    struct lov_stripe_md *ea, int flags,
+                                    void *opaque)
 {
         struct obd_export *exp;
         int rc;
         ENTRY;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, cancel_unused);
 
-        rc = OBP(exp->exp_obd, cancel_unused)(conn, ea, local);
+        rc = OBP(exp->exp_obd, cancel_unused)(conn, ea, flags, opaque);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -567,11 +738,12 @@ static inline int obd_san_preprw(int cmd, struct lustre_handle *conn,
         struct obd_export *exp;
         int rc;
 
-        OBD_CHECK_SETUP(conn, exp);
+        OBD_CHECK_ACTIVE(conn, exp);
         OBD_CHECK_OP(exp->exp_obd, preprw);
 
         rc = OBP(exp->exp_obd, san_preprw)(cmd, conn, objcount, obj,
                                            niocount, remote);
+        class_export_put(exp);
         RETURN(rc);
 }
 
@@ -607,39 +779,28 @@ static inline void obdo_free(struct obdo *oa)
         kmem_cache_free(obdo_cachep, oa);
 }
 
+#if !defined(__KERNEL__) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define to_kdev_t(dev) dev
+#define kdev_t_to_nr(dev) dev
+#endif
+
 #ifdef __KERNEL__
 static inline void obdo_from_iattr(struct obdo *oa, struct iattr *attr)
 {
         unsigned int ia_valid = attr->ia_valid;
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        if (ia_valid & ATTR_ATIME) {
-                oa->o_atime = attr->ia_atime;
-                oa->o_valid |= OBD_MD_FLATIME;
-        }
-        if (ia_valid & ATTR_MTIME) {
-                oa->o_mtime = attr->ia_mtime;
-                oa->o_valid |= OBD_MD_FLMTIME;
-        }
-        if (ia_valid & ATTR_CTIME) {
-                oa->o_ctime = attr->ia_ctime;
-                oa->o_valid |= OBD_MD_FLCTIME;
-        }
-#else
         if (ia_valid & ATTR_ATIME) {
-                oa->o_atime = attr->ia_atime.tv_sec;
+                oa->o_atime = LTIME_S(attr->ia_atime);
                 oa->o_valid |= OBD_MD_FLATIME;
         }
         if (ia_valid & ATTR_MTIME) {
-                oa->o_mtime = attr->ia_mtime.tv_sec;
+                oa->o_mtime = LTIME_S(attr->ia_mtime);
                 oa->o_valid |= OBD_MD_FLMTIME;
         }
         if (ia_valid & ATTR_CTIME) {
-                oa->o_ctime = attr->ia_ctime.tv_sec;
+                oa->o_ctime = LTIME_S(attr->ia_ctime);
                 oa->o_valid |= OBD_MD_FLCTIME;
         }
-#endif
-
         if (ia_valid & ATTR_SIZE) {
                 oa->o_size = attr->ia_size;
                 oa->o_valid |= OBD_MD_FLSIZE;
@@ -665,33 +826,18 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa,
                                    obd_flag valid)
 {
         memset(attr, 0, sizeof(*attr));
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        if (valid & OBD_MD_FLATIME) {
-                attr->ia_atime = oa->o_atime;
-                attr->ia_valid |= ATTR_ATIME;
-        }
-        if (valid & OBD_MD_FLMTIME) {
-                attr->ia_mtime = oa->o_mtime;
-                attr->ia_valid |= ATTR_MTIME;
-        }
-        if (valid & OBD_MD_FLCTIME) {
-                attr->ia_ctime = oa->o_ctime;
-                attr->ia_valid |= ATTR_CTIME;
-        }
-#else
         if (valid & OBD_MD_FLATIME) {
-                attr->ia_atime.tv_sec = oa->o_atime;
+                LTIME_S(attr->ia_atime) = oa->o_atime;
                 attr->ia_valid |= ATTR_ATIME;
         }
         if (valid & OBD_MD_FLMTIME) {
-                attr->ia_mtime.tv_sec = oa->o_mtime;
+                LTIME_S(attr->ia_mtime) = oa->o_mtime;
                 attr->ia_valid |= ATTR_MTIME;
         }
         if (valid & OBD_MD_FLCTIME) {
-                attr->ia_ctime.tv_sec = oa->o_ctime;
+                LTIME_S(attr->ia_ctime) = oa->o_ctime;
                 attr->ia_valid |= ATTR_CTIME;
         }
-#endif
         if (valid & OBD_MD_FLSIZE) {
                 attr->ia_size = oa->o_size;
                 attr->ia_valid |= ATTR_SIZE;
@@ -721,29 +867,16 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa,
 /* WARNING: the file systems must take care not to tinker with
    attributes they don't manage (such as blocks). */
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#define to_kdev_t(dev) dev
-#define kdev_t_to_nr(dev) dev
-#endif
 
 static inline void obdo_from_inode(struct obdo *dst, struct inode *src,
                                    obd_flag valid)
 {
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        if (valid & OBD_MD_FLATIME)
-                dst->o_atime = src->i_atime;
-        if (valid & OBD_MD_FLMTIME)
-                dst->o_mtime = src->i_mtime;
-        if (valid & OBD_MD_FLCTIME)
-                dst->o_ctime = src->i_ctime;
-#else
         if (valid & OBD_MD_FLATIME)
-                dst->o_atime = src->i_atime.tv_sec;
+                dst->o_atime = LTIME_S(src->i_atime);
         if (valid & OBD_MD_FLMTIME)
-                dst->o_mtime = src->i_mtime.tv_sec;
+                dst->o_mtime = LTIME_S(src->i_mtime);
         if (valid & OBD_MD_FLCTIME)
-                dst->o_ctime = src->i_ctime.tv_sec;
-#endif
+                dst->o_ctime = LTIME_S(src->i_ctime);
         if (valid & OBD_MD_FLSIZE)
                 dst->o_size = src->i_size;
         if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
@@ -775,21 +908,12 @@ static inline void obdo_refresh_inode(struct inode *dst, struct obdo *src,
 {
         valid &= src->o_valid;
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        if (valid & OBD_MD_FLATIME && src->o_atime > dst->i_atime)
-                dst->i_atime = src->o_atime;
-        if (valid & OBD_MD_FLMTIME && src->o_mtime > dst->i_mtime)
-                dst->i_mtime = src->o_mtime;
-        if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime)
-                dst->i_ctime = src->o_ctime;
-#else
-        if (valid & OBD_MD_FLATIME && src->o_atime > dst->i_atime.tv_sec)
-                dst->i_atime.tv_sec = src->o_atime;
-        if (valid & OBD_MD_FLMTIME && src->o_mtime > dst->i_mtime.tv_sec)
-                dst->i_mtime.tv_sec = src->o_mtime;
-        if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime.tv_sec)
-                dst->i_ctime.tv_sec = src->o_ctime;
-#endif
+        if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+                LTIME_S(dst->i_atime) = src->o_atime;
+        if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+                LTIME_S(dst->i_mtime) = src->o_mtime;
+        if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+                LTIME_S(dst->i_ctime) = src->o_ctime;
         if (valid & OBD_MD_FLSIZE && src->o_size > dst->i_size)
                 dst->i_size = src->o_size;
         /* allocation of space */
@@ -802,21 +926,12 @@ static inline void obdo_to_inode(struct inode *dst, struct obdo *src,
 {
         valid &= src->o_valid;
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
         if (valid & OBD_MD_FLATIME)
-                dst->i_atime = src->o_atime;
+                LTIME_S(dst->i_atime) = src->o_atime;
         if (valid & OBD_MD_FLMTIME)
-                dst->i_mtime = src->o_mtime;
-        if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime)
-                dst->i_ctime = src->o_ctime;
-#else
-        if (valid & OBD_MD_FLATIME)
-                dst->i_atime.tv_sec = src->o_atime;
-        if (valid & OBD_MD_FLMTIME)
-                dst->i_mtime.tv_sec = src->o_mtime;
-        if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime.tv_sec)
-                dst->i_ctime.tv_sec = src->o_ctime;
-#endif
+                LTIME_S(dst->i_mtime) = src->o_mtime;
+        if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+                LTIME_S(dst->i_ctime) = src->o_ctime;
         if (valid & OBD_MD_FLSIZE)
                 dst->i_size = src->o_size;
         if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
@@ -931,49 +1046,17 @@ static inline int obdo_cmp_md(struct obdo *dst, struct obdo *src,
         return res;
 }
 
-
 /* I'm as embarrassed about this as you are.
  *
  * <shaver> // XXX do not look into _superhack with remaining eye
  * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */
 extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
-extern void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp,
-                                               int dying_import);
-
-int class_register_type(struct obd_ops *ops, struct lprocfs_vars* vars,
-                        char *nm);
-int class_unregister_type(char *nm);
-int class_name2dev(char *name);
-int class_uuid2dev(struct obd_uuid *uuid);
-struct obd_device *class_uuid2obd(struct obd_uuid *uuid);
-struct obd_export *class_new_export(struct obd_device *obddev);
-struct obd_type *class_get_type(char *name);
-void class_put_type(struct obd_type *type);
-void class_destroy_export(struct obd_export *exp);
-int class_connect(struct lustre_handle *conn, struct obd_device *obd,
-                  struct obd_uuid *cluuid);
-int class_disconnect(struct lustre_handle *conn);
-void class_disconnect_all(struct obd_device *obddev);
-
-/* generic operations shared by various OBD types */
-int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data);
-int class_multi_cleanup(struct obd_device *obddev);
-
-extern void (*class_signal_connection_failure)(struct ptlrpc_connection *);
-
-static inline struct ptlrpc_connection *class_rd2conn(struct recovd_data *rd)
-{
-        /* reuse list_entry's member-pointer offset stuff */
-        return list_entry(rd, struct ptlrpc_connection, c_recovd_data);
-}
+extern void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp);
 
 struct obd_statfs;
 struct statfs;
 void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs);
 void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs);
-void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src);
-void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src);
-
 
 struct obd_class_user_state {
         struct obd_device     *ocus_current_obd;
index 273779a..c344d8a 100644 (file)
@@ -1,41 +1,42 @@
-#ifndef _OBD_ECHO_H
-#define _OBD_ECHO_H
-/*
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
  * Copyright (C) 2001  Cluster File Systems, Inc.
  *
  * This code is issued under the GNU General Public License.
  * See the file COPYING in this distribution
  */
 
+#ifndef _OBD_ECHO_H
+#define _OBD_ECHO_H
+
 #define OBD_ECHO_DEVICENAME "obdecho"
 #define OBD_ECHO_CLIENT_DEVICENAME "echo_client"
 
-struct ec_object
-{
-       struct list_head       eco_obj_chain;
-       struct obd_device     *eco_device;
-       int                    eco_refcount;
-       int                    eco_deleted;
-       obd_id                 eco_id;
-       struct lov_stripe_md  *eco_lsm;
+struct ec_object {
+        struct list_head       eco_obj_chain;
+        struct obd_device     *eco_device;
+        int                    eco_refcount;
+        int                    eco_deleted;
+        obd_id                 eco_id;
+        struct lov_stripe_md  *eco_lsm;
 };
 
-struct ec_open_object
-{
-       struct list_head       ecoo_exp_chain;
-       struct ec_object      *ecoo_object;
-       struct obdo            ecoo_oa;
-        __u64                  ecoo_cookie;
+struct ec_open_object {
+        struct list_head         ecoo_exp_chain;
+        struct ec_object        *ecoo_object;
+        __u64                    ecoo_cookie;
+        struct obdo              ecoo_oa;
+        struct obd_client_handle ecoo_och;
 };
 
-struct ec_lock
-{
-       struct list_head       ecl_exp_chain;
-       struct lustre_handle   ecl_handle;
-       struct ldlm_extent     ecl_extent;
-       __u32                  ecl_mode;
-       struct ec_object      *ecl_object;
-       __u64                  ecl_cookie;
+struct ec_lock {
+        struct list_head       ecl_exp_chain;
+        struct ec_object      *ecl_object;
+        __u64                  ecl_cookie;
+        struct lustre_handle   ecl_lock_handle;
+        struct ldlm_extent     ecl_extent;
+        __u32                  ecl_mode;
 };
 
 #endif
index 26850d8..74bb784 100644 (file)
 #ifndef _OBD_FILTER_H
 #define _OBD_FILTER_H
 
+#ifdef __KERNEL__
+#include <linux/spinlock.h>
+#endif
+#include <linux/lustre_handles.h>
+
 #ifndef OBD_FILTER_DEVICENAME
 #define OBD_FILTER_DEVICENAME "obdfilter"
 #endif
@@ -79,9 +84,10 @@ struct filter_export_data {
 
 /* file data for open files on OST */
 struct filter_file_data {
-        struct list_head  ffd_export_list;  /* export open list - fed_lock */
-        struct file      *ffd_file;         /* file handle */
-        __u64             ffd_servercookie; /* cookie for lustre handle */
+        struct portals_handle ffd_handle;
+        atomic_t              ffd_refcount;
+        struct list_head      ffd_export_list; /* export open list - fed_lock */
+        struct file          *ffd_file;         /* file handle */
 };
 
 struct filter_dentry_data {
index ff3e689..b12a062 100644 (file)
@@ -7,8 +7,16 @@
 
 #define OBD_LOV_DEVICENAME "lov"
 
-void lov_unpackdesc(struct lov_desc *ld);
-void lov_packdesc(struct lov_desc *ld);
+struct lov_brw_async_args {
+        obd_count        aa_oa_bufs;
+        struct brw_page *aa_ioarr;
+};
+
+struct lov_getattr_async_args {
+        struct lov_stripe_md  *aa_lsm;
+        struct obdo           *aa_oa;
+        struct obdo           *aa_stripe_oas;
+};
 
 static inline int lov_stripe_md_size(int stripes)
 {
@@ -20,6 +28,15 @@ static inline int lov_mds_md_size(int stripes)
         return sizeof(struct lov_mds_md) + stripes*sizeof(struct lov_object_id);
 }
 
+extern int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmm,
+                       struct lov_stripe_md *lsm);
+extern int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsm,
+                         struct lov_mds_md *lmm, int lmmsize);
+extern int lov_setstripe(struct lustre_handle *conn,
+                         struct lov_stripe_md **lsmp, struct lov_mds_md *lmmu);
+extern int lov_getstripe(struct lustre_handle *conn, 
+                         struct lov_stripe_md *lsm, struct lov_mds_md *lmmu);
+
 #define IOC_LOV_TYPE                   'g'
 #define IOC_LOV_MIN_NR                 50
 #define IOC_LOV_SET_OSC_ACTIVE         _IOWR('g', 50, long)
index 9ef7052..22fe694 100644 (file)
 #define LUSTRE_SANOSC_NAME "sanosc"
 #define LUSTRE_SANOST_NAME "sanost"
 
-/* ost/ost_pack.c */
-void ost_pack_niobuf(struct niobuf_remote *nb, __u64 offset, __u32 len,
-                     __u32 flags, __u32 xid);
-void ost_unpack_niobuf(struct niobuf_remote *dst, struct niobuf_remote *src);
-void ost_pack_ioo(struct obd_ioobj *ioo, struct lov_stripe_md *lsm, int bufcnt);
-void ost_unpack_ioo(struct obd_ioobj *dst, struct obd_ioobj *src);
+struct osc_brw_async_args {
+        int              aa_requested_nob;
+        int              aa_nio_count;
+        obd_count        aa_page_count;
+        struct brw_page *aa_pga;
+};
+
+struct osc_getattr_async_args {
+        struct obdo     *aa_oa;
+};
 
 #endif
index 3af66b5..1e6de5a 100644 (file)
@@ -22,9 +22,12 @@ extern void ptlbd_blk_exit(void);
 extern void ptlbd_cl_exit(void);
 extern void ptlbd_sv_exit(void);
 
+extern int ptlbd_do_connect(struct ptlbd_obd *);
+extern int ptlbd_do_disconnect(struct ptlbd_obd *);
 extern void ptlbd_blk_register(struct ptlbd_obd *ptlbd);
-extern int ptlbd_send_req(struct ptlbd_obd *, ptlbd_cmd_t cmd, 
-               struct request *);
-extern int ptlbd_parse_req(struct ptlrpc_request *req);
+extern int ptlbd_send_rw_req(struct ptlbd_obd *, ptlbd_cmd_t cmd, 
+               struct buffer_head *);
+extern int ptlbd_send_flush_req(struct ptlbd_obd *, ptlbd_cmd_t cmd);
+extern int ptlbd_handle(struct ptlrpc_request *req);
 
 #endif
index 85e577a..69a47dc 100644 (file)
@@ -38,7 +38,7 @@ extern atomic_t obd_memory;
 extern int obd_memmax;
 extern unsigned long obd_fail_loc;
 extern unsigned long obd_timeout;
-extern char obd_recovery_upcall[128];
+extern char obd_lustre_upcall[128];
 extern unsigned long obd_sync_filter;
 
 #define OBD_FAIL_MDS                     0x100
@@ -93,6 +93,9 @@ extern unsigned long obd_sync_filter;
 #define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
 #define OBD_FAIL_OST_BRW_READ_BULK       0x20f
 #define OBD_FAIL_OST_SYNCFS_NET          0x210
+#define OBD_FAIL_OST_ALL_REPLY_NET       0x211
+#define OBD_FAIL_OST_ALL_REQUESTS_NET    0x212
+#define OBD_FAIL_OST_LDLM_REPLY_NET      0x213
 
 #define OBD_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
@@ -153,9 +156,11 @@ do {                                                                         \
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 #define ll_bdevname(a) __bdevname((a))
 #define ll_lock_kernel lock_kernel()
+#define LTIME_S(time) (time.tv_sec)
 #else
 #define ll_lock_kernel
 #define ll_bdevname(a) bdevname((a))
+#define LTIME_S(time) (time)
 #endif
 
 
@@ -185,7 +190,8 @@ static inline void OBD_FAIL_WRITE(int id, kdev_t dev)
                 obd_fail_loc |= OBD_FAILED | OBD_FAIL_ONCE;
         }
 }
-
+#else /* !__KERNEL__ */
+#define LTIME_S(time) (time)
 #endif  /* __KERNEL__ */
 
 #define OBD_ALLOC(ptr, size)                                            \
@@ -208,6 +214,30 @@ do {                                                                    \
         }                                                               \
 } while (0)
 
+#ifdef __arch_um__
+# define OBD_VMALLOC(ptr, size) OBD_ALLOC(ptr, size)
+#else
+# define OBD_VMALLOC(ptr, size)                                         \
+do {                                                                    \
+        void *lptr;                                                     \
+        int s = (size);                                                 \
+        (ptr) = lptr = vmalloc(s);                                      \
+        if (lptr == NULL) {                                             \
+                CERROR("vmalloc of '" #ptr "' (%d bytes) failed "       \
+                       "at %s:%d\n", s, __FILE__, __LINE__);            \
+        } else {                                                        \
+                int obd_curmem;                                         \
+                memset(lptr, 0, s);                                     \
+                atomic_add(s, &obd_memory);                             \
+                obd_curmem = atomic_read(&obd_memory);                  \
+                if (obd_curmem > obd_memmax)                            \
+                        obd_memmax = obd_curmem;                        \
+                CDEBUG(D_MALLOC, "vmalloced '" #ptr "': %d at %p "      \
+                       "(tot %d)\n", s, lptr, obd_curmem);              \
+        }                                                               \
+} while (0)
+#endif
+
 #ifdef CONFIG_DEBUG_SLAB
 #define POISON(lptr, c, s) do {} while (0)
 #else
@@ -227,11 +257,55 @@ do {                                                                    \
         (ptr) = (void *)0xdeadbeef;                                     \
 } while (0)
 
-#ifdef CONFIG_HIGHMEM
-extern void obd_kmap_get(int count, int server);
-extern void obd_kmap_put(int count);
+#ifdef __arch_um__
+# define OBD_VFREE(ptr, size) OBD_FREE(ptr, size)
 #else
-#define obd_kmap_get(count, server) do {} while (0)
-#define obd_kmap_put(count) do {} while (0)
+# define OBD_VFREE(ptr, size)                                           \
+do {                                                                    \
+        void *lptr = (ptr);                                             \
+        int s = (size);                                                 \
+        LASSERT(lptr);                                                  \
+        POISON(lptr, 0x5a, s);                                          \
+        vfree(lptr);                                                    \
+        atomic_sub(s, &obd_memory);                                     \
+        CDEBUG(D_MALLOC, "vfreed '" #ptr "': %d at %p (tot %d).\n",     \
+               s, lptr, atomic_read(&obd_memory));                      \
+        (ptr) = (void *)0xdeadbeef;                                     \
+} while (0)
 #endif
+
+#define OBD_SLAB_ALLOC(ptr, slab, type, size)                             \
+do {                                                                      \
+        long s = (size);                                                  \
+        void *lptr;                                                       \
+        LASSERT (!in_interrupt());                                        \
+        (ptr) = lptr = kmem_cache_alloc((slab), type);                    \
+        if (lptr == NULL) {                                               \
+                CERROR("slab-alloc of '" #ptr "' (%ld bytes) failed "     \
+                       "at %s:%d\n", s, __FILE__, __LINE__);              \
+        } else {                                                          \
+                int obd_curmem;                                           \
+                memset(lptr, 0, s);                                       \
+                atomic_add(s, &obd_memory);                               \
+                obd_curmem = atomic_read(&obd_memory);                    \
+                if (obd_curmem > obd_memmax)                              \
+                        obd_memmax = obd_curmem;                          \
+                CDEBUG(D_MALLOC, "slab-alloced '" #ptr "': %ld at %p "    \
+                       "(tot %d)\n", s, lptr, obd_curmem);                \
+        }                                                                 \
+} while (0)
+
+#define OBD_SLAB_FREE(ptr, slab, size)                                    \
+do {                                                                      \
+        long s = (size);                                                  \
+        void *lptr = (ptr);                                               \
+        LASSERT(lptr);                                                    \
+        POISON(lptr, 0x5a, s);                                            \
+        CDEBUG(D_MALLOC, "slab-freed '" #ptr "': %ld at %p (tot %d).\n",  \
+               s, lptr, atomic_read(&obd_memory));                        \
+        kmem_cache_free((slab), lptr);                                    \
+        atomic_sub(s, &obd_memory);                                       \
+        (ptr) = (void *)0xdeadbeef;                                       \
+} while (0)
+
 #endif
index 7d4c4b6..1860f13 100644 (file)
@@ -57,7 +57,7 @@ within the root of that tree.  The scripts manage a "stack" of patches.
 Each patch is a changeset against the base tree plus the preceding patches.
 
 All patches are listed, in order, in the file ./series.  You manage the
-series file.
+series file.  Lines in the series file which start with `#' are ignored.
 
 Any currently-applied patches are described in the file
 ./applied-patches.  The patch scripts manage this file.
@@ -351,6 +351,15 @@ inpatch
 
        cat pc/$(toppatch).pc
 
+join-patch patchname
+
+  "joins" the named patch to the current topmost patch.
+
+  Use this when you want to merge two patches into one.  All the
+  files which `patchname' affects are added to pc/$(toppatch).pc (if
+  they are not already there) and patch `patchname' is applied.  The
+  top patch remains unchanged.  You'll need to run refpatch afterwards.
+
 mpatch
 
   A low-level thing to generate patches
diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-i386 b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-i386
new file mode 100644 (file)
index 0000000..94ee0ab
--- /dev/null
@@ -0,0 +1,1834 @@
+#
+# Automatically generated by make menuconfig: don't edit
+#
+CONFIG_X86=y
+CONFIG_ISA=y
+# CONFIG_SBUS is not set
+CONFIG_UID16=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+
+#
+# Processor type and features
+#
+CONFIG_LOLAT=y
+# CONFIG_LOLAT_SYSCTL is not set
+CONFIG_M386=y
+# CONFIG_M486 is not set
+# CONFIG_M586 is not set
+# CONFIG_M586TSC is not set
+# CONFIG_M586MMX is not set
+# CONFIG_M686 is not set
+# CONFIG_MPENTIUMIII is not set
+# CONFIG_MPENTIUM4 is not set
+# CONFIG_MK6 is not set
+# CONFIG_MK7 is not set
+# CONFIG_MELAN is not set
+# CONFIG_MCRUSOE is not set
+# CONFIG_MWINCHIPC6 is not set
+# CONFIG_MWINCHIP2 is not set
+# CONFIG_MWINCHIP3D is not set
+# CONFIG_MCYRIXIII is not set
+# CONFIG_X86_CMPXCHG is not set
+# CONFIG_X86_XADD is not set
+CONFIG_X86_L1_CACHE_SHIFT=4
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
+CONFIG_X86_PPRO_FENCE=y
+CONFIG_X86_MCE=y
+# CONFIG_CPU_FREQ is not set
+CONFIG_TOSHIBA=m
+CONFIG_I8K=m
+# CONFIG_MICROCODE is not set
+CONFIG_X86_MSR=m
+CONFIG_X86_CPUID=m
+# CONFIG_E820_PROC is not set
+CONFIG_NOHIGHMEM=y
+# CONFIG_HIGHMEM4G is not set
+# CONFIG_HIGHMEM64G is not set
+CONFIG_HIGHIO=y
+CONFIG_MATH_EMULATION=y
+CONFIG_MTRR=y
+# CONFIG_SMP is not set
+CONFIG_X86_UP_APIC=y
+CONFIG_X86_UP_IOAPIC=y
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_X86_IO_APIC=y
+
+#
+# General setup
+#
+CONFIG_HZ=100
+CONFIG_NET=y
+CONFIG_PCI=y
+# CONFIG_PCI_GOBIOS is not set
+# CONFIG_PCI_GODIRECT is not set
+CONFIG_PCI_GOANY=y
+CONFIG_PCI_BIOS=y
+CONFIG_PCI_DIRECT=y
+CONFIG_PCI_NAMES=y
+
+#
+# Performance-monitoring counters support
+#
+# CONFIG_PERFCTR is not set
+CONFIG_EISA=y
+# CONFIG_MCA is not set
+CONFIG_HOTPLUG=y
+
+#
+# PCMCIA/CardBus support
+#
+CONFIG_PCMCIA=m
+CONFIG_CARDBUS=y
+CONFIG_TCIC=y
+CONFIG_I82092=y
+CONFIG_I82365=y
+
+#
+# PCI Hotplug Support
+#
+# CONFIG_HOTPLUG_PCI is not set
+# CONFIG_HOTPLUG_PCI_COMPAQ is not set
+# CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM is not set
+# CONFIG_HOTPLUG_PCI_IBM is not set
+# CONFIG_HOTPLUG_PCI_ACPI is not set
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_KCORE_ELF=y
+# CONFIG_KCORE_AOUT is not set
+CONFIG_BINFMT_AOUT=m
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+# CONFIG_IKCONFIG is not set
+CONFIG_PM=y
+
+#
+# Additional device driver support
+#
+CONFIG_CIPE=m
+CONFIG_CRYPTO_AEP=m
+CONFIG_MEGARAC=m
+CONFIG_FC_QLA2200=m
+CONFIG_FC_QLA2300=m
+CONFIG_SCSI_ISCSI=m
+# CONFIG_IBMASM is not set
+# CONFIG_IBMSER is not set
+# CONFIG_ACPI is not set
+CONFIG_APM=y
+# CONFIG_APM_IGNORE_USER_SUSPEND is not set
+# CONFIG_APM_DO_ENABLE is not set
+CONFIG_APM_CPU_IDLE=y
+# CONFIG_APM_DISPLAY_BLANK is not set
+CONFIG_APM_RTC_IS_GMT=y
+# CONFIG_APM_ALLOW_INTS is not set
+# CONFIG_APM_REAL_MODE_POWER_OFF is not set
+
+#
+# Binary emulation of other systems
+#
+CONFIG_ABI=m
+CONFIG_ABI_SVR4=m
+CONFIG_ABI_UW7=m
+# CONFIG_ABI_SOLARIS is not set
+CONFIG_ABI_IBCS=m
+CONFIG_ABI_ISC=m
+CONFIG_ABI_SCO=m
+# CONFIG_ABI_WYSE is not set
+CONFIG_BINFMT_COFF=m
+CONFIG_BINFMT_XOUT=m
+# CONFIG_BINFMT_XOUT_X286 is not set
+CONFIG_ABI_SPX=y
+CONFIG_ABI_XTI=y
+CONFIG_ABI_TLI_OPTMGMT=y
+# CONFIG_ABI_XTI_OPTMGMT is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+# CONFIG_PARPORT_PC_FIFO is not set
+# CONFIG_PARPORT_PC_SUPERIO is not set
+CONFIG_PARPORT_PC_PCMCIA=m
+# CONFIG_PARPORT_AMIGA is not set
+# CONFIG_PARPORT_MFC3 is not set
+# CONFIG_PARPORT_ATARI is not set
+# CONFIG_PARPORT_GSC is not set
+# CONFIG_PARPORT_SUNBPP is not set
+# CONFIG_PARPORT_OTHER is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play configuration
+#
+CONFIG_PNP=y
+CONFIG_ISAPNP=y
+# CONFIG_PNPBIOS is not set
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+CONFIG_BLK_DEV_XD=m
+CONFIG_PARIDE=m
+CONFIG_PARIDE_PARPORT=m
+CONFIG_PARIDE_PD=m
+CONFIG_PARIDE_PCD=m
+CONFIG_PARIDE_PF=m
+CONFIG_PARIDE_PT=m
+CONFIG_PARIDE_PG=m
+CONFIG_PARIDE_ATEN=m
+CONFIG_PARIDE_BPCK=m
+CONFIG_PARIDE_BPCK6=m
+CONFIG_PARIDE_COMM=m
+CONFIG_PARIDE_DSTR=m
+CONFIG_PARIDE_FIT2=m
+CONFIG_PARIDE_FIT3=m
+CONFIG_PARIDE_EPAT=m
+CONFIG_PARIDE_EPATC8=y
+CONFIG_PARIDE_EPIA=m
+CONFIG_PARIDE_FRIQ=m
+CONFIG_PARIDE_FRPW=m
+CONFIG_PARIDE_KBIC=m
+CONFIG_PARIDE_KTTI=m
+CONFIG_PARIDE_ON20=m
+CONFIG_PARIDE_ON26=m
+CONFIG_BLK_CPQ_DA=m
+CONFIG_BLK_CPQ_CISS_DA=m
+CONFIG_CISS_SCSI_TAPE=y
+CONFIG_BLK_DEV_DAC960=m
+CONFIG_BLK_DEV_UMEM=m
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_NBD=m
+# CONFIG_BLK_DEV_ENBD is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_LVM=m
+
+#
+# Cryptography support (CryptoAPI)
+#
+CONFIG_CRYPTO=m
+CONFIG_CIPHERS=m
+CONFIG_CIPHER_AES=m
+CONFIG_CIPHER_IDENTITY=m
+CONFIG_CRYPTODEV=m
+CONFIG_CRYPTOLOOP=m
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_FILTER=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_TUX=m
+CONFIG_TUX_EXTCGI=y
+# CONFIG_TUX_EXTENDED_LOG is not set
+# CONFIG_TUX_DEBUG is not set
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_ROUTE_LARGE_TABLES=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+CONFIG_SYN_COOKIES=y
+
+#
+#   IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_UNCLEAN=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_MIRROR=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_COMPAT_IPCHAINS=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_COMPAT_IPFWADM=m
+CONFIG_IP_NF_NAT_NEEDED=y
+
+#
+#   IP: Virtual Server Configuration
+#
+CONFIG_IP_VS=m
+# CONFIG_IP_VS_DEBUG is not set
+CONFIG_IP_VS_TAB_BITS=16
+CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
+CONFIG_IP_VS_LC=m
+CONFIG_IP_VS_WLC=m
+CONFIG_IP_VS_LBLC=m
+CONFIG_IP_VS_LBLCR=m
+CONFIG_IP_VS_DH=m
+CONFIG_IP_VS_SH=m
+CONFIG_IP_VS_FTP=m
+CONFIG_IPV6=m
+
+#
+#   IPv6: Netfilter Configuration
+#
+# CONFIG_IP6_NF_QUEUE is not set
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+# CONFIG_KHTTPD is not set
+CONFIG_ATM=y
+CONFIG_ATM_CLIP=y
+# CONFIG_ATM_CLIP_NO_ICMP is not set
+CONFIG_ATM_LANE=m
+CONFIG_ATM_MPOA=m
+CONFIG_ATM_BR2684=m
+CONFIG_ATM_BR2684_IPFILTER=y
+CONFIG_VLAN_8021Q=m
+CONFIG_IPX=m
+# CONFIG_IPX_INTERN is not set
+CONFIG_ATALK=m
+
+#
+# Appletalk devices
+#
+CONFIG_DEV_APPLETALK=y
+CONFIG_LTPC=m
+CONFIG_COPS=m
+CONFIG_COPS_DAYNA=y
+CONFIG_COPS_TANGENT=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+CONFIG_DECNET=m
+CONFIG_DECNET_SIOCGIFCONF=y
+CONFIG_DECNET_ROUTER=y
+CONFIG_DECNET_ROUTE_FWMARK=y
+CONFIG_BRIDGE=m
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_LLC is not set
+CONFIG_NET_DIVERT=y
+# CONFIG_ECONET is not set
+CONFIG_WAN_ROUTER=m
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_CSZ=m
+# CONFIG_NET_SCH_ATM is not set
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+
+#
+# Telephony Support
+#
+CONFIG_PHONE=m
+CONFIG_PHONE_IXJ=m
+CONFIG_PHONE_IXJ_PCMCIA=m
+
+#
+# ATA/IDE/MFM/RLL support
+#
+CONFIG_IDE=y
+
+#
+# IDE, ATA and ATAPI Block devices
+#
+CONFIG_BLK_DEV_IDE=y
+# CONFIG_BLK_DEV_HD_IDE is not set
+# CONFIG_BLK_DEV_HD is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+# CONFIG_IDEDISK_STROKE is not set
+# CONFIG_BLK_DEV_IDEDISK_VENDOR is not set
+# CONFIG_BLK_DEV_IDEDISK_FUJITSU is not set
+# CONFIG_BLK_DEV_IDEDISK_IBM is not set
+# CONFIG_BLK_DEV_IDEDISK_MAXTOR is not set
+# CONFIG_BLK_DEV_IDEDISK_QUANTUM is not set
+# CONFIG_BLK_DEV_IDEDISK_SEAGATE is not set
+# CONFIG_BLK_DEV_IDEDISK_WD is not set
+# CONFIG_BLK_DEV_COMMERIAL is not set
+# CONFIG_BLK_DEV_TIVO is not set
+CONFIG_BLK_DEV_IDECS=m
+CONFIG_BLK_DEV_IDECD=m
+CONFIG_BLK_DEV_IDETAPE=m
+CONFIG_BLK_DEV_IDEFLOPPY=y
+CONFIG_BLK_DEV_IDESCSI=m
+# CONFIG_IDE_TASK_IOCTL is not set
+CONFIG_BLK_DEV_CMD640=y
+# CONFIG_BLK_DEV_CMD640_ENHANCED is not set
+CONFIG_BLK_DEV_ISAPNP=y
+CONFIG_BLK_DEV_RZ1000=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_PCI_WIP is not set
+# CONFIG_BLK_DEV_IDEDMA_TIMEOUT is not set
+# CONFIG_IDEDMA_NEW_DRIVE_LISTINGS is not set
+CONFIG_BLK_DEV_ADMA=y
+CONFIG_BLK_DEV_AEC62XX=y
+CONFIG_AEC62XX_TUNING=y
+CONFIG_BLK_DEV_ALI15X3=y
+# CONFIG_WDC_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_AMD74XX_OVERRIDE is not set
+CONFIG_BLK_DEV_CMD64X=y
+CONFIG_BLK_DEV_CMD680=y
+CONFIG_BLK_DEV_CY82C693=y
+CONFIG_BLK_DEV_CS5530=y
+CONFIG_BLK_DEV_HPT34X=y
+# CONFIG_HPT34X_AUTODMA is not set
+CONFIG_BLK_DEV_HPT366=y
+CONFIG_BLK_DEV_PIIX=y
+CONFIG_PIIX_TUNING=y
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_ADMA100=y
+CONFIG_BLK_DEV_PDC202XX=y
+# CONFIG_PDC202XX_BURST is not set
+CONFIG_PDC202XX_FORCE=y
+CONFIG_BLK_DEV_SVWKS=y
+CONFIG_BLK_DEV_SIS5513=y
+CONFIG_BLK_DEV_SLC90E66=y
+# CONFIG_BLK_DEV_TRM290 is not set
+CONFIG_BLK_DEV_VIA82CXXX=y
+CONFIG_BLK_DEV_CENATEK=y
+# CONFIG_IDE_CHIPSETS is not set
+# CONFIG_BLK_DEV_ELEVATOR_NOOP is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_IDEDMA_IVB is not set
+# CONFIG_DMA_NONPCI is not set
+CONFIG_BLK_DEV_IDE_MODES=y
+CONFIG_BLK_DEV_ATARAID=m
+CONFIG_BLK_DEV_ATARAID_PDC=m
+CONFIG_BLK_DEV_ATARAID_HPT=m
+
+#
+# SCSI support
+#
+CONFIG_SCSI=m
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_EXTRA_DEVS=40
+CONFIG_CHR_DEV_ST=m
+CONFIG_CHR_DEV_OSST=m
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_SR_EXTRA_DEVS=4
+CONFIG_CHR_DEV_SG=m
+# CONFIG_SCSI_DEBUG_QUEUES is not set
+# CONFIG_SCSI_MULTI_LUN is not set
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI low-level drivers
+#
+CONFIG_BLK_DEV_3W_XXXX_RAID=m
+CONFIG_SCSI_7000FASST=m
+CONFIG_SCSI_ACARD=m
+CONFIG_SCSI_AHA152X=m
+CONFIG_SCSI_AHA1542=m
+CONFIG_SCSI_AHA1740=m
+CONFIG_SCSI_AACRAID=m
+CONFIG_SCSI_AIC7XXX=m
+CONFIG_AIC7XXX_CMDS_PER_DEVICE=253
+CONFIG_AIC7XXX_RESET_DELAY_MS=15000
+# CONFIG_AIC7XXX_PROBE_EISA_VL is not set
+# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set
+CONFIG_SCSI_AIC79XX=m
+CONFIG_AIC79XX_CMDS_PER_DEVICE=253
+CONFIG_AIC79XX_RESET_DELAY_MS=15000
+# CONFIG_AIC79XX_BUILD_FIRMWARE is not set
+CONFIG_AIC79XX_ENABLE_RD_STRM=y
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+CONFIG_SCSI_AIC7XXX_OLD=m
+CONFIG_AIC7XXX_OLD_TCQ_ON_BY_DEFAULT=y
+CONFIG_AIC7XXX_OLD_CMDS_PER_DEVICE=32
+CONFIG_AIC7XXX_OLD_PROC_STATS=y
+CONFIG_SCSI_DPT_I2O=m
+CONFIG_SCSI_ADVANSYS=m
+CONFIG_SCSI_IN2000=m
+CONFIG_SCSI_AM53C974=m
+CONFIG_SCSI_MEGARAID=m
+CONFIG_SCSI_BUSLOGIC=m
+# CONFIG_SCSI_OMIT_FLASHPOINT is not set
+CONFIG_SCSI_CPQFCTS=m
+CONFIG_SCSI_DMX3191D=m
+CONFIG_SCSI_DTC3280=m
+CONFIG_SCSI_EATA=m
+CONFIG_SCSI_EATA_TAGGED_QUEUE=y
+# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set
+CONFIG_SCSI_EATA_MAX_TAGS=16
+CONFIG_SCSI_EATA_DMA=m
+CONFIG_SCSI_EATA_PIO=m
+CONFIG_SCSI_FUTURE_DOMAIN=m
+CONFIG_SCSI_GDTH=m
+CONFIG_SCSI_GENERIC_NCR5380=m
+# CONFIG_SCSI_GENERIC_NCR53C400 is not set
+CONFIG_SCSI_G_NCR5380_PORT=y
+# CONFIG_SCSI_G_NCR5380_MEM is not set
+CONFIG_SCSI_IPS=m
+CONFIG_SCSI_INITIO=m
+CONFIG_SCSI_INIA100=m
+CONFIG_SCSI_PPA=m
+CONFIG_SCSI_IMM=m
+# CONFIG_SCSI_IZIP_EPP16 is not set
+# CONFIG_SCSI_IZIP_SLOW_CTR is not set
+CONFIG_SCSI_NCR53C406A=m
+CONFIG_SCSI_NCR53C7xx=m
+# CONFIG_SCSI_NCR53C7xx_sync is not set
+CONFIG_SCSI_NCR53C7xx_FAST=y
+CONFIG_SCSI_NCR53C7xx_DISCONNECT=y
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_NCR53C8XX=m
+CONFIG_SCSI_SYM53C8XX=m
+CONFIG_SCSI_NCR53C8XX_DEFAULT_TAGS=8
+CONFIG_SCSI_NCR53C8XX_MAX_TAGS=32
+CONFIG_SCSI_NCR53C8XX_SYNC=40
+# CONFIG_SCSI_NCR53C8XX_PROFILE is not set
+# CONFIG_SCSI_NCR53C8XX_IOMAPPED is not set
+# CONFIG_SCSI_NCR53C8XX_PQS_PDS is not set
+# CONFIG_SCSI_NCR53C8XX_SYMBIOS_COMPAT is not set
+CONFIG_SCSI_PAS16=m
+CONFIG_SCSI_PCI2000=m
+CONFIG_SCSI_PCI2220I=m
+CONFIG_SCSI_PSI240I=m
+CONFIG_SCSI_QLOGIC_FAS=m
+CONFIG_SCSI_QLOGIC_ISP=m
+CONFIG_SCSI_QLOGIC_FC=m
+# CONFIG_SCSI_QLOGIC_FC_FIRMWARE is not set
+CONFIG_SCSI_QLOGIC_1280=m
+CONFIG_SCSI_NEWISP=m
+CONFIG_SCSI_SEAGATE=m
+CONFIG_SCSI_SIM710=m
+CONFIG_SCSI_SYM53C416=m
+CONFIG_SCSI_DC390T=m
+# CONFIG_SCSI_DC390T_NOGENSUPP is not set
+CONFIG_SCSI_T128=m
+CONFIG_SCSI_U14_34F=m
+# CONFIG_SCSI_U14_34F_LINKED_COMMANDS is not set
+CONFIG_SCSI_U14_34F_MAX_TAGS=8
+CONFIG_SCSI_ULTRASTOR=m
+CONFIG_SCSI_DEBUG=m
+
+#
+# PCMCIA SCSI adapter support
+#
+CONFIG_SCSI_PCMCIA=y
+CONFIG_PCMCIA_AHA152X=m
+CONFIG_PCMCIA_FDOMAIN=m
+CONFIG_PCMCIA_NINJA_SCSI=m
+CONFIG_PCMCIA_QLOGIC=m
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=m
+# CONFIG_FUSION_BOOT is not set
+# CONFIG_FUSION_ISENSE is not set
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+CONFIG_NET_FC=y
+
+#
+# IEEE 1394 (FireWire) support (EXPERIMENTAL)
+#
+CONFIG_IEEE1394=m
+# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_OHCI1394=m
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+
+#
+# I2O device support
+#
+CONFIG_I2O=m
+CONFIG_I2O_PCI=m
+CONFIG_I2O_BLOCK=m
+CONFIG_I2O_LAN=m
+CONFIG_I2O_SCSI=m
+CONFIG_I2O_PROC=m
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+CONFIG_ETHERTAP=m
+CONFIG_NET_SB1000=m
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+# CONFIG_SUNLANCE is not set
+CONFIG_HAPPYMEAL=m
+# CONFIG_SUNBMAC is not set
+# CONFIG_SUNQE is not set
+CONFIG_SUNGEM=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_EL1=m
+CONFIG_EL2=m
+CONFIG_ELPLUS=m
+CONFIG_EL16=m
+CONFIG_EL3=m
+CONFIG_3C515=m
+# CONFIG_ELMC is not set
+# CONFIG_ELMC_II is not set
+CONFIG_VORTEX=m
+CONFIG_LANCE=m
+CONFIG_NET_VENDOR_SMC=y
+CONFIG_WD80x3=m
+# CONFIG_ULTRAMCA is not set
+CONFIG_ULTRA=m
+CONFIG_ULTRA32=m
+CONFIG_SMC9194=m
+CONFIG_NET_VENDOR_RACAL=y
+CONFIG_NI5010=m
+CONFIG_NI52=m
+CONFIG_NI65=m
+CONFIG_AT1700=m
+CONFIG_DEPCA=m
+CONFIG_HP100=m
+CONFIG_NET_ISA=y
+CONFIG_E2100=m
+CONFIG_EWRK3=m
+CONFIG_EEXPRESS=m
+CONFIG_EEXPRESS_PRO=m
+CONFIG_HPLAN_PLUS=m
+CONFIG_HPLAN=m
+CONFIG_LP486E=m
+CONFIG_ETH16I=m
+CONFIG_NE2000=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_ADAPTEC_STARFIRE=m
+CONFIG_AC3200=m
+CONFIG_APRICOT=m
+CONFIG_CS89x0=m
+CONFIG_TULIP=m
+CONFIG_TC35815=m
+# CONFIG_TULIP_MWI is not set
+CONFIG_TULIP_MMIO=y
+CONFIG_DE4X5=m
+CONFIG_DGRS=m
+CONFIG_DM9102=m
+CONFIG_EEPRO100=m
+CONFIG_NET_E100=m
+CONFIG_LNE390=m
+CONFIG_FEALNX=m
+CONFIG_NATSEMI=m
+# CONFIG_NATSEMI_CABLE_MAGIC is not set
+CONFIG_NE2K_PCI=m
+CONFIG_NE3210=m
+CONFIG_ES3210=m
+CONFIG_8139CP=m
+CONFIG_8139TOO=m
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+CONFIG_8139TOO_8129=y
+# CONFIG_8139_NEW_RX_RESET is not set
+CONFIG_SIS900=m
+CONFIG_SIS900_OLD=m
+CONFIG_EPIC100=m
+CONFIG_SUNDANCE=m
+CONFIG_TLAN=m
+CONFIG_VIA_RHINE=m
+# CONFIG_VIA_RHINE_MMIO is not set
+CONFIG_WINBOND_840=m
+CONFIG_NET_POCKET=y
+CONFIG_ATP=m
+CONFIG_DE600=m
+CONFIG_DE620=m
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+# CONFIG_ACENIC_OMIT_TIGON_I is not set
+CONFIG_DL2K=m
+# CONFIG_MYRI_SBUS is not set
+CONFIG_NS83820=m
+CONFIG_HAMACHI=m
+CONFIG_YELLOWFIN=m
+CONFIG_SK98LIN=m
+CONFIG_NET_BROADCOM=m
+CONFIG_TIGON3=m
+CONFIG_NET_E1000=m
+CONFIG_FDDI=y
+CONFIG_DEFXX=m
+CONFIG_SKFP=m
+CONFIG_NETCONSOLE=m
+# CONFIG_HIPPI is not set
+CONFIG_PLIP=m
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_BSDCOMP is not set
+# CONFIG_PPPOE is not set
+CONFIG_PPPOATM=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+CONFIG_SLIP_MODE_SLIP6=y
+
+#
+# Wireless LAN (non-hamradio)
+#
+CONFIG_NET_RADIO=y
+CONFIG_STRIP=m
+CONFIG_WAVELAN=m
+CONFIG_ARLAN=m
+CONFIG_AIRONET4500=m
+CONFIG_AIRONET4500_NONCS=m
+CONFIG_AIRONET4500_PNP=y
+CONFIG_AIRONET4500_PCI=y
+CONFIG_AIRONET4500_ISA=y
+CONFIG_AIRONET4500_I365=y
+CONFIG_AIRONET4500_PROC=m
+CONFIG_AIRO=m
+CONFIG_HERMES=m
+CONFIG_PLX_HERMES=m
+CONFIG_PCI_HERMES=m
+CONFIG_PCMCIA_HERMES=m
+CONFIG_AIRO_CS=m
+CONFIG_NET_WIRELESS=y
+CONFIG_PCMCIA_HERMES_OLD=m
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMTR=m
+CONFIG_IBMOL=m
+CONFIG_IBMLS=m
+CONFIG_3C359=m
+CONFIG_TMS380TR=m
+CONFIG_TMSPCI=m
+CONFIG_TMSISA=m
+CONFIG_ABYSS=m
+# CONFIG_MADGEMC is not set
+CONFIG_SMCTR=m
+CONFIG_NET_FC=y
+CONFIG_IPHASE5526=m
+CONFIG_RCPCI=m
+CONFIG_SHAPER=m
+
+#
+# Wan interfaces
+#
+CONFIG_WAN=y
+CONFIG_HOSTESS_SV11=m
+CONFIG_COSA=m
+# CONFIG_COMX is not set
+# CONFIG_DSCC4 is not set
+CONFIG_FARSYNC=m
+# CONFIG_LANMEDIA is not set
+CONFIG_ATI_XX20=m
+CONFIG_SEALEVEL_4021=m
+# CONFIG_SYNCLINK_SYNCPPP is not set
+# CONFIG_HDLC is not set
+CONFIG_DLCI=m
+CONFIG_DLCI_COUNT=24
+CONFIG_DLCI_MAX=8
+CONFIG_SDLA=m
+CONFIG_WAN_ROUTER_DRIVERS=y
+CONFIG_VENDOR_SANGOMA=m
+CONFIG_WANPIPE_CHDLC=y
+CONFIG_WANPIPE_FR=y
+CONFIG_WANPIPE_X25=y
+CONFIG_WANPIPE_PPP=y
+CONFIG_WANPIPE_MULTPPP=y
+CONFIG_CYCLADES_SYNC=m
+CONFIG_CYCLOMX_X25=y
+# CONFIG_LAPBETHER is not set
+# CONFIG_X25_ASY is not set
+CONFIG_SBNI=m
+CONFIG_SBNI_MULTILINE=y
+
+#
+# PCMCIA network device support
+#
+CONFIG_NET_PCMCIA=y
+CONFIG_PCMCIA_3C589=m
+CONFIG_PCMCIA_3C574=m
+CONFIG_PCMCIA_FMVJ18X=m
+CONFIG_PCMCIA_PCNET=m
+CONFIG_PCMCIA_AXNET=m
+CONFIG_PCMCIA_NMCLAN=m
+CONFIG_PCMCIA_SMC91C92=m
+CONFIG_PCMCIA_XIRC2PS=m
+# CONFIG_ARCNET_COM20020_CS is not set
+CONFIG_PCMCIA_IBMTR=m
+CONFIG_PCMCIA_XIRCOM=m
+CONFIG_PCMCIA_XIRTULIP=m
+CONFIG_NET_PCMCIA_RADIO=y
+CONFIG_PCMCIA_RAYCS=m
+CONFIG_PCMCIA_NETWAVE=m
+CONFIG_PCMCIA_WAVELAN=m
+CONFIG_PCMCIA_WVLAN=m
+CONFIG_AIRONET4500_CS=m
+
+#
+# Quadrics Supercomputers
+#
+
+#
+# QsNet
+#
+CONFIG_QUADRICS=y
+CONFIG_QSNETMOD=m
+CONFIG_ELAN3MOD=m
+CONFIG_EPMOD=m
+CONFIG_EIPMOD=m
+CONFIG_RMSMOD=m
+CONFIG_JTAG=m
+
+#
+# QsNet II
+#
+
+#
+# ATM drivers
+#
+CONFIG_ATM_TCP=m
+CONFIG_ATM_LANAI=m
+CONFIG_ATM_ENI=m
+# CONFIG_ATM_ENI_DEBUG is not set
+# CONFIG_ATM_ENI_TUNE_BURST is not set
+CONFIG_ATM_FIRESTREAM=m
+CONFIG_ATM_ZATM=m
+# CONFIG_ATM_ZATM_DEBUG is not set
+CONFIG_ATM_ZATM_EXACT_TS=y
+CONFIG_ATM_NICSTAR=m
+CONFIG_ATM_NICSTAR_USE_SUNI=y
+CONFIG_ATM_NICSTAR_USE_IDT77105=y
+CONFIG_ATM_IDT77252=m
+# CONFIG_ATM_IDT77252_DEBUG is not set
+# CONFIG_ATM_IDT77252_RCV_ALL is not set
+CONFIG_ATM_IDT77252_USE_SUNI=y
+CONFIG_ATM_AMBASSADOR=m
+# CONFIG_ATM_AMBASSADOR_DEBUG is not set
+CONFIG_ATM_HORIZON=m
+# CONFIG_ATM_HORIZON_DEBUG is not set
+CONFIG_ATM_IA=m
+# CONFIG_ATM_IA_DEBUG is not set
+CONFIG_ATM_FORE200E_MAYBE=m
+CONFIG_ATM_FORE200E_PCA=y
+CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y
+CONFIG_ATM_FORE200E_TX_RETRY=16
+CONFIG_ATM_FORE200E_DEBUG=0
+CONFIG_ATM_FORE200E=m
+
+#
+# Amateur Radio support
+#
+CONFIG_HAMRADIO=y
+CONFIG_AX25=m
+# CONFIG_AX25_DAMA_SLAVE is not set
+CONFIG_NETROM=m
+CONFIG_ROSE=m
+
+#
+# AX.25 network device drivers
+#
+# CONFIG_MKISS is not set
+# CONFIG_6PACK is not set
+# CONFIG_BPQETHER is not set
+# CONFIG_DMASCC is not set
+# CONFIG_SCC is not set
+# CONFIG_BAYCOM_SER_FDX is not set
+# CONFIG_BAYCOM_SER_HDX is not set
+# CONFIG_BAYCOM_PAR is not set
+# CONFIG_BAYCOM_EPP is not set
+CONFIG_SOUNDMODEM=m
+CONFIG_SOUNDMODEM_SBC=y
+CONFIG_SOUNDMODEM_WSS=y
+CONFIG_SOUNDMODEM_AFSK1200=y
+CONFIG_SOUNDMODEM_AFSK2400_7=y
+CONFIG_SOUNDMODEM_AFSK2400_8=y
+CONFIG_SOUNDMODEM_AFSK2666=y
+CONFIG_SOUNDMODEM_HAPN4800=y
+CONFIG_SOUNDMODEM_PSK4800=y
+CONFIG_SOUNDMODEM_FSK9600=y
+# CONFIG_YAM is not set
+
+#
+# IrDA (infrared) support
+#
+CONFIG_IRDA=m
+CONFIG_IRLAN=m
+CONFIG_IRNET=m
+CONFIG_IRCOMM=m
+CONFIG_IRDA_ULTRA=y
+CONFIG_IRDA_CACHE_LAST_LSAP=y
+CONFIG_IRDA_FAST_RR=y
+# CONFIG_IRDA_DEBUG is not set
+
+#
+# Infrared-port device drivers
+#
+CONFIG_IRTTY_SIR=m
+CONFIG_IRPORT_SIR=m
+CONFIG_DONGLE=y
+CONFIG_ESI_DONGLE=m
+CONFIG_ACTISYS_DONGLE=m
+CONFIG_TEKRAM_DONGLE=m
+CONFIG_GIRBIL_DONGLE=m
+CONFIG_LITELINK_DONGLE=m
+CONFIG_OLD_BELKIN_DONGLE=m
+CONFIG_USB_IRDA=m
+CONFIG_NSC_FIR=m
+CONFIG_WINBOND_FIR=m
+CONFIG_TOSHIBA_FIR=m
+CONFIG_SMC_IRCC_FIR=m
+CONFIG_ALI_FIR=m
+CONFIG_VLSI_FIR=m
+
+#
+# ISDN subsystem
+#
+CONFIG_ISDN=m
+CONFIG_ISDN_BOOL=y
+CONFIG_ISDN_PPP=y
+CONFIG_ISDN_PPP_VJ=y
+CONFIG_ISDN_MPP=y
+CONFIG_ISDN_PPP_BSDCOMP=m
+CONFIG_ISDN_AUDIO=y
+CONFIG_ISDN_TTY_FAX=y
+
+#
+# ISDN feature submodules
+#
+CONFIG_ISDN_DRV_LOOP=m
+# CONFIG_ISDN_DIVERSION is not set
+
+#
+# Passive ISDN cards
+#
+CONFIG_ISDN_DRV_HISAX=m
+CONFIG_ISDN_HISAX=y
+CONFIG_HISAX_EURO=y
+CONFIG_DE_AOC=y
+# CONFIG_HISAX_NO_SENDCOMPLETE is not set
+# CONFIG_HISAX_NO_LLC is not set
+# CONFIG_HISAX_NO_KEYPAD is not set
+CONFIG_HISAX_1TR6=y
+CONFIG_HISAX_NI1=y
+CONFIG_HISAX_MAX_CARDS=8
+CONFIG_HISAX_16_0=y
+CONFIG_HISAX_16_3=y
+CONFIG_HISAX_TELESPCI=y
+CONFIG_HISAX_S0BOX=y
+CONFIG_HISAX_AVM_A1=y
+CONFIG_HISAX_FRITZPCI=y
+CONFIG_HISAX_AVM_A1_PCMCIA=y
+CONFIG_HISAX_ELSA=y
+CONFIG_HISAX_IX1MICROR2=y
+CONFIG_HISAX_DIEHLDIVA=y
+CONFIG_HISAX_ASUSCOM=y
+CONFIG_HISAX_TELEINT=y
+CONFIG_HISAX_HFCS=y
+CONFIG_HISAX_SEDLBAUER=y
+CONFIG_HISAX_SPORTSTER=y
+CONFIG_HISAX_MIC=y
+CONFIG_HISAX_NETJET=y
+CONFIG_HISAX_NETJET_U=y
+CONFIG_HISAX_NICCY=y
+CONFIG_HISAX_ISURF=y
+CONFIG_HISAX_HSTSAPHIR=y
+CONFIG_HISAX_BKM_A4T=y
+CONFIG_HISAX_SCT_QUADRO=y
+CONFIG_HISAX_GAZEL=y
+CONFIG_HISAX_HFC_PCI=y
+CONFIG_HISAX_W6692=y
+CONFIG_HISAX_HFC_SX=y
+CONFIG_HISAX_DEBUG=y
+CONFIG_HISAX_SEDLBAUER_CS=m
+CONFIG_HISAX_ELSA_CS=m
+CONFIG_HISAX_AVM_A1_CS=m
+CONFIG_HISAX_ST5481=m
+CONFIG_HISAX_FRITZ_PCIPNP=m
+
+#
+# Active ISDN cards
+#
+CONFIG_ISDN_DRV_ICN=m
+CONFIG_ISDN_DRV_PCBIT=m
+# CONFIG_ISDN_DRV_SC is not set
+# CONFIG_ISDN_DRV_ACT2000 is not set
+CONFIG_ISDN_DRV_EICON=y
+CONFIG_ISDN_DRV_EICON_DIVAS=m
+# CONFIG_ISDN_DRV_EICON_OLD is not set
+CONFIG_ISDN_DRV_TPAM=m
+CONFIG_ISDN_CAPI=m
+CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y
+CONFIG_ISDN_CAPI_MIDDLEWARE=y
+CONFIG_ISDN_CAPI_CAPI20=m
+CONFIG_ISDN_CAPI_CAPIFS_BOOL=y
+CONFIG_ISDN_CAPI_CAPIFS=m
+CONFIG_ISDN_CAPI_CAPIDRV=m
+CONFIG_ISDN_DRV_AVMB1_B1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_T1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
+CONFIG_HYSDN=m
+CONFIG_HYSDN_CAPI=y
+CONFIG_KALLSYMS=y
+
+#
+# Old CD-ROM drivers (not SCSI, not IDE)
+#
+# CONFIG_CD_NO_IDESCSI is not set
+
+#
+# Input core support
+#
+CONFIG_INPUT=m
+CONFIG_INPUT_KEYBDEV=m
+CONFIG_INPUT_MOUSEDEV=m
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_EVDEV=m
+
+#
+# Character devices
+#
+CONFIG_ECC=m
+# CONFIG_CHAOSTEST is not set
+# CONFIG_P4THERM is not set
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_SERIAL=y
+CONFIG_SERIAL_CONSOLE=y
+CONFIG_SERIAL_EXTENDED=y
+CONFIG_SERIAL_MANY_PORTS=y
+CONFIG_SERIAL_SHARE_IRQ=y
+# CONFIG_SERIAL_DETECT_IRQ is not set
+CONFIG_SERIAL_MULTIPORT=y
+# CONFIG_HUB6 is not set
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_COMPUTONE=m
+CONFIG_ROCKETPORT=m
+CONFIG_CYCLADES=m
+# CONFIG_CYZ_INTR is not set
+CONFIG_DIGIEPCA=m
+CONFIG_ESPSERIAL=m
+CONFIG_MOXA_INTELLIO=m
+CONFIG_MOXA_SMARTIO=m
+CONFIG_ISI=m
+CONFIG_SYNCLINK=m
+CONFIG_N_HDLC=m
+CONFIG_RISCOM8=m
+CONFIG_SPECIALIX=m
+CONFIG_SPECIALIX_RTSCTS=y
+CONFIG_SX=m
+# CONFIG_RIO is not set
+CONFIG_STALDRV=y
+CONFIG_STALLION=m
+CONFIG_ISTALLION=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=512
+CONFIG_PRINTER=m
+CONFIG_LP_CONSOLE=y
+CONFIG_PPDEV=m
+
+#
+# I2C support
+#
+CONFIG_I2C=m
+CONFIG_I2C_ALGOBIT=m
+CONFIG_I2C_PHILIPSPAR=m
+CONFIG_I2C_ELV=m
+CONFIG_I2C_VELLEMAN=m
+CONFIG_I2C_ALGOPCF=m
+CONFIG_I2C_ELEKTOR=m
+CONFIG_I2C_MAINBOARD=y
+CONFIG_I2C_ALI1535=m
+CONFIG_I2C_ALI15X3=m
+CONFIG_I2C_HYDRA=m
+CONFIG_I2C_AMD756=m
+# CONFIG_I2C_TSUNAMI is not set
+CONFIG_I2C_I801=m
+CONFIG_I2C_I810=m
+CONFIG_I2C_PIIX4=m
+CONFIG_I2C_SIS5595=m
+CONFIG_I2C_VIA=m
+CONFIG_I2C_VIAPRO=m
+CONFIG_I2C_VOODOO3=m
+CONFIG_I2C_ISA=m
+CONFIG_I2C_CHARDEV=m
+CONFIG_I2C_PROC=m
+
+#
+# Hardware sensors support
+#
+CONFIG_SENSORS=y
+CONFIG_SENSORS_ADM1021=m
+CONFIG_SENSORS_ADM1024=m
+CONFIG_SENSORS_ADM1025=m
+CONFIG_SENSORS_ADM9240=m
+CONFIG_SENSORS_DS1621=m
+CONFIG_SENSORS_FSCPOS=m
+CONFIG_SENSORS_FSCSCY=m
+CONFIG_SENSORS_GL518SM=m
+CONFIG_SENSORS_GL520SM=m
+CONFIG_SENSORS_MAXILIFE=m
+CONFIG_SENSORS_IT87=m
+CONFIG_SENSORS_MTP008=m
+CONFIG_SENSORS_LM75=m
+CONFIG_SENSORS_LM78=m
+CONFIG_SENSORS_LM80=m
+CONFIG_SENSORS_LM87=m
+CONFIG_SENSORS_SIS5595=m
+CONFIG_SENSORS_THMC50=m
+CONFIG_SENSORS_VIA686A=m
+CONFIG_SENSORS_W83781D=m
+CONFIG_SENSORS_OTHER=y
+CONFIG_SENSORS_BT869=m
+CONFIG_SENSORS_DDCMON=m
+CONFIG_SENSORS_EEPROM=m
+CONFIG_SENSORS_MATORB=m
+
+#
+# Mice
+#
+CONFIG_BUSMOUSE=m
+CONFIG_ATIXL_BUSMOUSE=m
+CONFIG_LOGIBUSMOUSE=m
+CONFIG_MS_BUSMOUSE=m
+CONFIG_MOUSE=y
+CONFIG_PSMOUSE=y
+CONFIG_82C710_MOUSE=m
+CONFIG_PC110_PAD=m
+CONFIG_MK712_MOUSE=m
+
+#
+# Joysticks
+#
+CONFIG_INPUT_GAMEPORT=m
+CONFIG_INPUT_NS558=m
+CONFIG_INPUT_LIGHTNING=m
+CONFIG_INPUT_PCIGAME=m
+CONFIG_INPUT_CS461X=m
+CONFIG_INPUT_EMU10K1=m
+CONFIG_INPUT_SERIO=m
+CONFIG_INPUT_SERPORT=m
+CONFIG_INPUT_ANALOG=m
+CONFIG_INPUT_A3D=m
+CONFIG_INPUT_ADI=m
+CONFIG_INPUT_COBRA=m
+CONFIG_INPUT_GF2K=m
+CONFIG_INPUT_GRIP=m
+CONFIG_INPUT_INTERACT=m
+CONFIG_INPUT_TMDC=m
+CONFIG_INPUT_SIDEWINDER=m
+CONFIG_INPUT_IFORCE_USB=m
+CONFIG_INPUT_IFORCE_232=m
+CONFIG_INPUT_WARRIOR=m
+CONFIG_INPUT_MAGELLAN=m
+CONFIG_INPUT_SPACEORB=m
+CONFIG_INPUT_SPACEBALL=m
+CONFIG_INPUT_STINGER=m
+CONFIG_INPUT_DB9=m
+CONFIG_INPUT_GAMECON=m
+CONFIG_INPUT_TURBOGRAFX=m
+# CONFIG_QIC02_TAPE is not set
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+CONFIG_ACQUIRE_WDT=m
+CONFIG_ADVANTECH_WDT=m
+CONFIG_ALIM7101_WDT=m
+CONFIG_SC520_WDT=m
+CONFIG_PCWATCHDOG=m
+CONFIG_EUROTECH_WDT=m
+CONFIG_IB700_WDT=m
+CONFIG_WAFER_WDT=m
+CONFIG_I810_TCO=m
+# CONFIG_MIXCOMWD is not set
+# CONFIG_60XX_WDT is not set
+CONFIG_SC1200_WDT=m
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_W83877F_WDT=m
+CONFIG_WDT=m
+CONFIG_WDTPCI=m
+# CONFIG_WDT_501 is not set
+CONFIG_MACHZ_WDT=m
+CONFIG_AMD7XX_TCO=m
+CONFIG_AMD_RNG=m
+CONFIG_INTEL_RNG=m
+CONFIG_AMD_PM768=m
+CONFIG_NVRAM=m
+CONFIG_RTC=y
+CONFIG_DTLK=m
+CONFIG_R3964=m
+# CONFIG_APPLICOM is not set
+CONFIG_SONYPI=m
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_FTAPE=m
+CONFIG_ZFTAPE=m
+CONFIG_ZFT_DFLT_BLK_SZ=10240
+CONFIG_ZFT_COMPRESSOR=m
+CONFIG_FT_NR_BUFFERS=3
+# CONFIG_FT_PROC_FS is not set
+CONFIG_FT_NORMAL_DEBUG=y
+# CONFIG_FT_FULL_DEBUG is not set
+# CONFIG_FT_NO_TRACE is not set
+# CONFIG_FT_NO_TRACE_AT_ALL is not set
+CONFIG_FT_STD_FDC=y
+# CONFIG_FT_MACH2 is not set
+# CONFIG_FT_PROBE_FC10 is not set
+# CONFIG_FT_ALT_FDC is not set
+CONFIG_FT_FDC_THR=8
+CONFIG_FT_FDC_MAX_RATE=2000
+CONFIG_FT_ALPHA_CLOCK=0
+CONFIG_AGP=m
+CONFIG_AGP_INTEL=y
+CONFIG_AGP_I810=y
+CONFIG_AGP_VIA=y
+CONFIG_AGP_AMD=y
+CONFIG_AGP_SIS=y
+CONFIG_AGP_ALI=y
+CONFIG_AGP_SWORKS=y
+# CONFIG_DRM is not set
+
+#
+# PCMCIA character devices
+#
+CONFIG_PCMCIA_SERIAL_CS=m
+CONFIG_SYNCLINK_CS=m
+CONFIG_MWAVE=m
+CONFIG_BATTERY_GERICOM=m
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+
+#
+# Video For Linux
+#
+CONFIG_VIDEO_PROC_FS=y
+CONFIG_I2C_PARPORT=m
+CONFIG_VIDEO_BT848=m
+# CONFIG_VIDEO_LS220 is not set
+# CONFIG_VIDEO_MARGI is not set
+CONFIG_VIDEO_PMS=m
+CONFIG_VIDEO_BWQCAM=m
+CONFIG_VIDEO_CQCAM=m
+CONFIG_VIDEO_W9966=m
+CONFIG_VIDEO_CPIA=m
+CONFIG_VIDEO_CPIA_PP=m
+CONFIG_VIDEO_CPIA_USB=m
+CONFIG_VIDEO_SAA5249=m
+CONFIG_TUNER_3036=m
+CONFIG_VIDEO_STRADIS=m
+CONFIG_VIDEO_ZORAN=m
+CONFIG_VIDEO_ZORAN_BUZ=m
+CONFIG_VIDEO_ZORAN_DC10=m
+CONFIG_VIDEO_ZORAN_LML33=m
+CONFIG_VIDEO_ZR36120=m
+CONFIG_VIDEO_MEYE=m
+
+#
+# Radio Adapters
+#
+CONFIG_RADIO_CADET=m
+CONFIG_RADIO_RTRACK=m
+CONFIG_RADIO_RTRACK2=m
+CONFIG_RADIO_AZTECH=m
+CONFIG_RADIO_GEMTEK=m
+CONFIG_RADIO_GEMTEK_PCI=m
+CONFIG_RADIO_MAXIRADIO=m
+CONFIG_RADIO_MAESTRO=m
+CONFIG_RADIO_MIROPCM20=m
+CONFIG_RADIO_MIROPCM20_RDS=m
+CONFIG_RADIO_SF16FMI=m
+CONFIG_RADIO_TERRATEC=m
+CONFIG_RADIO_TRUST=m
+CONFIG_RADIO_TYPHOON=m
+CONFIG_RADIO_TYPHOON_PROC_FS=y
+CONFIG_RADIO_ZOLTRIX=m
+
+#
+# Crypto Hardware support
+#
+CONFIG_CRYPTO=m
+CONFIG_CRYPTO_BROADCOM=m
+
+#
+# File systems
+#
+CONFIG_QUOTA=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFS_FS is not set
+# CONFIG_ADFS_FS_RW is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_BEFS_FS=m
+# CONFIG_BEFS_DEBUG is not set
+CONFIG_BFS_FS=m
+CONFIG_EXT3_FS=m
+CONFIG_JBD=m
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_UMSDOS_FS=m
+CONFIG_VFAT_FS=m
+# CONFIG_EFS_FS is not set
+# CONFIG_JFFS_FS is not set
+# CONFIG_JFFS2_FS is not set
+CONFIG_CRAMFS=m
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_DEBUG=y
+# CONFIG_JFS_STATISTICS is not set
+CONFIG_MINIX_FS=m
+CONFIG_VXFS_FS=m
+# CONFIG_NTFS_FS is not set
+# CONFIG_NTFS_RW is not set
+# CONFIG_HPFS_FS is not set
+CONFIG_PROC_FS=y
+# CONFIG_DEVFS_FS is not set
+# CONFIG_DEVFS_MOUNT is not set
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_QNX4FS_RW is not set
+CONFIG_ROMFS_FS=m
+CONFIG_EXT2_FS=y
+CONFIG_SYSV_FS=m
+CONFIG_UDF_FS=m
+CONFIG_UDF_RW=y
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_CODA_FS=m
+CONFIG_INTERMEZZO_FS=m
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+# CONFIG_ROOT_NFS is not set
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_TCP is not set
+CONFIG_SUNRPC=m
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+# CONFIG_PFS_FS is not set
+CONFIG_ZISOFS_FS=y
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+# CONFIG_AMIGA_PARTITION is not set
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+# CONFIG_LDM_PARTITION is not set
+CONFIG_SGI_PARTITION=y
+# CONFIG_ULTRIX_PARTITION is not set
+CONFIG_SUN_PARTITION=y
+CONFIG_SMB_NLS=y
+CONFIG_NLS=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Console drivers
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_VIDEO_SELECT=y
+# CONFIG_VIDEO_IGNORE_BAD_MODE is not set
+CONFIG_MDA_CONSOLE=m
+
+#
+# Frame-buffer support
+#
+CONFIG_FB=y
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FB_RIVA=m
+CONFIG_FB_CLGEN=m
+CONFIG_FB_PM2=m
+# CONFIG_FB_PM2_FIFO_DISCONNECT is not set
+CONFIG_FB_PM2_PCI=y
+CONFIG_FB_PM3=m
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_VESA=y
+# CONFIG_FB_VGA16 is not set
+CONFIG_FB_HGA=m
+CONFIG_VIDEO_SELECT=y
+CONFIG_FB_MATROX=m
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G100=y
+CONFIG_FB_MATROX_I2C=m
+CONFIG_FB_MATROX_MAVEN=m
+# CONFIG_FB_MATROX_G450 is not set
+# CONFIG_FB_MATROX_PROC is not set
+CONFIG_FB_MATROX_MULTIHEAD=y
+CONFIG_FB_ATY=m
+CONFIG_FB_ATY_GX=y
+CONFIG_FB_ATY_CT=y
+CONFIG_FB_RADEON=m
+CONFIG_FB_ATY128=m
+CONFIG_FB_SIS=m
+CONFIG_FB_SIS_300=y
+CONFIG_FB_SIS_315=y
+CONFIG_FB_NEOMAGIC=m
+CONFIG_FB_3DFX=m
+CONFIG_FB_VOODOO1=m
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FBCON_ADVANCED is not set
+CONFIG_FBCON_MFB=m
+CONFIG_FBCON_CFB8=y
+CONFIG_FBCON_CFB16=y
+CONFIG_FBCON_CFB24=y
+CONFIG_FBCON_CFB32=y
+CONFIG_FBCON_HGA=m
+# CONFIG_FBCON_FONTWIDTH8_ONLY is not set
+# CONFIG_FBCON_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+CONFIG_SPEAKUP=y
+CONFIG_SPEAKUP_ACNTSA=y
+CONFIG_SPEAKUP_ACNTPC=y
+CONFIG_SPEAKUP_APOLO=y
+CONFIG_SPEAKUP_AUDPTR=y
+CONFIG_SPEAKUP_BNS=y
+CONFIG_SPEAKUP_DECTLK=y
+CONFIG_SPEAKUP_DECEXT=y
+CONFIG_SPEAKUP_DTLK=y
+CONFIG_SPEAKUP_LTLK=y
+CONFIG_SPEAKUP_SPKOUT=y
+CONFIG_SPEAKUP_TXPRT=y
+CONFIG_SPEAKUP_DEFAULT="none"
+# CONFIG_SPEAKUP_KEYMAP is not set
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+CONFIG_SOUND_BT878=m
+CONFIG_SOUND_CMPCI=m
+CONFIG_SOUND_CMPCI_FM=y
+CONFIG_SOUND_CMPCI_FMIO=388
+CONFIG_SOUND_CMPCI_FMIO=388
+CONFIG_SOUND_CMPCI_MIDI=y
+CONFIG_SOUND_CMPCI_MPUIO=330
+CONFIG_SOUND_CMPCI_JOYSTICK=y
+CONFIG_SOUND_CMPCI_CM8738=y
+# CONFIG_SOUND_CMPCI_SPDIFINVERSE is not set
+CONFIG_SOUND_CMPCI_SPDIFLOOP=y
+CONFIG_SOUND_CMPCI_SPEAKERS=2
+CONFIG_SOUND_EMU10K1=m
+CONFIG_MIDI_EMU10K1=y
+CONFIG_SOUND_AUDIGY=m
+CONFIG_SOUND_FUSION=m
+CONFIG_SOUND_CS4281=m
+CONFIG_SOUND_ES1370=m
+CONFIG_SOUND_ES1371=m
+CONFIG_SOUND_ESSSOLO1=m
+CONFIG_SOUND_MAESTRO=m
+CONFIG_SOUND_MAESTRO3=m
+CONFIG_SOUND_ICH=m
+CONFIG_SOUND_RME96XX=m
+CONFIG_SOUND_SONICVIBES=m
+CONFIG_SOUND_TRIDENT=m
+CONFIG_SOUND_MSNDCLAS=m
+# CONFIG_MSNDCLAS_HAVE_BOOT is not set
+CONFIG_MSNDCLAS_INIT_FILE="/etc/sound/msndinit.bin"
+CONFIG_MSNDCLAS_PERM_FILE="/etc/sound/msndperm.bin"
+CONFIG_SOUND_MSNDPIN=m
+# CONFIG_MSNDPIN_HAVE_BOOT is not set
+CONFIG_MSNDPIN_INIT_FILE="/etc/sound/pndspini.bin"
+CONFIG_MSNDPIN_PERM_FILE="/etc/sound/pndsperm.bin"
+CONFIG_SOUND_VIA82CXXX=m
+CONFIG_MIDI_VIA82CXXX=y
+CONFIG_SOUND_OSS=m
+# CONFIG_SOUND_TRACEINIT is not set
+CONFIG_SOUND_DMAP=y
+CONFIG_SOUND_AD1816=m
+CONFIG_SOUND_SGALAXY=m
+CONFIG_SOUND_ADLIB=m
+CONFIG_SOUND_ACI_MIXER=m
+CONFIG_SOUND_CS4232=m
+CONFIG_SOUND_SSCAPE=m
+CONFIG_SOUND_GUS=m
+CONFIG_SOUND_GUS16=y
+CONFIG_SOUND_GUSMAX=y
+CONFIG_SOUND_VMIDI=m
+CONFIG_SOUND_TRIX=m
+CONFIG_SOUND_MSS=m
+CONFIG_SOUND_MPU401=m
+CONFIG_SOUND_NM256=m
+CONFIG_SOUND_MAD16=m
+CONFIG_MAD16_OLDCARD=y
+CONFIG_SOUND_PAS=m
+# CONFIG_PAS_JOYSTICK is not set
+CONFIG_SOUND_PSS=m
+# CONFIG_PSS_MIXER is not set
+# CONFIG_PSS_HAVE_BOOT is not set
+CONFIG_SOUND_SB=m
+CONFIG_SOUND_AWE32_SYNTH=m
+CONFIG_SOUND_WAVEFRONT=m
+CONFIG_SOUND_MAUI=m
+CONFIG_SOUND_YM3812=m
+CONFIG_SOUND_OPL3SA1=m
+CONFIG_SOUND_OPL3SA2=m
+CONFIG_SOUND_YMFPCI=m
+CONFIG_SOUND_YMFPCI_LEGACY=y
+CONFIG_SOUND_UART6850=m
+CONFIG_SOUND_AEDSP16=m
+CONFIG_SC6600=y
+CONFIG_SC6600_JOY=y
+CONFIG_SC6600_CDROM=4
+CONFIG_SC6600_CDROMBASE=0
+CONFIG_AEDSP16_SBPRO=y
+CONFIG_AEDSP16_MPU401=y
+CONFIG_SOUND_TVMIXER=m
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+CONFIG_USB_LONG_TIMEOUT=y
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_UHCI=m
+CONFIG_USB_UHCI_ALT=m
+CONFIG_USB_OHCI=m
+CONFIG_USB_AUDIO=m
+# CONFIG_USB_EMI26 is not set
+CONFIG_USB_BLUETOOTH=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+CONFIG_USB_HIDDEV=y
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_WACOM=m
+# CONFIG_USB_DC2XX is not set
+CONFIG_USB_MDC800=m
+CONFIG_USB_SCANNER=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_OV511=m
+CONFIG_USB_PWC=m
+CONFIG_USB_SE401=m
+CONFIG_USB_STV680=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_DSBR=m
+CONFIG_USB_DABUSB=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_CATC=m
+CONFIG_USB_CDCETHER=m
+CONFIG_USB_USBNET=m
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_WHITEHEAT=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_BRLVGER=m
+CONFIG_USB_USBLCD=m
+
+#
+# Bluetooth support
+#
+CONFIG_BLUEZ=m
+CONFIG_BLUEZ_L2CAP=m
+CONFIG_BLUEZ_SCO=m
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BLUEZ_HCIUSB=m
+CONFIG_BLUEZ_USB_FW_LOAD=y
+CONFIG_BLUEZ_USB_ZERO_PACKET=y
+CONFIG_BLUEZ_HCIUART=m
+CONFIG_BLUEZ_HCIUART_H4=y
+CONFIG_BLUEZ_HCIDTL1=m
+CONFIG_BLUEZ_HCIVHCI=m
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_KERNEL=y
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_STACK_TRACE_SCAN is not set
+CONFIG_STACK_TRACE_PARAM_COUNT=4
+# CONFIG_DEBUG_HIGHMEM is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_IOVIRT is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_MCL_COREDUMP is not set
+# CONFIG_OPROFILE is not set
+
+#
+# Library routines
+#
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-p4smp-61chaos b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-p4smp-61chaos
new file mode 100644 (file)
index 0000000..0de1146
--- /dev/null
@@ -0,0 +1,1035 @@
+#
+# Automatically generated by make menuconfig: don't edit
+#
+CONFIG_X86=y
+CONFIG_ISA=y
+# CONFIG_SBUS is not set
+CONFIG_UID16=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+
+#
+# Processor type and features
+#
+CONFIG_LOLAT=y
+# CONFIG_LOLAT_SYSCTL is not set
+# CONFIG_M386 is not set
+# CONFIG_M486 is not set
+# CONFIG_M586 is not set
+# CONFIG_M586TSC is not set
+# CONFIG_M586MMX is not set
+# CONFIG_M686 is not set
+# CONFIG_MPENTIUMIII is not set
+CONFIG_MPENTIUM4=y
+# CONFIG_MK6 is not set
+# CONFIG_MK7 is not set
+# CONFIG_MELAN is not set
+# CONFIG_MCRUSOE is not set
+# CONFIG_MWINCHIPC6 is not set
+# CONFIG_MWINCHIP2 is not set
+# CONFIG_MWINCHIP3D is not set
+# CONFIG_MCYRIXIII is not set
+CONFIG_X86_WP_WORKS_OK=y
+CONFIG_X86_INVLPG=y
+CONFIG_X86_CMPXCHG=y
+CONFIG_X86_XADD=y
+CONFIG_X86_BSWAP=y
+CONFIG_X86_POPAD_OK=y
+# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_TSC=y
+CONFIG_X86_GOOD_APIC=y
+CONFIG_X86_PGE=y
+CONFIG_X86_USE_PPRO_CHECKSUM=y
+CONFIG_X86_MCE=y
+# CONFIG_CPU_FREQ is not set
+# CONFIG_TOSHIBA is not set
+# CONFIG_I8K is not set
+CONFIG_MICROCODE=m
+CONFIG_X86_MSR=m
+CONFIG_X86_CPUID=m
+# CONFIG_E820_PROC is not set
+# CONFIG_NOHIGHMEM is not set
+CONFIG_HIGHMEM4G=y
+# CONFIG_HIGHMEM64G is not set
+CONFIG_HIGHMEM=y
+CONFIG_HIGHIO=y
+# CONFIG_MATH_EMULATION is not set
+CONFIG_MTRR=y
+CONFIG_SMP=y
+# CONFIG_MULTIQUAD is not set
+CONFIG_HAVE_DEC_LOCK=y
+
+#
+# General setup
+#
+CONFIG_HZ=100
+CONFIG_NET=y
+CONFIG_X86_IO_APIC=y
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_PCI=y
+# CONFIG_PCI_GOBIOS is not set
+# CONFIG_PCI_GODIRECT is not set
+CONFIG_PCI_GOANY=y
+CONFIG_PCI_BIOS=y
+CONFIG_PCI_DIRECT=y
+CONFIG_PCI_NAMES=y
+
+#
+# Performance-monitoring counters support
+#
+CONFIG_PERFCTR=m
+CONFIG_KPERFCTR=y
+# CONFIG_PERFCTR_DEBUG is not set
+# CONFIG_PERFCTR_INIT_TESTS is not set
+CONFIG_PERFCTR_VIRTUAL=y
+CONFIG_PERFCTR_GLOBAL=y
+# CONFIG_EISA is not set
+# CONFIG_MCA is not set
+# CONFIG_HOTPLUG is not set
+# CONFIG_PCMCIA is not set
+# CONFIG_HOTPLUG_PCI is not set
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_KCORE_ELF=y
+# CONFIG_KCORE_AOUT is not set
+CONFIG_BINFMT_AOUT=m
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+# CONFIG_IKCONFIG is not set
+CONFIG_PM=y
+
+#
+# Additional device driver support
+#
+# CONFIG_CIPE is not set
+# CONFIG_CRYPTO_AEP is not set
+# CONFIG_MEGARAC is not set
+CONFIG_FC_QLA2200=m
+CONFIG_FC_QLA2300=m
+# CONFIG_SCSI_ISCSI is not set
+CONFIG_IBMASM=m
+CONFIG_IBMSER=m
+# CONFIG_ACPI is not set
+CONFIG_APM=y
+CONFIG_APM_IGNORE_USER_SUSPEND=y
+# CONFIG_APM_DO_ENABLE is not set
+# CONFIG_APM_CPU_IDLE is not set
+# CONFIG_APM_DISPLAY_BLANK is not set
+CONFIG_APM_RTC_IS_GMT=y
+# CONFIG_APM_ALLOW_INTS is not set
+# CONFIG_APM_REAL_MODE_POWER_OFF is not set
+
+#
+# Binary emulation of other systems
+#
+# CONFIG_ABI is not set
+# CONFIG_ABI_SVR4 is not set
+# CONFIG_BINFMT_COFF is not set
+# CONFIG_BINFMT_XOUT is not set
+# CONFIG_BINFMT_XOUT_X286 is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+CONFIG_MTD=y
+# CONFIG_MTD_DEBUG is not set
+# CONFIG_MTD_PARTITIONS is not set
+# CONFIG_MTD_CONCAT is not set
+# CONFIG_MTD_REDBOOT_PARTS is not set
+# CONFIG_MTD_CMDLINE_PARTS is not set
+CONFIG_MTD_CHAR=m
+# CONFIG_MTD_BLOCK is not set
+# CONFIG_MTD_BLOCK_RO is not set
+# CONFIG_FTL is not set
+# CONFIG_NFTL is not set
+
+#
+# RAM/ROM/Flash chip drivers
+#
+# CONFIG_MTD_CFI is not set
+CONFIG_MTD_JEDECPROBE=y
+CONFIG_MTD_GEN_PROBE=y
+CONFIG_MTD_CFI_ADV_OPTIONS=y
+CONFIG_MTD_CFI_NOSWAP=y
+# CONFIG_MTD_CFI_BE_BYTE_SWAP is not set
+# CONFIG_MTD_CFI_LE_BYTE_SWAP is not set
+CONFIG_MTD_CFI_GEOMETRY=y
+CONFIG_MTD_CFI_B1=y
+# CONFIG_MTD_CFI_B2 is not set
+# CONFIG_MTD_CFI_B4 is not set
+# CONFIG_MTD_CFI_B8 is not set
+CONFIG_MTD_CFI_I1=y
+# CONFIG_MTD_CFI_I2 is not set
+# CONFIG_MTD_CFI_I4 is not set
+# CONFIG_MTD_CFI_I8 is not set
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_CFI_AMDSTD=y
+# CONFIG_MTD_RAM is not set
+CONFIG_MTD_ROM=y
+# CONFIG_MTD_ABSENT is not set
+# CONFIG_MTD_OBSOLETE_CHIPS is not set
+# CONFIG_MTD_AMDSTD is not set
+# CONFIG_MTD_SHARP is not set
+# CONFIG_MTD_JEDEC is not set
+
+#
+# Mapping drivers for chip access
+#
+# CONFIG_MTD_PHYSMAP is not set
+# CONFIG_MTD_PNC2000 is not set
+# CONFIG_MTD_SC520CDP is not set
+# CONFIG_MTD_NETSC520 is not set
+# CONFIG_MTD_SBC_GXX is not set
+# CONFIG_MTD_ELAN_104NC is not set
+# CONFIG_MTD_DILNETPC is not set
+# CONFIG_MTD_MIXMEM is not set
+# CONFIG_MTD_OCTAGON is not set
+# CONFIG_MTD_VMAX is not set
+# CONFIG_MTD_L440GX is not set
+# CONFIG_MTD_AMD766ROM is not set
+CONFIG_MTD_ICH2ROM=m
+# CONFIG_MTD_PCI is not set
+
+#
+# Self-contained MTD device drivers
+#
+# CONFIG_MTD_PMC551 is not set
+# CONFIG_MTD_SLRAM is not set
+# CONFIG_MTD_MTDRAM is not set
+# CONFIG_MTD_BLKMTD is not set
+# CONFIG_MTD_DOC1000 is not set
+# CONFIG_MTD_DOC2000 is not set
+# CONFIG_MTD_DOC2001 is not set
+# CONFIG_MTD_DOCPROBE is not set
+
+#
+# NAND Flash Device Drivers
+#
+# CONFIG_MTD_NAND is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+# CONFIG_PARPORT_PC_FIFO is not set
+# CONFIG_PARPORT_PC_SUPERIO is not set
+# CONFIG_PARPORT_AMIGA is not set
+# CONFIG_PARPORT_MFC3 is not set
+# CONFIG_PARPORT_ATARI is not set
+# CONFIG_PARPORT_GSC is not set
+# CONFIG_PARPORT_SUNBPP is not set
+# CONFIG_PARPORT_OTHER is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play configuration
+#
+CONFIG_PNP=y
+CONFIG_ISAPNP=y
+# CONFIG_PNPBIOS is not set
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+# CONFIG_BLK_DEV_XD is not set
+# CONFIG_PARIDE is not set
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_CISS_SCSI_TAPE is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_ENBD=m
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+# CONFIG_MD_LINEAR is not set
+# CONFIG_MD_RAID0 is not set
+# CONFIG_MD_RAID1 is not set
+# CONFIG_MD_RAID5 is not set
+# CONFIG_MD_MULTIPATH is not set
+CONFIG_BLK_DEV_LVM=m
+
+#
+# Cryptography support (CryptoAPI)
+#
+# CONFIG_CRYPTO is not set
+# CONFIG_CIPHERS is not set
+# CONFIG_CRYPTODEV is not set
+# CONFIG_CRYPTOLOOP is not set
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_FILTER=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+# CONFIG_TUX is not set
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_ROUTE_LARGE_TABLES=y
+# CONFIG_IP_PNP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+CONFIG_SYN_COOKIES=y
+
+#
+#   IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+# CONFIG_IP_NF_FTP is not set
+# CONFIG_IP_NF_IRC is not set
+# CONFIG_IP_NF_QUEUE is not set
+# CONFIG_IP_NF_IPTABLES is not set
+# CONFIG_IP_NF_ARPTABLES is not set
+# CONFIG_IP_NF_COMPAT_IPCHAINS is not set
+# CONFIG_IP_NF_COMPAT_IPFWADM is not set
+
+#
+#   IP: Virtual Server Configuration
+#
+# CONFIG_IP_VS is not set
+# CONFIG_IPV6 is not set
+CONFIG_KHTTPD=m
+# CONFIG_ATM is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+
+#
+# Appletalk devices
+#
+# CONFIG_DEV_APPLETALK is not set
+# CONFIG_DECNET is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_LLC is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+# CONFIG_NET_SCHED is not set
+
+#
+# Network testing
+#
+CONFIG_NET_PKTGEN=m
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+# CONFIG_PHONE_IXJ is not set
+# CONFIG_PHONE_IXJ_PCMCIA is not set
+
+#
+# ATA/IDE/MFM/RLL support
+#
+CONFIG_IDE=y
+
+#
+# IDE, ATA and ATAPI Block devices
+#
+CONFIG_BLK_DEV_IDE=y
+# CONFIG_BLK_DEV_HD_IDE is not set
+# CONFIG_BLK_DEV_HD is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+# CONFIG_IDEDISK_STROKE is not set
+# CONFIG_BLK_DEV_IDEDISK_VENDOR is not set
+# CONFIG_BLK_DEV_IDEDISK_FUJITSU is not set
+# CONFIG_BLK_DEV_IDEDISK_IBM is not set
+# CONFIG_BLK_DEV_IDEDISK_MAXTOR is not set
+# CONFIG_BLK_DEV_IDEDISK_QUANTUM is not set
+# CONFIG_BLK_DEV_IDEDISK_SEAGATE is not set
+# CONFIG_BLK_DEV_IDEDISK_WD is not set
+# CONFIG_BLK_DEV_COMMERIAL is not set
+# CONFIG_BLK_DEV_TIVO is not set
+# CONFIG_BLK_DEV_IDECS is not set
+CONFIG_BLK_DEV_IDECD=m
+# CONFIG_BLK_DEV_IDETAPE is not set
+CONFIG_BLK_DEV_IDEFLOPPY=y
+# CONFIG_BLK_DEV_IDESCSI is not set
+# CONFIG_IDE_TASK_IOCTL is not set
+# CONFIG_BLK_DEV_CMD640 is not set
+# CONFIG_BLK_DEV_CMD640_ENHANCED is not set
+CONFIG_BLK_DEV_ISAPNP=y
+# CONFIG_BLK_DEV_RZ1000 is not set
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_PCI_WIP is not set
+# CONFIG_BLK_DEV_IDEDMA_TIMEOUT is not set
+# CONFIG_IDEDMA_NEW_DRIVE_LISTINGS is not set
+CONFIG_BLK_DEV_ADMA=y
+CONFIG_BLK_DEV_AEC62XX=y
+CONFIG_AEC62XX_TUNING=y
+CONFIG_BLK_DEV_ALI15X3=y
+# CONFIG_WDC_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_AMD74XX_OVERRIDE is not set
+CONFIG_BLK_DEV_CMD64X=y
+CONFIG_BLK_DEV_CMD680=y
+CONFIG_BLK_DEV_CY82C693=y
+CONFIG_BLK_DEV_CS5530=y
+CONFIG_BLK_DEV_HPT34X=y
+# CONFIG_HPT34X_AUTODMA is not set
+CONFIG_BLK_DEV_HPT366=y
+CONFIG_BLK_DEV_PIIX=y
+CONFIG_PIIX_TUNING=y
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+# CONFIG_BLK_DEV_ADMA100 is not set
+CONFIG_BLK_DEV_PDC202XX=y
+# CONFIG_PDC202XX_BURST is not set
+# CONFIG_PDC202XX_FORCE is not set
+CONFIG_BLK_DEV_SVWKS=y
+CONFIG_BLK_DEV_SIS5513=y
+CONFIG_BLK_DEV_SLC90E66=y
+# CONFIG_BLK_DEV_TRM290 is not set
+CONFIG_BLK_DEV_VIA82CXXX=y
+CONFIG_BLK_DEV_CENATEK=y
+# CONFIG_IDE_CHIPSETS is not set
+# CONFIG_BLK_DEV_ELEVATOR_NOOP is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_IDEDMA_IVB is not set
+# CONFIG_DMA_NONPCI is not set
+CONFIG_BLK_DEV_IDE_MODES=y
+# CONFIG_BLK_DEV_ATARAID is not set
+# CONFIG_BLK_DEV_ATARAID_PDC is not set
+# CONFIG_BLK_DEV_ATARAID_HPT is not set
+
+#
+# SCSI support
+#
+CONFIG_SCSI=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_SD_EXTRA_DEVS=40
+# CONFIG_CHR_DEV_ST is not set
+# CONFIG_CHR_DEV_OSST is not set
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_SR_EXTRA_DEVS=4
+CONFIG_CHR_DEV_SG=m
+# CONFIG_SCSI_DEBUG_QUEUES is not set
+CONFIG_SCSI_MULTI_LUN=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI low-level drivers
+#
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
+# CONFIG_SCSI_7000FASST is not set
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AHA152X is not set
+# CONFIG_SCSI_AHA1542 is not set
+# CONFIG_SCSI_AHA1740 is not set
+# CONFIG_SCSI_AACRAID is not set
+CONFIG_SCSI_AIC7XXX=y
+CONFIG_AIC7XXX_CMDS_PER_DEVICE=253
+CONFIG_AIC7XXX_RESET_DELAY_MS=15000
+# CONFIG_AIC7XXX_PROBE_EISA_VL is not set
+# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set
+# CONFIG_SCSI_AIC79XX is not set
+# CONFIG_SCSI_DPT_I2O is not set
+# CONFIG_SCSI_ADVANSYS is not set
+# CONFIG_SCSI_IN2000 is not set
+# CONFIG_SCSI_AM53C974 is not set
+# CONFIG_SCSI_MEGARAID is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_CPQFCTS is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_DTC3280 is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_EATA_DMA is not set
+# CONFIG_SCSI_EATA_PIO is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_GENERIC_NCR5380 is not set
+# CONFIG_SCSI_IPS is not set
+# CONFIG_SCSI_INITIO is not set
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_PPA is not set
+# CONFIG_SCSI_IMM is not set
+# CONFIG_SCSI_NCR53C406A is not set
+# CONFIG_SCSI_NCR53C7xx is not set
+# CONFIG_SCSI_SYM53C8XX_2 is not set
+# CONFIG_SCSI_NCR53C8XX is not set
+# CONFIG_SCSI_SYM53C8XX is not set
+# CONFIG_SCSI_PAS16 is not set
+# CONFIG_SCSI_PCI2000 is not set
+# CONFIG_SCSI_PCI2220I is not set
+# CONFIG_SCSI_PSI240I is not set
+# CONFIG_SCSI_QLOGIC_FAS is not set
+# CONFIG_SCSI_QLOGIC_ISP is not set
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+# CONFIG_SCSI_NEWISP is not set
+# CONFIG_SCSI_SEAGATE is not set
+# CONFIG_SCSI_SIM710 is not set
+# CONFIG_SCSI_SYM53C416 is not set
+# CONFIG_SCSI_DC390T is not set
+# CONFIG_SCSI_T128 is not set
+# CONFIG_SCSI_U14_34F is not set
+# CONFIG_SCSI_ULTRASTOR is not set
+CONFIG_SCSI_DEBUG=m
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=y
+CONFIG_FUSION_BOOT=y
+CONFIG_FUSION_ISENSE=m
+CONFIG_FUSION_CTL=m
+# CONFIG_FUSION_LAN is not set
+
+#
+# IEEE 1394 (FireWire) support (EXPERIMENTAL)
+#
+# CONFIG_IEEE1394 is not set
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+# CONFIG_I2O_PCI is not set
+# CONFIG_I2O_BLOCK is not set
+# CONFIG_I2O_LAN is not set
+# CONFIG_I2O_SCSI is not set
+# CONFIG_I2O_PROC is not set
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+# CONFIG_BONDING is not set
+# CONFIG_EQUALIZER is not set
+# CONFIG_TUN is not set
+# CONFIG_ETHERTAP is not set
+# CONFIG_NET_SB1000 is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+# CONFIG_SUNLANCE is not set
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNBMAC is not set
+# CONFIG_SUNQE is not set
+# CONFIG_SUNGEM is not set
+# CONFIG_NET_VENDOR_3COM is not set
+# CONFIG_LANCE is not set
+# CONFIG_NET_VENDOR_SMC is not set
+# CONFIG_NET_VENDOR_RACAL is not set
+# CONFIG_AT1700 is not set
+# CONFIG_DEPCA is not set
+# CONFIG_HP100 is not set
+# CONFIG_NET_ISA is not set
+CONFIG_NET_PCI=y
+# CONFIG_PCNET32 is not set
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_AC3200 is not set
+# CONFIG_APRICOT is not set
+# CONFIG_CS89x0 is not set
+CONFIG_TULIP=m
+# CONFIG_TC35815 is not set
+# CONFIG_TULIP_MWI is not set
+CONFIG_TULIP_MMIO=y
+CONFIG_DE4X5=m
+# CONFIG_DGRS is not set
+# CONFIG_DM9102 is not set
+CONFIG_EEPRO100=m
+CONFIG_NET_E100=m
+# CONFIG_LNE390 is not set
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+# CONFIG_NE3210 is not set
+# CONFIG_ES3210 is not set
+# CONFIG_8139CP is not set
+# CONFIG_8139TOO is not set
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+# CONFIG_8139TOO_8129 is not set
+# CONFIG_8139_NEW_RX_RESET is not set
+# CONFIG_SIS900 is not set
+# CONFIG_SIS900_OLD is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_TLAN is not set
+# CONFIG_VIA_RHINE is not set
+# CONFIG_VIA_RHINE_MMIO is not set
+# CONFIG_WINBOND_840 is not set
+# CONFIG_NET_POCKET is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+# CONFIG_ACENIC_OMIT_TIGON_I is not set
+# CONFIG_DL2K is not set
+# CONFIG_MYRI_SBUS is not set
+CONFIG_NS83820=m
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+# CONFIG_SK98LIN is not set
+CONFIG_NET_BROADCOM=m
+CONFIG_TIGON3=m
+CONFIG_NET_E1000=m
+# CONFIG_FDDI is not set
+# CONFIG_NETCONSOLE is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PLIP is not set
+# CONFIG_PPP is not set
+# CONFIG_SLIP is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Token Ring devices
+#
+# CONFIG_TR is not set
+# CONFIG_NET_FC is not set
+# CONFIG_RCPCI is not set
+# CONFIG_SHAPER is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# Quadrics Supercomputers
+#
+
+#
+# QsNet
+#
+CONFIG_QUADRICS=y
+CONFIG_QSNETMOD=m
+CONFIG_ELAN3MOD=m
+CONFIG_EPMOD=m
+CONFIG_EIPMOD=m
+CONFIG_RMSMOD=m
+CONFIG_JTAG=m
+
+#
+# QsNet II
+#
+
+#
+# Amateur Radio support
+#
+# CONFIG_HAMRADIO is not set
+
+#
+# IrDA (infrared) support
+#
+# CONFIG_IRDA is not set
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+CONFIG_KALLSYMS=y
+
+#
+# Old CD-ROM drivers (not SCSI, not IDE)
+#
+# CONFIG_CD_NO_IDESCSI is not set
+
+#
+# Input core support
+#
+# CONFIG_INPUT is not set
+# CONFIG_INPUT_KEYBDEV is not set
+# CONFIG_INPUT_MOUSEDEV is not set
+# CONFIG_INPUT_JOYDEV is not set
+# CONFIG_INPUT_EVDEV is not set
+
+#
+# Character devices
+#
+CONFIG_ECC=m
+CONFIG_CHAOSTEST=m
+CONFIG_P4THERM=m
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_SERIAL=y
+CONFIG_SERIAL_CONSOLE=y
+CONFIG_SERIAL_EXTENDED=y
+# CONFIG_SERIAL_MANY_PORTS is not set
+CONFIG_SERIAL_SHARE_IRQ=y
+# CONFIG_SERIAL_DETECT_IRQ is not set
+# CONFIG_SERIAL_MULTIPORT is not set
+# CONFIG_HUB6 is not set
+# CONFIG_SERIAL_NONSTANDARD is not set
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=2048
+# CONFIG_PRINTER is not set
+# CONFIG_PPDEV is not set
+
+#
+# I2C support
+#
+CONFIG_I2C=y
+# CONFIG_I2C_ALGOBIT is not set
+# CONFIG_I2C_ALGOPCF is not set
+CONFIG_I2C_MAINBOARD=y
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_HYDRA is not set
+# CONFIG_I2C_AMD756 is not set
+# CONFIG_I2C_TSUNAMI is not set
+CONFIG_I2C_I801=m
+# CONFIG_I2C_I810 is not set
+# CONFIG_I2C_PIIX4 is not set
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+# CONFIG_I2C_VOODOO3 is not set
+CONFIG_I2C_ISA=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_PROC=y
+
+#
+# Hardware sensors support
+#
+CONFIG_SENSORS=y
+CONFIG_SENSORS_ADM1021=m
+# CONFIG_SENSORS_ADM1024 is not set
+# CONFIG_SENSORS_ADM1025 is not set
+# CONFIG_SENSORS_ADM9240 is not set
+# CONFIG_SENSORS_DS1621 is not set
+# CONFIG_SENSORS_FSCPOS is not set
+# CONFIG_SENSORS_FSCSCY is not set
+# CONFIG_SENSORS_GL518SM is not set
+# CONFIG_SENSORS_GL520SM is not set
+# CONFIG_SENSORS_MAXILIFE is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_MTP008 is not set
+# CONFIG_SENSORS_LM75 is not set
+# CONFIG_SENSORS_LM78 is not set
+# CONFIG_SENSORS_LM80 is not set
+CONFIG_SENSORS_LM87=m
+# CONFIG_SENSORS_SIS5595 is not set
+# CONFIG_SENSORS_THMC50 is not set
+# CONFIG_SENSORS_VIA686A is not set
+CONFIG_SENSORS_W83781D=y
+# CONFIG_SENSORS_OTHER is not set
+
+#
+# Mice
+#
+# CONFIG_BUSMOUSE is not set
+CONFIG_MOUSE=y
+CONFIG_PSMOUSE=y
+# CONFIG_82C710_MOUSE is not set
+# CONFIG_PC110_PAD is not set
+# CONFIG_MK712_MOUSE is not set
+
+#
+# Joysticks
+#
+# CONFIG_INPUT_GAMEPORT is not set
+# CONFIG_QIC02_TAPE is not set
+
+#
+# Watchdog Cards
+#
+# CONFIG_WATCHDOG is not set
+# CONFIG_AMD_RNG is not set
+# CONFIG_INTEL_RNG is not set
+# CONFIG_AMD_PM768 is not set
+# CONFIG_NVRAM is not set
+CONFIG_RTC=y
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+# CONFIG_SONYPI is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+# CONFIG_FTAPE is not set
+CONFIG_AGP=m
+CONFIG_AGP_INTEL=y
+# CONFIG_AGP_I810 is not set
+# CONFIG_AGP_VIA is not set
+# CONFIG_AGP_AMD is not set
+# CONFIG_AGP_SIS is not set
+# CONFIG_AGP_ALI is not set
+# CONFIG_AGP_SWORKS is not set
+# CONFIG_DRM is not set
+# CONFIG_MWAVE is not set
+# CONFIG_BATTERY_GERICOM is not set
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+
+#
+# Crypto Hardware support
+#
+# CONFIG_CRYPTO is not set
+
+#
+# File systems
+#
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_REISERFS_FS is not set
+# CONFIG_REISERFS_CHECK is not set
+# CONFIG_REISERFS_PROC_INFO is not set
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFS_FS is not set
+# CONFIG_ADFS_FS_RW is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BEFS_DEBUG is not set
+# CONFIG_BFS_FS is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXTN_FS=m
+CONFIG_JBD=y
+CONFIG_JBD_DEBUG=y
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_UMSDOS_FS=m
+CONFIG_VFAT_FS=m
+# CONFIG_EFS_FS is not set
+# CONFIG_JFFS_FS is not set
+# CONFIG_JFFS2_FS is not set
+CONFIG_CRAMFS=y
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+# CONFIG_JFS_FS is not set
+# CONFIG_JFS_DEBUG is not set
+# CONFIG_JFS_STATISTICS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_NTFS_FS is not set
+# CONFIG_NTFS_RW is not set
+# CONFIG_HPFS_FS is not set
+CONFIG_PROC_FS=y
+# CONFIG_DEVFS_FS is not set
+# CONFIG_DEVFS_MOUNT is not set
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_QNX4FS_RW is not set
+CONFIG_ROMFS_FS=m
+CONFIG_EXT2_FS=y
+# CONFIG_SYSV_FS is not set
+# CONFIG_UDF_FS is not set
+# CONFIG_UDF_RW is not set
+# CONFIG_UFS_FS is not set
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+# CONFIG_CODA_FS is not set
+# CONFIG_INTERMEZZO_FS is not set
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+# CONFIG_ROOT_NFS is not set
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+CONFIG_NFSD_TCP=y
+CONFIG_SUNRPC=m
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+# CONFIG_SMB_FS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_NCPFS_PACKET_SIGNING is not set
+# CONFIG_NCPFS_IOCTL_LOCKING is not set
+# CONFIG_NCPFS_STRONG is not set
+# CONFIG_NCPFS_NFS_NS is not set
+# CONFIG_NCPFS_OS2_NS is not set
+# CONFIG_NCPFS_SMALLDOS is not set
+# CONFIG_NCPFS_NLS is not set
+# CONFIG_NCPFS_EXTRAS is not set
+# CONFIG_PFS_FS is not set
+CONFIG_ZISOFS_FS=y
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+# CONFIG_AMIGA_PARTITION is not set
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+# CONFIG_LDM_PARTITION is not set
+CONFIG_SGI_PARTITION=y
+# CONFIG_ULTRIX_PARTITION is not set
+CONFIG_SUN_PARTITION=y
+# CONFIG_SMB_NLS is not set
+CONFIG_NLS=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+CONFIG_NLS_CODEPAGE_850=m
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+# CONFIG_NLS_CODEPAGE_932 is not set
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+CONFIG_NLS_ISO8859_1=m
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+# CONFIG_NLS_ISO8859_15 is not set
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+# CONFIG_NLS_UTF8 is not set
+
+#
+# Console drivers
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_VIDEO_SELECT=y
+# CONFIG_VIDEO_IGNORE_BAD_MODE is not set
+# CONFIG_MDA_CONSOLE is not set
+
+#
+# Frame-buffer support
+#
+# CONFIG_FB is not set
+# CONFIG_SPEAKUP is not set
+
+#
+# Sound
+#
+# CONFIG_SOUND is not set
+
+#
+# USB support
+#
+# CONFIG_USB is not set
+
+#
+# Bluetooth support
+#
+# CONFIG_BLUEZ is not set
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_KERNEL=y
+CONFIG_FRAME_POINTER=y
+CONFIG_STACK_TRACE_SCAN=y
+CONFIG_STACK_TRACE_FPTR=y
+CONFIG_STACK_TRACE_PARAM_COUNT=4
+# CONFIG_DEBUG_HIGHMEM is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_IOVIRT is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_SPINLOCK=y
+CONFIG_MCL_COREDUMP=y
+CONFIG_BOOTIMG=y
+# CONFIG_OPROFILE is not set
+
+#
+# Library routines
+#
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=y
diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-uml b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-uml
new file mode 100644 (file)
index 0000000..bb79c22
--- /dev/null
@@ -0,0 +1,458 @@
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_USERMODE=y
+# CONFIG_ISA is not set
+# CONFIG_SBUS is not set
+# CONFIG_PCI is not set
+CONFIG_UID16=y
+# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# General Setup
+#
+CONFIG_NET=y
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_BINFMT_AOUT=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=y
+CONFIG_HOSTFS=y
+# CONFIG_HPPFS is not set
+CONFIG_MCONSOLE=y
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_HOST_2G_2G is not set
+# CONFIG_UML_SMP is not set
+# CONFIG_SMP is not set
+CONFIG_NEST_LEVEL=0
+CONFIG_KERNEL_HALF_GIGS=1
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_KMOD=y
+
+#
+# Character Devices
+#
+CONFIG_STDIO_CONSOLE=y
+CONFIG_SSL=y
+CONFIG_FD_CHAN=y
+# CONFIG_NULL_CHAN is not set
+CONFIG_PORT_CHAN=y
+CONFIG_PTY_CHAN=y
+CONFIG_TTY_CHAN=y
+CONFIG_XTERM_CHAN=y
+CONFIG_CON_ZERO_CHAN="fd:0,fd:1"
+CONFIG_CON_CHAN="xterm"
+CONFIG_SSL_CHAN="pty"
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=2048
+# CONFIG_WATCHDOG is not set
+# CONFIG_UML_SOUND is not set
+# CONFIG_SOUND is not set
+# CONFIG_HOSTAUDIO is not set
+# CONFIG_TTY_LOG is not set
+
+#
+# Block Devices
+#
+CONFIG_BLK_DEV_UBD=y
+# CONFIG_BLK_DEV_UBD_SYNC is not set
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_NBD=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_MMAPPER is not set
+CONFIG_NETDEVICES=y
+
+#
+# Network Devices
+#
+CONFIG_UML_NET=y
+# CONFIG_UML_NET_ETHERTAP is not set
+CONFIG_UML_NET_TUNTAP=y
+CONFIG_UML_NET_SLIP=y
+CONFIG_UML_NET_DAEMON=y
+CONFIG_UML_NET_MCAST=y
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=y
+CONFIG_PPP=y
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_BSDCOMP is not set
+# CONFIG_PPPOE is not set
+CONFIG_SLIP=y
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+CONFIG_SLIP_MODE_SLIP6=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_FILTER=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+# CONFIG_TUX is not set
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_ROUTE_LARGE_TABLES=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=y
+CONFIG_NET_IPGRE=y
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+CONFIG_SYN_COOKIES=y
+
+#
+#   IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=y
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_UNCLEAN=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_MIRROR=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+
+#
+#   IP: Virtual Server Configuration
+#
+CONFIG_IP_VS=y
+# CONFIG_IP_VS_DEBUG is not set
+CONFIG_IP_VS_TAB_BITS=16
+
+#
+# IPVS scheduler
+#
+CONFIG_IP_VS_RR=m
+CONFIG_IP_VS_WRR=m
+CONFIG_IP_VS_LC=m
+CONFIG_IP_VS_WLC=m
+CONFIG_IP_VS_LBLC=m
+CONFIG_IP_VS_LBLCR=m
+CONFIG_IP_VS_DH=m
+CONFIG_IP_VS_SH=m
+
+#
+# IPVS application helper
+#
+CONFIG_IP_VS_FTP=m
+CONFIG_IPV6=y
+
+#
+#   IPv6: Netfilter Configuration
+#
+# CONFIG_IP6_NF_QUEUE is not set
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+# CONFIG_KHTTPD is not set
+CONFIG_ATM=y
+CONFIG_ATM_CLIP=y
+# CONFIG_ATM_CLIP_NO_ICMP is not set
+CONFIG_ATM_LANE=y
+CONFIG_ATM_MPOA=y
+CONFIG_ATM_BR2684=m
+CONFIG_ATM_BR2684_IPFILTER=y
+CONFIG_VLAN_8021Q=m
+
+#
+#  
+#
+CONFIG_IPX=m
+# CONFIG_IPX_INTERN is not set
+CONFIG_ATALK=m
+
+#
+# Appletalk devices
+#
+# CONFIG_DEV_APPLETALK is not set
+CONFIG_DECNET=m
+CONFIG_DECNET_SIOCGIFCONF=y
+CONFIG_DECNET_ROUTER=y
+CONFIG_DECNET_ROUTE_FWMARK=y
+CONFIG_BRIDGE=m
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_LLC is not set
+CONFIG_NET_DIVERT=y
+# CONFIG_ECONET is not set
+CONFIG_WAN_ROUTER=y
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_CSZ=m
+# CONFIG_NET_SCH_ATM is not set
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+
+#
+# File systems
+#
+CONFIG_QUOTA=y
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFS_FS is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_BEFS_FS=m
+# CONFIG_BEFS_DEBUG is not set
+CONFIG_BFS_FS=m
+CONFIG_EXT3_FS=y
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=m
+CONFIG_UMSDOS_FS=m
+CONFIG_VFAT_FS=y
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=m
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_DEBUG=y
+# CONFIG_JFS_STATISTICS is not set
+CONFIG_MINIX_FS=m
+CONFIG_VXFS_FS=m
+# CONFIG_NTFS_FS is not set
+# CONFIG_HPFS_FS is not set
+CONFIG_PROC_FS=y
+CONFIG_DEVFS_FS=y
+CONFIG_DEVFS_MOUNT=y
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+# CONFIG_QNX4FS_FS is not set
+CONFIG_ROMFS_FS=m
+CONFIG_EXT2_FS=y
+CONFIG_SYSV_FS=m
+CONFIG_UDF_FS=m
+CONFIG_UDF_RW=y
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_CODA_FS=m
+CONFIG_INTERMEZZO_FS=m
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_TCP is not set
+CONFIG_SUNRPC=y
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+CONFIG_ZISOFS_FS=y
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+# CONFIG_AMIGA_PARTITION is not set
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+# CONFIG_LDM_PARTITION is not set
+CONFIG_SGI_PARTITION=y
+# CONFIG_ULTRIX_PARTITION is not set
+CONFIG_SUN_PARTITION=y
+CONFIG_SMB_NLS=y
+CONFIG_NLS=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# SCSI support
+#
+# CONFIG_SCSI is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_LVM=m
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Kernel hacking
+#
+# CONFIG_DEBUG_SLAB is not set
+CONFIG_DEBUGSYM=y
+CONFIG_PT_PROXY=y
+# CONFIG_GPROF is not set
+# CONFIG_GCOV is not set
+
+#
+# Library routines
+#
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-i386-rh b/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-i386-rh
new file mode 100644 (file)
index 0000000..dec210a
--- /dev/null
@@ -0,0 +1,1849 @@
+#
+# Automatically generated by make menuconfig: don't edit
+#
+CONFIG_X86=y
+# CONFIG_SBUS is not set
+CONFIG_UID16=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+
+#
+# Processor type and features
+#
+CONFIG_LOLAT=y
+# CONFIG_M386 is not set
+# CONFIG_M486 is not set
+# CONFIG_M586 is not set
+# CONFIG_M586TSC is not set
+# CONFIG_M586MMX is not set
+CONFIG_M686=y
+# CONFIG_MPENTIUMIII is not set
+# CONFIG_MPENTIUM4 is not set
+# CONFIG_MK6 is not set
+# CONFIG_MK7 is not set
+# CONFIG_MELAN is not set
+# CONFIG_MCRUSOE is not set
+# CONFIG_MWINCHIPC6 is not set
+# CONFIG_MWINCHIP2 is not set
+# CONFIG_MWINCHIP3D is not set
+# CONFIG_MCYRIXIII is not set
+CONFIG_X86_WP_WORKS_OK=y
+CONFIG_X86_INVLPG=y
+CONFIG_X86_CMPXCHG=y
+CONFIG_X86_XADD=y
+CONFIG_X86_BSWAP=y
+CONFIG_X86_POPAD_OK=y
+# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_HAS_TSC=y
+CONFIG_X86_GOOD_APIC=y
+CONFIG_X86_PGE=y
+CONFIG_X86_USE_PPRO_CHECKSUM=y
+CONFIG_X86_PPRO_FENCE=y
+CONFIG_X86_F00F_WORKS_OK=y
+CONFIG_X86_MCE=y
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_24_API is not set
+CONFIG_X86_POWERNOW_K6=m
+# CONFIG_X86_LONGHAUL is not set
+CONFIG_X86_SPEEDSTEP=m
+# CONFIG_X86_P4_CLOCKMOD is not set
+# CONFIG_X86_LONGRUN is not set
+CONFIG_TOSHIBA=m
+CONFIG_I8K=m
+CONFIG_MICROCODE=m
+CONFIG_X86_MSR=m
+CONFIG_X86_CPUID=m
+# CONFIG_E820_PROC is not set
+CONFIG_EDD=m
+# CONFIG_NOHIGHMEM is not set
+CONFIG_HIGHMEM4G=y
+# CONFIG_HIGHMEM64G is not set
+CONFIG_HIGHMEM=y
+CONFIG_HIGHPTE=y
+CONFIG_HIGHIO=y
+# CONFIG_MATH_EMULATION is not set
+CONFIG_MTRR=y
+# CONFIG_SMP is not set
+CONFIG_X86_UP_APIC=y
+CONFIG_X86_UP_IOAPIC=y
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_X86_IO_APIC=y
+# CONFIG_X86_TSC_DISABLE is not set
+CONFIG_X86_TSC=y
+
+#
+# General setup
+#
+CONFIG_NET=y
+CONFIG_PCI=y
+# CONFIG_PCI_GOBIOS is not set
+# CONFIG_PCI_GODIRECT is not set
+CONFIG_PCI_GOANY=y
+CONFIG_PCI_BIOS=y
+CONFIG_PCI_DIRECT=y
+CONFIG_ISA=y
+CONFIG_PCI_NAMES=y
+CONFIG_EISA=y
+# CONFIG_MCA is not set
+CONFIG_HOTPLUG=y
+
+#
+# PCMCIA/CardBus support
+#
+CONFIG_PCMCIA=m
+CONFIG_CARDBUS=y
+CONFIG_TCIC=y
+CONFIG_I82092=y
+CONFIG_I82365=y
+
+#
+# PCI Hotplug Support
+#
+# CONFIG_HOTPLUG_PCI is not set
+# CONFIG_HOTPLUG_PCI_ACPI is not set
+# CONFIG_HOTPLUG_PCI_COMPAQ is not set
+# CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM is not set
+# CONFIG_HOTPLUG_PCI_IBM is not set
+# CONFIG_HOTPLUG_PCI_H2999 is not set
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_KCORE_ELF=y
+# CONFIG_KCORE_AOUT is not set
+CONFIG_BINFMT_AOUT=m
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+CONFIG_PM=y
+# CONFIG_ACPI is not set
+CONFIG_APM=y
+# CONFIG_APM_IGNORE_USER_SUSPEND is not set
+# CONFIG_APM_DO_ENABLE is not set
+CONFIG_APM_CPU_IDLE=y
+# CONFIG_APM_DISPLAY_BLANK is not set
+CONFIG_APM_RTC_IS_GMT=y
+# CONFIG_APM_ALLOW_INTS is not set
+# CONFIG_APM_REAL_MODE_POWER_OFF is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+# CONFIG_PARPORT_PC_FIFO is not set
+# CONFIG_PARPORT_PC_SUPERIO is not set
+CONFIG_PARPORT_PC_PCMCIA=m
+# CONFIG_PARPORT_AMIGA is not set
+# CONFIG_PARPORT_MFC3 is not set
+# CONFIG_PARPORT_ATARI is not set
+# CONFIG_PARPORT_GSC is not set
+# CONFIG_PARPORT_SUNBPP is not set
+# CONFIG_PARPORT_OTHER is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play configuration
+#
+CONFIG_PNP=y
+CONFIG_ISAPNP=y
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+CONFIG_BLK_DEV_XD=m
+CONFIG_PARIDE=m
+CONFIG_PARIDE_PARPORT=m
+CONFIG_PARIDE_PD=m
+CONFIG_PARIDE_PCD=m
+CONFIG_PARIDE_PF=m
+CONFIG_PARIDE_PT=m
+CONFIG_PARIDE_PG=m
+CONFIG_PARIDE_ATEN=m
+CONFIG_PARIDE_BPCK=m
+CONFIG_PARIDE_BPCK6=m
+CONFIG_PARIDE_COMM=m
+CONFIG_PARIDE_DSTR=m
+CONFIG_PARIDE_FIT2=m
+CONFIG_PARIDE_FIT3=m
+CONFIG_PARIDE_EPAT=m
+CONFIG_PARIDE_EPATC8=y
+CONFIG_PARIDE_EPIA=m
+CONFIG_PARIDE_FRIQ=m
+CONFIG_PARIDE_FRPW=m
+CONFIG_PARIDE_KBIC=m
+CONFIG_PARIDE_KTTI=m
+CONFIG_PARIDE_ON20=m
+CONFIG_PARIDE_ON26=m
+CONFIG_BLK_CPQ_DA=m
+CONFIG_BLK_CPQ_CISS_DA=m
+CONFIG_CISS_SCSI_TAPE=y
+CONFIG_BLK_DEV_DAC960=m
+CONFIG_BLK_DEV_UMEM=m
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_BLK_STATS=y
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_LVM=m
+
+#
+# Cryptography support (CryptoAPI)
+#
+CONFIG_CRYPTO=m
+CONFIG_CIPHERS=m
+CONFIG_CIPHER_AES=m
+CONFIG_CIPHER_IDENTITY=m
+CONFIG_CRYPTODEV=m
+CONFIG_CRYPTOLOOP=m
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_FILTER=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_TUX=m
+CONFIG_TUX_EXTCGI=y
+# CONFIG_TUX_EXTENDED_LOG is not set
+# CONFIG_TUX_DEBUG is not set
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_ROUTE_LARGE_TABLES=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+CONFIG_SYN_COOKIES=y
+
+#
+#   IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_PKTTYPE=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_HELPER=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_CONNTRACK=m
+CONFIG_IP_NF_MATCH_UNCLEAN=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_MIRROR=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_COMPAT_IPCHAINS=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_COMPAT_IPFWADM=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IPV6=m
+
+#
+#   IPv6: Netfilter Configuration
+#
+# CONFIG_IP6_NF_QUEUE is not set
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_MATCH_LENGTH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+# CONFIG_KHTTPD is not set
+CONFIG_ATM=y
+CONFIG_ATM_CLIP=y
+# CONFIG_ATM_CLIP_NO_ICMP is not set
+CONFIG_ATM_LANE=m
+CONFIG_ATM_MPOA=m
+CONFIG_ATM_BR2684=m
+CONFIG_ATM_BR2684_IPFILTER=y
+CONFIG_VLAN_8021Q=m
+CONFIG_IPX=m
+# CONFIG_IPX_INTERN is not set
+CONFIG_ATALK=m
+
+#
+# Appletalk devices
+#
+CONFIG_DEV_APPLETALK=y
+CONFIG_LTPC=m
+CONFIG_COPS=m
+CONFIG_COPS_DAYNA=y
+CONFIG_COPS_TANGENT=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+CONFIG_DECNET=m
+CONFIG_DECNET_SIOCGIFCONF=y
+CONFIG_DECNET_ROUTER=y
+CONFIG_DECNET_ROUTE_FWMARK=y
+CONFIG_BRIDGE=m
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_LLC is not set
+CONFIG_NET_DIVERT=y
+# CONFIG_ECONET is not set
+CONFIG_WAN_ROUTER=m
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_CSZ=m
+# CONFIG_NET_SCH_ATM is not set
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+
+#
+# Telephony Support
+#
+CONFIG_PHONE=m
+CONFIG_PHONE_IXJ=m
+CONFIG_PHONE_IXJ_PCMCIA=m
+
+#
+# ATA/IDE/MFM/RLL support
+#
+CONFIG_IDE=y
+
+#
+# IDE, ATA and ATAPI Block devices
+#
+CONFIG_BLK_DEV_IDE=y
+# CONFIG_BLK_DEV_HD_IDE is not set
+# CONFIG_BLK_DEV_HD is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+# CONFIG_IDEDISK_STROKE is not set
+CONFIG_BLK_DEV_IDECS=m
+CONFIG_BLK_DEV_IDECD=m
+CONFIG_BLK_DEV_IDETAPE=m
+CONFIG_BLK_DEV_IDEFLOPPY=y
+CONFIG_BLK_DEV_IDESCSI=m
+# CONFIG_IDE_TASK_IOCTL is not set
+CONFIG_BLK_DEV_CMD640=y
+# CONFIG_BLK_DEV_CMD640_ENHANCED is not set
+CONFIG_BLK_DEV_ISAPNP=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_BLK_DEV_GENERIC=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_PCI_WIP is not set
+CONFIG_BLK_DEV_ADMA=y
+CONFIG_BLK_DEV_AEC62XX=y
+CONFIG_BLK_DEV_ALI15X3=y
+# CONFIG_WDC_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_AMD74XX_OVERRIDE is not set
+CONFIG_BLK_DEV_CMD64X=y
+CONFIG_BLK_DEV_TRIFLEX=y
+CONFIG_BLK_DEV_CY82C693=y
+CONFIG_BLK_DEV_CS5530=y
+CONFIG_BLK_DEV_HPT34X=y
+# CONFIG_HPT34X_AUTODMA is not set
+CONFIG_BLK_DEV_HPT366=y
+CONFIG_BLK_DEV_PIIX=y
+CONFIG_BLK_DEV_NFORCE=y
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+# CONFIG_PDC202XX_BURST is not set
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+CONFIG_PDC202XX_FORCE=y
+CONFIG_BLK_DEV_RZ1000=y
+# CONFIG_BLK_DEV_SC1200 is not set
+CONFIG_BLK_DEV_SVWKS=y
+CONFIG_BLK_DEV_SIIMAGE=y
+CONFIG_BLK_DEV_SIS5513=y
+CONFIG_BLK_DEV_SLC90E66=y
+# CONFIG_BLK_DEV_TRM290 is not set
+CONFIG_BLK_DEV_VIA82CXXX=y
+# CONFIG_IDE_CHIPSETS is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_IDEDMA_IVB is not set
+# CONFIG_DMA_NONPCI is not set
+CONFIG_BLK_DEV_PDC202XX=y
+CONFIG_BLK_DEV_IDE_MODES=y
+CONFIG_BLK_DEV_ATARAID=m
+CONFIG_BLK_DEV_ATARAID_PDC=m
+CONFIG_BLK_DEV_ATARAID_HPT=m
+CONFIG_BLK_DEV_ATARAID_SII=m
+
+#
+# SCSI support
+#
+CONFIG_SCSI=m
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_EXTRA_DEVS=40
+CONFIG_CHR_DEV_ST=m
+CONFIG_CHR_DEV_OSST=m
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_SR_EXTRA_DEVS=4
+CONFIG_CHR_DEV_SG=m
+# CONFIG_SCSI_DEBUG_QUEUES is not set
+# CONFIG_SCSI_MULTI_LUN is not set
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI low-level drivers
+#
+CONFIG_BLK_DEV_3W_XXXX_RAID=m
+CONFIG_SCSI_7000FASST=m
+CONFIG_SCSI_ACARD=m
+CONFIG_SCSI_AHA152X=m
+CONFIG_SCSI_AHA1542=m
+CONFIG_SCSI_AHA1740=m
+CONFIG_SCSI_AACRAID=m
+CONFIG_SCSI_AIC7XXX=m
+CONFIG_AIC7XXX_CMDS_PER_DEVICE=253
+CONFIG_AIC7XXX_RESET_DELAY_MS=15000
+# CONFIG_AIC7XXX_PROBE_EISA_VL is not set
+# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set
+CONFIG_SCSI_AIC79XX=m
+CONFIG_AIC79XX_CMDS_PER_DEVICE=253
+CONFIG_AIC79XX_RESET_DELAY_MS=15000
+# CONFIG_AIC79XX_BUILD_FIRMWARE is not set
+CONFIG_AIC79XX_ENABLE_RD_STRM=y
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+CONFIG_SCSI_AIC7XXX_OLD=m
+CONFIG_AIC7XXX_OLD_TCQ_ON_BY_DEFAULT=y
+CONFIG_AIC7XXX_OLD_CMDS_PER_DEVICE=32
+CONFIG_AIC7XXX_OLD_PROC_STATS=y
+CONFIG_SCSI_DPT_I2O=m
+CONFIG_SCSI_ADVANSYS=m
+CONFIG_SCSI_IN2000=m
+CONFIG_SCSI_AM53C974=m
+CONFIG_SCSI_MEGARAID=m
+CONFIG_SCSI_BUSLOGIC=m
+# CONFIG_SCSI_OMIT_FLASHPOINT is not set
+CONFIG_SCSI_CPQFCTS=m
+CONFIG_SCSI_DMX3191D=m
+CONFIG_SCSI_DTC3280=m
+CONFIG_SCSI_EATA=m
+CONFIG_SCSI_EATA_TAGGED_QUEUE=y
+# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set
+CONFIG_SCSI_EATA_MAX_TAGS=16
+CONFIG_SCSI_EATA_DMA=m
+CONFIG_SCSI_EATA_PIO=m
+CONFIG_SCSI_FUTURE_DOMAIN=m
+CONFIG_SCSI_GDTH=m
+CONFIG_SCSI_GENERIC_NCR5380=m
+# CONFIG_SCSI_GENERIC_NCR53C400 is not set
+CONFIG_SCSI_G_NCR5380_PORT=y
+# CONFIG_SCSI_G_NCR5380_MEM is not set
+CONFIG_SCSI_IPS=m
+CONFIG_SCSI_INITIO=m
+CONFIG_SCSI_INIA100=m
+CONFIG_SCSI_PPA=m
+CONFIG_SCSI_IMM=m
+# CONFIG_SCSI_IZIP_EPP16 is not set
+# CONFIG_SCSI_IZIP_SLOW_CTR is not set
+CONFIG_SCSI_NCR53C406A=m
+CONFIG_SCSI_NCR53C7xx=m
+# CONFIG_SCSI_NCR53C7xx_sync is not set
+CONFIG_SCSI_NCR53C7xx_FAST=y
+CONFIG_SCSI_NCR53C7xx_DISCONNECT=y
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_NCR53C8XX=m
+CONFIG_SCSI_SYM53C8XX=m
+CONFIG_SCSI_NCR53C8XX_DEFAULT_TAGS=8
+CONFIG_SCSI_NCR53C8XX_MAX_TAGS=32
+CONFIG_SCSI_NCR53C8XX_SYNC=40
+# CONFIG_SCSI_NCR53C8XX_PROFILE is not set
+# CONFIG_SCSI_NCR53C8XX_IOMAPPED is not set
+# CONFIG_SCSI_NCR53C8XX_PQS_PDS is not set
+# CONFIG_SCSI_NCR53C8XX_SYMBIOS_COMPAT is not set
+CONFIG_SCSI_PAS16=m
+CONFIG_SCSI_PCI2000=m
+CONFIG_SCSI_PCI2220I=m
+CONFIG_SCSI_PSI240I=m
+CONFIG_SCSI_QLOGIC_FAS=m
+CONFIG_SCSI_QLOGIC_ISP=m
+CONFIG_SCSI_QLOGIC_FC=m
+# CONFIG_SCSI_QLOGIC_FC_FIRMWARE is not set
+CONFIG_SCSI_QLOGIC_1280=m
+CONFIG_SCSI_NEWISP=m
+CONFIG_SCSI_SEAGATE=m
+CONFIG_SCSI_SIM710=m
+CONFIG_SCSI_SYM53C416=m
+CONFIG_SCSI_DC390T=m
+# CONFIG_SCSI_DC390T_NOGENSUPP is not set
+CONFIG_SCSI_T128=m
+CONFIG_SCSI_U14_34F=m
+# CONFIG_SCSI_U14_34F_LINKED_COMMANDS is not set
+CONFIG_SCSI_U14_34F_MAX_TAGS=8
+CONFIG_SCSI_ULTRASTOR=m
+CONFIG_SCSI_NSP32=m
+CONFIG_SCSI_DEBUG=m
+
+#
+# PCMCIA SCSI adapter support
+#
+CONFIG_SCSI_PCMCIA=y
+CONFIG_PCMCIA_AHA152X=m
+CONFIG_PCMCIA_FDOMAIN=m
+CONFIG_PCMCIA_NINJA_SCSI=m
+CONFIG_PCMCIA_QLOGIC=m
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=m
+# CONFIG_FUSION_BOOT is not set
+CONFIG_FUSION_MAX_SGE=40
+# CONFIG_FUSION_ISENSE is not set
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+CONFIG_NET_FC=y
+
+#
+# IEEE 1394 (FireWire) support (EXPERIMENTAL)
+#
+CONFIG_IEEE1394=m
+# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_OHCI1394=m
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+
+#
+# I2O device support
+#
+CONFIG_I2O=m
+CONFIG_I2O_PCI=m
+CONFIG_I2O_BLOCK=m
+CONFIG_I2O_LAN=m
+CONFIG_I2O_SCSI=m
+CONFIG_I2O_PROC=m
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+CONFIG_ETHERTAP=m
+CONFIG_NET_SB1000=m
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+# CONFIG_SUNLANCE is not set
+CONFIG_HAPPYMEAL=m
+# CONFIG_SUNBMAC is not set
+# CONFIG_SUNQE is not set
+CONFIG_SUNGEM=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_EL1=m
+CONFIG_EL2=m
+CONFIG_ELPLUS=m
+CONFIG_EL16=m
+CONFIG_EL3=m
+CONFIG_3C515=m
+# CONFIG_ELMC is not set
+# CONFIG_ELMC_II is not set
+CONFIG_VORTEX=m
+CONFIG_LANCE=m
+CONFIG_NET_VENDOR_SMC=y
+CONFIG_WD80x3=m
+# CONFIG_ULTRAMCA is not set
+CONFIG_ULTRA=m
+CONFIG_ULTRA32=m
+CONFIG_SMC9194=m
+CONFIG_NET_VENDOR_RACAL=y
+CONFIG_NI5010=m
+CONFIG_NI52=m
+CONFIG_NI65=m
+CONFIG_AT1700=m
+CONFIG_DEPCA=m
+CONFIG_HP100=m
+CONFIG_NET_ISA=y
+CONFIG_E2100=m
+CONFIG_EWRK3=m
+CONFIG_EEXPRESS=m
+CONFIG_EEXPRESS_PRO=m
+CONFIG_HPLAN_PLUS=m
+CONFIG_HPLAN=m
+CONFIG_LP486E=m
+CONFIG_ETH16I=m
+CONFIG_NE2000=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_AMD8111_ETH=m
+CONFIG_ADAPTEC_STARFIRE=m
+CONFIG_AC3200=m
+CONFIG_APRICOT=m
+CONFIG_CS89x0=m
+CONFIG_TULIP=m
+# CONFIG_TULIP_MWI is not set
+CONFIG_TULIP_MMIO=y
+CONFIG_DE4X5=m
+CONFIG_DGRS=m
+CONFIG_DM9102=m
+CONFIG_EEPRO100=m
+CONFIG_E100=m
+CONFIG_LNE390=m
+CONFIG_FEALNX=m
+CONFIG_NATSEMI=m
+CONFIG_NE2K_PCI=m
+CONFIG_NE3210=m
+CONFIG_ES3210=m
+CONFIG_8139CP=m
+CONFIG_8139TOO=m
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+CONFIG_8139TOO_8129=y
+# CONFIG_8139_OLD_RX_RESET is not set
+CONFIG_SIS900=m
+CONFIG_EPIC100=m
+CONFIG_SUNDANCE=m
+CONFIG_SUNDANCE_MMIO=y
+CONFIG_TLAN=m
+CONFIG_TC35815=m
+CONFIG_VIA_RHINE=m
+# CONFIG_VIA_RHINE_MMIO is not set
+CONFIG_WINBOND_840=m
+CONFIG_NET_POCKET=y
+CONFIG_ATP=m
+CONFIG_DE600=m
+CONFIG_DE620=m
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+# CONFIG_ACENIC_OMIT_TIGON_I is not set
+CONFIG_DL2K=m
+CONFIG_E1000=m
+# CONFIG_MYRI_SBUS is not set
+CONFIG_NS83820=m
+CONFIG_HAMACHI=m
+CONFIG_YELLOWFIN=m
+CONFIG_R8169=m
+CONFIG_SK98LIN=m
+CONFIG_TIGON3=m
+CONFIG_FDDI=y
+CONFIG_DEFXX=m
+CONFIG_SKFP=m
+CONFIG_NETCONSOLE=m
+# CONFIG_HIPPI is not set
+CONFIG_PLIP=m
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_BSDCOMP is not set
+# CONFIG_PPPOE is not set
+CONFIG_PPPOATM=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+CONFIG_SLIP_MODE_SLIP6=y
+
+#
+# Wireless LAN (non-hamradio)
+#
+CONFIG_NET_RADIO=y
+CONFIG_STRIP=m
+CONFIG_WAVELAN=m
+CONFIG_ARLAN=m
+CONFIG_AIRONET4500=m
+CONFIG_AIRONET4500_NONCS=m
+CONFIG_AIRONET4500_PNP=y
+CONFIG_AIRONET4500_PCI=y
+CONFIG_AIRONET4500_ISA=y
+CONFIG_AIRONET4500_I365=y
+CONFIG_AIRONET4500_PROC=m
+CONFIG_AIRO=m
+CONFIG_HERMES=m
+CONFIG_PLX_HERMES=m
+CONFIG_PCI_HERMES=m
+CONFIG_PCMCIA_HERMES=m
+CONFIG_AIRO_CS=m
+CONFIG_NET_WIRELESS=y
+CONFIG_PCMCIA_HERMES_OLD=m
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMTR=m
+CONFIG_IBMOL=m
+CONFIG_IBMLS=m
+CONFIG_3C359=m
+CONFIG_TMS380TR=m
+CONFIG_TMSPCI=m
+CONFIG_TMSISA=m
+CONFIG_ABYSS=m
+# CONFIG_MADGEMC is not set
+CONFIG_SMCTR=m
+CONFIG_NET_FC=y
+CONFIG_IPHASE5526=m
+CONFIG_RCPCI=m
+CONFIG_SHAPER=m
+
+#
+# Wan interfaces
+#
+CONFIG_WAN=y
+CONFIG_HOSTESS_SV11=m
+CONFIG_COSA=m
+# CONFIG_COMX is not set
+# CONFIG_DSCC4 is not set
+# CONFIG_LANMEDIA is not set
+CONFIG_ATI_XX20=m
+CONFIG_SEALEVEL_4021=m
+# CONFIG_SYNCLINK_SYNCPPP is not set
+# CONFIG_HDLC is not set
+CONFIG_DLCI=m
+CONFIG_DLCI_COUNT=24
+CONFIG_DLCI_MAX=8
+CONFIG_SDLA=m
+CONFIG_WAN_ROUTER_DRIVERS=y
+CONFIG_VENDOR_SANGOMA=m
+CONFIG_WANPIPE_CHDLC=y
+CONFIG_WANPIPE_FR=y
+CONFIG_WANPIPE_X25=y
+CONFIG_WANPIPE_PPP=y
+CONFIG_WANPIPE_MULTPPP=y
+CONFIG_CYCLADES_SYNC=m
+CONFIG_CYCLOMX_X25=y
+# CONFIG_LAPBETHER is not set
+# CONFIG_X25_ASY is not set
+CONFIG_SBNI=m
+CONFIG_SBNI_MULTILINE=y
+
+#
+# PCMCIA network device support
+#
+CONFIG_NET_PCMCIA=y
+CONFIG_PCMCIA_3C589=m
+CONFIG_PCMCIA_3C574=m
+CONFIG_PCMCIA_FMVJ18X=m
+CONFIG_PCMCIA_PCNET=m
+CONFIG_PCMCIA_AXNET=m
+CONFIG_PCMCIA_NMCLAN=m
+CONFIG_PCMCIA_SMC91C92=m
+CONFIG_PCMCIA_XIRC2PS=m
+# CONFIG_ARCNET_COM20020_CS is not set
+CONFIG_PCMCIA_IBMTR=m
+CONFIG_PCMCIA_XIRCOM=m
+CONFIG_PCMCIA_XIRTULIP=m
+CONFIG_NET_PCMCIA_RADIO=y
+CONFIG_PCMCIA_RAYCS=m
+CONFIG_PCMCIA_NETWAVE=m
+CONFIG_PCMCIA_WAVELAN=m
+CONFIG_PCMCIA_WVLAN=m
+CONFIG_AIRONET4500_CS=m
+
+#
+# ATM drivers
+#
+CONFIG_ATM_TCP=m
+CONFIG_ATM_LANAI=m
+CONFIG_ATM_ENI=m
+# CONFIG_ATM_ENI_DEBUG is not set
+# CONFIG_ATM_ENI_TUNE_BURST is not set
+CONFIG_ATM_FIRESTREAM=m
+CONFIG_ATM_ZATM=m
+# CONFIG_ATM_ZATM_DEBUG is not set
+CONFIG_ATM_ZATM_EXACT_TS=y
+CONFIG_ATM_NICSTAR=m
+CONFIG_ATM_NICSTAR_USE_SUNI=y
+CONFIG_ATM_NICSTAR_USE_IDT77105=y
+CONFIG_ATM_IDT77252=m
+# CONFIG_ATM_IDT77252_DEBUG is not set
+# CONFIG_ATM_IDT77252_RCV_ALL is not set
+CONFIG_ATM_IDT77252_USE_SUNI=y
+CONFIG_ATM_AMBASSADOR=m
+# CONFIG_ATM_AMBASSADOR_DEBUG is not set
+CONFIG_ATM_HORIZON=m
+# CONFIG_ATM_HORIZON_DEBUG is not set
+CONFIG_ATM_IA=m
+# CONFIG_ATM_IA_DEBUG is not set
+CONFIG_ATM_FORE200E_MAYBE=m
+CONFIG_ATM_FORE200E_PCA=y
+CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y
+CONFIG_ATM_FORE200E_TX_RETRY=16
+CONFIG_ATM_FORE200E_DEBUG=0
+CONFIG_ATM_FORE200E=m
+
+#
+# Amateur Radio support
+#
+CONFIG_HAMRADIO=y
+CONFIG_AX25=m
+# CONFIG_AX25_DAMA_SLAVE is not set
+CONFIG_NETROM=m
+CONFIG_ROSE=m
+
+#
+# AX.25 network device drivers
+#
+# CONFIG_MKISS is not set
+# CONFIG_6PACK is not set
+# CONFIG_BPQETHER is not set
+# CONFIG_DMASCC is not set
+# CONFIG_SCC is not set
+# CONFIG_BAYCOM_SER_FDX is not set
+# CONFIG_BAYCOM_SER_HDX is not set
+# CONFIG_BAYCOM_PAR is not set
+# CONFIG_BAYCOM_EPP is not set
+CONFIG_SOUNDMODEM=m
+CONFIG_SOUNDMODEM_SBC=y
+CONFIG_SOUNDMODEM_WSS=y
+CONFIG_SOUNDMODEM_AFSK1200=y
+CONFIG_SOUNDMODEM_AFSK2400_7=y
+CONFIG_SOUNDMODEM_AFSK2400_8=y
+CONFIG_SOUNDMODEM_AFSK2666=y
+CONFIG_SOUNDMODEM_HAPN4800=y
+CONFIG_SOUNDMODEM_PSK4800=y
+CONFIG_SOUNDMODEM_FSK9600=y
+# CONFIG_YAM is not set
+
+#
+# IrDA (infrared) support
+#
+CONFIG_IRDA=m
+CONFIG_IRLAN=m
+CONFIG_IRNET=m
+CONFIG_IRCOMM=m
+CONFIG_IRDA_ULTRA=y
+CONFIG_IRDA_CACHE_LAST_LSAP=y
+CONFIG_IRDA_FAST_RR=y
+# CONFIG_IRDA_DEBUG is not set
+
+#
+# Infrared-port device drivers
+#
+CONFIG_IRTTY_SIR=m
+CONFIG_IRPORT_SIR=m
+CONFIG_DONGLE=y
+CONFIG_ESI_DONGLE=m
+CONFIG_ACTISYS_DONGLE=m
+CONFIG_TEKRAM_DONGLE=m
+CONFIG_GIRBIL_DONGLE=m
+CONFIG_LITELINK_DONGLE=m
+CONFIG_MCP2120_DONGLE=m
+CONFIG_OLD_BELKIN_DONGLE=m
+CONFIG_ACT200L_DONGLE=m
+CONFIG_MA600_DONGLE=m
+CONFIG_USB_IRDA=m
+CONFIG_NSC_FIR=m
+CONFIG_WINBOND_FIR=m
+CONFIG_TOSHIBA_OLD=m
+CONFIG_TOSHIBA_FIR=m
+CONFIG_SMC_IRCC_FIR=m
+CONFIG_ALI_FIR=m
+CONFIG_VLSI_FIR=m
+
+#
+# ISDN subsystem
+#
+CONFIG_ISDN=m
+CONFIG_ISDN_BOOL=y
+CONFIG_ISDN_PPP=y
+CONFIG_ISDN_PPP_VJ=y
+CONFIG_ISDN_MPP=y
+CONFIG_ISDN_PPP_BSDCOMP=m
+CONFIG_ISDN_AUDIO=y
+CONFIG_ISDN_TTY_FAX=y
+
+#
+# ISDN feature submodules
+#
+CONFIG_ISDN_DRV_LOOP=m
+# CONFIG_ISDN_DIVERSION is not set
+
+#
+# Passive ISDN cards
+#
+CONFIG_ISDN_DRV_HISAX=m
+CONFIG_ISDN_HISAX=y
+CONFIG_HISAX_EURO=y
+CONFIG_DE_AOC=y
+# CONFIG_HISAX_NO_SENDCOMPLETE is not set
+# CONFIG_HISAX_NO_LLC is not set
+# CONFIG_HISAX_NO_KEYPAD is not set
+CONFIG_HISAX_1TR6=y
+CONFIG_HISAX_NI1=y
+CONFIG_HISAX_MAX_CARDS=8
+CONFIG_HISAX_16_0=y
+CONFIG_HISAX_16_3=y
+CONFIG_HISAX_AVM_A1=y
+CONFIG_HISAX_IX1MICROR2=y
+CONFIG_HISAX_ASUSCOM=y
+CONFIG_HISAX_TELEINT=y
+CONFIG_HISAX_HFCS=y
+CONFIG_HISAX_SPORTSTER=y
+CONFIG_HISAX_MIC=y
+CONFIG_HISAX_ISURF=y
+CONFIG_HISAX_HSTSAPHIR=y
+CONFIG_HISAX_TELESPCI=y
+CONFIG_HISAX_S0BOX=y
+CONFIG_HISAX_FRITZPCI=y
+CONFIG_HISAX_AVM_A1_PCMCIA=y
+CONFIG_HISAX_ELSA=y
+CONFIG_HISAX_DIEHLDIVA=y
+CONFIG_HISAX_SEDLBAUER=y
+CONFIG_HISAX_NETJET=y
+CONFIG_HISAX_NETJET_U=y
+CONFIG_HISAX_NICCY=y
+CONFIG_HISAX_BKM_A4T=y
+CONFIG_HISAX_SCT_QUADRO=y
+CONFIG_HISAX_GAZEL=y
+CONFIG_HISAX_HFC_PCI=y
+CONFIG_HISAX_W6692=y
+CONFIG_HISAX_HFC_SX=y
+CONFIG_HISAX_ENTERNOW_PCI=y
+CONFIG_HISAX_DEBUG=y
+CONFIG_HISAX_SEDLBAUER_CS=m
+CONFIG_HISAX_ELSA_CS=m
+CONFIG_HISAX_AVM_A1_CS=m
+CONFIG_HISAX_ST5481=m
+CONFIG_HISAX_FRITZ_PCIPNP=m
+
+#
+# Active ISDN cards
+#
+CONFIG_ISDN_DRV_ICN=m
+CONFIG_ISDN_DRV_PCBIT=m
+# CONFIG_ISDN_DRV_SC is not set
+# CONFIG_ISDN_DRV_ACT2000 is not set
+CONFIG_ISDN_DRV_EICON=y
+CONFIG_ISDN_DRV_EICON_DIVAS=m
+# CONFIG_ISDN_DRV_EICON_OLD is not set
+CONFIG_ISDN_DRV_TPAM=m
+CONFIG_ISDN_CAPI=m
+CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y
+CONFIG_ISDN_CAPI_MIDDLEWARE=y
+CONFIG_ISDN_CAPI_CAPI20=m
+CONFIG_ISDN_CAPI_CAPIFS_BOOL=y
+CONFIG_ISDN_CAPI_CAPIFS=m
+CONFIG_ISDN_CAPI_CAPIDRV=m
+CONFIG_ISDN_DRV_AVMB1_B1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_T1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
+CONFIG_HYSDN=m
+CONFIG_HYSDN_CAPI=y
+CONFIG_KALLSYMS=y
+
+#
+# Old CD-ROM drivers (not SCSI, not IDE)
+#
+# CONFIG_CD_NO_IDESCSI is not set
+
+#
+# Input core support
+#
+CONFIG_INPUT=m
+CONFIG_INPUT_KEYBDEV=m
+CONFIG_INPUT_MOUSEDEV=m
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_EVDEV=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_ECC=m
+CONFIG_VT_CONSOLE=y
+CONFIG_SERIAL=y
+CONFIG_SERIAL_CONSOLE=y
+CONFIG_SERIAL_EXTENDED=y
+CONFIG_SERIAL_MANY_PORTS=y
+CONFIG_SERIAL_SHARE_IRQ=y
+# CONFIG_SERIAL_DETECT_IRQ is not set
+CONFIG_SERIAL_MULTIPORT=y
+# CONFIG_HUB6 is not set
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_COMPUTONE=m
+CONFIG_ROCKETPORT=m
+CONFIG_CYCLADES=m
+# CONFIG_CYZ_INTR is not set
+CONFIG_DIGIEPCA=m
+CONFIG_ESPSERIAL=m
+CONFIG_MOXA_INTELLIO=m
+CONFIG_MOXA_SMARTIO=m
+CONFIG_ISI=m
+CONFIG_SYNCLINK=m
+# CONFIG_SYNCLINKMP is not set
+CONFIG_N_HDLC=m
+CONFIG_RISCOM8=m
+CONFIG_SPECIALIX=m
+CONFIG_SPECIALIX_RTSCTS=y
+CONFIG_SX=m
+# CONFIG_RIO is not set
+CONFIG_STALDRV=y
+CONFIG_STALLION=m
+CONFIG_ISTALLION=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=2048
+CONFIG_PRINTER=m
+CONFIG_LP_CONSOLE=y
+CONFIG_PPDEV=m
+CONFIG_TIPAR=m
+
+#
+# I2C support
+#
+CONFIG_I2C=m
+CONFIG_I2C_ALGOBIT=m
+CONFIG_I2C_PHILIPSPAR=m
+CONFIG_I2C_ELV=m
+CONFIG_I2C_VELLEMAN=m
+# CONFIG_SCx200_I2C is not set
+# CONFIG_SCx200_ACB is not set
+CONFIG_I2C_ALGOPCF=m
+CONFIG_I2C_ELEKTOR=m
+CONFIG_I2C_MAINBOARD=y
+CONFIG_I2C_ALI1535=m
+CONFIG_I2C_ALI15X3=m
+CONFIG_I2C_HYDRA=m
+CONFIG_I2C_AMD756=m
+# CONFIG_I2C_TSUNAMI is not set
+CONFIG_I2C_I801=m
+CONFIG_I2C_I810=m
+CONFIG_I2C_PIIX4=m
+CONFIG_I2C_SIS5595=m
+CONFIG_I2C_VIA=m
+CONFIG_I2C_VIAPRO=m
+CONFIG_I2C_VOODOO3=m
+CONFIG_I2C_ISA=m
+CONFIG_I2C_CHARDEV=m
+CONFIG_I2C_PROC=m
+
+#
+# Hardware sensors support
+#
+CONFIG_SENSORS=y
+CONFIG_SENSORS_ADM1021=m
+CONFIG_SENSORS_ADM1024=m
+CONFIG_SENSORS_ADM1025=m
+CONFIG_SENSORS_ADM9240=m
+CONFIG_SENSORS_DS1621=m
+CONFIG_SENSORS_FSCPOS=m
+CONFIG_SENSORS_FSCSCY=m
+CONFIG_SENSORS_GL518SM=m
+CONFIG_SENSORS_GL520SM=m
+CONFIG_SENSORS_MAXILIFE=m
+CONFIG_SENSORS_IT87=m
+CONFIG_SENSORS_MTP008=m
+CONFIG_SENSORS_LM75=m
+CONFIG_SENSORS_LM78=m
+CONFIG_SENSORS_LM80=m
+CONFIG_SENSORS_LM87=m
+CONFIG_SENSORS_LM92=m
+CONFIG_SENSORS_SIS5595=m
+CONFIG_SENSORS_SMSC47M1=m
+CONFIG_SENSORS_THMC50=m
+CONFIG_SENSORS_VIA686A=m
+CONFIG_SENSORS_VT1211=m
+CONFIG_SENSORS_VT8231=m
+CONFIG_SENSORS_W83781D=m
+CONFIG_SENSORS_OTHER=y
+CONFIG_SENSORS_BT869=m
+CONFIG_SENSORS_DDCMON=m
+CONFIG_SENSORS_EEPROM=m
+CONFIG_SENSORS_MATORB=m
+CONFIG_SENSORS_PCF8574=m
+CONFIG_SENSORS_PCF8591=m
+
+#
+# Mice
+#
+CONFIG_BUSMOUSE=m
+CONFIG_ATIXL_BUSMOUSE=m
+CONFIG_LOGIBUSMOUSE=m
+CONFIG_MS_BUSMOUSE=m
+CONFIG_MOUSE=y
+CONFIG_PSMOUSE=y
+CONFIG_82C710_MOUSE=m
+CONFIG_PC110_PAD=m
+CONFIG_MK712_MOUSE=m
+
+#
+# Joysticks
+#
+CONFIG_INPUT_GAMEPORT=m
+CONFIG_INPUT_NS558=m
+CONFIG_INPUT_LIGHTNING=m
+CONFIG_INPUT_PCIGAME=m
+CONFIG_INPUT_CS461X=m
+CONFIG_INPUT_EMU10K1=m
+CONFIG_INPUT_SERIO=m
+CONFIG_INPUT_SERPORT=m
+CONFIG_INPUT_ANALOG=m
+CONFIG_INPUT_A3D=m
+CONFIG_INPUT_ADI=m
+CONFIG_INPUT_COBRA=m
+CONFIG_INPUT_GF2K=m
+CONFIG_INPUT_GRIP=m
+CONFIG_INPUT_INTERACT=m
+CONFIG_INPUT_TMDC=m
+CONFIG_INPUT_SIDEWINDER=m
+CONFIG_INPUT_IFORCE_USB=m
+CONFIG_INPUT_IFORCE_232=m
+CONFIG_INPUT_WARRIOR=m
+CONFIG_INPUT_MAGELLAN=m
+CONFIG_INPUT_SPACEORB=m
+CONFIG_INPUT_SPACEBALL=m
+CONFIG_INPUT_STINGER=m
+CONFIG_INPUT_DB9=m
+CONFIG_INPUT_GAMECON=m
+CONFIG_INPUT_TURBOGRAFX=m
+# CONFIG_QIC02_TAPE is not set
+CONFIG_IPMI_HANDLER=m
+# CONFIG_IPMI_PANIC_EVENT is not set
+CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_KCS=m
+CONFIG_IPMI_WATCHDOG=m
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+CONFIG_ACQUIRE_WDT=m
+CONFIG_ADVANTECH_WDT=m
+CONFIG_ALIM7101_WDT=m
+CONFIG_SC520_WDT=m
+CONFIG_PCWATCHDOG=m
+CONFIG_EUROTECH_WDT=m
+CONFIG_IB700_WDT=m
+CONFIG_WAFER_WDT=m
+CONFIG_I810_TCO=m
+# CONFIG_MIXCOMWD is not set
+# CONFIG_60XX_WDT is not set
+CONFIG_SC1200_WDT=m
+# CONFIG_SCx200_WDT is not set
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_W83877F_WDT=m
+CONFIG_WDT=m
+CONFIG_WDTPCI=m
+# CONFIG_WDT_501 is not set
+CONFIG_MACHZ_WDT=m
+CONFIG_AMD7XX_TCO=m
+# CONFIG_SCx200_GPIO is not set
+CONFIG_AMD_RNG=m
+CONFIG_INTEL_RNG=m
+CONFIG_AMD_PM768=m
+CONFIG_NVRAM=m
+CONFIG_RTC=y
+CONFIG_DTLK=m
+CONFIG_R3964=m
+# CONFIG_APPLICOM is not set
+CONFIG_SONYPI=m
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_FTAPE=m
+CONFIG_ZFTAPE=m
+CONFIG_ZFT_DFLT_BLK_SZ=10240
+CONFIG_ZFT_COMPRESSOR=m
+CONFIG_FT_NR_BUFFERS=3
+# CONFIG_FT_PROC_FS is not set
+CONFIG_FT_NORMAL_DEBUG=y
+# CONFIG_FT_FULL_DEBUG is not set
+# CONFIG_FT_NO_TRACE is not set
+# CONFIG_FT_NO_TRACE_AT_ALL is not set
+CONFIG_FT_STD_FDC=y
+# CONFIG_FT_MACH2 is not set
+# CONFIG_FT_PROBE_FC10 is not set
+# CONFIG_FT_ALT_FDC is not set
+CONFIG_FT_FDC_THR=8
+CONFIG_FT_FDC_MAX_RATE=2000
+CONFIG_FT_ALPHA_CLOCK=0
+CONFIG_AGP=m
+CONFIG_AGP_INTEL=y
+CONFIG_AGP_I810=y
+CONFIG_AGP_VIA=y
+CONFIG_AGP_AMD=y
+CONFIG_AGP_AMD_8151=y
+CONFIG_AGP_SIS=y
+CONFIG_AGP_ALI=y
+CONFIG_AGP_SWORKS=y
+CONFIG_DRM=y
+# CONFIG_DRM_OLD is not set
+CONFIG_DRM_NEW=y
+CONFIG_DRM_TDFX=m
+CONFIG_DRM_R128=m
+CONFIG_DRM_RADEON=m
+CONFIG_DRM_I810=m
+# CONFIG_DRM_I810_XFREE_41 is not set
+CONFIG_DRM_I830=m
+CONFIG_DRM_MGA=m
+CONFIG_DRM_SIS=m
+
+#
+# PCMCIA character devices
+#
+CONFIG_PCMCIA_SERIAL_CS=m
+CONFIG_SYNCLINK_CS=m
+CONFIG_MWAVE=m
+CONFIG_BATTERY_GERICOM=m
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+
+#
+# Video For Linux
+#
+CONFIG_VIDEO_PROC_FS=y
+CONFIG_I2C_PARPORT=m
+CONFIG_VIDEO_BT848=m
+CONFIG_VIDEO_PMS=m
+CONFIG_VIDEO_BWQCAM=m
+CONFIG_VIDEO_CQCAM=m
+CONFIG_VIDEO_W9966=m
+CONFIG_VIDEO_CPIA=m
+CONFIG_VIDEO_CPIA_PP=m
+CONFIG_VIDEO_CPIA_USB=m
+CONFIG_VIDEO_SAA5249=m
+CONFIG_TUNER_3036=m
+CONFIG_VIDEO_STRADIS=m
+CONFIG_VIDEO_ZORAN=m
+CONFIG_VIDEO_ZORAN_BUZ=m
+CONFIG_VIDEO_ZORAN_DC10=m
+CONFIG_VIDEO_ZORAN_LML33=m
+CONFIG_VIDEO_ZR36120=m
+CONFIG_VIDEO_MEYE=m
+
+#
+# Radio Adapters
+#
+CONFIG_RADIO_CADET=m
+CONFIG_RADIO_RTRACK=m
+CONFIG_RADIO_RTRACK2=m
+CONFIG_RADIO_AZTECH=m
+CONFIG_RADIO_GEMTEK=m
+CONFIG_RADIO_GEMTEK_PCI=m
+CONFIG_RADIO_MAXIRADIO=m
+CONFIG_RADIO_MAESTRO=m
+CONFIG_RADIO_MIROPCM20=m
+CONFIG_RADIO_MIROPCM20_RDS=m
+CONFIG_RADIO_SF16FMI=m
+CONFIG_RADIO_SF16FMR2=m
+CONFIG_RADIO_TERRATEC=m
+CONFIG_RADIO_TRUST=m
+CONFIG_RADIO_TYPHOON=m
+CONFIG_RADIO_TYPHOON_PROC_FS=y
+CONFIG_RADIO_ZOLTRIX=m
+
+#
+# Crypto Hardware support
+#
+CONFIG_CRYPTO=m
+CONFIG_CRYPTO_BROADCOM=m
+
+#
+# File systems
+#
+CONFIG_QUOTA=y
+# CONFIG_QFMT_V1 is not set
+CONFIG_QFMT_V2=y
+# CONFIG_QIFACE_COMPAT is not set
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+# CONFIG_ADFS_FS is not set
+CONFIG_AFS_FS=m
+# CONFIG_ADFS_FS_RW is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_BEFS_FS=m
+# CONFIG_BEFS_DEBUG is not set
+CONFIG_BFS_FS=m
+CONFIG_EXT3_FS=m
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_XATTR_SHARING=y
+CONFIG_EXT3_FS_XATTR_USER=y
+CONFIG_JBD=m
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_UMSDOS_FS=m
+CONFIG_VFAT_FS=m
+# CONFIG_EFS_FS is not set
+# CONFIG_JFFS_FS is not set
+# CONFIG_JFFS2_FS is not set
+CONFIG_CRAMFS=m
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_DEBUG=y
+# CONFIG_JFS_STATISTICS is not set
+CONFIG_MINIX_FS=m
+CONFIG_VXFS_FS=m
+# CONFIG_NTFS_FS is not set
+# CONFIG_NTFS_RW is not set
+# CONFIG_HPFS_FS is not set
+CONFIG_PROC_FS=y
+# CONFIG_DEVFS_FS is not set
+# CONFIG_DEVFS_MOUNT is not set
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_QNX4FS_RW is not set
+CONFIG_ROMFS_FS=m
+CONFIG_EXT2_FS=y
+# CONFIG_EXT2_FS_XATTR is not set
+# CONFIG_EXT2_FS_XATTR_SHARING is not set
+# CONFIG_EXT2_FS_XATTR_USER is not set
+CONFIG_SYSV_FS=m
+CONFIG_UDF_FS=m
+CONFIG_UDF_RW=y
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_CODA_FS=m
+CONFIG_INTERMEZZO_FS=m
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+# CONFIG_ROOT_NFS is not set
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_TCP is not set
+CONFIG_SUNRPC=m
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_FS_MBCACHE=y
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+# CONFIG_AMIGA_PARTITION is not set
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+# CONFIG_LDM_PARTITION is not set
+CONFIG_SGI_PARTITION=y
+# CONFIG_ULTRIX_PARTITION is not set
+CONFIG_SUN_PARTITION=y
+# CONFIG_EFI_PARTITION is not set
+CONFIG_SMB_NLS=y
+CONFIG_NLS=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Console drivers
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_VIDEO_SELECT=y
+# CONFIG_VIDEO_IGNORE_BAD_MODE is not set
+CONFIG_MDA_CONSOLE=m
+
+#
+# Frame-buffer support
+#
+CONFIG_FB=y
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FB_RIVA=m
+CONFIG_FB_CLGEN=m
+CONFIG_FB_PM2=m
+# CONFIG_FB_PM2_FIFO_DISCONNECT is not set
+CONFIG_FB_PM2_PCI=y
+CONFIG_FB_PM3=m
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_VESA=y
+CONFIG_FB_VGA16=m
+CONFIG_FB_HGA=m
+CONFIG_VIDEO_SELECT=y
+CONFIG_FB_MATROX=m
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+# CONFIG_FB_MATROX_G450 is not set
+CONFIG_FB_MATROX_G100A=y
+CONFIG_FB_MATROX_G100=y
+CONFIG_FB_MATROX_I2C=m
+CONFIG_FB_MATROX_MAVEN=m
+# CONFIG_FB_MATROX_PROC is not set
+CONFIG_FB_MATROX_MULTIHEAD=y
+CONFIG_FB_ATY=m
+CONFIG_FB_ATY_GX=y
+CONFIG_FB_ATY_CT=y
+CONFIG_FB_ATY_CT_VAIO_LCD=y
+CONFIG_FB_RADEON=m
+CONFIG_FB_ATY128=m
+CONFIG_FB_SIS=m
+CONFIG_FB_SIS_300=y
+CONFIG_FB_SIS_315=y
+CONFIG_FB_NEOMAGIC=m
+CONFIG_FB_3DFX=m
+CONFIG_FB_VOODOO1=m
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FBCON_ADVANCED is not set
+CONFIG_FBCON_MFB=m
+CONFIG_FBCON_CFB8=y
+CONFIG_FBCON_CFB16=y
+CONFIG_FBCON_CFB24=y
+CONFIG_FBCON_CFB32=y
+CONFIG_FBCON_VGA_PLANES=m
+CONFIG_FBCON_HGA=m
+# CONFIG_FBCON_FONTWIDTH8_ONLY is not set
+# CONFIG_FBCON_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+CONFIG_SOUND_ALI5455=m
+CONFIG_SOUND_BT878=m
+CONFIG_SOUND_CMPCI=m
+CONFIG_SOUND_CMPCI_FM=y
+CONFIG_SOUND_CMPCI_FMIO=388
+CONFIG_SOUND_CMPCI_FMIO=388
+CONFIG_SOUND_CMPCI_MIDI=y
+CONFIG_SOUND_CMPCI_MPUIO=330
+CONFIG_SOUND_CMPCI_JOYSTICK=y
+CONFIG_SOUND_CMPCI_CM8738=y
+# CONFIG_SOUND_CMPCI_SPDIFINVERSE is not set
+CONFIG_SOUND_CMPCI_SPDIFLOOP=y
+CONFIG_SOUND_CMPCI_SPEAKERS=2
+CONFIG_SOUND_EMU10K1=m
+CONFIG_MIDI_EMU10K1=y
+CONFIG_SOUND_AUDIGY=m
+CONFIG_SOUND_FUSION=m
+CONFIG_SOUND_CS4281=m
+CONFIG_SOUND_ES1370=m
+CONFIG_SOUND_ES1371=m
+CONFIG_SOUND_ESSSOLO1=m
+CONFIG_SOUND_MAESTRO=m
+CONFIG_SOUND_MAESTRO3=m
+CONFIG_SOUND_FORTE=m
+CONFIG_SOUND_ICH=m
+CONFIG_SOUND_RME96XX=m
+CONFIG_SOUND_SONICVIBES=m
+CONFIG_SOUND_TRIDENT=m
+CONFIG_SOUND_MSNDCLAS=m
+# CONFIG_MSNDCLAS_HAVE_BOOT is not set
+CONFIG_MSNDCLAS_INIT_FILE="/etc/sound/msndinit.bin"
+CONFIG_MSNDCLAS_PERM_FILE="/etc/sound/msndperm.bin"
+CONFIG_SOUND_MSNDPIN=m
+# CONFIG_MSNDPIN_HAVE_BOOT is not set
+CONFIG_MSNDPIN_INIT_FILE="/etc/sound/pndspini.bin"
+CONFIG_MSNDPIN_PERM_FILE="/etc/sound/pndsperm.bin"
+CONFIG_SOUND_VIA82CXXX=m
+CONFIG_MIDI_VIA82CXXX=y
+CONFIG_SOUND_OSS=m
+# CONFIG_SOUND_TRACEINIT is not set
+CONFIG_SOUND_DMAP=y
+CONFIG_SOUND_AD1816=m
+CONFIG_SOUND_AD1889=m
+CONFIG_SOUND_SGALAXY=m
+CONFIG_SOUND_ADLIB=m
+CONFIG_SOUND_ACI_MIXER=m
+CONFIG_SOUND_CS4232=m
+CONFIG_SOUND_SSCAPE=m
+CONFIG_SOUND_GUS=m
+CONFIG_SOUND_GUS16=y
+CONFIG_SOUND_GUSMAX=y
+CONFIG_SOUND_VMIDI=m
+CONFIG_SOUND_TRIX=m
+CONFIG_SOUND_MSS=m
+CONFIG_SOUND_MPU401=m
+CONFIG_SOUND_NM256=m
+CONFIG_SOUND_MAD16=m
+CONFIG_MAD16_OLDCARD=y
+CONFIG_SOUND_PAS=m
+# CONFIG_PAS_JOYSTICK is not set
+CONFIG_SOUND_PSS=m
+# CONFIG_PSS_MIXER is not set
+# CONFIG_PSS_HAVE_BOOT is not set
+CONFIG_SOUND_SB=m
+CONFIG_SOUND_AWE32_SYNTH=m
+CONFIG_SOUND_WAVEFRONT=m
+CONFIG_SOUND_MAUI=m
+CONFIG_SOUND_YM3812=m
+CONFIG_SOUND_OPL3SA1=m
+CONFIG_SOUND_OPL3SA2=m
+CONFIG_SOUND_YMFPCI=m
+CONFIG_SOUND_YMFPCI_LEGACY=y
+CONFIG_SOUND_UART6850=m
+CONFIG_SOUND_AEDSP16=m
+CONFIG_SC6600=y
+CONFIG_SC6600_JOY=y
+CONFIG_SC6600_CDROM=4
+CONFIG_SC6600_CDROMBASE=0
+CONFIG_AEDSP16_SBPRO=y
+CONFIG_AEDSP16_MPU401=y
+CONFIG_SOUND_TVMIXER=m
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_UHCI=m
+CONFIG_USB_UHCI_ALT=m
+CONFIG_USB_OHCI=m
+CONFIG_USB_AUDIO=m
+# CONFIG_USB_EMI26 is not set
+CONFIG_USB_MIDI=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+CONFIG_USB_HIDDEV=y
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_POWERMATE=m
+# CONFIG_USB_DC2XX is not set
+CONFIG_USB_MDC800=m
+CONFIG_USB_SCANNER=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_OV511=m
+CONFIG_USB_PWC=m
+CONFIG_USB_SE401=m
+CONFIG_USB_STV680=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_DSBR=m
+CONFIG_USB_DABUSB=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_CATC=m
+CONFIG_USB_CDCETHER=m
+CONFIG_USB_USBNET=m
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+# CONFIG_USB_SERIAL_DEBUG is not set
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_WHITEHEAT=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+# CONFIG_USB_SERIAL_KEYSPAN_USA49W is not set
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_TIGL=m
+CONFIG_USB_BRLVGER=m
+CONFIG_USB_LCD=m
+
+#
+# Additional device driver support
+#
+CONFIG_NET_BROADCOM=m
+CONFIG_CIPE=m
+CONFIG_CRYPTO_AEP=m
+CONFIG_MEGARAC=m
+CONFIG_FC_QLA2200=m
+CONFIG_FC_QLA2300=m
+CONFIG_SCSI_ISCSI=m
+
+#
+# Bluetooth support
+#
+CONFIG_BLUEZ=m
+CONFIG_BLUEZ_L2CAP=m
+CONFIG_BLUEZ_SCO=m
+CONFIG_BLUEZ_RFCOMM=m
+CONFIG_BLUEZ_RFCOMM_TTY=y
+CONFIG_BLUEZ_BNEP=m
+CONFIG_BLUEZ_BNEP_MC_FILTER=y
+CONFIG_BLUEZ_BNEP_PROTO_FILTER=y
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BLUEZ_HCIUSB=m
+CONFIG_BLUEZ_USB_ZERO_PACKET=y
+CONFIG_BLUEZ_HCIUART=m
+CONFIG_BLUEZ_HCIUART_H4=y
+CONFIG_BLUEZ_HCIUART_BCSP=y
+CONFIG_BLUEZ_HCIUART_BCSP_TXCRC=y
+CONFIG_BLUEZ_HCIDTL1=m
+CONFIG_BLUEZ_HCIBT3C=m
+CONFIG_BLUEZ_HCIBLUECARD=m
+CONFIG_BLUEZ_HCIBTUART=m
+CONFIG_BLUEZ_HCIVHCI=m
+
+#
+# Profiling support
+#
+# CONFIG_PROFILING is not set
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+# CONFIG_DEBUG_HIGHMEM is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_IOVIRT is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_FRAME_POINTER is not set
+CONFIG_MCL_COREDUMP=y
+CONFIG_BOOTIMG=y
+
+#
+# Library routines
+#
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=y
diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-uml b/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-uml
new file mode 100644 (file)
index 0000000..2d4a2d5
--- /dev/null
@@ -0,0 +1,297 @@
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_USERMODE=y
+# CONFIG_ISA is not set
+# CONFIG_SBUS is not set
+# CONFIG_PCI is not set
+CONFIG_UID16=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# General Setup
+#
+# CONFIG_MODE_SKAS is not set
+CONFIG_MODE_TT=y
+CONFIG_MODE_TT=y
+CONFIG_NET=y
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+# CONFIG_BINFMT_AOUT is not set
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=y
+CONFIG_HOSTFS=y
+# CONFIG_HPPFS is not set
+CONFIG_MCONSOLE=y
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_HOST_2G_2G is not set
+# CONFIG_UML_SMP is not set
+# CONFIG_SMP is not set
+CONFIG_NEST_LEVEL=0
+CONFIG_KERNEL_HALF_GIGS=1
+# CONFIG_HIGHMEM is not set
+# CONFIG_PROC_MM is not set
+CONFIG_KERNEL_STACK_ORDER=2
+CONFIG_MODE_TT=y
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+# CONFIG_KMOD is not set
+
+#
+# Character Devices
+#
+CONFIG_STDIO_CONSOLE=y
+CONFIG_SSL=y
+CONFIG_FD_CHAN=y
+# CONFIG_NULL_CHAN is not set
+CONFIG_PORT_CHAN=y
+CONFIG_PTY_CHAN=y
+CONFIG_TTY_CHAN=y
+CONFIG_XTERM_CHAN=y
+CONFIG_CON_ZERO_CHAN="fd:0,fd:1"
+CONFIG_CON_CHAN="xterm"
+CONFIG_SSL_CHAN="pty"
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=256
+# CONFIG_WATCHDOG is not set
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+# CONFIG_SOFT_WATCHDOG is not set
+# CONFIG_UML_WATCHDOG is not set
+# CONFIG_UML_SOUND is not set
+# CONFIG_SOUND is not set
+# CONFIG_HOSTAUDIO is not set
+# CONFIG_TTY_LOG is not set
+
+#
+# Block Devices
+#
+CONFIG_BLK_DEV_UBD=y
+# CONFIG_BLK_DEV_UBD_SYNC is not set
+CONFIG_BLK_DEV_LOOP=y
+# CONFIG_BLK_DEV_NBD is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_BLK_DEV_INITRD is not set
+# CONFIG_MMAPPER is not set
+CONFIG_NETDEVICES=y
+
+#
+# Network Devices
+#
+CONFIG_UML_NET=y
+CONFIG_UML_NET_ETHERTAP=y
+CONFIG_UML_NET_TUNTAP=y
+CONFIG_UML_NET_SLIP=y
+# CONFIG_UML_NET_SLIRP is not set
+CONFIG_UML_NET_DAEMON=y
+CONFIG_UML_NET_MCAST=y
+# CONFIG_UML_NET_PCAP is not set
+CONFIG_DUMMY=y
+# CONFIG_BONDING is not set
+# CONFIG_EQUALIZER is not set
+CONFIG_TUN=y
+CONFIG_PPP=y
+# CONFIG_PPP_MULTILINK is not set
+# CONFIG_PPP_FILTER is not set
+# CONFIG_PPP_ASYNC is not set
+# CONFIG_PPP_SYNC_TTY is not set
+# CONFIG_PPP_DEFLATE is not set
+# CONFIG_PPP_BSDCOMP is not set
+# CONFIG_PPPOE is not set
+CONFIG_SLIP=y
+# CONFIG_SLIP_COMPRESSED is not set
+# CONFIG_SLIP_SMART is not set
+# CONFIG_SLIP_MODE_SLIP6 is not set
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+# CONFIG_NETLINK_DEV is not set
+# CONFIG_NETFILTER is not set
+# CONFIG_FILTER is not set
+CONFIG_UNIX=y
+CONFIG_INET=y
+# CONFIG_IP_MULTICAST is not set
+# CONFIG_IP_ADVANCED_ROUTER is not set
+# CONFIG_IP_PNP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_IPV6 is not set
+# CONFIG_KHTTPD is not set
+# CONFIG_ATM is not set
+# CONFIG_VLAN_8021Q is not set
+
+#
+#  
+#
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+
+#
+# Appletalk devices
+#
+# CONFIG_DEV_APPLETALK is not set
+# CONFIG_DECNET is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_LLC is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+# CONFIG_NET_SCHED is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+
+#
+# File systems
+#
+CONFIG_QUOTA=y
+# CONFIG_AUTOFS_FS is not set
+CONFIG_AUTOFS4_FS=m
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+# CONFIG_REISERFS_PROC_INFO is not set
+# CONFIG_ADFS_FS is not set
+# CONFIG_ADFS_FS_RW is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BEFS_DEBUG is not set
+# CONFIG_BFS_FS is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+# CONFIG_EXT3_FS_XATTR_SHARING is not set
+# CONFIG_EXT3_FS_XATTR_USER is not set
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+# CONFIG_FAT_FS is not set
+# CONFIG_MSDOS_FS is not set
+# CONFIG_UMSDOS_FS is not set
+# CONFIG_VFAT_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_JFFS_FS is not set
+# CONFIG_JFFS2_FS is not set
+# CONFIG_CRAMFS is not set
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+# CONFIG_ISO9660_FS is not set
+# CONFIG_JOLIET is not set
+# CONFIG_ZISOFS is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_JFS_DEBUG is not set
+# CONFIG_JFS_STATISTICS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_NTFS_FS is not set
+# CONFIG_NTFS_RW is not set
+# CONFIG_HPFS_FS is not set
+CONFIG_PROC_FS=y
+CONFIG_DEVFS_FS=y
+CONFIG_DEVFS_MOUNT=y
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_QNX4FS_RW is not set
+# CONFIG_ROMFS_FS is not set
+CONFIG_EXT2_FS=y
+# CONFIG_EXT2_FS_XATTR is not set
+# CONFIG_EXT2_FS_XATTR_SHARING is not set
+# CONFIG_EXT2_FS_XATTR_USER is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UDF_FS is not set
+# CONFIG_UDF_RW is not set
+# CONFIG_UFS_FS is not set
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+# CONFIG_CODA_FS is not set
+# CONFIG_INTERMEZZO_FS is not set
+# CONFIG_NFS_FS is not set
+# CONFIG_NFS_V3 is not set
+# CONFIG_ROOT_NFS is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFSD_V3 is not set
+# CONFIG_NFSD_TCP is not set
+# CONFIG_SUNRPC is not set
+# CONFIG_LOCKD is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_NCPFS_PACKET_SIGNING is not set
+# CONFIG_NCPFS_IOCTL_LOCKING is not set
+# CONFIG_NCPFS_STRONG is not set
+# CONFIG_NCPFS_NFS_NS is not set
+# CONFIG_NCPFS_OS2_NS is not set
+# CONFIG_NCPFS_SMALLDOS is not set
+# CONFIG_NCPFS_NLS is not set
+# CONFIG_NCPFS_EXTRAS is not set
+# CONFIG_ZISOFS_FS is not set
+CONFIG_FS_MBCACHE=y
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+# CONFIG_SMB_NLS is not set
+# CONFIG_NLS is not set
+
+#
+# SCSI support
+#
+# CONFIG_SCSI is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+# CONFIG_MD is not set
+# CONFIG_BLK_DEV_MD is not set
+# CONFIG_MD_LINEAR is not set
+# CONFIG_MD_RAID0 is not set
+# CONFIG_MD_RAID1 is not set
+# CONFIG_MD_RAID5 is not set
+# CONFIG_MD_MULTIPATH is not set
+# CONFIG_BLK_DEV_LVM is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Library routines
+#
+# CONFIG_ZLIB_INFLATE is not set
+# CONFIG_ZLIB_DEFLATE is not set
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUGSYM=y
+CONFIG_PT_PROXY=y
+# CONFIG_GPROF is not set
+# CONFIG_GCOV is not set
diff --git a/lustre/kernel_patches/kernel_configs/jdike-2.5.69-uml.config b/lustre/kernel_patches/kernel_configs/jdike-2.5.69-uml.config
new file mode 100644 (file)
index 0000000..4aa8a2c
--- /dev/null
@@ -0,0 +1,321 @@
+#
+# Automatically generated make config: don't edit
+#
+CONFIG_USERMODE=y
+CONFIG_MMU=y
+CONFIG_UID16=y
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# General setup
+#
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_LOG_BUF_SHIFT=14
+
+#
+# Loadable module support
+#
+# CONFIG_MODULES is not set
+
+#
+# UML-specific options
+#
+CONFIG_MODE_TT=y
+# CONFIG_MODE_SKAS is not set
+CONFIG_NET=y
+CONFIG_HOSTFS=y
+# CONFIG_HPPFS is not set
+CONFIG_MCONSOLE=y
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_HOST_2G_2G is not set
+# CONFIG_UML_SMP is not set
+# CONFIG_SMP is not set
+CONFIG_NEST_LEVEL=0
+CONFIG_KERNEL_HALF_GIGS=1
+# CONFIG_HIGHMEM is not set
+# CONFIG_PROC_MM is not set
+CONFIG_KERNEL_STACK_ORDER=3
+
+#
+# Executable file formats
+#
+# CONFIG_BINFMT_AOUT is not set
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=y
+
+#
+# Character Devices
+#
+CONFIG_STDIO_CONSOLE=y
+CONFIG_SSL=y
+CONFIG_FD_CHAN=y
+# CONFIG_NULL_CHAN is not set
+CONFIG_PORT_CHAN=y
+CONFIG_PTY_CHAN=y
+CONFIG_TTY_CHAN=y
+CONFIG_XTERM_CHAN=y
+CONFIG_CON_ZERO_CHAN="fd:0,fd:1"
+CONFIG_CON_CHAN="xterm"
+CONFIG_SSL_CHAN="pty"
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=256
+# CONFIG_WATCHDOG is not set
+# CONFIG_UML_SOUND is not set
+# CONFIG_SOUND is not set
+# CONFIG_HOSTAUDIO is not set
+
+#
+# Block Devices
+#
+CONFIG_BLK_DEV_UBD=y
+# CONFIG_BLK_DEV_UBD_SYNC is not set
+CONFIG_BLK_DEV_LOOP=y
+# CONFIG_BLK_DEV_NBD is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_MMAPPER is not set
+CONFIG_NETDEVICES=y
+
+#
+# Networking support
+#
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+# CONFIG_NETLINK_DEV is not set
+# CONFIG_NETFILTER is not set
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+# CONFIG_IP_MULTICAST is not set
+# CONFIG_IP_ADVANCED_ROUTER is not set
+# CONFIG_IP_PNP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_IPV6 is not set
+# CONFIG_XFRM_USER is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+CONFIG_IPV6_SCTP__=y
+# CONFIG_IP_SCTP is not set
+# CONFIG_ATM is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_LLC is not set
+# CONFIG_DECNET is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+# CONFIG_NET_SCHED is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+CONFIG_DUMMY=y
+# CONFIG_BONDING is not set
+# CONFIG_EQUALIZER is not set
+CONFIG_TUN=y
+# CONFIG_ETHERTAP is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+# CONFIG_NET_ETHERNET is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+
+#
+# Ethernet (10000 Mbit)
+#
+CONFIG_PPP=y
+# CONFIG_PPP_MULTILINK is not set
+# CONFIG_PPP_FILTER is not set
+# CONFIG_PPP_ASYNC is not set
+# CONFIG_PPP_SYNC_TTY is not set
+# CONFIG_PPP_DEFLATE is not set
+# CONFIG_PPP_BSDCOMP is not set
+# CONFIG_PPPOE is not set
+CONFIG_SLIP=y
+# CONFIG_SLIP_COMPRESSED is not set
+# CONFIG_SLIP_SMART is not set
+# CONFIG_SLIP_MODE_SLIP6 is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Token Ring devices (depends on LLC=y)
+#
+# CONFIG_SHAPER is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+
+#
+# UML Network Devices
+#
+CONFIG_UML_NET=y
+CONFIG_UML_NET_ETHERTAP=y
+CONFIG_UML_NET_TUNTAP=y
+CONFIG_UML_NET_SLIP=y
+CONFIG_UML_NET_DAEMON=y
+CONFIG_UML_NET_MCAST=y
+# CONFIG_UML_NET_PCAP is not set
+# CONFIG_UML_NET_SLIRP is not set
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+# CONFIG_EXT2_FS_XATTR is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+# CONFIG_EXT3_FS_POSIX_ACL is not set
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+# CONFIG_REISERFS_PROC_INFO is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_XFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_ROMFS_FS is not set
+CONFIG_QUOTA=y
+# CONFIG_QFMT_V1 is not set
+# CONFIG_QFMT_V2 is not set
+CONFIG_QUOTACTL=y
+# CONFIG_AUTOFS_FS is not set
+CONFIG_AUTOFS4_FS=y
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+# CONFIG_FAT_FS is not set
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_DEVFS_FS=y
+CONFIG_DEVFS_MOUNT=y
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+
+#
+# Network File Systems
+#
+# CONFIG_NFS_FS is not set
+# CONFIG_NFSD is not set
+# CONFIG_EXPORTFS is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_CIFS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_INTERMEZZO_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+
+#
+# Security options
+#
+# CONFIG_SECURITY is not set
+
+#
+# Cryptographic options
+#
+# CONFIG_CRYPTO is not set
+
+#
+# Library routines
+#
+CONFIG_CRC32=y
+
+#
+# SCSI support
+#
+# CONFIG_SCSI is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+# CONFIG_MD is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUG_SPINLOCK=y
+CONFIG_DEBUGSYM=y
+CONFIG_FRAME_POINTER=y
+CONFIG_PT_PROXY=y
+# CONFIG_GPROF is not set
+# CONFIG_GCOV is not set
diff --git a/lustre/kernel_patches/patches/dev_read_only_2.4.20-rh.patch b/lustre/kernel_patches/patches/dev_read_only_2.4.20-rh.patch
new file mode 100644 (file)
index 0000000..55057d9
--- /dev/null
@@ -0,0 +1,77 @@
+
+
+
+ drivers/block/blkpg.c  |   35 +++++++++++++++++++++++++++++++++++
+ drivers/block/loop.c   |    3 +++
+ drivers/ide/ide-disk.c |    5 ++++-
+ 3 files changed, 42 insertions(+), 1 deletion(-)
+
+--- rh-2.4.20/drivers/block/blkpg.c~dev_read_only_2.4.20       2003-04-11 14:05:03.000000000 +0800
++++ rh-2.4.20-root/drivers/block/blkpg.c       2003-04-12 13:11:31.000000000 +0800
+@@ -297,3 +297,38 @@ int blk_ioctl(kdev_t dev, unsigned int c
+ }
+ EXPORT_SYMBOL(blk_ioctl);
++
++#define NUM_DEV_NO_WRITE 16
++static int dev_no_write[NUM_DEV_NO_WRITE];
++
++/*
++ * Debug code for turning block devices "read-only" (will discard writes
++ * silently).  This is for filesystem crash/recovery testing.
++ */
++void dev_set_rdonly(kdev_t dev, int no_write)
++{
++      if (dev) {
++              printk(KERN_WARNING "Turning device %s read-only\n",
++                     bdevname(dev));
++              dev_no_write[no_write] = 0xdead0000 + dev;
++      }
++}
++
++int dev_check_rdonly(kdev_t dev) {
++      int i;
++
++      for (i = 0; i < NUM_DEV_NO_WRITE; i++) {
++              if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 &&
++                  dev == (dev_no_write[i] & 0xffff))
++                      return 1;
++      }
++      return 0;
++}
++
++void dev_clear_rdonly(int no_write) {
++      dev_no_write[no_write] = 0;
++}
++
++EXPORT_SYMBOL(dev_set_rdonly);
++EXPORT_SYMBOL(dev_check_rdonly);
++EXPORT_SYMBOL(dev_clear_rdonly);
+--- rh-2.4.20/drivers/block/loop.c~dev_read_only_2.4.20        2003-04-11 14:05:08.000000000 +0800
++++ rh-2.4.20-root/drivers/block/loop.c        2003-04-12 13:11:31.000000000 +0800
+@@ -491,6 +491,9 @@ static int loop_make_request(request_que
+       spin_unlock_irq(&lo->lo_lock);
+       if (rw == WRITE) {
++              if (dev_check_rdonly(rbh->b_rdev))
++                      goto err;
++
+               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+                       goto err;
+       } else if (rw == READA) {
+--- rh-2.4.20/drivers/ide/ide-disk.c~dev_read_only_2.4.20      2003-04-11 14:04:53.000000000 +0800
++++ rh-2.4.20-root/drivers/ide/ide-disk.c      2003-04-12 13:14:48.000000000 +0800
+@@ -381,7 +381,10 @@ static ide_startstop_t do_rw_disk (ide_d
+       if (IS_PDC4030_DRIVE)
+               return promise_rw_disk(drive, rq, block);
+ #endif /* CONFIG_BLK_DEV_PDC4030 */
+-
++      if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) {
++              ide_end_request(1, HWGROUP(drive));
++              return ide_stopped;
++      }
+       if (IDE_CONTROL_REG)
+               hwif->OUTB(drive->ctl, IDE_CONTROL_REG);
+
+_
@@ -1,14 +1,16 @@
- drivers/block/blkpg.c  |   3++++++++++++++++++++++++++++++++++++++
- drivers/block/loop.c   |    5 +++++
- drivers/ide/ide-disk.c |    6 ++++++
- 3 files changed, 49 insertions(+)
+ drivers/block/blkpg.c  |   39 +++++++++++++++++++++++++++++++++++++++
+ drivers/block/loop.c   |    +++
+ drivers/ide/ide-disk.c |    ++++
+ 3 files changed, 46 insertions(+)
 
---- linux-2.4.19-hp2_pnnl2/drivers/block/blkpg.c~dev_read_only_hp      Sun Jan 19 18:51:12 2003
-+++ linux-2.4.19-hp2_pnnl2-root/drivers/block/blkpg.c  Sun Jan 19 18:52:28 2003
-@@ -310,6 +310,42 @@ int blk_ioctl(kdev_t dev, unsigned int c
+--- linux-2.4.20/drivers/block/blkpg.c~dev_read_only_hp        2003-04-09 15:14:54.000000000 -0600
++++ linux-2.4.20-braam/drivers/block/blkpg.c   2003-04-09 15:37:02.000000000 -0600
+@@ -296,3 +296,42 @@ int blk_ioctl(kdev_t dev, unsigned int c
+ }
  
  EXPORT_SYMBOL(blk_ioctl);
++
++  
 +
 +#define NUM_DEV_NO_WRITE 16
 +static int dev_no_write[NUM_DEV_NO_WRITE];
 +EXPORT_SYMBOL(dev_check_rdonly);
 +EXPORT_SYMBOL(dev_clear_rdonly);
 +
- /**
-  * get_last_sector()
-  *  
---- linux-2.4.19-hp2_pnnl2/drivers/block/loop.c~dev_read_only_hp       Sun Jan 19 18:51:12 2003
-+++ linux-2.4.19-hp2_pnnl2-root/drivers/block/loop.c   Sun Jan 19 18:51:12 2003
++
+--- linux-2.4.20/drivers/block/loop.c~dev_read_only_hp 2003-04-09 15:14:54.000000000 -0600
++++ linux-2.4.20-braam/drivers/block/loop.c    2003-04-09 15:37:02.000000000 -0600
 @@ -474,6 +474,9 @@ static int loop_make_request(request_que
        spin_unlock_irq(&lo->lo_lock);
  
@@ -60,9 +60,9 @@
                if (lo->lo_flags & LO_FLAGS_READ_ONLY)
                        goto err;
        } else if (rw == READA) {
---- linux-2.4.19-hp2_pnnl2/drivers/ide/ide-disk.c~dev_read_only_hp     Sun Jan 19 18:51:12 2003
-+++ linux-2.4.19-hp2_pnnl2-root/drivers/ide/ide-disk.c Sun Jan 19 18:51:12 2003
-@@ -551,6 +551,10 @@ static ide_startstop_t lba_48_rw_disk (i
+--- linux-2.4.20/drivers/ide/ide-disk.c~dev_read_only_hp       2003-04-09 15:14:54.000000000 -0600
++++ linux-2.4.20-braam/drivers/ide/ide-disk.c  2003-04-09 15:37:02.000000000 -0600
+@@ -558,6 +558,10 @@ static ide_startstop_t lba_48_rw_disk (i
   */
  static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
  {
diff --git a/lustre/kernel_patches/patches/dev_read_only_hp_2.4.20.patch b/lustre/kernel_patches/patches/dev_read_only_hp_2.4.20.patch
new file mode 100644 (file)
index 0000000..60081db
--- /dev/null
@@ -0,0 +1,77 @@
+ drivers/block/blkpg.c  |   36 ++++++++++++++++++++++++++++++++++++
+ drivers/block/loop.c   |    3 +++
+ drivers/ide/ide-disk.c |    4 ++++
+ 3 files changed, 43 insertions(+)
+
+--- linux/drivers/block/blkpg.c~dev_read_only_hp_2.4.20        Mon May 19 07:07:52 2003
++++ linux-mmonroe/drivers/block/blkpg.c        Mon May 19 07:37:22 2003
+@@ -310,6 +310,42 @@ int blk_ioctl(kdev_t dev, unsigned int c
+ EXPORT_SYMBOL(blk_ioctl);
++
++#define NUM_DEV_NO_WRITE 16
++static int dev_no_write[NUM_DEV_NO_WRITE];
++/*
++ * Debug code for turning block devices "read-only" (will discard writes
++ * silently).  This is for filesystem crash/recovery testing.
++ */
++void dev_set_rdonly(kdev_t dev, int no_write)
++{
++      if (dev) {
++              printk(KERN_WARNING "Turning device %s read-only\n",
++                     bdevname(dev));
++              dev_no_write[no_write] = 0xdead0000 + dev;
++      }
++}
++
++int dev_check_rdonly(kdev_t dev) {
++      int i;
++
++      for (i = 0; i < NUM_DEV_NO_WRITE; i++) {
++      if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 &&
++              dev == (dev_no_write[i] & 0xffff))
++              return 1;
++      }
++      return 0;
++}
++
++void dev_clear_rdonly(int no_write) {
++      dev_no_write[no_write] = 0;
++}
++
++EXPORT_SYMBOL(dev_set_rdonly);
++EXPORT_SYMBOL(dev_check_rdonly);
++EXPORT_SYMBOL(dev_clear_rdonly);
++
++
+ /**
+  * get_last_sector()
+  *  
+--- linux/drivers/block/loop.c~dev_read_only_hp_2.4.20 Thu Nov 28 15:53:12 2002
++++ linux-mmonroe/drivers/block/loop.c Mon May 19 07:28:29 2003
+@@ -474,6 +474,9 @@ static int loop_make_request(request_que
+       spin_unlock_irq(&lo->lo_lock);
+       if (rw == WRITE) {
++              if (dev_check_rdonly(rbh->b_rdev))
++                      goto err;
++
+               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
+                       goto err;
+       } else if (rw == READA) {
+--- linux/drivers/ide/ide-disk.c~dev_read_only_hp_2.4.20       Thu Nov 28 15:53:13 2002
++++ linux-mmonroe/drivers/ide/ide-disk.c       Mon May 19 07:28:29 2003
+@@ -558,6 +558,10 @@ static ide_startstop_t lba_48_rw_disk (i
+  */
+ static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
+ {
++      if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) {
++              ide_end_request(1, HWGROUP(drive));
++              return ide_stopped;
++      }
+       if (IDE_CONTROL_REG)
+               OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
+
+_
diff --git a/lustre/kernel_patches/patches/dsp.patch b/lustre/kernel_patches/patches/dsp.patch
new file mode 100644 (file)
index 0000000..f2e5b93
--- /dev/null
@@ -0,0 +1,130 @@
+ arch/i386/kernel/crash.c |   24 +++++++++++++++++-------
+ arch/i386/kernel/nmi.c   |    2 +-
+ include/asm-i386/apic.h  |    1 +
+ include/linux/crash.h    |    2 +-
+ kernel/bootimg.c         |   13 ++++++++++++-
+ kernel/bootimg_pic.c     |    6 ++++--
+ 6 files changed, 36 insertions(+), 12 deletions(-)
+
+--- linux-rh-2.4.20-8/kernel/bootimg.c~dsp     2003-05-07 19:30:47.000000000 +0800
++++ linux-rh-2.4.20-8-root/kernel/bootimg.c    2003-05-07 19:31:12.000000000 +0800
+@@ -238,9 +238,20 @@ int boot_image()
+       int error = -ENOMEM;
+       if (bootimg_checksum(__va(bootimg_dsc.page_dir),bootimg_dsc.pages) 
+-              != bootimg_dsc.csum)
++              != bootimg_dsc.csum) {
+               printk("Checksum of kernel image failed.  Rebooting via BIOS\n");
++              /* Before calling machine_restart(), make sure it will not
++               * simply call this function recursively.
++               */
++              bootimg_dsc.page_dir = NULL;
++              machine_restart(NULL);
++
++              /* We should never get here, but just in case... */
++              for (; ; )
++                      __asm__ __volatile__ ("hlt");
++      }
++
+       code_page = get_identity_mapped_page();
+       if (!code_page) goto out3;
+       code = (relocate_and_jump_t) virt_to_phys((void *) code_page);
+--- linux-rh-2.4.20-8/kernel/bootimg_pic.c~dsp 2003-05-07 19:30:47.000000000 +0800
++++ linux-rh-2.4.20-8-root/kernel/bootimg_pic.c        2003-05-07 19:31:12.000000000 +0800
+@@ -69,7 +69,8 @@ void __bootimg relocate_and_jump(void)
+                       for (j = i+1; j < dsc.pages; j++) {
+                               table = dsc.page_dir+FROM_TABLE(j);
+                               if (((unsigned long) *table) == to) {
+-                                      copy_and_swap(*table,dsc.scratch);
++                                      copy_and_swap((unsigned long) (*table),
++                                                    dsc.scratch);
+                                       break;
+                               }
+                               if ((*table)[PAGE_NR(j)] == to) {
+@@ -79,7 +80,8 @@ void __bootimg relocate_and_jump(void)
+                               }
+                               table = dsc.page_dir+TO_TABLE(j);
+                               if (((unsigned long) *table) == to) {
+-                                      copy_and_swap(*table,dsc.scratch);
++                                      copy_and_swap((unsigned long) (*table),
++                                                    dsc.scratch);
+                                       break;
+                               }
+                       }
+--- linux-rh-2.4.20-8/include/asm-i386/apic.h~dsp      2003-05-07 17:00:16.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/asm-i386/apic.h     2003-05-07 19:31:12.000000000 +0800
+@@ -86,6 +86,7 @@ extern struct pm_dev *apic_pm_register(p
+ extern void apic_pm_unregister(struct pm_dev*);
+ extern int check_nmi_watchdog (void);
++extern void disable_apic_nmi_watchdog(void);
+ extern unsigned int nmi_watchdog;
+ #define NMI_NONE      0
+--- linux-rh-2.4.20-8/include/linux/crash.h~dsp        2003-05-07 19:30:47.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/crash.h       2003-05-07 19:31:12.000000000 +0800
+@@ -71,7 +71,7 @@ extern void stop_this_cpu(void *);
+ #define CRASH_ZALLOC_PAGES 16*5*2     /* 2 to handle crash in crash */
+ #define CRASH_LOW_WATER_PAGES 100
+-#define CRASH_CPU_TIMEOUT 5000        /* 5 sec wait for other cpus to stop */
++#define CRASH_CPU_TIMEOUT 15000       /* 15 sec wait for other cpus to stop */
+ #define CRASH_MARK_RESERVED(addr) (set_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags))
+ #define CRASH_CLEAR_RESERVED(addr) (clear_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags))
+--- linux-rh-2.4.20-8/arch/i386/kernel/crash.c~dsp     2003-05-07 19:30:47.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/i386/kernel/crash.c    2003-05-07 19:31:39.000000000 +0800
+@@ -9,6 +9,8 @@
+ #include <linux/crash.h>
+ #include <linux/reboot.h>
+ #include <linux/bootimg.h>
++#include <asm/fixmap.h>
++#include <asm/apic.h>
+ inline void crash_save_regs(void) {
+       static unsigned long regs[8];
+@@ -30,15 +32,23 @@ inline void crash_save_regs(void) {
+  */
+ void crash_save_current_state(struct task_struct *tp)
+ {
++      if (tp != NULL) {
++              /*
++               *  Here we save ebp instead of esp just in case the compiler
++               *  decides to put an extra push in before we execute this
++               *  instruction (thus invalidating our frame pointer).
++               */
++              asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp));
++              tp->thread.eip = (u_long)crash_save_current_state;
++              panic_ksp[smp_processor_id()] = tp->thread.esp;
++              mb();
++      }
++
+       /*
+-       *  Here we save ebp instead of esp just in case the compiler
+-       *  decides to put an extra push in before we execute this
+-       *  instruction (thus invalidating our frame pointer).
++       * Just to be safe, disable the NMI watchdog on the calling CPU so it
++       * doesn't get in the way while we are trying to save a dump.
+        */
+-      asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp));
+-      tp->thread.eip = (u_long)crash_save_current_state;
+-      panic_ksp[smp_processor_id()] = tp->thread.esp;
+-      mb();
++      disable_apic_nmi_watchdog();
+       save_core();
+--- linux-rh-2.4.20-8/arch/i386/kernel/nmi.c~dsp       2003-05-07 19:30:47.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/i386/kernel/nmi.c      2003-05-07 19:31:12.000000000 +0800
+@@ -138,7 +138,7 @@ __setup("nmi_watchdog=", setup_nmi_watch
+ struct pm_dev *nmi_pmdev;
+-static void disable_apic_nmi_watchdog(void)
++void disable_apic_nmi_watchdog(void)
+ {
+       switch (boot_cpu_data.x86_vendor) {
+       case X86_VENDOR_AMD:
+
+_
diff --git a/lustre/kernel_patches/patches/export-truncate-2.5.63.patch b/lustre/kernel_patches/patches/export-truncate-2.5.63.patch
new file mode 100644 (file)
index 0000000..3d82572
--- /dev/null
@@ -0,0 +1,37 @@
+ include/linux/mm.h |    2 ++
+ mm/truncate.c      |    4 ++--
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+--- linux-2.5.63/include/linux/mm.h~export-truncate-2.5.63     Mon May  5 18:08:15 2003
++++ linux-2.5.63-root/include/linux/mm.h       Mon May  5 18:08:58 2003
+@@ -540,6 +540,8 @@ can_vma_merge(struct vm_area_struct *vma
+       else
+               return 0;
+ }
++/* truncate.c */
++extern void truncate_complete_page(struct page *);
+ /* filemap.c */
+ extern unsigned long page_unuse(struct page *);
+--- linux-2.5.63/mm/truncate.c~export-truncate-2.5.63  Mon May  5 18:09:50 2003
++++ linux-2.5.63-root/mm/truncate.c    Mon May  5 18:11:29 2003
+@@ -41,7 +41,7 @@ static inline void truncate_partial_page
+  * its lock, b) when a concurrent invalidate_inode_pages got there first and
+  * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
+  */
+-static void
++void
+ truncate_complete_page(struct address_space *mapping, struct page *page)
+ {
+       if (page->mapping != mapping)
+@@ -56,7 +56,7 @@ truncate_complete_page(struct address_sp
+       remove_from_page_cache(page);
+       page_cache_release(page);       /* pagecache ref */
+ }
+-
++EXPORT_SYMBOL_GPL(truncate_complete_page);
+ /*
+  * This is for invalidate_inode_pages().  That function can be called at
+  * any time, and is not supposed to throw away dirty pages.  But pages can
+
+_
diff --git a/lustre/kernel_patches/patches/export-truncate.patch b/lustre/kernel_patches/patches/export-truncate.patch
new file mode 100644 (file)
index 0000000..2cd96b9
--- /dev/null
@@ -0,0 +1,35 @@
+ include/linux/mm.h |    1 +
+ mm/filemap.c       |    3 ++-
+ 2 files changed, 3 insertions(+), 1 deletion(-)
+
+--- linux-2.4.18-18/include/linux/mm.h~export-truncate 2003-04-05 02:54:55.000000000 -0700
++++ linux-2.4.18-18-braam/include/linux/mm.h   2003-04-09 17:37:46.000000000 -0600
+@@ -650,6 +650,7 @@ struct zone_t;
+ /* filemap.c */
+ extern void remove_inode_page(struct page *);
+ extern unsigned long page_unuse(struct page *);
++extern void truncate_complete_page(struct page *);
+ extern void truncate_inode_pages(struct address_space *, loff_t);
+ /* generic vm_area_ops exported for stackable file systems */
+--- linux-2.4.18-18/mm/filemap.c~export-truncate       2003-04-05 02:54:55.000000000 -0700
++++ linux-2.4.18-18-braam/mm/filemap.c 2003-04-09 17:37:46.000000000 -0600
+@@ -245,7 +245,7 @@ static inline void truncate_partial_page
+               do_flushpage(page, partial);
+ }
+-static void truncate_complete_page(struct page *page)
++void truncate_complete_page(struct page *page)
+ {
+       /*
+        * Leave it on the LRU if it gets converted into anonymous buffers
+@@ -266,6 +266,7 @@ static void truncate_complete_page(struc
+       remove_inode_page(page);
+       page_cache_release(page);
+ }
++EXPORT_SYMBOL_GPL(truncate_complete_page);
+ static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
+ static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
+
+_
index 716c156..33e0b6c 100644 (file)
@@ -7,20 +7,20 @@
  kernel/ksyms.c     |    5 +++++
  4 files changed, 9 insertions(+), 1 deletion(-)
 
---- linux-2.4.19-hp2_pnnl2/fs/ext3/Makefile~exports    Sun Jan 19 18:52:38 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/Makefile       Sun Jan 19 18:52:38 2003
+--- linux-2.4.18-18/fs/ext3/Makefile~exports   Sat Apr  5 02:51:27 2003
++++ linux-2.4.18-18-braam/fs/ext3/Makefile     Sat Apr  5 02:54:45 2003
 @@ -9,6 +9,8 @@
  
  O_TARGET := ext3.o
  
-+export-objs :=        super.o
++export-objs :=        super.o inode.o
 +
  obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
                ioctl.o namei.o super.o symlink.o
  obj-m    := $(O_TARGET)
---- linux-2.4.19-hp2_pnnl2/fs/ext3/super.c~exports     Sun Jan 19 18:52:38 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/super.c        Sun Jan 19 18:52:38 2003
-@@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void)
+--- linux-2.4.18-18/fs/ext3/super.c~exports    Sat Apr  5 02:51:27 2003
++++ linux-2.4.18-18-braam/fs/ext3/super.c      Sat Apr  5 02:54:28 2003
+@@ -1746,7 +1746,7 @@ static void __exit exit_ext3_fs(void)
        unregister_filesystem(&ext3_fs_type);
  }
  
@@ -29,9 +29,9 @@
  
  MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
  MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
---- linux-2.4.19-hp2_pnnl2/include/linux/fs.h~exports  Sun Jan 19 18:52:38 2003
-+++ linux-2.4.19-hp2_pnnl2-root/include/linux/fs.h     Sun Jan 19 18:52:38 2003
-@@ -1020,6 +1020,7 @@ extern int unregister_filesystem(struct 
+--- linux-2.4.18-18/include/linux/fs.h~exports Sat Apr  5 02:51:27 2003
++++ linux-2.4.18-18-braam/include/linux/fs.h   Sat Apr  5 02:54:29 2003
+@@ -1046,6 +1046,7 @@ extern int unregister_filesystem(struct 
  extern struct vfsmount *kern_mount(struct file_system_type *);
  extern int may_umount(struct vfsmount *);
  extern long do_mount(char *, char *, char *, unsigned long, void *);
  extern void umount_tree(struct vfsmount *);
  
  #define kern_umount mntput
---- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~exports      Sun Jan 19 18:52:38 2003
-+++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 18:52:38 2003
-@@ -308,6 +308,11 @@ EXPORT_SYMBOL(dcache_dir_fsync);
- EXPORT_SYMBOL(dcache_readdir);
- EXPORT_SYMBOL(dcache_dir_ops);
+--- linux-2.4.18-18/kernel/ksyms.c~exports     Sat Apr  5 02:51:27 2003
++++ linux-2.4.18-18-braam/kernel/ksyms.c       Sat Apr  5 02:54:29 2003
+@@ -306,6 +306,11 @@ EXPORT_SYMBOL_GPL(buffermem_pages);
+ EXPORT_SYMBOL_GPL(nr_free_pages);
+ EXPORT_SYMBOL_GPL(page_cache_size);
  
 +/* lustre */
 +EXPORT_SYMBOL(panic_notifier_list);
@@ -1,26 +1,23 @@
-
-
-
  fs/ext3/Makefile   |    2 ++
  fs/ext3/super.c    |    2 +-
  include/linux/fs.h |    1 +
  kernel/ksyms.c     |    4 ++++
- 4 files changed, 9 insertions(+), 1 deletion(-)
+ 4 files changed, 8 insertions(+), 1 deletion(-)
 
---- linux-2.4.19-hp2_pnnl2/fs/ext3/Makefile~exports    Sun Jan 19 18:52:38 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/Makefile       Sun Jan 19 18:52:38 2003
+--- linux/fs/ext3/Makefile~exports_2.4.20      Wed Apr  9 10:07:14 2003
++++ linux-mmonroe/fs/ext3/Makefile     Wed Apr  9 10:19:53 2003
 @@ -9,6 +9,8 @@
  
  O_TARGET := ext3.o
  
-+export-objs :=        super.o
++export-objs :=        super.o inode.o
 +
  obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
                ioctl.o namei.o super.o symlink.o
  obj-m    := $(O_TARGET)
---- linux-2.4.19-hp2_pnnl2/fs/ext3/super.c~exports     Sun Jan 19 18:52:38 2003
-+++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/super.c        Sun Jan 19 18:52:38 2003
-@@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void)
+--- linux/fs/ext3/super.c~exports_2.4.20       Wed Apr  9 10:07:14 2003
++++ linux-mmonroe/fs/ext3/super.c      Wed Apr  9 10:19:53 2003
+@@ -1769,7 +1769,7 @@ static void __exit exit_ext3_fs(void)
        unregister_filesystem(&ext3_fs_type);
  }
  
  
  MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
  MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
---- linux-2.4.19-hp2_pnnl2/include/linux/fs.h~exports  Sun Jan 19 18:52:38 2003
-+++ linux-2.4.19-hp2_pnnl2-root/include/linux/fs.h     Sun Jan 19 18:52:38 2003
+--- linux/include/linux/fs.h~exports_2.4.20    Wed Apr  9 10:07:14 2003
++++ linux-mmonroe/include/linux/fs.h   Wed Apr  9 10:19:53 2003
 @@ -1020,6 +1020,7 @@ extern int unregister_filesystem(struct 
  extern struct vfsmount *kern_mount(struct file_system_type *);
  extern int may_umount(struct vfsmount *);
  extern long do_mount(char *, char *, char *, unsigned long, void *);
 +struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data);
- extern void umount_tree(struct vfsmount *);
  
  #define kern_umount mntput
---- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~exports      Sun Jan 19 18:52:38 2003
-+++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 18:52:38 2003
+--- linux/kernel/ksyms.c~exports_2.4.20        Wed Apr  9 10:07:14 2003
++++ linux-mmonroe/kernel/ksyms.c       Wed Apr  9 10:19:53 2003
 @@ -308,6 +308,10 @@ EXPORT_SYMBOL(dcache_dir_fsync);
  EXPORT_SYMBOL(dcache_readdir);
  EXPORT_SYMBOL(dcache_dir_ops);
diff --git a/lustre/kernel_patches/patches/exports_2.4.20.patch b/lustre/kernel_patches/patches/exports_2.4.20.patch
new file mode 100644 (file)
index 0000000..bed8693
--- /dev/null
@@ -0,0 +1,57 @@
+
+
+
+ fs/ext3/Makefile   |    4 +++-
+ fs/ext3/super.c    |    2 +-
+ include/linux/fs.h |    1 +
+ kernel/ksyms.c     |    5 +++++
+ 4 files changed, 10 insertions(+), 2 deletions(-)
+
+--- linux-2.4.20/fs/ext3/Makefile~exports_hp   Sat Apr  5 03:55:19 2003
++++ linux-2.4.20-braam/fs/ext3/Makefile        Sat Apr  5 03:56:03 2003
+@@ -9,6 +9,8 @@
+ O_TARGET := ext3.o
++export-objs :=        super.o inode.o
++
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+               ioctl.o namei.o super.o symlink.o
+ obj-m    := $(O_TARGET)
+--- linux-2.4.20/fs/ext3/super.c~exports_hp    Sat Apr  5 03:55:19 2003
++++ linux-2.4.20-braam/fs/ext3/super.c Sat Apr  5 03:55:19 2003
+@@ -1769,7 +1769,7 @@ static void __exit exit_ext3_fs(void)
+       unregister_filesystem(&ext3_fs_type);
+ }
+-EXPORT_NO_SYMBOLS;
++EXPORT_SYMBOL(ext3_bread);
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+--- linux-2.4.20/include/linux/fs.h~exports_hp Sat Apr  5 03:55:19 2003
++++ linux-2.4.20-braam/include/linux/fs.h      Sat Apr  5 03:55:19 2003
+@@ -1005,6 +1005,7 @@ extern int unregister_filesystem(struct 
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount(struct vfsmount *);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
++struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data);
+ #define kern_umount mntput
+--- linux-2.4.20/kernel/ksyms.c~exports_hp     Sat Apr  5 03:55:19 2003
++++ linux-2.4.20-braam/kernel/ksyms.c  Sat Apr  5 03:55:19 2003
+@@ -284,6 +284,11 @@ EXPORT_SYMBOL(dcache_dir_fsync);
+ EXPORT_SYMBOL(dcache_readdir);
+ EXPORT_SYMBOL(dcache_dir_ops);
++/* lustre */
++EXPORT_SYMBOL(pagecache_lock_cacheline);
++EXPORT_SYMBOL(panic_notifier_list);
++EXPORT_SYMBOL(do_kern_mount);
++
+ /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
+ EXPORT_SYMBOL(default_llseek);
+ EXPORT_SYMBOL(dentry_open);
+
+_
diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch
new file mode 100644 (file)
index 0000000..b59cea2
--- /dev/null
@@ -0,0 +1,2527 @@
+ fs/ext3/Makefile           |    2 
+ fs/ext3/dir.c              |  299 +++++++++
+ fs/ext3/file.c             |    3 
+ fs/ext3/hash.c             |  215 ++++++
+ fs/ext3/namei.c            | 1388 ++++++++++++++++++++++++++++++++++++++++-----
+ fs/ext3/super.c            |    7 
+ include/linux/ext3_fs.h    |   85 ++
+ include/linux/ext3_fs_sb.h |    2 
+ include/linux/ext3_jbd.h   |    2 
+ include/linux/rbtree.h     |    2 
+ lib/rbtree.c               |   42 +
+ 11 files changed, 1887 insertions(+), 160 deletions(-)
+
+--- linux-chaos-2.4.20-6/fs/ext3/Makefile~ext-2.4-patch-1-chaos        2003-04-09 16:10:38.000000000 -0600
++++ linux-chaos-2.4.20-6-braam/fs/ext3/Makefile        2003-04-09 16:18:55.000000000 -0600
+@@ -12,7 +12,7 @@ O_TARGET := ext3.o
+ export-objs :=        super.o
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-              ioctl.o namei.o super.o symlink.o
++              ioctl.o namei.o super.o symlink.o hash.o
+ obj-m    := $(O_TARGET)
+ include $(TOPDIR)/Rules.make
+--- linux-chaos-2.4.20-6/fs/ext3/dir.c~ext-2.4-patch-1-chaos   2002-05-07 15:53:46.000000000 -0600
++++ linux-chaos-2.4.20-6-braam/fs/ext3/dir.c   2003-04-09 16:18:55.000000000 -0600
+@@ -21,12 +21,16 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/slab.h>
++#include <linux/rbtree.h>
+ static unsigned char ext3_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+ static int ext3_readdir(struct file *, void *, filldir_t);
++static int ext3_dx_readdir(struct file * filp,
++                         void * dirent, filldir_t filldir);
+ struct file_operations ext3_dir_operations = {
+       read:           generic_read_dir,
+@@ -35,6 +39,17 @@ struct file_operations ext3_dir_operatio
+       fsync:          ext3_sync_file,         /* BKL held */
+ };
++
++static unsigned char get_dtype(struct super_block *sb, int filetype)
++{
++      if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
++          (filetype >= EXT3_FT_MAX))
++              return DT_UNKNOWN;
++
++      return (ext3_filetype_table[filetype]);
++}
++                             
++
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+                         struct ext3_dir_entry_2 * de,
+                         struct buffer_head * bh,
+@@ -79,6 +94,16 @@ static int ext3_readdir(struct file * fi
+       sb = inode->i_sb;
++      if (is_dx(inode)) {
++              err = ext3_dx_readdir(filp, dirent, filldir);
++              if (err != ERR_BAD_DX_DIR)
++                      return err;
++              /*
++               * We don't set the inode dirty flag since it's not
++               * critical that it get flushed back to the disk.
++               */
++              EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
++      }
+       stored = 0;
+       bh = NULL;
+       offset = filp->f_pos & (sb->s_blocksize - 1);
+@@ -162,18 +187,12 @@ revalidate:
+                                * during the copy operation.
+                                */
+                               unsigned long version = filp->f_version;
+-                              unsigned char d_type = DT_UNKNOWN;
+-                              if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+-                                              EXT3_FEATURE_INCOMPAT_FILETYPE)
+-                                              && de->file_type < EXT3_FT_MAX)
+-                                      d_type =
+-                                        ext3_filetype_table[de->file_type];
+                               error = filldir(dirent, de->name,
+                                               de->name_len,
+                                               filp->f_pos,
+                                               le32_to_cpu(de->inode),
+-                                              d_type);
++                                              get_dtype(sb, de->file_type));
+                               if (error)
+                                       break;
+                               if (version != filp->f_version)
+@@ -188,3 +207,269 @@ revalidate:
+       UPDATE_ATIME(inode);
+       return 0;
+ }
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * These functions convert from the major/minor hash to an f_pos
++ * value.
++ * 
++ * Currently we only use major hash numer.  This is unfortunate, but
++ * on 32-bit machines, the same VFS interface is used for lseek and
++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
++ * lseek/telldir/seekdir will blow out spectacularly, and from within
++ * the ext2 low-level routine, we don't know if we're being called by
++ * a 64-bit version of the system call or the 32-bit version of the
++ * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
++ * cookie.  Sigh.
++ */
++#define hash2pos(major, minor)        (major >> 1)
++#define pos2maj_hash(pos)     ((pos << 1) & 0xffffffff)
++#define pos2min_hash(pos)     (0)
++
++/*
++ * This structure holds the nodes of the red-black tree used to store
++ * the directory entry in hash order.
++ */
++struct fname {
++      __u32           hash;
++      __u32           minor_hash;
++      rb_node_t       rb_hash; 
++      struct fname    *next;
++      __u32           inode;
++      __u8            name_len;
++      __u8            file_type;
++      char            name[0];
++};
++
++/*
++ * This functoin implements a non-recursive way of freeing all of the
++ * nodes in the red-black tree.
++ */
++static void free_rb_tree_fname(rb_root_t *root)
++{
++      rb_node_t       *n = root->rb_node;
++      rb_node_t       *parent;
++      struct fname    *fname;
++
++      while (n) {
++              /* Do the node's children first */
++              if ((n)->rb_left) {
++                      n = n->rb_left;
++                      continue;
++              }
++              if (n->rb_right) {
++                      n = n->rb_right;
++                      continue;
++              }
++              /*
++               * The node has no children; free it, and then zero
++               * out parent's link to it.  Finally go to the
++               * beginning of the loop and try to free the parent
++               * node.
++               */
++              parent = n->rb_parent;
++              fname = rb_entry(n, struct fname, rb_hash);
++              kfree(fname);
++              if (!parent)
++                      root->rb_node = 0;
++              else if (parent->rb_left == n)
++                      parent->rb_left = 0;
++              else if (parent->rb_right == n)
++                      parent->rb_right = 0;
++              n = parent;
++      }
++      root->rb_node = 0;
++}
++
++
++struct dir_private_info *create_dir_info(loff_t pos)
++{
++      struct dir_private_info *p;
++
++      p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++      if (!p)
++              return NULL;
++      p->root.rb_node = 0;
++      p->curr_node = 0;
++      p->extra_fname = 0;
++      p->last_pos = 0;
++      p->curr_hash = pos2maj_hash(pos);
++      p->curr_minor_hash = pos2min_hash(pos);
++      p->next_hash = 0;
++      return p;
++}
++
++void ext3_htree_free_dir_info(struct dir_private_info *p)
++{
++      free_rb_tree_fname(&p->root);
++      kfree(p);
++}
++              
++/*
++ * Given a directory entry, enter it into the fname rb tree.
++ */
++void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++                           __u32 minor_hash,
++                           struct ext3_dir_entry_2 *dirent)
++{
++      rb_node_t **p, *parent = NULL;
++      struct fname * fname, *new_fn;
++      struct dir_private_info *info;
++      int len;
++
++      info = (struct dir_private_info *) dir_file->private_data;
++      p = &info->root.rb_node;
++
++      /* Create and allocate the fname structure */
++      len = sizeof(struct fname) + dirent->name_len + 1;
++      new_fn = kmalloc(len, GFP_KERNEL);
++      memset(new_fn, 0, len);
++      new_fn->hash = hash;
++      new_fn->minor_hash = minor_hash;
++      new_fn->inode = le32_to_cpu(dirent->inode);
++      new_fn->name_len = dirent->name_len;
++      new_fn->file_type = dirent->file_type;
++      memcpy(new_fn->name, dirent->name, dirent->name_len);
++      new_fn->name[dirent->name_len] = 0;
++      
++      while (*p) {
++              parent = *p;
++              fname = rb_entry(parent, struct fname, rb_hash);
++
++              /*
++               * If the hash and minor hash match up, then we put
++               * them on a linked list.  This rarely happens...
++               */
++              if ((new_fn->hash == fname->hash) &&
++                  (new_fn->minor_hash == fname->minor_hash)) {
++                      new_fn->next = fname->next;
++                      fname->next = new_fn;
++                      return;
++              }
++                      
++              if (new_fn->hash < fname->hash)
++                      p = &(*p)->rb_left;
++              else if (new_fn->hash > fname->hash)
++                      p = &(*p)->rb_right;
++              else if (new_fn->minor_hash < fname->minor_hash)
++                      p = &(*p)->rb_left;
++              else /* if (new_fn->minor_hash > fname->minor_hash) */
++                      p = &(*p)->rb_right;
++      }
++
++      rb_link_node(&new_fn->rb_hash, parent, p);
++      rb_insert_color(&new_fn->rb_hash, &info->root);
++}
++
++
++
++/*
++ * This is a helper function for ext3_dx_readdir.  It calls filldir
++ * for all entres on the fname linked list.  (Normally there is only
++ * one entry on the linked list, unless there are 62 bit hash collisions.)
++ */
++static int call_filldir(struct file * filp, void * dirent,
++                      filldir_t filldir, struct fname *fname)
++{
++      struct dir_private_info *info = filp->private_data;
++      loff_t  curr_pos;
++      struct inode *inode = filp->f_dentry->d_inode;
++      struct super_block * sb;
++      int error;
++
++      sb = inode->i_sb;
++      
++      if (!fname) {
++              printk("call_filldir: called with null fname?!?\n");
++              return 0;
++      }
++      curr_pos = hash2pos(fname->hash, fname->minor_hash);
++      while (fname) {
++              error = filldir(dirent, fname->name,
++                              fname->name_len, curr_pos, 
++                              fname->inode,
++                              get_dtype(sb, fname->file_type));
++              if (error) {
++                      filp->f_pos = curr_pos;
++                      info->extra_fname = fname->next;
++                      return error;
++              }
++              fname = fname->next;
++      }
++      return 0;
++}
++
++static int ext3_dx_readdir(struct file * filp,
++                       void * dirent, filldir_t filldir)
++{
++      struct dir_private_info *info = filp->private_data;
++      struct inode *inode = filp->f_dentry->d_inode;
++      struct fname *fname;
++      int     ret;
++
++      if (!info) {
++              info = create_dir_info(filp->f_pos);
++              if (!info)
++                      return -ENOMEM;
++              filp->private_data = info;
++      }
++
++      /* Some one has messed with f_pos; reset the world */
++      if (info->last_pos != filp->f_pos) {
++              free_rb_tree_fname(&info->root);
++              info->curr_node = 0;
++              info->extra_fname = 0;
++              info->curr_hash = pos2maj_hash(filp->f_pos);
++              info->curr_minor_hash = pos2min_hash(filp->f_pos);
++      }
++
++      /*
++       * If there are any leftover names on the hash collision
++       * chain, return them first.
++       */
++      if (info->extra_fname &&
++          call_filldir(filp, dirent, filldir, info->extra_fname))
++              goto finished;
++
++      if (!info->curr_node)
++              info->curr_node = rb_get_first(&info->root);
++
++      while (1) {
++              /*
++               * Fill the rbtree if we have no more entries,
++               * or the inode has changed since we last read in the
++               * cached entries. 
++               */
++              if ((!info->curr_node) ||
++                  (filp->f_version != inode->i_version)) {
++                      info->curr_node = 0;
++                      free_rb_tree_fname(&info->root);
++                      filp->f_version = inode->i_version;
++                      ret = ext3_htree_fill_tree(filp, info->curr_hash,
++                                                 info->curr_minor_hash,
++                                                 &info->next_hash);
++                      if (ret < 0)
++                              return ret;
++                      if (ret == 0)
++                              break;
++                      info->curr_node = rb_get_first(&info->root);
++              }
++
++              fname = rb_entry(info->curr_node, struct fname, rb_hash);
++              info->curr_hash = fname->hash;
++              info->curr_minor_hash = fname->minor_hash;
++              if (call_filldir(filp, dirent, filldir, fname))
++                      break;
++
++              info->curr_node = rb_get_next(info->curr_node);
++              if (!info->curr_node) {
++                      info->curr_hash = info->next_hash;
++                      info->curr_minor_hash = 0;
++              }
++      }
++finished:
++      info->last_pos = filp->f_pos;
++      UPDATE_ATIME(inode);
++      return 0;
++}
++#endif
+--- linux-chaos-2.4.20-6/fs/ext3/file.c~ext-2.4-patch-1-chaos  2003-02-14 15:59:09.000000000 -0700
++++ linux-chaos-2.4.20-6-braam/fs/ext3/file.c  2003-04-09 16:18:55.000000000 -0600
+@@ -35,6 +35,9 @@ static int ext3_release_file (struct ino
+ {
+       if (filp->f_mode & FMODE_WRITE)
+               ext3_discard_prealloc (inode);
++      if (is_dx(inode) && filp->private_data)
++              ext3_htree_free_dir_info(filp->private_data);
++
+       return 0;
+ }
+--- /dev/null  2003-01-30 03:24:37.000000000 -0700
++++ linux-chaos-2.4.20-6-braam/fs/ext3/hash.c  2003-04-09 16:18:55.000000000 -0600
+@@ -0,0 +1,215 @@
++/*
++ *  linux/fs/ext3/hash.c
++ *
++ * Copyright (C) 2002 by Theodore Ts'o
++ *
++ * This file is released under the GPL v2.
++ * 
++ * This file may be redistributed under the terms of the GNU Public
++ * License.
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/sched.h>
++#include <linux/ext3_fs.h>
++
++#define DELTA 0x9E3779B9
++
++static void TEA_transform(__u32 buf[4], __u32 const in[])
++{
++      __u32   sum = 0;
++      __u32   b0 = buf[0], b1 = buf[1];
++      __u32   a = in[0], b = in[1], c = in[2], d = in[3];
++      int     n = 16;
++
++      do {                                                    
++              sum += DELTA;                                   
++              b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); 
++              b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); 
++      } while(--n);
++
++      buf[0] += b0;
++      buf[1] += b1;
++}
++
++/* F, G and H are basic MD4 functions: selection, majority, parity */
++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
++#define H(x, y, z) ((x) ^ (y) ^ (z))
++
++/*
++ * The generic round function.  The application is so specific that
++ * we don't bother protecting all the arguments with parens, as is generally
++ * good macro practice, in favor of extra legibility.
++ * Rotation is separate from addition to prevent recomputation
++ */
++#define ROUND(f, a, b, c, d, x, s)    \
++      (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
++#define K1 0
++#define K2 013240474631UL
++#define K3 015666365641UL
++
++/*
++ * Basic cut-down MD4 transform.  Returns only 32 bits of result.
++ */
++static void halfMD4Transform (__u32 buf[4], __u32 const in[])
++{
++      __u32   a = buf[0], b = buf[1], c = buf[2], d = buf[3];
++
++      /* Round 1 */
++      ROUND(F, a, b, c, d, in[0] + K1,  3);
++      ROUND(F, d, a, b, c, in[1] + K1,  7);
++      ROUND(F, c, d, a, b, in[2] + K1, 11);
++      ROUND(F, b, c, d, a, in[3] + K1, 19);
++      ROUND(F, a, b, c, d, in[4] + K1,  3);
++      ROUND(F, d, a, b, c, in[5] + K1,  7);
++      ROUND(F, c, d, a, b, in[6] + K1, 11);
++      ROUND(F, b, c, d, a, in[7] + K1, 19);
++
++      /* Round 2 */
++      ROUND(G, a, b, c, d, in[1] + K2,  3);
++      ROUND(G, d, a, b, c, in[3] + K2,  5);
++      ROUND(G, c, d, a, b, in[5] + K2,  9);
++      ROUND(G, b, c, d, a, in[7] + K2, 13);
++      ROUND(G, a, b, c, d, in[0] + K2,  3);
++      ROUND(G, d, a, b, c, in[2] + K2,  5);
++      ROUND(G, c, d, a, b, in[4] + K2,  9);
++      ROUND(G, b, c, d, a, in[6] + K2, 13);
++
++      /* Round 3 */
++      ROUND(H, a, b, c, d, in[3] + K3,  3);
++      ROUND(H, d, a, b, c, in[7] + K3,  9);
++      ROUND(H, c, d, a, b, in[2] + K3, 11);
++      ROUND(H, b, c, d, a, in[6] + K3, 15);
++      ROUND(H, a, b, c, d, in[1] + K3,  3);
++      ROUND(H, d, a, b, c, in[5] + K3,  9);
++      ROUND(H, c, d, a, b, in[0] + K3, 11);
++      ROUND(H, b, c, d, a, in[4] + K3, 15);
++
++      buf[0] += a;
++      buf[1] += b;
++      buf[2] += c;
++      buf[3] += d;
++}
++
++#undef ROUND
++#undef F
++#undef G
++#undef H
++#undef K1
++#undef K2
++#undef K3
++
++/* The old legacy hash */
++static __u32 dx_hack_hash (const char *name, int len)
++{
++      __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
++      while (len--) {
++              __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
++              
++              if (hash & 0x80000000) hash -= 0x7fffffff;
++              hash1 = hash0;
++              hash0 = hash;
++      }
++      return (hash0 << 1);
++}
++
++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
++{
++      __u32   pad, val;
++      int     i;
++
++      pad = (__u32)len | ((__u32)len << 8);
++      pad |= pad << 16;
++
++      val = pad;
++      if (len > num*4)
++              len = num * 4;
++      for (i=0; i < len; i++) {
++              if ((i % 4) == 0)
++                      val = pad;
++              val = msg[i] + (val << 8);
++              if ((i % 4) == 3) {
++                      *buf++ = val;
++                      val = pad;
++                      num--;
++              }
++      }
++      if (--num >= 0)
++              *buf++ = val;
++      while (--num >= 0)
++              *buf++ = pad;
++}
++
++/*
++ * Returns the hash of a filename.  If len is 0 and name is NULL, then
++ * this function can be used to test whether or not a hash version is
++ * supported.
++ * 
++ * The seed is an 4 longword (32 bits) "secret" which can be used to
++ * uniquify a hash.  If the seed is all zero's, then some default seed
++ * may be used.
++ * 
++ * A particular hash version specifies whether or not the seed is
++ * represented, and whether or not the returned hash is 32 bits or 64
++ * bits.  32 bit hashes will return 0 for the minor hash.
++ */
++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
++{
++      __u32   hash;
++      __u32   minor_hash = 0;
++      const char      *p;
++      int             i;
++      __u32           in[8], buf[4];
++
++      /* Initialize the default seed for the hash checksum functions */
++      buf[0] = 0x67452301;
++      buf[1] = 0xefcdab89;
++      buf[2] = 0x98badcfe;
++      buf[3] = 0x10325476;
++
++      /* Check to see if the seed is all zero's */
++      if (hinfo->seed) {
++              for (i=0; i < 4; i++) {
++                      if (hinfo->seed[i])
++                              break;
++              }
++              if (i < 4)
++                      memcpy(buf, hinfo->seed, sizeof(buf));
++      }
++              
++      switch (hinfo->hash_version) {
++      case DX_HASH_LEGACY:
++              hash = dx_hack_hash(name, len);
++              break;
++      case DX_HASH_HALF_MD4:
++              p = name;
++              while (len > 0) {
++                      str2hashbuf(p, len, in, 8);
++                      halfMD4Transform(buf, in);
++                      len -= 32;
++                      p += 32;
++              }
++              minor_hash = buf[2];
++              hash = buf[1];
++              break;
++      case DX_HASH_TEA:
++              p = name;
++              while (len > 0) {
++                      str2hashbuf(p, len, in, 4);
++                      TEA_transform(buf, in);
++                      len -= 16;
++                      p += 16;
++              }
++              hash = buf[0];
++              minor_hash = buf[1];
++              break;
++      default:
++              hinfo->hash = 0;
++              return -1;
++      }
++      hinfo->hash = hash & ~1;
++      hinfo->minor_hash = minor_hash;
++      return 0;
++}
+--- linux-chaos-2.4.20-6/fs/ext3/namei.c~ext-2.4-patch-1-chaos 2003-03-12 12:51:02.000000000 -0700
++++ linux-chaos-2.4.20-6-braam/fs/ext3/namei.c 2003-04-09 16:26:04.000000000 -0600
+@@ -16,6 +16,12 @@
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  *  Directory entry file type support and forward compatibility hooks
+  *    for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
++ *  Hash Tree Directory indexing (c)
++ *    Daniel Phillips, 2001
++ *  Hash Tree Directory indexing porting
++ *    Christopher Li, 2002
++ *  Hash Tree Directory indexing cleanup
++ *    Theodore Ts'o, 2002
+  */
+ #include <linux/fs.h>
+@@ -38,6 +44,630 @@
+ #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
++static struct buffer_head *ext3_append(handle_t *handle,
++                                      struct inode *inode,
++                                      u32 *block, int *err)
++{
++      struct buffer_head *bh;
++
++      *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
++
++      if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++              inode->i_size += inode->i_sb->s_blocksize;
++              EXT3_I(inode)->i_disksize = inode->i_size;
++              ext3_journal_get_write_access(handle,bh);
++      }
++      return bh;
++}
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#ifndef swap
++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
++#endif
++
++typedef struct { u32 v; } le_u32;
++typedef struct { u16 v; } le_u16;
++
++#ifdef DX_DEBUG
++#define dxtrace(command) command
++#else
++#define dxtrace(command) 
++#endif
++
++struct fake_dirent
++{
++      /*le*/u32 inode;
++      /*le*/u16 rec_len;
++      u8 name_len;
++      u8 file_type;
++};
++
++struct dx_countlimit
++{
++      le_u16 limit;
++      le_u16 count;
++};
++
++struct dx_entry
++{
++      le_u32 hash;
++      le_u32 block;
++};
++
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero.  Therefore, the
++ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
++ */
++
++struct dx_root
++{
++      struct fake_dirent dot;
++      char dot_name[4];
++      struct fake_dirent dotdot;
++      char dotdot_name[4];
++      struct dx_root_info
++      {
++              le_u32 reserved_zero;
++              u8 hash_version;
++              u8 info_length; /* 8 */
++              u8 indirect_levels;
++              u8 unused_flags;
++      }
++      info;
++      struct dx_entry entries[0];
++};
++
++struct dx_node
++{
++      struct fake_dirent fake;
++      struct dx_entry entries[0];
++};
++
++
++struct dx_frame
++{
++      struct buffer_head *bh;
++      struct dx_entry *entries;
++      struct dx_entry *at;
++};
++
++struct dx_map_entry
++{
++      u32 hash;
++      u32 offs;
++};
++
++#ifdef CONFIG_EXT3_INDEX
++static inline unsigned dx_get_block (struct dx_entry *entry);
++static void dx_set_block (struct dx_entry *entry, unsigned value);
++static inline unsigned dx_get_hash (struct dx_entry *entry);
++static void dx_set_hash (struct dx_entry *entry, unsigned value);
++static unsigned dx_get_count (struct dx_entry *entries);
++static unsigned dx_get_limit (struct dx_entry *entries);
++static void dx_set_count (struct dx_entry *entries, unsigned value);
++static void dx_set_limit (struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
++static unsigned dx_node_limit (struct inode *dir);
++static struct dx_frame *dx_probe(struct dentry *dentry,
++                               struct inode *dir,
++                               struct dx_hash_info *hinfo,
++                               struct dx_frame *frame,
++                               int *err);
++static void dx_release (struct dx_frame *frames);
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++                      struct dx_hash_info *hinfo, struct dx_map_entry map[]);
++static void dx_sort_map(struct dx_map_entry *map, unsigned count);
++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
++              struct dx_map_entry *offsets, int count);
++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++                               struct dx_frame *frame,
++                               struct dx_frame *frames, int *err,
++                               __u32 *start_hash);
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++                     struct ext3_dir_entry_2 **res_dir, int *err);
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++                           struct inode *inode);
++
++/*
++ * Future: use high four bits of block for coalesce-on-delete flags
++ * Mask them off for now.
++ */
++
++static inline unsigned dx_get_block (struct dx_entry *entry)
++{
++      return le32_to_cpu(entry->block.v) & 0x00ffffff;
++}
++
++static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++{
++      entry->block.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_hash (struct dx_entry *entry)
++{
++      return le32_to_cpu(entry->hash.v);
++}
++
++static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++{
++      entry->hash.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_count (struct dx_entry *entries)
++{
++      return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
++}
++
++static inline unsigned dx_get_limit (struct dx_entry *entries)
++{
++      return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
++}
++
++static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++{
++      ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
++}
++
++static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++{
++      ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
++}
++
++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++{
++      unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
++              EXT3_DIR_REC_LEN(2) - infosize;
++      return 0? 20: entry_space / sizeof(struct dx_entry);
++}
++
++static inline unsigned dx_node_limit (struct inode *dir)
++{
++      unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
++      return 0? 22: entry_space / sizeof(struct dx_entry);
++}
++
++/*
++ * Debug
++ */
++#ifdef DX_DEBUG
++struct stats
++{ 
++      unsigned names;
++      unsigned space;
++      unsigned bcount;
++};
++
++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
++                               int size, int show_names)
++{
++      unsigned names = 0, space = 0;
++      char *base = (char *) de;
++      struct dx_hash_info h = *hinfo;
++      
++      printk("names: ");
++      while ((char *) de < base + size)
++      {
++              if (de->inode)
++              {
++                      if (show_names)
++                      {
++                              int len = de->name_len;
++                              char *name = de->name;
++                              while (len--) printk("%c", *name++);
++                              ext3fs_dirhash(de->name, de->name_len, &h);
++                              printk(":%x.%u ", h.hash,
++                                     ((char *) de - base));
++                      }
++                      space += EXT3_DIR_REC_LEN(de->name_len);
++                      names++;
++              }
++              de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++      }
++      printk("(%i)\n", names);
++      return (struct stats) { names, space, 1 };
++}
++
++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
++                           struct dx_entry *entries, int levels)
++{
++      unsigned blocksize = dir->i_sb->s_blocksize;
++      unsigned count = dx_get_count (entries), names = 0, space = 0, i;
++      unsigned bcount = 0;
++      struct buffer_head *bh;
++      int err;
++      printk("%i indexed blocks...\n", count);
++      for (i = 0; i < count; i++, entries++)
++      {
++              u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
++              u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
++              struct stats stats;
++              printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
++              if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
++              stats = levels?
++                 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
++                 dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
++              names += stats.names;
++              space += stats.space;
++              bcount += stats.bcount;
++              brelse (bh);
++      }
++      if (bcount)
++              printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
++                      names, space/bcount,(space/bcount)*100/blocksize);
++      return (struct stats) { names, space, bcount};
++}
++#endif /* DX_DEBUG */
++
++/*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally.  The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static struct dx_frame *
++dx_probe(struct dentry *dentry, struct inode *dir,
++       struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++{
++      unsigned count, indirect;
++      struct dx_entry *at, *entries, *p, *q, *m;
++      struct dx_root *root;
++      struct buffer_head *bh;
++      struct dx_frame *frame = frame_in;
++      u32 hash;
++
++      frame->bh = NULL;
++      if (dentry)
++              dir = dentry->d_parent->d_inode;
++      if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
++              goto fail;
++      root = (struct dx_root *) bh->b_data;
++      if (root->info.hash_version != DX_HASH_TEA &&
++          root->info.hash_version != DX_HASH_HALF_MD4 &&
++          root->info.hash_version != DX_HASH_LEGACY) {
++              ext3_warning(dir->i_sb, __FUNCTION__,
++                           "Unrecognised inode hash code %d",
++                           root->info.hash_version);
++              brelse(bh);
++              *err = ERR_BAD_DX_DIR;
++              goto fail;
++      }
++      hinfo->hash_version = root->info.hash_version;
++      hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++      if (dentry)
++              ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++      hash = hinfo->hash;
++
++      if (root->info.unused_flags & 1) {
++              ext3_warning(dir->i_sb, __FUNCTION__,
++                           "Unimplemented inode hash flags: %#06x",
++                           root->info.unused_flags);
++              brelse(bh);
++              *err = ERR_BAD_DX_DIR;
++              goto fail;
++      }
++
++      if ((indirect = root->info.indirect_levels) > 1) {
++              ext3_warning(dir->i_sb, __FUNCTION__,
++                           "Unimplemented inode hash depth: %#06x",
++                           root->info.indirect_levels);
++              brelse(bh);
++              *err = ERR_BAD_DX_DIR;
++              goto fail;
++      }
++
++      entries = (struct dx_entry *) (((char *)&root->info) +
++                                     root->info.info_length);
++      assert(dx_get_limit(entries) == dx_root_limit(dir,
++                                                    root->info.info_length));
++      dxtrace (printk("Look up %x", hash));
++      while (1)
++      {
++              count = dx_get_count(entries);
++              assert (count && count <= dx_get_limit(entries));
++              p = entries + 1;
++              q = entries + count - 1;
++              while (p <= q)
++              {
++                      m = p + (q - p)/2;
++                      dxtrace(printk("."));
++                      if (dx_get_hash(m) > hash)
++                              q = m - 1;
++                      else
++                              p = m + 1;
++              }
++
++              if (0) // linear search cross check
++              {
++                      unsigned n = count - 1;
++                      at = entries;
++                      while (n--)
++                      {
++                              dxtrace(printk(","));
++                              if (dx_get_hash(++at) > hash)
++                              {
++                                      at--;
++                                      break;
++                              }
++                      }
++                      assert (at == p - 1);
++              }
++
++              at = p - 1;
++              dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++              frame->bh = bh;
++              frame->entries = entries;
++              frame->at = at;
++              if (!indirect--) return frame;
++              if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++                      goto fail2;
++              at = entries = ((struct dx_node *) bh->b_data)->entries;
++              assert (dx_get_limit(entries) == dx_node_limit (dir));
++              frame++;
++      }
++fail2:
++      while (frame >= frame_in) {
++              brelse(frame->bh);
++              frame--;
++      }
++fail:
++      return NULL;
++}
++
++static void dx_release (struct dx_frame *frames)
++{
++      if (frames[0].bh == NULL)
++              return;
++
++      if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++              brelse(frames[1].bh);
++      brelse(frames[0].bh);
++}
++
++/*
++ * This function increments the frame pointer to search the next leaf
++ * block, and reads in the necessary intervening nodes if the search
++ * should be necessary.  Whether or not the search is necessary is
++ * controlled by the hash parameter.  If the hash value is even, then
++ * the search is only continued if the next block starts with that
++ * hash value.  This is used if we are searching for a specific file.
++ *
++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
++ *
++ * This function returns 1 if the caller should continue to search,
++ * or 0 if it should not.  If there is an error reading one of the
++ * index blocks, it will return -1.
++ *
++ * If start_hash is non-null, it will be filled in with the starting
++ * hash of the next page.
++ */
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++                               struct dx_frame *frame,
++                               struct dx_frame *frames, int *err,
++                               __u32 *start_hash)
++{
++      struct dx_frame *p;
++      struct buffer_head *bh;
++      int num_frames = 0;
++      __u32 bhash;
++
++      *err = ENOENT;
++      p = frame;
++      /*
++       * Find the next leaf page by incrementing the frame pointer.
++       * If we run out of entries in the interior node, loop around and
++       * increment pointer in the parent node.  When we break out of
++       * this loop, num_frames indicates the number of interior
++       * nodes need to be read.
++       */
++      while (1) {
++              if (++(p->at) < p->entries + dx_get_count(p->entries))
++                      break;
++              if (p == frames)
++                      return 0;
++              num_frames++;
++              p--;
++      }
++
++      /*
++       * If the hash is 1, then continue only if the next page has a
++       * continuation hash of any value.  This is used for readdir
++       * handling.  Otherwise, check to see if the hash matches the
++       * desired contiuation hash.  If it doesn't, return since
++       * there's no point to read in the successive index pages.
++       */
++      bhash = dx_get_hash(p->at);
++      if (start_hash)
++              *start_hash = bhash;
++      if ((hash & 1) == 0) {
++              if ((bhash & ~1) != hash)
++                      return 0;
++      }
++      /*
++       * If the hash is HASH_NB_ALWAYS, we always go to the next
++       * block so no check is necessary
++       */
++      while (num_frames--) {
++              if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
++                                    0, err)))
++                      return -1; /* Failure */
++              p++;
++              brelse (p->bh);
++              p->bh = bh;
++              p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++      }
++      return 1;
++}
++
++
++/*
++ * p is at least 6 bytes before the end of page
++ */
++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
++{
++      return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
++}
++
++/*
++ * This function fills a red-black tree with information from a
++ * directory.  We start scanning the directory in hash order, starting
++ * at start_hash and start_minor_hash.
++ *
++ * This function returns the number of entries inserted into the tree,
++ * or a negative error code.
++ */
++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++                       __u32 start_minor_hash, __u32 *next_hash)
++{
++      struct dx_hash_info hinfo;
++      struct buffer_head *bh;
++      struct ext3_dir_entry_2 *de, *top;
++      static struct dx_frame frames[2], *frame;
++      struct inode *dir;
++      int block, err;
++      int count = 0;
++      int ret;
++      __u32 hashval;
++      
++      dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
++                     start_minor_hash));
++      dir = dir_file->f_dentry->d_inode;
++      hinfo.hash = start_hash;
++      hinfo.minor_hash = 0;
++      frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++      if (!frame)
++              return err;
++
++      while (1) {
++              block = dx_get_block(frame->at);
++              dxtrace(printk("Reading block %d\n", block));
++              if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
++                      goto errout;
++      
++              de = (struct ext3_dir_entry_2 *) bh->b_data;
++              top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize -
++                                     EXT3_DIR_REC_LEN(0));
++              for (; de < top; de = ext3_next_entry(de)) {
++                      ext3fs_dirhash(de->name, de->name_len, &hinfo);
++                      if ((hinfo.hash < start_hash) ||
++                          ((hinfo.hash == start_hash) &&
++                           (hinfo.minor_hash < start_minor_hash)))
++                              continue;
++                      ext3_htree_store_dirent(dir_file, hinfo.hash,
++                                              hinfo.minor_hash, de);
++                      count++;
++              }
++              brelse (bh);
++              hashval = ~1;
++              ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, 
++                                          frame, frames, &err, &hashval);
++              if (next_hash)
++                      *next_hash = hashval;
++              if (ret == -1)
++                      goto errout;
++              /*
++               * Stop if:  (a) there are no more entries, or
++               * (b) we have inserted at least one entry and the
++               * next hash value is not a continuation
++               */
++              if ((ret == 0) ||
++                  (count && ((hashval & 1) == 0)))
++                      break;
++      }
++      dx_release(frames);
++      dxtrace(printk("Fill tree: returned %d entries\n", count));
++      return count;
++errout:
++      dx_release(frames);
++      return (err);
++}
++
++
++/*
++ * Directory block splitting, compacting
++ */
++
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++                      struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
++{
++      int count = 0;
++      char *base = (char *) de;
++      struct dx_hash_info h = *hinfo;
++      
++      while ((char *) de < base + size)
++      {
++              if (de->name_len && de->inode) {
++                      ext3fs_dirhash(de->name, de->name_len, &h);
++                      map_tail--;
++                      map_tail->hash = h.hash;
++                      map_tail->offs = (u32) ((char *) de - base);
++                      count++;
++              }
++              /* XXX: do we need to check rec_len == 0 case? -Chris */
++              de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++      }
++      return count;
++}
++
++static void dx_sort_map (struct dx_map_entry *map, unsigned count)
++{
++        struct dx_map_entry *p, *q, *top = map + count - 1;
++        int more;
++        /* Combsort until bubble sort doesn't suck */
++        while (count > 2)
++      {
++                count = count*10/13;
++                if (count - 9 < 2) /* 9, 10 -> 11 */
++                        count = 11;
++                for (p = top, q = p - count; q >= map; p--, q--)
++                        if (p->hash < q->hash)
++                                swap(*p, *q);
++        }
++        /* Garden variety bubble sort */
++        do {
++                more = 0;
++                q = top;
++                while (q-- > map)
++              {
++                        if (q[1].hash >= q[0].hash)
++                              continue;
++                        swap(*(q+1), *q);
++                        more = 1;
++              }
++      } while(more);
++}
++
++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++{
++      struct dx_entry *entries = frame->entries;
++      struct dx_entry *old = frame->at, *new = old + 1;
++      int count = dx_get_count(entries);
++
++      assert(count < dx_get_limit(entries));
++      assert(old < entries + count);
++      memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
++      dx_set_hash(new, hash);
++      dx_set_block(new, block);
++      dx_set_count(entries, count + 1);
++}
++#endif
++
++
++static void ext3_update_dx_flag(struct inode *inode)
++{
++      if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
++                                   EXT3_FEATURE_COMPAT_DIR_INDEX))
++              EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
++}
++
+ /*
+  * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+  *
+@@ -94,6 +724,7 @@ static int inline search_dirblock(struct
+       return 0;
+ }
++
+ /*
+  *    ext3_find_entry()
+  *
+@@ -105,6 +736,8 @@ static int inline search_dirblock(struct
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
+  * to brelse() it when appropriate.
+  */
++
++      
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+                                       struct ext3_dir_entry_2 ** res_dir)
+ {
+@@ -119,12 +752,32 @@ static struct buffer_head * ext3_find_en
+       int num = 0;
+       int nblocks, i, err;
+       struct inode *dir = dentry->d_parent->d_inode;
++      int namelen;
++      const u8 *name;
++      unsigned blocksize;
+       *res_dir = NULL;
+       sb = dir->i_sb;
+-
++      blocksize = sb->s_blocksize;
++      namelen = dentry->d_name.len;
++      name = dentry->d_name.name;
++      if (namelen > EXT3_NAME_LEN)
++              return NULL;
++#ifdef CONFIG_EXT3_INDEX
++      if (is_dx(dir)) {
++              bh = ext3_dx_find_entry(dentry, res_dir, &err);
++              /*
++               * On success, or if the error was file not found,
++               * return.  Otherwise, fall back to doing a search the
++               * old fashioned way.
++               */
++              if (bh || (err != ERR_BAD_DX_DIR))
++                      return bh;
++              dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++      }
++#endif
+       nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+-      start = dir->u.ext3_i.i_dir_start_lookup;
++      start = EXT3_I(dir)->i_dir_start_lookup;
+       if (start >= nblocks)
+               start = 0;
+       block = start;
+@@ -166,7 +819,7 @@ restart:
+               i = search_dirblock(bh, dir, dentry,
+                           block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+               if (i == 1) {
+-                      dir->u.ext3_i.i_dir_start_lookup = block;
++                      EXT3_I(dir)->i_dir_start_lookup = block;
+                       ret = bh;
+                       goto cleanup_and_exit;
+               } else {
+@@ -197,6 +850,66 @@ cleanup_and_exit:
+       return ret;
+ }
++#ifdef CONFIG_EXT3_INDEX
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++                     struct ext3_dir_entry_2 **res_dir, int *err)
++{
++      struct super_block * sb;
++      struct dx_hash_info     hinfo;
++      u32 hash;
++      struct dx_frame frames[2], *frame;
++      struct ext3_dir_entry_2 *de, *top;
++      struct buffer_head *bh;
++      unsigned long block;
++      int retval;
++      int namelen = dentry->d_name.len;
++      const u8 *name = dentry->d_name.name;
++      struct inode *dir = dentry->d_parent->d_inode;
++      
++      sb = dir->i_sb;
++      if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++              return NULL;
++      hash = hinfo.hash;
++      do {
++              block = dx_get_block(frame->at);
++              if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++                      goto errout;
++              de = (struct ext3_dir_entry_2 *) bh->b_data;
++              top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
++                                     EXT3_DIR_REC_LEN(0));
++              for (; de < top; de = ext3_next_entry(de))
++              if (ext3_match (namelen, name, de)) {
++                      if (!ext3_check_dir_entry("ext3_find_entry",
++                                                dir, de, bh,
++                                (block<<EXT3_BLOCK_SIZE_BITS(sb))
++                                        +((char *)de - bh->b_data))) {
++                              brelse (bh);
++                              goto errout;
++                      }
++                      *res_dir = de;
++                      dx_release (frames);
++                      return bh;
++              }
++              brelse (bh);
++              /* Check to see if we should continue to search */
++              retval = ext3_htree_next_block(dir, hash, frame,
++                                             frames, err, 0);
++              if (retval == -1) {
++                      ext3_warning(sb, __FUNCTION__,
++                           "error reading index page in directory #%lu",
++                           dir->i_ino);
++                      goto errout;
++              }
++      } while (retval == 1);
++      
++      *err = -ENOENT;
++errout:
++      dxtrace(printk("%s not found\n", name));
++      dx_release (frames);
++      return NULL;
++}
++#endif
++
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+       struct inode * inode;
+@@ -213,8 +926,9 @@ static struct dentry *ext3_lookup(struct
+               brelse (bh);
+               inode = iget(dir->i_sb, ino);
+-              if (!inode)
++              if (!inode) {
+                       return ERR_PTR(-EACCES);
++              }
+       }
+       d_add(dentry, inode);
+       return NULL;
+@@ -238,6 +952,300 @@ static inline void ext3_set_de_type(stru
+               de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
++#ifdef CONFIG_EXT3_INDEX
++static struct ext3_dir_entry_2 *
++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
++{
++      unsigned rec_len = 0;
++
++      while (count--) {
++              struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++              rec_len = EXT3_DIR_REC_LEN(de->name_len);
++              memcpy (to, de, rec_len);
++              ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++              de->inode = 0;
++              map++;
++              to += rec_len;
++      }
++      return (struct ext3_dir_entry_2 *) (to - rec_len);
++}
++
++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
++{
++      struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++      unsigned rec_len = 0;
++
++      prev = to = de;
++      while ((char*)de < base + size) {
++              next = (struct ext3_dir_entry_2 *) ((char *) de +
++                                                  le16_to_cpu(de->rec_len));
++              if (de->inode && de->name_len) {
++                      rec_len = EXT3_DIR_REC_LEN(de->name_len);
++                      if (de > to)
++                              memmove(to, de, rec_len);
++                      to->rec_len = rec_len;
++                      prev = to;
++                      to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++              }
++              de = next;
++      }
++      return prev;
++}
++
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
++                      struct buffer_head **bh,struct dx_frame *frame,
++                      struct dx_hash_info *hinfo, int *error)
++{
++      unsigned blocksize = dir->i_sb->s_blocksize;
++      unsigned count, continued;
++      struct buffer_head *bh2;
++      u32 newblock;
++      u32 hash2;
++      struct dx_map_entry *map;
++      char *data1 = (*bh)->b_data, *data2;
++      unsigned split;
++      struct ext3_dir_entry_2 *de = NULL, *de2;
++      int     err;
++
++      bh2 = ext3_append (handle, dir, &newblock, error);
++      if (!(bh2)) {
++              brelse(*bh);
++              *bh = NULL;
++              goto errout;
++      }
++
++      BUFFER_TRACE(*bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, *bh);
++      if (err) {
++      journal_error:
++              brelse(*bh);
++              brelse(bh2);
++              *bh = NULL;
++              ext3_std_error(dir->i_sb, err);
++              goto errout;
++      }
++      BUFFER_TRACE(frame->bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, frame->bh);
++      if (err)
++              goto journal_error;
++
++      data2 = bh2->b_data;
++
++      /* create map in the end of data2 block */
++      map = (struct dx_map_entry *) (data2 + blocksize);
++      count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
++                           blocksize, hinfo, map);
++      map -= count;
++      split = count/2; // need to adjust to actual middle
++      dx_sort_map (map, count);
++      hash2 = map[split].hash;
++      continued = hash2 == map[split - 1].hash;
++      dxtrace(printk("Split block %i at %x, %i/%i\n",
++              dx_get_block(frame->at), hash2, split, count-split));
++
++      /* Fancy dance to stay within two buffers */
++      de2 = dx_move_dirents(data1, data2, map + split, count - split);
++      de = dx_pack_dirents(data1,blocksize);
++      de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++      de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++      /* Which block gets the new entry? */
++      if (hinfo->hash >= hash2)
++      {
++              swap(*bh, bh2);
++              de = de2;
++      }
++      dx_insert_block (frame, hash2 + continued, newblock);
++      err = ext3_journal_dirty_metadata (handle, bh2);
++      if (err)
++              goto journal_error;
++      err = ext3_journal_dirty_metadata (handle, frame->bh);
++      if (err)
++              goto journal_error;
++      brelse (bh2);
++      dxtrace(dx_show_index ("frame", frame->entries));
++errout:
++      return de;
++}
++#endif
++
++
++/*
++ * Add a new entry into a directory (leaf) block.  If de is non-NULL,
++ * it points to a directory entry which is guaranteed to be large
++ * enough for new directory entry.  If de is NULL, then
++ * add_dirent_to_buf will attempt search the directory block for
++ * space.  It will return -ENOSPC if no space is available, and -EIO
++ * and -EEXIST if directory entry already exists.
++ * 
++ * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
++ * all other cases bh is released.
++ */
++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
++                           struct inode *inode, struct ext3_dir_entry_2 *de,
++                           struct buffer_head * bh)
++{
++      struct inode    *dir = dentry->d_parent->d_inode;
++      const char      *name = dentry->d_name.name;
++      int             namelen = dentry->d_name.len;
++      unsigned long   offset = 0;
++      unsigned short  reclen;
++      int             nlen, rlen, err;
++      char            *top;
++      
++      reclen = EXT3_DIR_REC_LEN(namelen);
++      if (!de) {
++              de = (struct ext3_dir_entry_2 *)bh->b_data;
++              top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++              while ((char *) de <= top) {
++                      if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
++                                                bh, offset)) {
++                              brelse (bh);
++                              return -EIO;
++                      }
++                      if (ext3_match (namelen, name, de)) {
++                              brelse (bh);
++                              return -EEXIST;
++                      }
++                      nlen = EXT3_DIR_REC_LEN(de->name_len);
++                      rlen = le16_to_cpu(de->rec_len);
++                      if ((de->inode? rlen - nlen: rlen) >= reclen)
++                              break;
++                      de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++                      offset += rlen;
++              }
++              if ((char *) de > top)
++                      return -ENOSPC;
++      }
++      BUFFER_TRACE(bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, bh);
++      if (err) {
++              ext3_std_error(dir->i_sb, err);
++              brelse(bh);
++              return err;
++      }
++      
++      /* By now the buffer is marked for journaling */
++      nlen = EXT3_DIR_REC_LEN(de->name_len);
++      rlen = le16_to_cpu(de->rec_len);
++      if (de->inode) {
++              struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++              de1->rec_len = cpu_to_le16(rlen - nlen);
++              de->rec_len = cpu_to_le16(nlen);
++              de = de1;
++      }
++      de->file_type = EXT3_FT_UNKNOWN;
++      if (inode) {
++              de->inode = cpu_to_le32(inode->i_ino);
++              ext3_set_de_type(dir->i_sb, de, inode->i_mode);
++      } else
++              de->inode = 0;
++      de->name_len = namelen;
++      memcpy (de->name, name, namelen);
++      /*
++       * XXX shouldn't update any times until successful
++       * completion of syscall, but too many callers depend
++       * on this.
++       *
++       * XXX similarly, too many callers depend on
++       * ext3_new_inode() setting the times, but error
++       * recovery deletes the inode, so the worst that can
++       * happen is that the times are slightly out of date
++       * and/or different from the directory change time.
++       */
++      dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++      ext3_update_dx_flag(dir);
++      dir->i_version = ++event;
++      ext3_mark_inode_dirty(handle, dir);
++      BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++      err = ext3_journal_dirty_metadata(handle, bh);
++      if (err)
++              ext3_std_error(dir->i_sb, err);
++      brelse(bh);
++      return 0;
++}
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * This converts a one block unindexed directory to a 3 block indexed
++ * directory, and adds the dentry to the indexed directory.
++ */
++static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
++                          struct inode *inode, struct buffer_head *bh)
++{
++      struct inode    *dir = dentry->d_parent->d_inode;
++      const char      *name = dentry->d_name.name;
++      int             namelen = dentry->d_name.len;
++      struct buffer_head *bh2;
++      struct dx_root  *root;
++      struct dx_frame frames[2], *frame;
++      struct dx_entry *entries;
++      struct ext3_dir_entry_2 *de, *de2;
++      char            *data1, *top;
++      unsigned        len;
++      int             retval;
++      unsigned        blocksize;
++      struct dx_hash_info hinfo;
++      u32             block;
++              
++      blocksize =  dir->i_sb->s_blocksize;
++      dxtrace(printk("Creating index\n"));
++      retval = ext3_journal_get_write_access(handle, bh);
++      if (retval) {
++              ext3_std_error(dir->i_sb, retval);
++              brelse(bh);
++              return retval;
++      }
++      root = (struct dx_root *) bh->b_data;
++              
++      EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++      bh2 = ext3_append (handle, dir, &block, &retval);
++      if (!(bh2)) {
++              brelse(bh);
++              return retval;
++      }
++      data1 = bh2->b_data;
++
++      /* The 0th block becomes the root, move the dirents out */
++      de = (struct ext3_dir_entry_2 *) &root->info;
++      len = ((char *) root) + blocksize - (char *) de;
++      memcpy (data1, de, len);
++      de = (struct ext3_dir_entry_2 *) data1;
++      top = data1 + len;
++      while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
++              de = de2;
++      de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++      /* Initialize the root; the dot dirents already exist */
++      de = (struct ext3_dir_entry_2 *) (&root->dotdot);
++      de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
++      memset (&root->info, 0, sizeof(root->info));
++      root->info.info_length = sizeof(root->info);
++      root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version;
++      entries = root->entries;
++      dx_set_block (entries, 1);
++      dx_set_count (entries, 1);
++      dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++
++      /* Initialize as for dx_probe */
++      hinfo.hash_version = root->info.hash_version;
++      hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++      ext3fs_dirhash(name, namelen, &hinfo);
++      frame = frames;
++      frame->entries = entries;
++      frame->at = entries;
++      frame->bh = bh;
++      bh = bh2;
++      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++      dx_release (frames);
++      if (!(de))
++              return retval;
++
++      return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
++#endif
++
+ /*
+  *    ext3_add_entry()
+  *
+@@ -248,127 +1256,198 @@ static inline void ext3_set_de_type(stru
+  * may not sleep between calling this and putting something into
+  * the entry, as someone else might have used it while you slept.
+  */
+-
+-/*
+- * AKPM: the journalling code here looks wrong on the error paths
+- */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+       struct inode *inode)
+ {
+       struct inode *dir = dentry->d_parent->d_inode;
+-      const char *name = dentry->d_name.name;
+-      int namelen = dentry->d_name.len;
+       unsigned long offset;
+-      unsigned short rec_len;
+       struct buffer_head * bh;
+-      struct ext3_dir_entry_2 * de, * de1;
++      struct ext3_dir_entry_2 *de;
+       struct super_block * sb;
+       int     retval;
++#ifdef CONFIG_EXT3_INDEX
++      int     dx_fallback=0;
++#endif
++      unsigned blocksize;
++      unsigned nlen, rlen;
++      u32 block, blocks;
+       sb = dir->i_sb;
+-
+-      if (!namelen)
++      blocksize = sb->s_blocksize;
++      if (!dentry->d_name.len)
+               return -EINVAL;
+-      bh = ext3_bread (handle, dir, 0, 0, &retval);
++#ifdef CONFIG_EXT3_INDEX
++      if (is_dx(dir)) {
++              retval = ext3_dx_add_entry(handle, dentry, inode);
++              if (!retval || (retval != ERR_BAD_DX_DIR))
++                      return retval;
++              EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
++              dx_fallback++;
++              ext3_mark_inode_dirty(handle, dir);
++      }
++#endif
++      blocks = dir->i_size >> sb->s_blocksize_bits;
++      for (block = 0, offset = 0; block < blocks; block++) {
++              bh = ext3_bread(handle, dir, block, 0, &retval);
++              if(!bh)
++                      return retval;
++              retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++              if (retval != -ENOSPC)
++                      return retval;
++
++#ifdef CONFIG_EXT3_INDEX
++              if (blocks == 1 && !dx_fallback &&
++                  EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
++                      return make_indexed_dir(handle, dentry, inode, bh);
++#endif
++              brelse(bh);
++      }
++      bh = ext3_append(handle, dir, &block, &retval);
+       if (!bh)
+               return retval;
+-      rec_len = EXT3_DIR_REC_LEN(namelen);
+-      offset = 0;
+       de = (struct ext3_dir_entry_2 *) bh->b_data;
+-      while (1) {
+-              if ((char *)de >= sb->s_blocksize + bh->b_data) {
+-                      brelse (bh);
+-                      bh = NULL;
+-                      bh = ext3_bread (handle, dir,
+-                              offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+-                      if (!bh)
+-                              return retval;
+-                      if (dir->i_size <= offset) {
+-                              if (dir->i_size == 0) {
+-                                      brelse(bh);
+-                                      return -ENOENT;
+-                              }
++      de->inode = 0;
++      de->rec_len = cpu_to_le16(rlen = blocksize);
++      nlen = 0;
++      return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
+-                              ext3_debug ("creating next block\n");
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++                           struct inode *inode)
++{
++      struct dx_frame frames[2], *frame;
++      struct dx_entry *entries, *at;
++      struct dx_hash_info hinfo;
++      struct buffer_head * bh;
++      struct inode *dir = dentry->d_parent->d_inode;
++      struct super_block * sb = dir->i_sb;
++      struct ext3_dir_entry_2 *de;
++      int err;
+-                              BUFFER_TRACE(bh, "get_write_access");
+-                              ext3_journal_get_write_access(handle, bh);
+-                              de = (struct ext3_dir_entry_2 *) bh->b_data;
+-                              de->inode = 0;
+-                              de->rec_len = le16_to_cpu(sb->s_blocksize);
+-                              dir->u.ext3_i.i_disksize =
+-                                      dir->i_size = offset + sb->s_blocksize;
+-                              dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+-                              ext3_mark_inode_dirty(handle, dir);
+-                      } else {
++      frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++      if (!frame)
++              return err;
++      entries = frame->entries;
++      at = frame->at;
+-                              ext3_debug ("skipping to next block\n");
++      if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++              goto cleanup;
+-                              de = (struct ext3_dir_entry_2 *) bh->b_data;
+-                      }
+-              }
+-              if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+-                                         offset)) {
+-                      brelse (bh);
+-                      return -ENOENT;
+-              }
+-              if (ext3_match (namelen, name, de)) {
+-                              brelse (bh);
+-                              return -EEXIST;
++      BUFFER_TRACE(bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, bh);
++      if (err)
++              goto journal_error;
++
++      err = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++      if (err != -ENOSPC) {
++              bh = 0;
++              goto cleanup;
++      }
++
++      /* Block full, should compress but for now just split */
++      dxtrace(printk("using %u of %u node entries\n",
++                     dx_get_count(entries), dx_get_limit(entries)));
++      /* Need to split index? */
++      if (dx_get_count(entries) == dx_get_limit(entries)) {
++              u32 newblock;
++              unsigned icount = dx_get_count(entries);
++              int levels = frame - frames;
++              struct dx_entry *entries2;
++              struct dx_node *node2;
++              struct buffer_head *bh2;
++
++              if (levels && (dx_get_count(frames->entries) ==
++                             dx_get_limit(frames->entries))) {
++                      ext3_warning(sb, __FUNCTION__,
++                                   "Directory index full!\n");
++                      err = -ENOSPC;
++                      goto cleanup;
+               }
+-              if ((le32_to_cpu(de->inode) == 0 &&
+-                              le16_to_cpu(de->rec_len) >= rec_len) ||
+-                  (le16_to_cpu(de->rec_len) >=
+-                              EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+-                      BUFFER_TRACE(bh, "get_write_access");
+-                      ext3_journal_get_write_access(handle, bh);
+-                      /* By now the buffer is marked for journaling */
+-                      offset += le16_to_cpu(de->rec_len);
+-                      if (le32_to_cpu(de->inode)) {
+-                              de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+-                                      EXT3_DIR_REC_LEN(de->name_len));
+-                              de1->rec_len =
+-                                      cpu_to_le16(le16_to_cpu(de->rec_len) -
+-                                      EXT3_DIR_REC_LEN(de->name_len));
+-                              de->rec_len = cpu_to_le16(
+-                                              EXT3_DIR_REC_LEN(de->name_len));
+-                              de = de1;
++              bh2 = ext3_append (handle, dir, &newblock, &err);
++              if (!(bh2))
++                      goto cleanup;
++              node2 = (struct dx_node *)(bh2->b_data);
++              entries2 = node2->entries;
++              node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
++              node2->fake.inode = 0;
++              BUFFER_TRACE(frame->bh, "get_write_access");
++              err = ext3_journal_get_write_access(handle, frame->bh);
++              if (err)
++                      goto journal_error;
++              if (levels) {
++                      unsigned icount1 = icount/2, icount2 = icount - icount1;
++                      unsigned hash2 = dx_get_hash(entries + icount1);
++                      dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++                              
++                      BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++                      err = ext3_journal_get_write_access(handle,
++                                                           frames[0].bh);
++                      if (err)
++                              goto journal_error;
++                              
++                      memcpy ((char *) entries2, (char *) (entries + icount1),
++                              icount2 * sizeof(struct dx_entry));
++                      dx_set_count (entries, icount1);
++                      dx_set_count (entries2, icount2);
++                      dx_set_limit (entries2, dx_node_limit(dir));
++
++                      /* Which index block gets the new entry? */
++                      if (at - entries >= icount1) {
++                              frame->at = at = at - entries - icount1 + entries2;
++                              frame->entries = entries = entries2;
++                              swap(frame->bh, bh2);
+                       }
+-                      de->file_type = EXT3_FT_UNKNOWN;
+-                      if (inode) {
+-                              de->inode = cpu_to_le32(inode->i_ino);
+-                              ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+-                      } else
+-                              de->inode = 0;
+-                      de->name_len = namelen;
+-                      memcpy (de->name, name, namelen);
+-                      /*
+-                       * XXX shouldn't update any times until successful
+-                       * completion of syscall, but too many callers depend
+-                       * on this.
+-                       *
+-                       * XXX similarly, too many callers depend on
+-                       * ext3_new_inode() setting the times, but error
+-                       * recovery deletes the inode, so the worst that can
+-                       * happen is that the times are slightly out of date
+-                       * and/or different from the directory change time.
+-                       */
+-                      dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+-                      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+-                      dir->i_version = ++event;
+-                      ext3_mark_inode_dirty(handle, dir);
+-                      BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+-                      ext3_journal_dirty_metadata(handle, bh);
+-                      brelse(bh);
+-                      return 0;
++                      dx_insert_block (frames + 0, hash2, newblock);
++                      dxtrace(dx_show_index ("node", frames[1].entries));
++                      dxtrace(dx_show_index ("node",
++                             ((struct dx_node *) bh2->b_data)->entries));
++                      err = ext3_journal_dirty_metadata(handle, bh2);
++                      if (err)
++                              goto journal_error;
++                      brelse (bh2);
++              } else {
++                      dxtrace(printk("Creating second level index...\n"));
++                      memcpy((char *) entries2, (char *) entries,
++                             icount * sizeof(struct dx_entry));
++                      dx_set_limit(entries2, dx_node_limit(dir));
++
++                      /* Set up root */
++                      dx_set_count(entries, 1);
++                      dx_set_block(entries + 0, newblock);
++                      ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++
++                      /* Add new access path frame */
++                      frame = frames + 1;
++                      frame->at = at = at - entries + entries2;
++                      frame->entries = entries = entries2;
++                      frame->bh = bh2;
++                      err = ext3_journal_get_write_access(handle,
++                                                           frame->bh);
++                      if (err)
++                              goto journal_error;
+               }
+-              offset += le16_to_cpu(de->rec_len);
+-              de = (struct ext3_dir_entry_2 *)
+-                      ((char *) de + le16_to_cpu(de->rec_len));
++              ext3_journal_dirty_metadata(handle, frames[0].bh);
+       }
+-      brelse (bh);
+-      return -ENOSPC;
++      de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++      if (!de)
++              goto cleanup;
++      err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++      bh = 0;
++      goto cleanup;
++      
++journal_error:
++      ext3_std_error(dir->i_sb, err);
++cleanup:
++      if (bh)
++              brelse(bh);
++      dx_release(frames);
++      return err;
+ }
++#endif
+ /*
+  * ext3_delete_entry deletes a directory entry by merging it with the
+@@ -455,9 +1534,11 @@ static int ext3_create (struct inode * d
+       struct inode * inode;
+       int err;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -481,9 +1562,11 @@ static int ext3_mknod (struct inode * di
+       struct inode *inode;
+       int err;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -509,9 +1592,11 @@ static int ext3_mkdir(struct inode * dir
+       if (dir->i_nlink >= EXT3_LINK_MAX)
+               return -EMLINK;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -523,7 +1608,7 @@ static int ext3_mkdir(struct inode * dir
+       inode->i_op = &ext3_dir_inode_operations;
+       inode->i_fop = &ext3_dir_operations;
+-      inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
++      inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+       inode->i_blocks = 0;    
+       dir_block = ext3_bread (handle, inode, 0, 1, &err);
+       if (!dir_block) {
+@@ -556,21 +1641,19 @@ static int ext3_mkdir(struct inode * dir
+               inode->i_mode |= S_ISGID;
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_entry (handle, dentry, inode);
+-      if (err)
+-              goto out_no_entry;
++      if (err) {
++              inode->i_nlink = 0;
++              ext3_mark_inode_dirty(handle, inode);
++              iput (inode);
++              goto out_stop;
++      }
+       dir->i_nlink++;
+-      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      ext3_update_dx_flag(dir);
+       ext3_mark_inode_dirty(handle, dir);
+       d_instantiate(dentry, inode);
+ out_stop:
+       ext3_journal_stop(handle, dir);
+       return err;
+-
+-out_no_entry:
+-      inode->i_nlink = 0;
+-      ext3_mark_inode_dirty(handle, inode);
+-      iput (inode);
+-      goto out_stop;
+ }
+ /*
+@@ -657,7 +1740,7 @@ int ext3_orphan_add(handle_t *handle, st
+       int err = 0, rc;
+       
+       lock_super(sb);
+-      if (!list_empty(&inode->u.ext3_i.i_orphan))
++      if (!list_empty(&EXT3_I(inode)->i_orphan))
+               goto out_unlock;
+       /* Orphan handling is only valid for files with data blocks
+@@ -698,7 +1781,7 @@ int ext3_orphan_add(handle_t *handle, st
+        * This is safe: on error we're going to ignore the orphan list
+        * anyway on the next recovery. */
+       if (!err)
+-              list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
++              list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+       jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+       jbd_debug(4, "orphan inode %ld will point to %d\n",
+@@ -716,25 +1799,26 @@ out_unlock:
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+       struct list_head *prev;
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       struct ext3_sb_info *sbi;
+       unsigned long ino_next;
+       struct ext3_iloc iloc;
+       int err = 0;
+       lock_super(inode->i_sb);
+-      if (list_empty(&inode->u.ext3_i.i_orphan)) {
++      if (list_empty(&ei->i_orphan)) {
+               unlock_super(inode->i_sb);
+               return 0;
+       }
+       ino_next = NEXT_ORPHAN(inode);
+-      prev = inode->u.ext3_i.i_orphan.prev;
++      prev = ei->i_orphan.prev;
+       sbi = EXT3_SB(inode->i_sb);
+       jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+-      list_del(&inode->u.ext3_i.i_orphan);
+-      INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++      list_del(&ei->i_orphan);
++      INIT_LIST_HEAD(&ei->i_orphan);
+       /* If we're on an error path, we may not have a valid
+        * transaction handle with which to update the orphan list on
+@@ -795,8 +1879,9 @@ static int ext3_rmdir (struct inode * di
+       handle_t *handle;
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+-      if (IS_ERR(handle))
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       retval = -ENOENT;
+       bh = ext3_find_entry (dentry, &de);
+@@ -834,7 +1919,7 @@ static int ext3_rmdir (struct inode * di
+       dir->i_nlink--;
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+       ext3_mark_inode_dirty(handle, inode);
+-      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      ext3_update_dx_flag(dir);
+       ext3_mark_inode_dirty(handle, dir);
+ end_rmdir:
+@@ -852,8 +1937,9 @@ static int ext3_unlink(struct inode * di
+       handle_t *handle;
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+-      if (IS_ERR(handle))
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -880,7 +1966,7 @@ static int ext3_unlink(struct inode * di
+       if (retval)
+               goto end_unlink;
+       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+-      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      ext3_update_dx_flag(dir);
+       ext3_mark_inode_dirty(handle, dir);
+       inode->i_nlink--;
+       if (!inode->i_nlink)
+@@ -906,9 +1992,11 @@ static int ext3_symlink (struct inode * 
+       if (l > dir->i_sb->s_blocksize)
+               return -ENAMETOOLONG;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -918,7 +2006,7 @@ static int ext3_symlink (struct inode * 
+       if (IS_ERR(inode))
+               goto out_stop;
+-      if (l > sizeof (inode->u.ext3_i.i_data)) {
++      if (l > sizeof (EXT3_I(inode)->i_data)) {
+               inode->i_op = &page_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+               /*
+@@ -927,24 +2015,23 @@ static int ext3_symlink (struct inode * 
+                * i_size in generic_commit_write().
+                */
+               err = block_symlink(inode, symname, l);
+-              if (err)
+-                      goto out_no_entry;
++              if (err) {
++                      ext3_dec_count(handle, inode);
++                      ext3_mark_inode_dirty(handle, inode);
++                      iput (inode);
++                      goto out_stop;
++              }
+       } else {
+               inode->i_op = &ext3_fast_symlink_inode_operations;
+-              memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
++              memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+               inode->i_size = l-1;
+       }
+-      inode->u.ext3_i.i_disksize = inode->i_size;
++      EXT3_I(inode)->i_disksize = inode->i_size;
+       err = ext3_add_nondir(handle, dentry, inode);
++      ext3_mark_inode_dirty(handle, inode);
+ out_stop:
+       ext3_journal_stop(handle, dir);
+       return err;
+-
+-out_no_entry:
+-      ext3_dec_count(handle, inode);
+-      ext3_mark_inode_dirty(handle, inode);
+-      iput (inode);
+-      goto out_stop;
+ }
+ static int ext3_link (struct dentry * old_dentry,
+@@ -957,12 +2044,15 @@ static int ext3_link (struct dentry * ol
+       if (S_ISDIR(inode->i_mode))
+               return -EPERM;
+-      if (inode->i_nlink >= EXT3_LINK_MAX)
++      if (inode->i_nlink >= EXT3_LINK_MAX) {
+               return -EMLINK;
++      }
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -995,9 +2085,11 @@ static int ext3_rename (struct inode * o
+       old_bh = new_bh = dir_bh = NULL;
+-      handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+               handle->h_sync = 1;
+@@ -1077,7 +2169,7 @@ static int ext3_rename (struct inode * o
+               new_inode->i_ctime = CURRENT_TIME;
+       }
+       old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+-      old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      ext3_update_dx_flag(old_dir);
+       if (dir_bh) {
+               BUFFER_TRACE(dir_bh, "get_write_access");
+               ext3_journal_get_write_access(handle, dir_bh);
+@@ -1089,7 +2181,7 @@ static int ext3_rename (struct inode * o
+                       new_inode->i_nlink--;
+               } else {
+                       new_dir->i_nlink++;
+-                      new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++                      ext3_update_dx_flag(new_dir);
+                       ext3_mark_inode_dirty(handle, new_dir);
+               }
+       }
+--- linux-chaos-2.4.20-6/fs/ext3/super.c~ext-2.4-patch-1-chaos 2003-04-09 16:10:38.000000000 -0600
++++ linux-chaos-2.4.20-6-braam/fs/ext3/super.c 2003-04-09 16:18:55.000000000 -0600
+@@ -710,6 +710,7 @@ static int ext3_setup_super(struct super
+       es->s_mtime = cpu_to_le32(CURRENT_TIME);
+       ext3_update_dynamic_rev(sb);
+       EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++
+       ext3_commit_super (sb, es, 1);
+       if (test_opt (sb, DEBUG))
+               printk (KERN_INFO
+@@ -720,6 +721,7 @@ static int ext3_setup_super(struct super
+                       EXT3_BLOCKS_PER_GROUP(sb),
+                       EXT3_INODES_PER_GROUP(sb),
+                       sbi->s_mount_opt);
++
+       printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+                               bdevname(sb->s_dev));
+       if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+@@ -893,6 +895,7 @@ static loff_t ext3_max_size(int bits)
+       return res;
+ }
++
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+                                     int silent)
+ {
+@@ -1069,6 +1072,9 @@ struct super_block * ext3_read_super (st
+       sbi->s_mount_state = le16_to_cpu(es->s_state);
+       sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+       sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
++      for (i=0; i < 4; i++)
++              sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
++      sbi->s_def_hash_version = es->s_def_hash_version;
+       if (sbi->s_blocks_per_group > blocksize * 8) {
+               printk (KERN_ERR
+@@ -1770,6 +1776,7 @@ static void __exit exit_ext3_fs(void)
+       unregister_filesystem(&ext3_fs_type);
+ }
++EXPORT_SYMBOL(ext3_force_commit);
+ EXPORT_SYMBOL(ext3_bread);
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+--- linux-chaos-2.4.20-6/include/linux/ext3_fs.h~ext-2.4-patch-1-chaos 2003-03-12 12:51:27.000000000 -0700
++++ linux-chaos-2.4.20-6-braam/include/linux/ext3_fs.h 2003-04-09 16:18:55.000000000 -0600
+@@ -40,6 +40,11 @@
+ #define EXT3FS_VERSION                "2.4-0.9.19"
+ /*
++ * Always enable hashed directories
++ */
++#define CONFIG_EXT3_INDEX
++
++/*
+  * Debug code
+  */
+ #ifdef EXT3FS_DEBUG
+@@ -437,8 +442,11 @@ struct ext3_super_block {
+ /*E0*/        __u32   s_journal_inum;         /* inode number of journal file */
+       __u32   s_journal_dev;          /* device number of journal file */
+       __u32   s_last_orphan;          /* start of list of inodes to delete */
+-
+-/*EC*/        __u32   s_reserved[197];        /* Padding to the end of the block */
++      __u32   s_hash_seed[4];         /* HTREE hash seed */
++      __u8    s_def_hash_version;     /* Default hash version to use */
++      __u8    s_reserved_char_pad;
++      __u16   s_reserved_word_pad;
++      __u32   s_reserved[192];        /* Padding to the end of the block */
+ };
+ #ifdef __KERNEL__
+@@ -575,9 +583,46 @@ struct ext3_dir_entry_2 {
+ #define EXT3_DIR_ROUND                        (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len)    (((name_len) + 8 + EXT3_DIR_ROUND) & \
+                                        ~EXT3_DIR_ROUND)
++/*
++ * Hash Tree Directory indexing
++ * (c) Daniel Phillips, 2001
++ */
++
++#ifdef CONFIG_EXT3_INDEX
++  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++                    (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#else
++  #define is_dx(dir) 0
++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
++#endif
++
++/* Legal values for the dx_root hash_version field: */
++
++#define DX_HASH_LEGACY                0
++#define DX_HASH_HALF_MD4      1
++#define DX_HASH_TEA           2
++
++/* hash info structure used by the directory hash */
++struct dx_hash_info
++{
++      u32             hash;
++      u32             minor_hash;
++      int             hash_version;
++      u32             *seed;
++};
+ #ifdef __KERNEL__
+ /*
++ * Control parameters used by ext3_htree_next_block
++ */
++#define HASH_NB_ALWAYS                1
++
++
++/*
+  * Describe an inode's exact location on disk and in memory
+  */
+ struct ext3_iloc
+@@ -587,6 +632,27 @@ struct ext3_iloc
+       unsigned long block_group;
+ };
++
++/*
++ * This structure is stuffed into the struct file's private_data field
++ * for directories.  It is where we put information so that we can do
++ * readdir operations in hash tree order.
++ */
++struct dir_private_info {
++      rb_root_t       root;
++      rb_node_t       *curr_node;
++      struct fname    *extra_fname;
++      loff_t          last_pos;
++      __u32           curr_hash;
++      __u32           curr_minor_hash;
++      __u32           next_hash;
++};
++
++/*
++ * Special error return code only used by dx_probe() and its callers.
++ */
++#define ERR_BAD_DX_DIR        -75000
++
+ /*
+  * Function prototypes
+  */
+@@ -614,11 +680,20 @@ extern struct ext3_group_desc * ext3_get
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+-                              struct ext3_dir_entry_2 *, struct buffer_head *,
+-                              unsigned long);
++                              struct ext3_dir_entry_2 *,
++                              struct buffer_head *, unsigned long);
++extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++                                  __u32 minor_hash,
++                                  struct ext3_dir_entry_2 *dirent);
++extern void ext3_htree_free_dir_info(struct dir_private_info *p);
++
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
++/* hash.c */
++extern int ext3fs_dirhash(const char *name, int len, struct
++                        dx_hash_info *hinfo);
++
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+@@ -650,6 +725,8 @@ extern int ext3_ioctl (struct inode *, s
+ /* namei.c */
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++                              __u32 start_minor_hash, __u32 *next_hash);
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+--- linux-chaos-2.4.20-6/include/linux/ext3_fs_sb.h~ext-2.4-patch-1-chaos      2003-03-12 12:51:27.000000000 -0700
++++ linux-chaos-2.4.20-6-braam/include/linux/ext3_fs_sb.h      2003-04-09 16:18:55.000000000 -0600
+@@ -62,6 +62,8 @@ struct ext3_sb_info {
+       int s_inode_size;
+       int s_first_ino;
+       u32 s_next_generation;
++      u32 s_hash_seed[4];
++      int s_def_hash_version;
+       /* Journaling */
+       struct inode * s_journal_inode;
+--- linux-chaos-2.4.20-6/include/linux/ext3_jbd.h~ext-2.4-patch-1-chaos        2003-03-12 12:51:27.000000000 -0700
++++ linux-chaos-2.4.20-6-braam/include/linux/ext3_jbd.h        2003-04-09 16:18:55.000000000 -0600
+@@ -63,6 +63,8 @@ extern int ext3_writepage_trans_blocks(s
+ #define EXT3_RESERVE_TRANS_BLOCKS     12U
++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
++
+ int
+ ext3_mark_iloc_dirty(handle_t *handle, 
+                    struct inode *inode,
+--- linux-chaos-2.4.20-6/include/linux/rbtree.h~ext-2.4-patch-1-chaos  2002-05-07 15:53:47.000000000 -0600
++++ linux-chaos-2.4.20-6-braam/include/linux/rbtree.h  2003-04-09 16:18:55.000000000 -0600
+@@ -120,6 +120,8 @@ rb_root_t;
+ extern void rb_insert_color(rb_node_t *, rb_root_t *);
+ extern void rb_erase(rb_node_t *, rb_root_t *);
++extern rb_node_t *rb_get_first(rb_root_t *root);
++extern rb_node_t *rb_get_next(rb_node_t *n);
+ static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+ {
+--- linux-chaos-2.4.20-6/lib/rbtree.c~ext-2.4-patch-1-chaos    2002-09-25 11:14:03.000000000 -0600
++++ linux-chaos-2.4.20-6-braam/lib/rbtree.c    2003-04-09 16:18:55.000000000 -0600
+@@ -17,6 +17,8 @@
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+   linux/lib/rbtree.c
++
++  rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002
+ */
+ #include <linux/rbtree.h>
+@@ -294,3 +296,43 @@ void rb_erase(rb_node_t * node, rb_root_
+               __rb_erase_color(child, parent, root);
+ }
+ EXPORT_SYMBOL(rb_erase);
++
++/*
++ * This function returns the first node (in sort order) of the tree.
++ */
++rb_node_t *rb_get_first(rb_root_t *root)
++{
++      rb_node_t       *n;
++
++      n = root->rb_node;
++      if (!n)
++              return 0;
++      while (n->rb_left)
++              n = n->rb_left;
++      return n;
++}
++EXPORT_SYMBOL(rb_get_first);
++
++/*
++ * Given a node, this function will return the next node in the tree.
++ */
++rb_node_t *rb_get_next(rb_node_t *n)
++{
++      rb_node_t       *parent;
++
++      if (n->rb_right) {
++              n = n->rb_right;
++              while (n->rb_left)
++                      n = n->rb_left;
++              return n;
++      } else {
++              while ((parent = n->rb_parent)) {
++                      if (n == parent->rb_left)
++                              return parent;
++                      n = parent;
++              }
++              return 0;
++      }
++}
++EXPORT_SYMBOL(rb_get_next);
++
+
+_
diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1.patch
new file mode 100644 (file)
index 0000000..09caec1
--- /dev/null
@@ -0,0 +1,2527 @@
+ fs/ext3/Makefile           |    2 
+ fs/ext3/dir.c              |  299 +++++++++
+ fs/ext3/file.c             |    3 
+ fs/ext3/hash.c             |  215 ++++++
+ fs/ext3/namei.c            | 1387 ++++++++++++++++++++++++++++++++++++++++-----
+ fs/ext3/super.c            |    7 
+ include/linux/ext3_fs.h    |   85 ++
+ include/linux/ext3_fs_sb.h |    2 
+ include/linux/ext3_jbd.h   |    2 
+ include/linux/rbtree.h     |    2 
+ lib/rbtree.c               |   42 +
+ 11 files changed, 1886 insertions(+), 160 deletions(-)
+
+--- linux-2.4.20/fs/ext3/Makefile~ext-2.4-patch-1      Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/fs/ext3/Makefile        Sat Apr  5 03:57:05 2003
+@@ -12,7 +12,7 @@ O_TARGET := ext3.o
+ export-objs :=        super.o inode.o
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-              ioctl.o namei.o super.o symlink.o
++              ioctl.o namei.o super.o symlink.o hash.o
+ obj-m    := $(O_TARGET)
+ include $(TOPDIR)/Rules.make
+--- linux-2.4.20/fs/ext3/dir.c~ext-2.4-patch-1 Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/fs/ext3/dir.c   Sat Apr  5 03:56:31 2003
+@@ -21,12 +21,16 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/slab.h>
++#include <linux/rbtree.h>
+ static unsigned char ext3_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+ static int ext3_readdir(struct file *, void *, filldir_t);
++static int ext3_dx_readdir(struct file * filp,
++                         void * dirent, filldir_t filldir);
+ struct file_operations ext3_dir_operations = {
+       read:           generic_read_dir,
+@@ -35,6 +39,17 @@ struct file_operations ext3_dir_operatio
+       fsync:          ext3_sync_file,         /* BKL held */
+ };
++
++static unsigned char get_dtype(struct super_block *sb, int filetype)
++{
++      if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
++          (filetype >= EXT3_FT_MAX))
++              return DT_UNKNOWN;
++
++      return (ext3_filetype_table[filetype]);
++}
++                             
++
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+                         struct ext3_dir_entry_2 * de,
+                         struct buffer_head * bh,
+@@ -79,6 +94,16 @@ static int ext3_readdir(struct file * fi
+       sb = inode->i_sb;
++      if (is_dx(inode)) {
++              err = ext3_dx_readdir(filp, dirent, filldir);
++              if (err != ERR_BAD_DX_DIR)
++                      return err;
++              /*
++               * We don't set the inode dirty flag since it's not
++               * critical that it get flushed back to the disk.
++               */
++              EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
++      }
+       stored = 0;
+       bh = NULL;
+       offset = filp->f_pos & (sb->s_blocksize - 1);
+@@ -162,18 +187,12 @@ revalidate:
+                                * during the copy operation.
+                                */
+                               unsigned long version = filp->f_version;
+-                              unsigned char d_type = DT_UNKNOWN;
+-                              if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+-                                              EXT3_FEATURE_INCOMPAT_FILETYPE)
+-                                              && de->file_type < EXT3_FT_MAX)
+-                                      d_type =
+-                                        ext3_filetype_table[de->file_type];
+                               error = filldir(dirent, de->name,
+                                               de->name_len,
+                                               filp->f_pos,
+                                               le32_to_cpu(de->inode),
+-                                              d_type);
++                                              get_dtype(sb, de->file_type));
+                               if (error)
+                                       break;
+                               if (version != filp->f_version)
+@@ -188,3 +207,269 @@ revalidate:
+       UPDATE_ATIME(inode);
+       return 0;
+ }
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * These functions convert from the major/minor hash to an f_pos
++ * value.
++ * 
++ * Currently we only use major hash numer.  This is unfortunate, but
++ * on 32-bit machines, the same VFS interface is used for lseek and
++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
++ * lseek/telldir/seekdir will blow out spectacularly, and from within
++ * the ext2 low-level routine, we don't know if we're being called by
++ * a 64-bit version of the system call or the 32-bit version of the
++ * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
++ * cookie.  Sigh.
++ */
++#define hash2pos(major, minor)        (major >> 1)
++#define pos2maj_hash(pos)     ((pos << 1) & 0xffffffff)
++#define pos2min_hash(pos)     (0)
++
++/*
++ * This structure holds the nodes of the red-black tree used to store
++ * the directory entry in hash order.
++ */
++struct fname {
++      __u32           hash;
++      __u32           minor_hash;
++      rb_node_t       rb_hash; 
++      struct fname    *next;
++      __u32           inode;
++      __u8            name_len;
++      __u8            file_type;
++      char            name[0];
++};
++
++/*
++ * This functoin implements a non-recursive way of freeing all of the
++ * nodes in the red-black tree.
++ */
++static void free_rb_tree_fname(rb_root_t *root)
++{
++      rb_node_t       *n = root->rb_node;
++      rb_node_t       *parent;
++      struct fname    *fname;
++
++      while (n) {
++              /* Do the node's children first */
++              if ((n)->rb_left) {
++                      n = n->rb_left;
++                      continue;
++              }
++              if (n->rb_right) {
++                      n = n->rb_right;
++                      continue;
++              }
++              /*
++               * The node has no children; free it, and then zero
++               * out parent's link to it.  Finally go to the
++               * beginning of the loop and try to free the parent
++               * node.
++               */
++              parent = n->rb_parent;
++              fname = rb_entry(n, struct fname, rb_hash);
++              kfree(fname);
++              if (!parent)
++                      root->rb_node = 0;
++              else if (parent->rb_left == n)
++                      parent->rb_left = 0;
++              else if (parent->rb_right == n)
++                      parent->rb_right = 0;
++              n = parent;
++      }
++      root->rb_node = 0;
++}
++
++
++struct dir_private_info *create_dir_info(loff_t pos)
++{
++      struct dir_private_info *p;
++
++      p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++      if (!p)
++              return NULL;
++      p->root.rb_node = 0;
++      p->curr_node = 0;
++      p->extra_fname = 0;
++      p->last_pos = 0;
++      p->curr_hash = pos2maj_hash(pos);
++      p->curr_minor_hash = pos2min_hash(pos);
++      p->next_hash = 0;
++      return p;
++}
++
++void ext3_htree_free_dir_info(struct dir_private_info *p)
++{
++      free_rb_tree_fname(&p->root);
++      kfree(p);
++}
++              
++/*
++ * Given a directory entry, enter it into the fname rb tree.
++ */
++void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++                           __u32 minor_hash,
++                           struct ext3_dir_entry_2 *dirent)
++{
++      rb_node_t **p, *parent = NULL;
++      struct fname * fname, *new_fn;
++      struct dir_private_info *info;
++      int len;
++
++      info = (struct dir_private_info *) dir_file->private_data;
++      p = &info->root.rb_node;
++
++      /* Create and allocate the fname structure */
++      len = sizeof(struct fname) + dirent->name_len + 1;
++      new_fn = kmalloc(len, GFP_KERNEL);
++      memset(new_fn, 0, len);
++      new_fn->hash = hash;
++      new_fn->minor_hash = minor_hash;
++      new_fn->inode = le32_to_cpu(dirent->inode);
++      new_fn->name_len = dirent->name_len;
++      new_fn->file_type = dirent->file_type;
++      memcpy(new_fn->name, dirent->name, dirent->name_len);
++      new_fn->name[dirent->name_len] = 0;
++      
++      while (*p) {
++              parent = *p;
++              fname = rb_entry(parent, struct fname, rb_hash);
++
++              /*
++               * If the hash and minor hash match up, then we put
++               * them on a linked list.  This rarely happens...
++               */
++              if ((new_fn->hash == fname->hash) &&
++                  (new_fn->minor_hash == fname->minor_hash)) {
++                      new_fn->next = fname->next;
++                      fname->next = new_fn;
++                      return;
++              }
++                      
++              if (new_fn->hash < fname->hash)
++                      p = &(*p)->rb_left;
++              else if (new_fn->hash > fname->hash)
++                      p = &(*p)->rb_right;
++              else if (new_fn->minor_hash < fname->minor_hash)
++                      p = &(*p)->rb_left;
++              else /* if (new_fn->minor_hash > fname->minor_hash) */
++                      p = &(*p)->rb_right;
++      }
++
++      rb_link_node(&new_fn->rb_hash, parent, p);
++      rb_insert_color(&new_fn->rb_hash, &info->root);
++}
++
++
++
++/*
++ * This is a helper function for ext3_dx_readdir.  It calls filldir
++ * for all entres on the fname linked list.  (Normally there is only
++ * one entry on the linked list, unless there are 62 bit hash collisions.)
++ */
++static int call_filldir(struct file * filp, void * dirent,
++                      filldir_t filldir, struct fname *fname)
++{
++      struct dir_private_info *info = filp->private_data;
++      loff_t  curr_pos;
++      struct inode *inode = filp->f_dentry->d_inode;
++      struct super_block * sb;
++      int error;
++
++      sb = inode->i_sb;
++      
++      if (!fname) {
++              printk("call_filldir: called with null fname?!?\n");
++              return 0;
++      }
++      curr_pos = hash2pos(fname->hash, fname->minor_hash);
++      while (fname) {
++              error = filldir(dirent, fname->name,
++                              fname->name_len, curr_pos, 
++                              fname->inode,
++                              get_dtype(sb, fname->file_type));
++              if (error) {
++                      filp->f_pos = curr_pos;
++                      info->extra_fname = fname->next;
++                      return error;
++              }
++              fname = fname->next;
++      }
++      return 0;
++}
++
++static int ext3_dx_readdir(struct file * filp,
++                       void * dirent, filldir_t filldir)
++{
++      struct dir_private_info *info = filp->private_data;
++      struct inode *inode = filp->f_dentry->d_inode;
++      struct fname *fname;
++      int     ret;
++
++      if (!info) {
++              info = create_dir_info(filp->f_pos);
++              if (!info)
++                      return -ENOMEM;
++              filp->private_data = info;
++      }
++
++      /* Some one has messed with f_pos; reset the world */
++      if (info->last_pos != filp->f_pos) {
++              free_rb_tree_fname(&info->root);
++              info->curr_node = 0;
++              info->extra_fname = 0;
++              info->curr_hash = pos2maj_hash(filp->f_pos);
++              info->curr_minor_hash = pos2min_hash(filp->f_pos);
++      }
++
++      /*
++       * If there are any leftover names on the hash collision
++       * chain, return them first.
++       */
++      if (info->extra_fname &&
++          call_filldir(filp, dirent, filldir, info->extra_fname))
++              goto finished;
++
++      if (!info->curr_node)
++              info->curr_node = rb_get_first(&info->root);
++
++      while (1) {
++              /*
++               * Fill the rbtree if we have no more entries,
++               * or the inode has changed since we last read in the
++               * cached entries. 
++               */
++              if ((!info->curr_node) ||
++                  (filp->f_version != inode->i_version)) {
++                      info->curr_node = 0;
++                      free_rb_tree_fname(&info->root);
++                      filp->f_version = inode->i_version;
++                      ret = ext3_htree_fill_tree(filp, info->curr_hash,
++                                                 info->curr_minor_hash,
++                                                 &info->next_hash);
++                      if (ret < 0)
++                              return ret;
++                      if (ret == 0)
++                              break;
++                      info->curr_node = rb_get_first(&info->root);
++              }
++
++              fname = rb_entry(info->curr_node, struct fname, rb_hash);
++              info->curr_hash = fname->hash;
++              info->curr_minor_hash = fname->minor_hash;
++              if (call_filldir(filp, dirent, filldir, fname))
++                      break;
++
++              info->curr_node = rb_get_next(info->curr_node);
++              if (!info->curr_node) {
++                      info->curr_hash = info->next_hash;
++                      info->curr_minor_hash = 0;
++              }
++      }
++finished:
++      info->last_pos = filp->f_pos;
++      UPDATE_ATIME(inode);
++      return 0;
++}
++#endif
+--- linux-2.4.20/fs/ext3/file.c~ext-2.4-patch-1        Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/fs/ext3/file.c  Sat Apr  5 03:56:31 2003
+@@ -35,6 +35,9 @@ static int ext3_release_file (struct ino
+ {
+       if (filp->f_mode & FMODE_WRITE)
+               ext3_discard_prealloc (inode);
++      if (is_dx(inode) && filp->private_data)
++              ext3_htree_free_dir_info(filp->private_data);
++
+       return 0;
+ }
+--- /dev/null  Fri Aug 30 17:31:37 2002
++++ linux-2.4.20-braam/fs/ext3/hash.c  Sat Apr  5 03:56:31 2003
+@@ -0,0 +1,215 @@
++/*
++ *  linux/fs/ext3/hash.c
++ *
++ * Copyright (C) 2002 by Theodore Ts'o
++ *
++ * This file is released under the GPL v2.
++ * 
++ * This file may be redistributed under the terms of the GNU Public
++ * License.
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/sched.h>
++#include <linux/ext3_fs.h>
++
++#define DELTA 0x9E3779B9
++
++static void TEA_transform(__u32 buf[4], __u32 const in[])
++{
++      __u32   sum = 0;
++      __u32   b0 = buf[0], b1 = buf[1];
++      __u32   a = in[0], b = in[1], c = in[2], d = in[3];
++      int     n = 16;
++
++      do {                                                    
++              sum += DELTA;                                   
++              b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); 
++              b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); 
++      } while(--n);
++
++      buf[0] += b0;
++      buf[1] += b1;
++}
++
++/* F, G and H are basic MD4 functions: selection, majority, parity */
++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
++#define H(x, y, z) ((x) ^ (y) ^ (z))
++
++/*
++ * The generic round function.  The application is so specific that
++ * we don't bother protecting all the arguments with parens, as is generally
++ * good macro practice, in favor of extra legibility.
++ * Rotation is separate from addition to prevent recomputation
++ */
++#define ROUND(f, a, b, c, d, x, s)    \
++      (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
++#define K1 0
++#define K2 013240474631UL
++#define K3 015666365641UL
++
++/*
++ * Basic cut-down MD4 transform.  Returns only 32 bits of result.
++ */
++static void halfMD4Transform (__u32 buf[4], __u32 const in[])
++{
++      __u32   a = buf[0], b = buf[1], c = buf[2], d = buf[3];
++
++      /* Round 1 */
++      ROUND(F, a, b, c, d, in[0] + K1,  3);
++      ROUND(F, d, a, b, c, in[1] + K1,  7);
++      ROUND(F, c, d, a, b, in[2] + K1, 11);
++      ROUND(F, b, c, d, a, in[3] + K1, 19);
++      ROUND(F, a, b, c, d, in[4] + K1,  3);
++      ROUND(F, d, a, b, c, in[5] + K1,  7);
++      ROUND(F, c, d, a, b, in[6] + K1, 11);
++      ROUND(F, b, c, d, a, in[7] + K1, 19);
++
++      /* Round 2 */
++      ROUND(G, a, b, c, d, in[1] + K2,  3);
++      ROUND(G, d, a, b, c, in[3] + K2,  5);
++      ROUND(G, c, d, a, b, in[5] + K2,  9);
++      ROUND(G, b, c, d, a, in[7] + K2, 13);
++      ROUND(G, a, b, c, d, in[0] + K2,  3);
++      ROUND(G, d, a, b, c, in[2] + K2,  5);
++      ROUND(G, c, d, a, b, in[4] + K2,  9);
++      ROUND(G, b, c, d, a, in[6] + K2, 13);
++
++      /* Round 3 */
++      ROUND(H, a, b, c, d, in[3] + K3,  3);
++      ROUND(H, d, a, b, c, in[7] + K3,  9);
++      ROUND(H, c, d, a, b, in[2] + K3, 11);
++      ROUND(H, b, c, d, a, in[6] + K3, 15);
++      ROUND(H, a, b, c, d, in[1] + K3,  3);
++      ROUND(H, d, a, b, c, in[5] + K3,  9);
++      ROUND(H, c, d, a, b, in[0] + K3, 11);
++      ROUND(H, b, c, d, a, in[4] + K3, 15);
++
++      buf[0] += a;
++      buf[1] += b;
++      buf[2] += c;
++      buf[3] += d;
++}
++
++#undef ROUND
++#undef F
++#undef G
++#undef H
++#undef K1
++#undef K2
++#undef K3
++
++/* The old legacy hash */
++static __u32 dx_hack_hash (const char *name, int len)
++{
++      __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
++      while (len--) {
++              __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
++              
++              if (hash & 0x80000000) hash -= 0x7fffffff;
++              hash1 = hash0;
++              hash0 = hash;
++      }
++      return (hash0 << 1);
++}
++
++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
++{
++      __u32   pad, val;
++      int     i;
++
++      pad = (__u32)len | ((__u32)len << 8);
++      pad |= pad << 16;
++
++      val = pad;
++      if (len > num*4)
++              len = num * 4;
++      for (i=0; i < len; i++) {
++              if ((i % 4) == 0)
++                      val = pad;
++              val = msg[i] + (val << 8);
++              if ((i % 4) == 3) {
++                      *buf++ = val;
++                      val = pad;
++                      num--;
++              }
++      }
++      if (--num >= 0)
++              *buf++ = val;
++      while (--num >= 0)
++              *buf++ = pad;
++}
++
++/*
++ * Returns the hash of a filename.  If len is 0 and name is NULL, then
++ * this function can be used to test whether or not a hash version is
++ * supported.
++ * 
++ * The seed is an 4 longword (32 bits) "secret" which can be used to
++ * uniquify a hash.  If the seed is all zero's, then some default seed
++ * may be used.
++ * 
++ * A particular hash version specifies whether or not the seed is
++ * represented, and whether or not the returned hash is 32 bits or 64
++ * bits.  32 bit hashes will return 0 for the minor hash.
++ */
++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
++{
++      __u32   hash;
++      __u32   minor_hash = 0;
++      const char      *p;
++      int             i;
++      __u32           in[8], buf[4];
++
++      /* Initialize the default seed for the hash checksum functions */
++      buf[0] = 0x67452301;
++      buf[1] = 0xefcdab89;
++      buf[2] = 0x98badcfe;
++      buf[3] = 0x10325476;
++
++      /* Check to see if the seed is all zero's */
++      if (hinfo->seed) {
++              for (i=0; i < 4; i++) {
++                      if (hinfo->seed[i])
++                              break;
++              }
++              if (i < 4)
++                      memcpy(buf, hinfo->seed, sizeof(buf));
++      }
++              
++      switch (hinfo->hash_version) {
++      case DX_HASH_LEGACY:
++              hash = dx_hack_hash(name, len);
++              break;
++      case DX_HASH_HALF_MD4:
++              p = name;
++              while (len > 0) {
++                      str2hashbuf(p, len, in, 8);
++                      halfMD4Transform(buf, in);
++                      len -= 32;
++                      p += 32;
++              }
++              minor_hash = buf[2];
++              hash = buf[1];
++              break;
++      case DX_HASH_TEA:
++              p = name;
++              while (len > 0) {
++                      str2hashbuf(p, len, in, 4);
++                      TEA_transform(buf, in);
++                      len -= 16;
++                      p += 16;
++              }
++              hash = buf[0];
++              minor_hash = buf[1];
++              break;
++      default:
++              hinfo->hash = 0;
++              return -1;
++      }
++      hinfo->hash = hash & ~1;
++      hinfo->minor_hash = minor_hash;
++      return 0;
++}
+--- linux-2.4.20/fs/ext3/namei.c~ext-2.4-patch-1       Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/fs/ext3/namei.c Sat Apr  5 03:56:31 2003
+@@ -16,6 +16,12 @@
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  *  Directory entry file type support and forward compatibility hooks
+  *    for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
++ *  Hash Tree Directory indexing (c)
++ *    Daniel Phillips, 2001
++ *  Hash Tree Directory indexing porting
++ *    Christopher Li, 2002
++ *  Hash Tree Directory indexing cleanup
++ *    Theodore Ts'o, 2002
+  */
+ #include <linux/fs.h>
+@@ -38,6 +44,630 @@
+ #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
++static struct buffer_head *ext3_append(handle_t *handle,
++                                      struct inode *inode,
++                                      u32 *block, int *err)
++{
++      struct buffer_head *bh;
++
++      *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
++
++      if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++              inode->i_size += inode->i_sb->s_blocksize;
++              EXT3_I(inode)->i_disksize = inode->i_size;
++              ext3_journal_get_write_access(handle,bh);
++      }
++      return bh;
++}
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#ifndef swap
++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
++#endif
++
++typedef struct { u32 v; } le_u32;
++typedef struct { u16 v; } le_u16;
++
++#ifdef DX_DEBUG
++#define dxtrace(command) command
++#else
++#define dxtrace(command) 
++#endif
++
++struct fake_dirent
++{
++      /*le*/u32 inode;
++      /*le*/u16 rec_len;
++      u8 name_len;
++      u8 file_type;
++};
++
++struct dx_countlimit
++{
++      le_u16 limit;
++      le_u16 count;
++};
++
++struct dx_entry
++{
++      le_u32 hash;
++      le_u32 block;
++};
++
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero.  Therefore, the
++ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
++ */
++
++struct dx_root
++{
++      struct fake_dirent dot;
++      char dot_name[4];
++      struct fake_dirent dotdot;
++      char dotdot_name[4];
++      struct dx_root_info
++      {
++              le_u32 reserved_zero;
++              u8 hash_version;
++              u8 info_length; /* 8 */
++              u8 indirect_levels;
++              u8 unused_flags;
++      }
++      info;
++      struct dx_entry entries[0];
++};
++
++struct dx_node
++{
++      struct fake_dirent fake;
++      struct dx_entry entries[0];
++};
++
++
++struct dx_frame
++{
++      struct buffer_head *bh;
++      struct dx_entry *entries;
++      struct dx_entry *at;
++};
++
++struct dx_map_entry
++{
++      u32 hash;
++      u32 offs;
++};
++
++#ifdef CONFIG_EXT3_INDEX
++static inline unsigned dx_get_block (struct dx_entry *entry);
++static void dx_set_block (struct dx_entry *entry, unsigned value);
++static inline unsigned dx_get_hash (struct dx_entry *entry);
++static void dx_set_hash (struct dx_entry *entry, unsigned value);
++static unsigned dx_get_count (struct dx_entry *entries);
++static unsigned dx_get_limit (struct dx_entry *entries);
++static void dx_set_count (struct dx_entry *entries, unsigned value);
++static void dx_set_limit (struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
++static unsigned dx_node_limit (struct inode *dir);
++static struct dx_frame *dx_probe(struct dentry *dentry,
++                               struct inode *dir,
++                               struct dx_hash_info *hinfo,
++                               struct dx_frame *frame,
++                               int *err);
++static void dx_release (struct dx_frame *frames);
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++                      struct dx_hash_info *hinfo, struct dx_map_entry map[]);
++static void dx_sort_map(struct dx_map_entry *map, unsigned count);
++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
++              struct dx_map_entry *offsets, int count);
++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++                               struct dx_frame *frame,
++                               struct dx_frame *frames, int *err,
++                               __u32 *start_hash);
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++                     struct ext3_dir_entry_2 **res_dir, int *err);
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++                           struct inode *inode);
++
++/*
++ * Future: use high four bits of block for coalesce-on-delete flags
++ * Mask them off for now.
++ */
++
++static inline unsigned dx_get_block (struct dx_entry *entry)
++{
++      return le32_to_cpu(entry->block.v) & 0x00ffffff;
++}
++
++static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++{
++      entry->block.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_hash (struct dx_entry *entry)
++{
++      return le32_to_cpu(entry->hash.v);
++}
++
++static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++{
++      entry->hash.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_count (struct dx_entry *entries)
++{
++      return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
++}
++
++static inline unsigned dx_get_limit (struct dx_entry *entries)
++{
++      return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
++}
++
++static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++{
++      ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
++}
++
++static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++{
++      ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
++}
++
++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++{
++      unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
++              EXT3_DIR_REC_LEN(2) - infosize;
++      return 0? 20: entry_space / sizeof(struct dx_entry);
++}
++
++static inline unsigned dx_node_limit (struct inode *dir)
++{
++      unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
++      return 0? 22: entry_space / sizeof(struct dx_entry);
++}
++
++/*
++ * Debug
++ */
++#ifdef DX_DEBUG
++struct stats
++{ 
++      unsigned names;
++      unsigned space;
++      unsigned bcount;
++};
++
++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
++                               int size, int show_names)
++{
++      unsigned names = 0, space = 0;
++      char *base = (char *) de;
++      struct dx_hash_info h = *hinfo;
++      
++      printk("names: ");
++      while ((char *) de < base + size)
++      {
++              if (de->inode)
++              {
++                      if (show_names)
++                      {
++                              int len = de->name_len;
++                              char *name = de->name;
++                              while (len--) printk("%c", *name++);
++                              ext3fs_dirhash(de->name, de->name_len, &h);
++                              printk(":%x.%u ", h.hash,
++                                     ((char *) de - base));
++                      }
++                      space += EXT3_DIR_REC_LEN(de->name_len);
++                      names++;
++              }
++              de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++      }
++      printk("(%i)\n", names);
++      return (struct stats) { names, space, 1 };
++}
++
++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
++                           struct dx_entry *entries, int levels)
++{
++      unsigned blocksize = dir->i_sb->s_blocksize;
++      unsigned count = dx_get_count (entries), names = 0, space = 0, i;
++      unsigned bcount = 0;
++      struct buffer_head *bh;
++      int err;
++      printk("%i indexed blocks...\n", count);
++      for (i = 0; i < count; i++, entries++)
++      {
++              u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
++              u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
++              struct stats stats;
++              printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
++              if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
++              stats = levels?
++                 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
++                 dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
++              names += stats.names;
++              space += stats.space;
++              bcount += stats.bcount;
++              brelse (bh);
++      }
++      if (bcount)
++              printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
++                      names, space/bcount,(space/bcount)*100/blocksize);
++      return (struct stats) { names, space, bcount};
++}
++#endif /* DX_DEBUG */
++
++/*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally.  The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static struct dx_frame *
++dx_probe(struct dentry *dentry, struct inode *dir,
++       struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++{
++      unsigned count, indirect;
++      struct dx_entry *at, *entries, *p, *q, *m;
++      struct dx_root *root;
++      struct buffer_head *bh;
++      struct dx_frame *frame = frame_in;
++      u32 hash;
++
++      frame->bh = NULL;
++      if (dentry)
++              dir = dentry->d_parent->d_inode;
++      if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
++              goto fail;
++      root = (struct dx_root *) bh->b_data;
++      if (root->info.hash_version != DX_HASH_TEA &&
++          root->info.hash_version != DX_HASH_HALF_MD4 &&
++          root->info.hash_version != DX_HASH_LEGACY) {
++              ext3_warning(dir->i_sb, __FUNCTION__,
++                           "Unrecognised inode hash code %d",
++                           root->info.hash_version);
++              brelse(bh);
++              *err = ERR_BAD_DX_DIR;
++              goto fail;
++      }
++      hinfo->hash_version = root->info.hash_version;
++      hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++      if (dentry)
++              ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++      hash = hinfo->hash;
++
++      if (root->info.unused_flags & 1) {
++              ext3_warning(dir->i_sb, __FUNCTION__,
++                           "Unimplemented inode hash flags: %#06x",
++                           root->info.unused_flags);
++              brelse(bh);
++              *err = ERR_BAD_DX_DIR;
++              goto fail;
++      }
++
++      if ((indirect = root->info.indirect_levels) > 1) {
++              ext3_warning(dir->i_sb, __FUNCTION__,
++                           "Unimplemented inode hash depth: %#06x",
++                           root->info.indirect_levels);
++              brelse(bh);
++              *err = ERR_BAD_DX_DIR;
++              goto fail;
++      }
++
++      entries = (struct dx_entry *) (((char *)&root->info) +
++                                     root->info.info_length);
++      assert(dx_get_limit(entries) == dx_root_limit(dir,
++                                                    root->info.info_length));
++      dxtrace (printk("Look up %x", hash));
++      while (1)
++      {
++              count = dx_get_count(entries);
++              assert (count && count <= dx_get_limit(entries));
++              p = entries + 1;
++              q = entries + count - 1;
++              while (p <= q)
++              {
++                      m = p + (q - p)/2;
++                      dxtrace(printk("."));
++                      if (dx_get_hash(m) > hash)
++                              q = m - 1;
++                      else
++                              p = m + 1;
++              }
++
++              if (0) // linear search cross check
++              {
++                      unsigned n = count - 1;
++                      at = entries;
++                      while (n--)
++                      {
++                              dxtrace(printk(","));
++                              if (dx_get_hash(++at) > hash)
++                              {
++                                      at--;
++                                      break;
++                              }
++                      }
++                      assert (at == p - 1);
++              }
++
++              at = p - 1;
++              dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++              frame->bh = bh;
++              frame->entries = entries;
++              frame->at = at;
++              if (!indirect--) return frame;
++              if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++                      goto fail2;
++              at = entries = ((struct dx_node *) bh->b_data)->entries;
++              assert (dx_get_limit(entries) == dx_node_limit (dir));
++              frame++;
++      }
++fail2:
++      while (frame >= frame_in) {
++              brelse(frame->bh);
++              frame--;
++      }
++fail:
++      return NULL;
++}
++
++static void dx_release (struct dx_frame *frames)
++{
++      if (frames[0].bh == NULL)
++              return;
++
++      if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++              brelse(frames[1].bh);
++      brelse(frames[0].bh);
++}
++
++/*
++ * This function increments the frame pointer to search the next leaf
++ * block, and reads in the necessary intervening nodes if the search
++ * should be necessary.  Whether or not the search is necessary is
++ * controlled by the hash parameter.  If the hash value is even, then
++ * the search is only continued if the next block starts with that
++ * hash value.  This is used if we are searching for a specific file.
++ *
++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
++ *
++ * This function returns 1 if the caller should continue to search,
++ * or 0 if it should not.  If there is an error reading one of the
++ * index blocks, it will return -1.
++ *
++ * If start_hash is non-null, it will be filled in with the starting
++ * hash of the next page.
++ */
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++                               struct dx_frame *frame,
++                               struct dx_frame *frames, int *err,
++                               __u32 *start_hash)
++{
++      struct dx_frame *p;
++      struct buffer_head *bh;
++      int num_frames = 0;
++      __u32 bhash;
++
++      *err = ENOENT;
++      p = frame;
++      /*
++       * Find the next leaf page by incrementing the frame pointer.
++       * If we run out of entries in the interior node, loop around and
++       * increment pointer in the parent node.  When we break out of
++       * this loop, num_frames indicates the number of interior
++       * nodes need to be read.
++       */
++      while (1) {
++              if (++(p->at) < p->entries + dx_get_count(p->entries))
++                      break;
++              if (p == frames)
++                      return 0;
++              num_frames++;
++              p--;
++      }
++
++      /*
++       * If the hash is 1, then continue only if the next page has a
++       * continuation hash of any value.  This is used for readdir
++       * handling.  Otherwise, check to see if the hash matches the
++       * desired contiuation hash.  If it doesn't, return since
++       * there's no point to read in the successive index pages.
++       */
++      bhash = dx_get_hash(p->at);
++      if (start_hash)
++              *start_hash = bhash;
++      if ((hash & 1) == 0) {
++              if ((bhash & ~1) != hash)
++                      return 0;
++      }
++      /*
++       * If the hash is HASH_NB_ALWAYS, we always go to the next
++       * block so no check is necessary
++       */
++      while (num_frames--) {
++              if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
++                                    0, err)))
++                      return -1; /* Failure */
++              p++;
++              brelse (p->bh);
++              p->bh = bh;
++              p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++      }
++      return 1;
++}
++
++
++/*
++ * p is at least 6 bytes before the end of page
++ */
++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
++{
++      return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
++}
++
++/*
++ * This function fills a red-black tree with information from a
++ * directory.  We start scanning the directory in hash order, starting
++ * at start_hash and start_minor_hash.
++ *
++ * This function returns the number of entries inserted into the tree,
++ * or a negative error code.
++ */
++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++                       __u32 start_minor_hash, __u32 *next_hash)
++{
++      struct dx_hash_info hinfo;
++      struct buffer_head *bh;
++      struct ext3_dir_entry_2 *de, *top;
++      static struct dx_frame frames[2], *frame;
++      struct inode *dir;
++      int block, err;
++      int count = 0;
++      int ret;
++      __u32 hashval;
++      
++      dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
++                     start_minor_hash));
++      dir = dir_file->f_dentry->d_inode;
++      hinfo.hash = start_hash;
++      hinfo.minor_hash = 0;
++      frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++      if (!frame)
++              return err;
++
++      while (1) {
++              block = dx_get_block(frame->at);
++              dxtrace(printk("Reading block %d\n", block));
++              if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
++                      goto errout;
++      
++              de = (struct ext3_dir_entry_2 *) bh->b_data;
++              top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize -
++                                     EXT3_DIR_REC_LEN(0));
++              for (; de < top; de = ext3_next_entry(de)) {
++                      ext3fs_dirhash(de->name, de->name_len, &hinfo);
++                      if ((hinfo.hash < start_hash) ||
++                          ((hinfo.hash == start_hash) &&
++                           (hinfo.minor_hash < start_minor_hash)))
++                              continue;
++                      ext3_htree_store_dirent(dir_file, hinfo.hash,
++                                              hinfo.minor_hash, de);
++                      count++;
++              }
++              brelse (bh);
++              hashval = ~1;
++              ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, 
++                                          frame, frames, &err, &hashval);
++              if (next_hash)
++                      *next_hash = hashval;
++              if (ret == -1)
++                      goto errout;
++              /*
++               * Stop if:  (a) there are no more entries, or
++               * (b) we have inserted at least one entry and the
++               * next hash value is not a continuation
++               */
++              if ((ret == 0) ||
++                  (count && ((hashval & 1) == 0)))
++                      break;
++      }
++      dx_release(frames);
++      dxtrace(printk("Fill tree: returned %d entries\n", count));
++      return count;
++errout:
++      dx_release(frames);
++      return (err);
++}
++
++
++/*
++ * Directory block splitting, compacting
++ */
++
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++                      struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
++{
++      int count = 0;
++      char *base = (char *) de;
++      struct dx_hash_info h = *hinfo;
++      
++      while ((char *) de < base + size)
++      {
++              if (de->name_len && de->inode) {
++                      ext3fs_dirhash(de->name, de->name_len, &h);
++                      map_tail--;
++                      map_tail->hash = h.hash;
++                      map_tail->offs = (u32) ((char *) de - base);
++                      count++;
++              }
++              /* XXX: do we need to check rec_len == 0 case? -Chris */
++              de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++      }
++      return count;
++}
++
++static void dx_sort_map (struct dx_map_entry *map, unsigned count)
++{
++        struct dx_map_entry *p, *q, *top = map + count - 1;
++        int more;
++        /* Combsort until bubble sort doesn't suck */
++        while (count > 2)
++      {
++                count = count*10/13;
++                if (count - 9 < 2) /* 9, 10 -> 11 */
++                        count = 11;
++                for (p = top, q = p - count; q >= map; p--, q--)
++                        if (p->hash < q->hash)
++                                swap(*p, *q);
++        }
++        /* Garden variety bubble sort */
++        do {
++                more = 0;
++                q = top;
++                while (q-- > map)
++              {
++                        if (q[1].hash >= q[0].hash)
++                              continue;
++                        swap(*(q+1), *q);
++                        more = 1;
++              }
++      } while(more);
++}
++
++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++{
++      struct dx_entry *entries = frame->entries;
++      struct dx_entry *old = frame->at, *new = old + 1;
++      int count = dx_get_count(entries);
++
++      assert(count < dx_get_limit(entries));
++      assert(old < entries + count);
++      memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
++      dx_set_hash(new, hash);
++      dx_set_block(new, block);
++      dx_set_count(entries, count + 1);
++}
++#endif
++
++
++static void ext3_update_dx_flag(struct inode *inode)
++{
++      if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
++                                   EXT3_FEATURE_COMPAT_DIR_INDEX))
++              EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
++}
++
+ /*
+  * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+  *
+@@ -94,6 +724,7 @@ static int inline search_dirblock(struct
+       return 0;
+ }
++
+ /*
+  *    ext3_find_entry()
+  *
+@@ -105,6 +736,8 @@ static int inline search_dirblock(struct
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
+  * to brelse() it when appropriate.
+  */
++
++      
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+                                       struct ext3_dir_entry_2 ** res_dir)
+ {
+@@ -119,12 +752,32 @@ static struct buffer_head * ext3_find_en
+       int num = 0;
+       int nblocks, i, err;
+       struct inode *dir = dentry->d_parent->d_inode;
++      int namelen;
++      const u8 *name;
++      unsigned blocksize;
+       *res_dir = NULL;
+       sb = dir->i_sb;
+-
++      blocksize = sb->s_blocksize;
++      namelen = dentry->d_name.len;
++      name = dentry->d_name.name;
++      if (namelen > EXT3_NAME_LEN)
++              return NULL;
++#ifdef CONFIG_EXT3_INDEX
++      if (is_dx(dir)) {
++              bh = ext3_dx_find_entry(dentry, res_dir, &err);
++              /*
++               * On success, or if the error was file not found,
++               * return.  Otherwise, fall back to doing a search the
++               * old fashioned way.
++               */
++              if (bh || (err != ERR_BAD_DX_DIR))
++                      return bh;
++              dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++      }
++#endif
+       nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+-      start = dir->u.ext3_i.i_dir_start_lookup;
++      start = EXT3_I(dir)->i_dir_start_lookup;
+       if (start >= nblocks)
+               start = 0;
+       block = start;
+@@ -165,7 +818,7 @@ restart:
+               i = search_dirblock(bh, dir, dentry,
+                           block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+               if (i == 1) {
+-                      dir->u.ext3_i.i_dir_start_lookup = block;
++                      EXT3_I(dir)->i_dir_start_lookup = block;
+                       ret = bh;
+                       goto cleanup_and_exit;
+               } else {
+@@ -196,6 +849,66 @@ cleanup_and_exit:
+       return ret;
+ }
++#ifdef CONFIG_EXT3_INDEX
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++                     struct ext3_dir_entry_2 **res_dir, int *err)
++{
++      struct super_block * sb;
++      struct dx_hash_info     hinfo;
++      u32 hash;
++      struct dx_frame frames[2], *frame;
++      struct ext3_dir_entry_2 *de, *top;
++      struct buffer_head *bh;
++      unsigned long block;
++      int retval;
++      int namelen = dentry->d_name.len;
++      const u8 *name = dentry->d_name.name;
++      struct inode *dir = dentry->d_parent->d_inode;
++      
++      sb = dir->i_sb;
++      if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++              return NULL;
++      hash = hinfo.hash;
++      do {
++              block = dx_get_block(frame->at);
++              if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++                      goto errout;
++              de = (struct ext3_dir_entry_2 *) bh->b_data;
++              top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
++                                     EXT3_DIR_REC_LEN(0));
++              for (; de < top; de = ext3_next_entry(de))
++              if (ext3_match (namelen, name, de)) {
++                      if (!ext3_check_dir_entry("ext3_find_entry",
++                                                dir, de, bh,
++                                (block<<EXT3_BLOCK_SIZE_BITS(sb))
++                                        +((char *)de - bh->b_data))) {
++                              brelse (bh);
++                              goto errout;
++                      }
++                      *res_dir = de;
++                      dx_release (frames);
++                      return bh;
++              }
++              brelse (bh);
++              /* Check to see if we should continue to search */
++              retval = ext3_htree_next_block(dir, hash, frame,
++                                             frames, err, 0);
++              if (retval == -1) {
++                      ext3_warning(sb, __FUNCTION__,
++                           "error reading index page in directory #%lu",
++                           dir->i_ino);
++                      goto errout;
++              }
++      } while (retval == 1);
++      
++      *err = -ENOENT;
++errout:
++      dxtrace(printk("%s not found\n", name));
++      dx_release (frames);
++      return NULL;
++}
++#endif
++
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+       struct inode * inode;
+@@ -212,8 +925,9 @@ static struct dentry *ext3_lookup(struct
+               brelse (bh);
+               inode = iget(dir->i_sb, ino);
+-              if (!inode)
++              if (!inode) {
+                       return ERR_PTR(-EACCES);
++              }
+       }
+       d_add(dentry, inode);
+       return NULL;
+@@ -237,6 +951,300 @@ static inline void ext3_set_de_type(stru
+               de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
++#ifdef CONFIG_EXT3_INDEX
++static struct ext3_dir_entry_2 *
++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
++{
++      unsigned rec_len = 0;
++
++      while (count--) {
++              struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++              rec_len = EXT3_DIR_REC_LEN(de->name_len);
++              memcpy (to, de, rec_len);
++              ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++              de->inode = 0;
++              map++;
++              to += rec_len;
++      }
++      return (struct ext3_dir_entry_2 *) (to - rec_len);
++}
++
++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
++{
++      struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++      unsigned rec_len = 0;
++
++      prev = to = de;
++      while ((char*)de < base + size) {
++              next = (struct ext3_dir_entry_2 *) ((char *) de +
++                                                  le16_to_cpu(de->rec_len));
++              if (de->inode && de->name_len) {
++                      rec_len = EXT3_DIR_REC_LEN(de->name_len);
++                      if (de > to)
++                              memmove(to, de, rec_len);
++                      to->rec_len = rec_len;
++                      prev = to;
++                      to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++              }
++              de = next;
++      }
++      return prev;
++}
++
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
++                      struct buffer_head **bh,struct dx_frame *frame,
++                      struct dx_hash_info *hinfo, int *error)
++{
++      unsigned blocksize = dir->i_sb->s_blocksize;
++      unsigned count, continued;
++      struct buffer_head *bh2;
++      u32 newblock;
++      u32 hash2;
++      struct dx_map_entry *map;
++      char *data1 = (*bh)->b_data, *data2;
++      unsigned split;
++      struct ext3_dir_entry_2 *de = NULL, *de2;
++      int     err;
++
++      bh2 = ext3_append (handle, dir, &newblock, error);
++      if (!(bh2)) {
++              brelse(*bh);
++              *bh = NULL;
++              goto errout;
++      }
++
++      BUFFER_TRACE(*bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, *bh);
++      if (err) {
++      journal_error:
++              brelse(*bh);
++              brelse(bh2);
++              *bh = NULL;
++              ext3_std_error(dir->i_sb, err);
++              goto errout;
++      }
++      BUFFER_TRACE(frame->bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, frame->bh);
++      if (err)
++              goto journal_error;
++
++      data2 = bh2->b_data;
++
++      /* create map in the end of data2 block */
++      map = (struct dx_map_entry *) (data2 + blocksize);
++      count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
++                           blocksize, hinfo, map);
++      map -= count;
++      split = count/2; // need to adjust to actual middle
++      dx_sort_map (map, count);
++      hash2 = map[split].hash;
++      continued = hash2 == map[split - 1].hash;
++      dxtrace(printk("Split block %i at %x, %i/%i\n",
++              dx_get_block(frame->at), hash2, split, count-split));
++
++      /* Fancy dance to stay within two buffers */
++      de2 = dx_move_dirents(data1, data2, map + split, count - split);
++      de = dx_pack_dirents(data1,blocksize);
++      de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++      de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++      dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++      /* Which block gets the new entry? */
++      if (hinfo->hash >= hash2)
++      {
++              swap(*bh, bh2);
++              de = de2;
++      }
++      dx_insert_block (frame, hash2 + continued, newblock);
++      err = ext3_journal_dirty_metadata (handle, bh2);
++      if (err)
++              goto journal_error;
++      err = ext3_journal_dirty_metadata (handle, frame->bh);
++      if (err)
++              goto journal_error;
++      brelse (bh2);
++      dxtrace(dx_show_index ("frame", frame->entries));
++errout:
++      return de;
++}
++#endif
++
++
++/*
++ * Add a new entry into a directory (leaf) block.  If de is non-NULL,
++ * it points to a directory entry which is guaranteed to be large
++ * enough for new directory entry.  If de is NULL, then
++ * add_dirent_to_buf will attempt search the directory block for
++ * space.  It will return -ENOSPC if no space is available, and -EIO
++ * and -EEXIST if directory entry already exists.
++ * 
++ * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
++ * all other cases bh is released.
++ */
++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
++                           struct inode *inode, struct ext3_dir_entry_2 *de,
++                           struct buffer_head * bh)
++{
++      struct inode    *dir = dentry->d_parent->d_inode;
++      const char      *name = dentry->d_name.name;
++      int             namelen = dentry->d_name.len;
++      unsigned long   offset = 0;
++      unsigned short  reclen;
++      int             nlen, rlen, err;
++      char            *top;
++      
++      reclen = EXT3_DIR_REC_LEN(namelen);
++      if (!de) {
++              de = (struct ext3_dir_entry_2 *)bh->b_data;
++              top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++              while ((char *) de <= top) {
++                      if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
++                                                bh, offset)) {
++                              brelse (bh);
++                              return -EIO;
++                      }
++                      if (ext3_match (namelen, name, de)) {
++                              brelse (bh);
++                              return -EEXIST;
++                      }
++                      nlen = EXT3_DIR_REC_LEN(de->name_len);
++                      rlen = le16_to_cpu(de->rec_len);
++                      if ((de->inode? rlen - nlen: rlen) >= reclen)
++                              break;
++                      de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++                      offset += rlen;
++              }
++              if ((char *) de > top)
++                      return -ENOSPC;
++      }
++      BUFFER_TRACE(bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, bh);
++      if (err) {
++              ext3_std_error(dir->i_sb, err);
++              brelse(bh);
++              return err;
++      }
++      
++      /* By now the buffer is marked for journaling */
++      nlen = EXT3_DIR_REC_LEN(de->name_len);
++      rlen = le16_to_cpu(de->rec_len);
++      if (de->inode) {
++              struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++              de1->rec_len = cpu_to_le16(rlen - nlen);
++              de->rec_len = cpu_to_le16(nlen);
++              de = de1;
++      }
++      de->file_type = EXT3_FT_UNKNOWN;
++      if (inode) {
++              de->inode = cpu_to_le32(inode->i_ino);
++              ext3_set_de_type(dir->i_sb, de, inode->i_mode);
++      } else
++              de->inode = 0;
++      de->name_len = namelen;
++      memcpy (de->name, name, namelen);
++      /*
++       * XXX shouldn't update any times until successful
++       * completion of syscall, but too many callers depend
++       * on this.
++       *
++       * XXX similarly, too many callers depend on
++       * ext3_new_inode() setting the times, but error
++       * recovery deletes the inode, so the worst that can
++       * happen is that the times are slightly out of date
++       * and/or different from the directory change time.
++       */
++      dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++      ext3_update_dx_flag(dir);
++      dir->i_version = ++event;
++      ext3_mark_inode_dirty(handle, dir);
++      BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++      err = ext3_journal_dirty_metadata(handle, bh);
++      if (err)
++              ext3_std_error(dir->i_sb, err);
++      brelse(bh);
++      return 0;
++}
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * This converts a one block unindexed directory to a 3 block indexed
++ * directory, and adds the dentry to the indexed directory.
++ */
++static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
++                          struct inode *inode, struct buffer_head *bh)
++{
++      struct inode    *dir = dentry->d_parent->d_inode;
++      const char      *name = dentry->d_name.name;
++      int             namelen = dentry->d_name.len;
++      struct buffer_head *bh2;
++      struct dx_root  *root;
++      struct dx_frame frames[2], *frame;
++      struct dx_entry *entries;
++      struct ext3_dir_entry_2 *de, *de2;
++      char            *data1, *top;
++      unsigned        len;
++      int             retval;
++      unsigned        blocksize;
++      struct dx_hash_info hinfo;
++      u32             block;
++              
++      blocksize =  dir->i_sb->s_blocksize;
++      dxtrace(printk("Creating index\n"));
++      retval = ext3_journal_get_write_access(handle, bh);
++      if (retval) {
++              ext3_std_error(dir->i_sb, retval);
++              brelse(bh);
++              return retval;
++      }
++      root = (struct dx_root *) bh->b_data;
++              
++      EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++      bh2 = ext3_append (handle, dir, &block, &retval);
++      if (!(bh2)) {
++              brelse(bh);
++              return retval;
++      }
++      data1 = bh2->b_data;
++
++      /* The 0th block becomes the root, move the dirents out */
++      de = (struct ext3_dir_entry_2 *) &root->info;
++      len = ((char *) root) + blocksize - (char *) de;
++      memcpy (data1, de, len);
++      de = (struct ext3_dir_entry_2 *) data1;
++      top = data1 + len;
++      while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
++              de = de2;
++      de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++      /* Initialize the root; the dot dirents already exist */
++      de = (struct ext3_dir_entry_2 *) (&root->dotdot);
++      de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
++      memset (&root->info, 0, sizeof(root->info));
++      root->info.info_length = sizeof(root->info);
++      root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version;
++      entries = root->entries;
++      dx_set_block (entries, 1);
++      dx_set_count (entries, 1);
++      dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++
++      /* Initialize as for dx_probe */
++      hinfo.hash_version = root->info.hash_version;
++      hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++      ext3fs_dirhash(name, namelen, &hinfo);
++      frame = frames;
++      frame->entries = entries;
++      frame->at = entries;
++      frame->bh = bh;
++      bh = bh2;
++      de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++      dx_release (frames);
++      if (!(de))
++              return retval;
++
++      return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
++#endif
++
+ /*
+  *    ext3_add_entry()
+  *
+@@ -247,127 +1255,198 @@ static inline void ext3_set_de_type(stru
+  * may not sleep between calling this and putting something into
+  * the entry, as someone else might have used it while you slept.
+  */
+-
+-/*
+- * AKPM: the journalling code here looks wrong on the error paths
+- */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+       struct inode *inode)
+ {
+       struct inode *dir = dentry->d_parent->d_inode;
+-      const char *name = dentry->d_name.name;
+-      int namelen = dentry->d_name.len;
+       unsigned long offset;
+-      unsigned short rec_len;
+       struct buffer_head * bh;
+-      struct ext3_dir_entry_2 * de, * de1;
++      struct ext3_dir_entry_2 *de;
+       struct super_block * sb;
+       int     retval;
++#ifdef CONFIG_EXT3_INDEX
++      int     dx_fallback=0;
++#endif
++      unsigned blocksize;
++      unsigned nlen, rlen;
++      u32 block, blocks;
+       sb = dir->i_sb;
+-
+-      if (!namelen)
++      blocksize = sb->s_blocksize;
++      if (!dentry->d_name.len)
+               return -EINVAL;
+-      bh = ext3_bread (handle, dir, 0, 0, &retval);
++#ifdef CONFIG_EXT3_INDEX
++      if (is_dx(dir)) {
++              retval = ext3_dx_add_entry(handle, dentry, inode);
++              if (!retval || (retval != ERR_BAD_DX_DIR))
++                      return retval;
++              EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
++              dx_fallback++;
++              ext3_mark_inode_dirty(handle, dir);
++      }
++#endif
++      blocks = dir->i_size >> sb->s_blocksize_bits;
++      for (block = 0, offset = 0; block < blocks; block++) {
++              bh = ext3_bread(handle, dir, block, 0, &retval);
++              if(!bh)
++                      return retval;
++              retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++              if (retval != -ENOSPC)
++                      return retval;
++
++#ifdef CONFIG_EXT3_INDEX
++              if (blocks == 1 && !dx_fallback &&
++                  EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
++                      return make_indexed_dir(handle, dentry, inode, bh);
++#endif
++              brelse(bh);
++      }
++      bh = ext3_append(handle, dir, &block, &retval);
+       if (!bh)
+               return retval;
+-      rec_len = EXT3_DIR_REC_LEN(namelen);
+-      offset = 0;
+       de = (struct ext3_dir_entry_2 *) bh->b_data;
+-      while (1) {
+-              if ((char *)de >= sb->s_blocksize + bh->b_data) {
+-                      brelse (bh);
+-                      bh = NULL;
+-                      bh = ext3_bread (handle, dir,
+-                              offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+-                      if (!bh)
+-                              return retval;
+-                      if (dir->i_size <= offset) {
+-                              if (dir->i_size == 0) {
+-                                      brelse(bh);
+-                                      return -ENOENT;
+-                              }
++      de->inode = 0;
++      de->rec_len = cpu_to_le16(rlen = blocksize);
++      nlen = 0;
++      return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
+-                              ext3_debug ("creating next block\n");
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++                           struct inode *inode)
++{
++      struct dx_frame frames[2], *frame;
++      struct dx_entry *entries, *at;
++      struct dx_hash_info hinfo;
++      struct buffer_head * bh;
++      struct inode *dir = dentry->d_parent->d_inode;
++      struct super_block * sb = dir->i_sb;
++      struct ext3_dir_entry_2 *de;
++      int err;
+-                              BUFFER_TRACE(bh, "get_write_access");
+-                              ext3_journal_get_write_access(handle, bh);
+-                              de = (struct ext3_dir_entry_2 *) bh->b_data;
+-                              de->inode = 0;
+-                              de->rec_len = le16_to_cpu(sb->s_blocksize);
+-                              dir->u.ext3_i.i_disksize =
+-                                      dir->i_size = offset + sb->s_blocksize;
+-                              dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+-                              ext3_mark_inode_dirty(handle, dir);
+-                      } else {
++      frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++      if (!frame)
++              return err;
++      entries = frame->entries;
++      at = frame->at;
+-                              ext3_debug ("skipping to next block\n");
++      if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++              goto cleanup;
+-                              de = (struct ext3_dir_entry_2 *) bh->b_data;
+-                      }
+-              }
+-              if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+-                                         offset)) {
+-                      brelse (bh);
+-                      return -ENOENT;
+-              }
+-              if (ext3_match (namelen, name, de)) {
+-                              brelse (bh);
+-                              return -EEXIST;
++      BUFFER_TRACE(bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, bh);
++      if (err)
++              goto journal_error;
++
++      err = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++      if (err != -ENOSPC) {
++              bh = 0;
++              goto cleanup;
++      }
++
++      /* Block full, should compress but for now just split */
++      dxtrace(printk("using %u of %u node entries\n",
++                     dx_get_count(entries), dx_get_limit(entries)));
++      /* Need to split index? */
++      if (dx_get_count(entries) == dx_get_limit(entries)) {
++              u32 newblock;
++              unsigned icount = dx_get_count(entries);
++              int levels = frame - frames;
++              struct dx_entry *entries2;
++              struct dx_node *node2;
++              struct buffer_head *bh2;
++
++              if (levels && (dx_get_count(frames->entries) ==
++                             dx_get_limit(frames->entries))) {
++                      ext3_warning(sb, __FUNCTION__,
++                                   "Directory index full!\n");
++                      err = -ENOSPC;
++                      goto cleanup;
+               }
+-              if ((le32_to_cpu(de->inode) == 0 &&
+-                              le16_to_cpu(de->rec_len) >= rec_len) ||
+-                  (le16_to_cpu(de->rec_len) >=
+-                              EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+-                      BUFFER_TRACE(bh, "get_write_access");
+-                      ext3_journal_get_write_access(handle, bh);
+-                      /* By now the buffer is marked for journaling */
+-                      offset += le16_to_cpu(de->rec_len);
+-                      if (le32_to_cpu(de->inode)) {
+-                              de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+-                                      EXT3_DIR_REC_LEN(de->name_len));
+-                              de1->rec_len =
+-                                      cpu_to_le16(le16_to_cpu(de->rec_len) -
+-                                      EXT3_DIR_REC_LEN(de->name_len));
+-                              de->rec_len = cpu_to_le16(
+-                                              EXT3_DIR_REC_LEN(de->name_len));
+-                              de = de1;
++              bh2 = ext3_append (handle, dir, &newblock, &err);
++              if (!(bh2))
++                      goto cleanup;
++              node2 = (struct dx_node *)(bh2->b_data);
++              entries2 = node2->entries;
++              node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
++              node2->fake.inode = 0;
++              BUFFER_TRACE(frame->bh, "get_write_access");
++              err = ext3_journal_get_write_access(handle, frame->bh);
++              if (err)
++                      goto journal_error;
++              if (levels) {
++                      unsigned icount1 = icount/2, icount2 = icount - icount1;
++                      unsigned hash2 = dx_get_hash(entries + icount1);
++                      dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++                              
++                      BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++                      err = ext3_journal_get_write_access(handle,
++                                                           frames[0].bh);
++                      if (err)
++                              goto journal_error;
++                              
++                      memcpy ((char *) entries2, (char *) (entries + icount1),
++                              icount2 * sizeof(struct dx_entry));
++                      dx_set_count (entries, icount1);
++                      dx_set_count (entries2, icount2);
++                      dx_set_limit (entries2, dx_node_limit(dir));
++
++                      /* Which index block gets the new entry? */
++                      if (at - entries >= icount1) {
++                              frame->at = at = at - entries - icount1 + entries2;
++                              frame->entries = entries = entries2;
++                              swap(frame->bh, bh2);
+                       }
+-                      de->file_type = EXT3_FT_UNKNOWN;
+-                      if (inode) {
+-                              de->inode = cpu_to_le32(inode->i_ino);
+-                              ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+-                      } else
+-                              de->inode = 0;
+-                      de->name_len = namelen;
+-                      memcpy (de->name, name, namelen);
+-                      /*
+-                       * XXX shouldn't update any times until successful
+-                       * completion of syscall, but too many callers depend
+-                       * on this.
+-                       *
+-                       * XXX similarly, too many callers depend on
+-                       * ext3_new_inode() setting the times, but error
+-                       * recovery deletes the inode, so the worst that can
+-                       * happen is that the times are slightly out of date
+-                       * and/or different from the directory change time.
+-                       */
+-                      dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+-                      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+-                      dir->i_version = ++event;
+-                      ext3_mark_inode_dirty(handle, dir);
+-                      BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+-                      ext3_journal_dirty_metadata(handle, bh);
+-                      brelse(bh);
+-                      return 0;
++                      dx_insert_block (frames + 0, hash2, newblock);
++                      dxtrace(dx_show_index ("node", frames[1].entries));
++                      dxtrace(dx_show_index ("node",
++                             ((struct dx_node *) bh2->b_data)->entries));
++                      err = ext3_journal_dirty_metadata(handle, bh2);
++                      if (err)
++                              goto journal_error;
++                      brelse (bh2);
++              } else {
++                      dxtrace(printk("Creating second level index...\n"));
++                      memcpy((char *) entries2, (char *) entries,
++                             icount * sizeof(struct dx_entry));
++                      dx_set_limit(entries2, dx_node_limit(dir));
++
++                      /* Set up root */
++                      dx_set_count(entries, 1);
++                      dx_set_block(entries + 0, newblock);
++                      ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++
++                      /* Add new access path frame */
++                      frame = frames + 1;
++                      frame->at = at = at - entries + entries2;
++                      frame->entries = entries = entries2;
++                      frame->bh = bh2;
++                      err = ext3_journal_get_write_access(handle,
++                                                           frame->bh);
++                      if (err)
++                              goto journal_error;
+               }
+-              offset += le16_to_cpu(de->rec_len);
+-              de = (struct ext3_dir_entry_2 *)
+-                      ((char *) de + le16_to_cpu(de->rec_len));
++              ext3_journal_dirty_metadata(handle, frames[0].bh);
+       }
+-      brelse (bh);
+-      return -ENOSPC;
++      de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++      if (!de)
++              goto cleanup;
++      err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++      bh = 0;
++      goto cleanup;
++      
++journal_error:
++      ext3_std_error(dir->i_sb, err);
++cleanup:
++      if (bh)
++              brelse(bh);
++      dx_release(frames);
++      return err;
+ }
++#endif
+ /*
+  * ext3_delete_entry deletes a directory entry by merging it with the
+@@ -451,9 +1530,11 @@ static int ext3_create (struct inode * d
+       struct inode * inode;
+       int err;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -478,9 +1559,11 @@ static int ext3_mknod (struct inode * di
+       struct inode *inode;
+       int err;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -507,9 +1590,11 @@ static int ext3_mkdir(struct inode * dir
+       if (dir->i_nlink >= EXT3_LINK_MAX)
+               return -EMLINK;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -521,7 +1606,7 @@ static int ext3_mkdir(struct inode * dir
+       inode->i_op = &ext3_dir_inode_operations;
+       inode->i_fop = &ext3_dir_operations;
+-      inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
++      inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+       inode->i_blocks = 0;    
+       dir_block = ext3_bread (handle, inode, 0, 1, &err);
+       if (!dir_block) {
+@@ -554,21 +1639,19 @@ static int ext3_mkdir(struct inode * dir
+               inode->i_mode |= S_ISGID;
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_entry (handle, dentry, inode);
+-      if (err)
+-              goto out_no_entry;
++      if (err) {
++              inode->i_nlink = 0;
++              ext3_mark_inode_dirty(handle, inode);
++              iput (inode);
++              goto out_stop;
++      }
+       dir->i_nlink++;
+-      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      ext3_update_dx_flag(dir);
+       ext3_mark_inode_dirty(handle, dir);
+       d_instantiate(dentry, inode);
+ out_stop:
+       ext3_journal_stop(handle, dir);
+       return err;
+-
+-out_no_entry:
+-      inode->i_nlink = 0;
+-      ext3_mark_inode_dirty(handle, inode);
+-      iput (inode);
+-      goto out_stop;
+ }
+ /*
+@@ -655,7 +1738,7 @@ int ext3_orphan_add(handle_t *handle, st
+       int err = 0, rc;
+       
+       lock_super(sb);
+-      if (!list_empty(&inode->u.ext3_i.i_orphan))
++      if (!list_empty(&EXT3_I(inode)->i_orphan))
+               goto out_unlock;
+       /* Orphan handling is only valid for files with data blocks
+@@ -696,7 +1779,7 @@ int ext3_orphan_add(handle_t *handle, st
+        * This is safe: on error we're going to ignore the orphan list
+        * anyway on the next recovery. */
+       if (!err)
+-              list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
++              list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+       jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+       jbd_debug(4, "orphan inode %ld will point to %d\n",
+@@ -714,25 +1797,26 @@ out_unlock:
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+       struct list_head *prev;
++      struct ext3_inode_info *ei = EXT3_I(inode);
+       struct ext3_sb_info *sbi;
+       ino_t ino_next; 
+       struct ext3_iloc iloc;
+       int err = 0;
+       
+       lock_super(inode->i_sb);
+-      if (list_empty(&inode->u.ext3_i.i_orphan)) {
++      if (list_empty(&ei->i_orphan)) {
+               unlock_super(inode->i_sb);
+               return 0;
+       }
+       ino_next = NEXT_ORPHAN(inode);
+-      prev = inode->u.ext3_i.i_orphan.prev;
++      prev = ei->i_orphan.prev;
+       sbi = EXT3_SB(inode->i_sb);
+       jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+-      list_del(&inode->u.ext3_i.i_orphan);
+-      INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++      list_del(&ei->i_orphan);
++      INIT_LIST_HEAD(&ei->i_orphan);
+       /* If we're on an error path, we may not have a valid
+        * transaction handle with which to update the orphan list on
+@@ -793,8 +1877,9 @@ static int ext3_rmdir (struct inode * di
+       handle_t *handle;
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+-      if (IS_ERR(handle))
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       retval = -ENOENT;
+       bh = ext3_find_entry (dentry, &de);
+@@ -832,7 +1917,7 @@ static int ext3_rmdir (struct inode * di
+       dir->i_nlink--;
+       inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+       ext3_mark_inode_dirty(handle, inode);
+-      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      ext3_update_dx_flag(dir);
+       ext3_mark_inode_dirty(handle, dir);
+ end_rmdir:
+@@ -850,8 +1935,9 @@ static int ext3_unlink(struct inode * di
+       handle_t *handle;
+       handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+-      if (IS_ERR(handle))
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -878,7 +1964,7 @@ static int ext3_unlink(struct inode * di
+       if (retval)
+               goto end_unlink;
+       dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+-      dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      ext3_update_dx_flag(dir);
+       ext3_mark_inode_dirty(handle, dir);
+       inode->i_nlink--;
+       if (!inode->i_nlink)
+@@ -904,9 +1990,11 @@ static int ext3_symlink (struct inode * 
+       if (l > dir->i_sb->s_blocksize)
+               return -ENAMETOOLONG;
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -916,7 +2004,7 @@ static int ext3_symlink (struct inode * 
+       if (IS_ERR(inode))
+               goto out_stop;
+-      if (l > sizeof (inode->u.ext3_i.i_data)) {
++      if (l > sizeof (EXT3_I(inode)->i_data)) {
+               inode->i_op = &page_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+               /*
+@@ -925,25 +2013,23 @@ static int ext3_symlink (struct inode * 
+                * i_size in generic_commit_write().
+                */
+               err = block_symlink(inode, symname, l);
+-              if (err)
+-                      goto out_no_entry;
++              if (err) {
++                      ext3_dec_count(handle, inode);
++                      ext3_mark_inode_dirty(handle, inode);
++                      iput (inode);
++                      goto out_stop;
++              }
+       } else {
+               inode->i_op = &ext3_fast_symlink_inode_operations;
+-              memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
++              memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+               inode->i_size = l-1;
+       }
+-      inode->u.ext3_i.i_disksize = inode->i_size;
++      EXT3_I(inode)->i_disksize = inode->i_size;
+       err = ext3_add_nondir(handle, dentry, inode);
+       ext3_mark_inode_dirty(handle, inode);
+ out_stop:
+       ext3_journal_stop(handle, dir);
+       return err;
+-
+-out_no_entry:
+-      ext3_dec_count(handle, inode);
+-      ext3_mark_inode_dirty(handle, inode);
+-      iput (inode);
+-      goto out_stop;
+ }
+ static int ext3_link (struct dentry * old_dentry,
+@@ -956,12 +2042,15 @@ static int ext3_link (struct dentry * ol
+       if (S_ISDIR(inode->i_mode))
+               return -EPERM;
+-      if (inode->i_nlink >= EXT3_LINK_MAX)
++      if (inode->i_nlink >= EXT3_LINK_MAX) {
+               return -EMLINK;
++      }
+-      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+@@ -995,9 +2084,11 @@ static int ext3_rename (struct inode * o
+       old_bh = new_bh = dir_bh = NULL;
+-      handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+-      if (IS_ERR(handle))
++      handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
++                                      EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
++      if (IS_ERR(handle)) {
+               return PTR_ERR(handle);
++      }
+       if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+               handle->h_sync = 1;
+@@ -1077,7 +2168,7 @@ static int ext3_rename (struct inode * o
+               new_inode->i_ctime = CURRENT_TIME;
+       }
+       old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+-      old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++      ext3_update_dx_flag(old_dir);
+       if (dir_bh) {
+               BUFFER_TRACE(dir_bh, "get_write_access");
+               ext3_journal_get_write_access(handle, dir_bh);
+@@ -1089,7 +2180,7 @@ static int ext3_rename (struct inode * o
+                       new_inode->i_nlink--;
+               } else {
+                       new_dir->i_nlink++;
+-                      new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++                      ext3_update_dx_flag(new_dir);
+                       ext3_mark_inode_dirty(handle, new_dir);
+               }
+       }
+--- linux-2.4.20/fs/ext3/super.c~ext-2.4-patch-1       Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/fs/ext3/super.c Sat Apr  5 03:56:31 2003
+@@ -707,6 +707,7 @@ static int ext3_setup_super(struct super
+       es->s_mtime = cpu_to_le32(CURRENT_TIME);
+       ext3_update_dynamic_rev(sb);
+       EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++
+       ext3_commit_super (sb, es, 1);
+       if (test_opt (sb, DEBUG))
+               printk (KERN_INFO
+@@ -717,6 +718,7 @@ static int ext3_setup_super(struct super
+                       EXT3_BLOCKS_PER_GROUP(sb),
+                       EXT3_INODES_PER_GROUP(sb),
+                       sbi->s_mount_opt);
++
+       printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+                               bdevname(sb->s_dev));
+       if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+@@ -890,6 +892,7 @@ static loff_t ext3_max_size(int bits)
+       return res;
+ }
++
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+                                     int silent)
+ {
+@@ -1066,6 +1069,9 @@ struct super_block * ext3_read_super (st
+       sbi->s_mount_state = le16_to_cpu(es->s_state);
+       sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+       sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
++      for (i=0; i < 4; i++)
++              sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
++      sbi->s_def_hash_version = es->s_def_hash_version;
+       if (sbi->s_blocks_per_group > blocksize * 8) {
+               printk (KERN_ERR
+@@ -1769,6 +1775,7 @@ static void __exit exit_ext3_fs(void)
+       unregister_filesystem(&ext3_fs_type);
+ }
++EXPORT_SYMBOL(ext3_force_commit);
+ EXPORT_SYMBOL(ext3_bread);
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+--- linux-2.4.20/include/linux/ext3_fs.h~ext-2.4-patch-1       Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/include/linux/ext3_fs.h Sat Apr  5 03:56:31 2003
+@@ -40,6 +40,11 @@
+ #define EXT3FS_VERSION                "2.4-0.9.19"
+ /*
++ * Always enable hashed directories
++ */
++#define CONFIG_EXT3_INDEX
++
++/*
+  * Debug code
+  */
+ #ifdef EXT3FS_DEBUG
+@@ -437,8 +442,11 @@ struct ext3_super_block {
+ /*E0*/        __u32   s_journal_inum;         /* inode number of journal file */
+       __u32   s_journal_dev;          /* device number of journal file */
+       __u32   s_last_orphan;          /* start of list of inodes to delete */
+-
+-/*EC*/        __u32   s_reserved[197];        /* Padding to the end of the block */
++      __u32   s_hash_seed[4];         /* HTREE hash seed */
++      __u8    s_def_hash_version;     /* Default hash version to use */
++      __u8    s_reserved_char_pad;
++      __u16   s_reserved_word_pad;
++      __u32   s_reserved[192];        /* Padding to the end of the block */
+ };
+ #ifdef __KERNEL__
+@@ -575,9 +583,46 @@ struct ext3_dir_entry_2 {
+ #define EXT3_DIR_ROUND                        (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len)    (((name_len) + 8 + EXT3_DIR_ROUND) & \
+                                        ~EXT3_DIR_ROUND)
++/*
++ * Hash Tree Directory indexing
++ * (c) Daniel Phillips, 2001
++ */
++
++#ifdef CONFIG_EXT3_INDEX
++  #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++                                            EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++                    (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#else
++  #define is_dx(dir) 0
++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
++#endif
++
++/* Legal values for the dx_root hash_version field: */
++
++#define DX_HASH_LEGACY                0
++#define DX_HASH_HALF_MD4      1
++#define DX_HASH_TEA           2
++
++/* hash info structure used by the directory hash */
++struct dx_hash_info
++{
++      u32             hash;
++      u32             minor_hash;
++      int             hash_version;
++      u32             *seed;
++};
+ #ifdef __KERNEL__
+ /*
++ * Control parameters used by ext3_htree_next_block
++ */
++#define HASH_NB_ALWAYS                1
++
++
++/*
+  * Describe an inode's exact location on disk and in memory
+  */
+ struct ext3_iloc
+@@ -587,6 +632,27 @@ struct ext3_iloc
+       unsigned long block_group;
+ };
++
++/*
++ * This structure is stuffed into the struct file's private_data field
++ * for directories.  It is where we put information so that we can do
++ * readdir operations in hash tree order.
++ */
++struct dir_private_info {
++      rb_root_t       root;
++      rb_node_t       *curr_node;
++      struct fname    *extra_fname;
++      loff_t          last_pos;
++      __u32           curr_hash;
++      __u32           curr_minor_hash;
++      __u32           next_hash;
++};
++
++/*
++ * Special error return code only used by dx_probe() and its callers.
++ */
++#define ERR_BAD_DX_DIR        -75000
++
+ /*
+  * Function prototypes
+  */
+@@ -614,11 +680,20 @@ extern struct ext3_group_desc * ext3_get
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+-                              struct ext3_dir_entry_2 *, struct buffer_head *,
+-                              unsigned long);
++                              struct ext3_dir_entry_2 *,
++                              struct buffer_head *, unsigned long);
++extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++                                  __u32 minor_hash,
++                                  struct ext3_dir_entry_2 *dirent);
++extern void ext3_htree_free_dir_info(struct dir_private_info *p);
++
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
++/* hash.c */
++extern int ext3fs_dirhash(const char *name, int len, struct
++                        dx_hash_info *hinfo);
++
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+@@ -650,6 +725,8 @@ extern int ext3_ioctl (struct inode *, s
+ /* namei.c */
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++                              __u32 start_minor_hash, __u32 *next_hash);
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+--- linux-2.4.20/include/linux/ext3_fs_sb.h~ext-2.4-patch-1    Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/include/linux/ext3_fs_sb.h      Sat Apr  5 03:56:31 2003
+@@ -62,6 +62,8 @@ struct ext3_sb_info {
+       int s_inode_size;
+       int s_first_ino;
+       u32 s_next_generation;
++      u32 s_hash_seed[4];
++      int s_def_hash_version;
+       /* Journaling */
+       struct inode * s_journal_inode;
+--- linux-2.4.20/include/linux/ext3_jbd.h~ext-2.4-patch-1      Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/include/linux/ext3_jbd.h        Sat Apr  5 03:56:31 2003
+@@ -63,6 +63,8 @@ extern int ext3_writepage_trans_blocks(s
+ #define EXT3_RESERVE_TRANS_BLOCKS     12
++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
++
+ int
+ ext3_mark_iloc_dirty(handle_t *handle, 
+                    struct inode *inode,
+--- linux-2.4.20/include/linux/rbtree.h~ext-2.4-patch-1        Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/include/linux/rbtree.h  Sat Apr  5 03:56:31 2003
+@@ -120,6 +120,8 @@ rb_root_t;
+ extern void rb_insert_color(rb_node_t *, rb_root_t *);
+ extern void rb_erase(rb_node_t *, rb_root_t *);
++extern rb_node_t *rb_get_first(rb_root_t *root);
++extern rb_node_t *rb_get_next(rb_node_t *n);
+ static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+ {
+--- linux-2.4.20/lib/rbtree.c~ext-2.4-patch-1  Sat Apr  5 03:56:31 2003
++++ linux-2.4.20-braam/lib/rbtree.c    Sat Apr  5 03:56:31 2003
+@@ -17,6 +17,8 @@
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+   linux/lib/rbtree.c
++
++  rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002
+ */
+ #include <linux/rbtree.h>
+@@ -294,3 +296,43 @@ void rb_erase(rb_node_t * node, rb_root_
+               __rb_erase_color(child, parent, root);
+ }
+ EXPORT_SYMBOL(rb_erase);
++
++/*
++ * This function returns the first node (in sort order) of the tree.
++ */
++rb_node_t *rb_get_first(rb_root_t *root)
++{
++      rb_node_t       *n;
++
++      n = root->rb_node;
++      if (!n)
++              return 0;
++      while (n->rb_left)
++              n = n->rb_left;
++      return n;
++}
++EXPORT_SYMBOL(rb_get_first);
++
++/*
++ * Given a node, this function will return the next node in the tree.
++ */
++rb_node_t *rb_get_next(rb_node_t *n)
++{
++      rb_node_t       *parent;
++
++      if (n->rb_right) {
++              n = n->rb_right;
++              while (n->rb_left)
++                      n = n->rb_left;
++              return n;
++      } else {
++              while ((parent = n->rb_parent)) {
++                      if (n == parent->rb_left)
++                              return parent;
++                      n = parent;
++              }
++              return 0;
++      }
++}
++EXPORT_SYMBOL(rb_get_next);
++
+
+_
diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-2.patch b/lustre/kernel_patches/patches/ext-2.4-patch-2.patch
new file mode 100644 (file)
index 0000000..689d33b
--- /dev/null
@@ -0,0 +1,34 @@
+# This is a BitKeeper generated patch for the following project:
+# Project Name: Linux kernel tree
+#
+# namei.c |    9 +++++++++
+# 1 files changed, 9 insertions(+)
+#
+# The following is the BitKeeper ChangeSet Log
+# --------------------------------------------
+# 02/11/07     tytso@snap.thunk.org    1.777
+# Add '.' and '..' entries to be returned by readdir of htree directories
+# 
+# This patch from Chris Li adds '.' and '..' to the rbtree so that they 
+# are properly returned by readdir.
+# --------------------------------------------
+#
+diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c
+--- a/fs/ext3/namei.c  Thu Nov  7 10:57:30 2002
++++ b/fs/ext3/namei.c  Thu Nov  7 10:57:30 2002
+@@ -546,6 +546,15 @@
+       if (!frame)
+               return err;
++      /* Add '.' and '..' from the htree header */
++      if (!start_hash && !start_minor_hash) {
++              de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
++              ext3_htree_store_dirent(dir_file, 0, 0, de);
++              de = ext3_next_entry(de);
++              ext3_htree_store_dirent(dir_file, 0, 0, de);
++              count += 2;
++      }
++
+       while (1) {
+               block = dx_get_block(frame->at);
+               dxtrace(printk("Reading block %d\n", block));
diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-3.patch b/lustre/kernel_patches/patches/ext-2.4-patch-3.patch
new file mode 100644 (file)
index 0000000..2600b02
--- /dev/null
@@ -0,0 +1,96 @@
+# This is a BitKeeper generated patch for the following project:
+# Project Name: Linux kernel tree
+#
+# fs/ext3/dir.c           |    7 +++++--
+# fs/ext3/namei.c         |   11 +++++++----
+# include/linux/ext3_fs.h |    2 +-
+# 3 files changed, 13 insertions(+), 7 deletions(-)
+#
+# The following is the BitKeeper ChangeSet Log
+# --------------------------------------------
+# 02/11/07     tytso@snap.thunk.org    1.778
+# Check for failed kmalloc() in ext3_htree_store_dirent()
+# 
+# This patch checks for a failed kmalloc() in ext3_htree_store_dirent(),
+# and passes the error up to its caller, ext3_htree_fill_tree().
+# --------------------------------------------
+#
+diff -Nru a/fs/ext3/dir.c b/fs/ext3/dir.c
+--- a/fs/ext3/dir.c    Thu Nov  7 10:57:34 2002
++++ b/fs/ext3/dir.c    Thu Nov  7 10:57:34 2002
+@@ -308,7 +308,7 @@
+ /*
+  * Given a directory entry, enter it into the fname rb tree.
+  */
+-void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+                            __u32 minor_hash,
+                            struct ext3_dir_entry_2 *dirent)
+ {
+@@ -323,6 +323,8 @@
+       /* Create and allocate the fname structure */
+       len = sizeof(struct fname) + dirent->name_len + 1;
+       new_fn = kmalloc(len, GFP_KERNEL);
++      if (!new_fn)
++              return -ENOMEM;
+       memset(new_fn, 0, len);
+       new_fn->hash = hash;
+       new_fn->minor_hash = minor_hash;
+@@ -344,7 +346,7 @@
+                   (new_fn->minor_hash == fname->minor_hash)) {
+                       new_fn->next = fname->next;
+                       fname->next = new_fn;
+-                      return;
++                      return 0;
+               }
+                       
+               if (new_fn->hash < fname->hash)
+@@ -359,6 +361,7 @@
+       rb_link_node(&new_fn->rb_hash, parent, p);
+       rb_insert_color(&new_fn->rb_hash, &info->root);
++      return 0;
+ }
+diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c
+--- a/fs/ext3/namei.c  Thu Nov  7 10:57:34 2002
++++ b/fs/ext3/namei.c  Thu Nov  7 10:57:34 2002
+@@ -549,9 +549,11 @@
+       /* Add '.' and '..' from the htree header */
+       if (!start_hash && !start_minor_hash) {
+               de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
+-              ext3_htree_store_dirent(dir_file, 0, 0, de);
++              if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
++                      goto errout;
+               de = ext3_next_entry(de);
+-              ext3_htree_store_dirent(dir_file, 0, 0, de);
++              if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
++                      goto errout;
+               count += 2;
+       }
+@@ -570,8 +572,9 @@
+                           ((hinfo.hash == start_hash) &&
+                            (hinfo.minor_hash < start_minor_hash)))
+                               continue;
+-                      ext3_htree_store_dirent(dir_file, hinfo.hash,
+-                                              hinfo.minor_hash, de);
++                      if ((err = ext3_htree_store_dirent(dir_file,
++                                 hinfo.hash, hinfo.minor_hash, de)) != 0)
++                              goto errout;
+                       count++;
+               }
+               brelse (bh);
+diff -Nru a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
+--- a/include/linux/ext3_fs.h  Thu Nov  7 10:57:34 2002
++++ b/include/linux/ext3_fs.h  Thu Nov  7 10:57:34 2002
+@@ -682,7 +682,7 @@
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+                               struct ext3_dir_entry_2 *,
+                               struct buffer_head *, unsigned long);
+-extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
+                                   __u32 minor_hash,
+                                   struct ext3_dir_entry_2 *dirent);
+ extern void ext3_htree_free_dir_info(struct dir_private_info *p);
diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-4.patch b/lustre/kernel_patches/patches/ext-2.4-patch-4.patch
new file mode 100644 (file)
index 0000000..67f5afa
--- /dev/null
@@ -0,0 +1,48 @@
+# This is a BitKeeper generated patch for the following project:
+# Project Name: Linux kernel tree
+#
+# namei.c |   21 ++++++++++++++++++++-
+# 1 files changed, 20 insertions(+), 1 deletion(-)
+#
+# The following is the BitKeeper ChangeSet Log
+# --------------------------------------------
+# 02/11/07     tytso@snap.thunk.org    1.779
+# Fix ext3 htree rename bug.
+# 
+# This fixes an ext3 htree bug pointed out by Christopher Li; if 
+# adding the new name to the directory causes a split, this can cause
+# the directory entry containing the old name to move to another 
+# block, and then the removal of the old name will fail.
+# --------------------------------------------
+#
+diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c
+--- a/fs/ext3/namei.c  Thu Nov  7 10:57:49 2002
++++ b/fs/ext3/namei.c  Thu Nov  7 10:57:49 2002
+@@ -2173,7 +2173,26 @@
+       /*
+        * ok, that's it
+        */
+-      ext3_delete_entry(handle, old_dir, old_de, old_bh);
++      retval = ext3_delete_entry(handle, old_dir, old_de, old_bh);
++      if (retval == -ENOENT) {
++              /*
++               * old_de could have moved out from under us.
++               */
++              struct buffer_head *old_bh2;
++              struct ext3_dir_entry_2 *old_de2;
++              
++              old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++              if (old_bh2) {
++                      retval = ext3_delete_entry(handle, old_dir,
++                                                 old_de2, old_bh2);
++                      brelse(old_bh2);
++              }
++      }
++      if (retval) {
++              ext3_warning(old_dir->i_sb, "ext3_rename",
++                              "Deleting old file (%lu), %d, error=%d",
++                              old_dir->i_ino, old_dir->i_nlink, retval);
++      }
+       if (new_inode) {
+               new_inode->i_nlink--;
similarity index 73%
rename from lustre/extN/ext3-2.4-ino_t.diff
rename to lustre/kernel_patches/patches/ext3-2.4-ino_t.patch
index ce1bd88..1786d0f 100644 (file)
@@ -1,6 +1,11 @@
---- linux/fs/ext3/ialloc.c.orig        Sat Oct 19 11:42:23 2002
-+++ linux/fs/ext3/ialloc.c     Sat Jan  4 12:14:18 2003
-@@ -64,8 +64,8 @@ static int read_inode_bitmap (struct sup
+ fs/ext3/ialloc.c        |   20 ++++++++++----------
+ fs/ext3/namei.c         |   16 ++++++++--------
+ include/linux/ext3_fs.h |    2 +-
+ 3 files changed, 19 insertions(+), 19 deletions(-)
+
+--- linux-2.4.20/fs/ext3/ialloc.c~ext3-2.4-ino_t       2003-04-08 23:35:24.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/ialloc.c        2003-04-08 23:35:24.000000000 -0600
+@@ -65,8 +65,8 @@ static int read_inode_bitmap (struct sup
        if (!bh) {
                ext3_error (sb, "read_inode_bitmap",
                            "Cannot read inode bitmap - "
@@ -11,7 +16,7 @@
                retval = -EIO;
        }
        /*
-@@ -531,19 +532,19 @@ out:
+@@ -533,19 +533,19 @@ out:
  }
  
  /* Verify that we are loading a valid orphan from disk */
@@ -35,7 +40,7 @@
                return NULL;
        }
  
-@@ -552,7 +553,7 @@ struct inode *ext3_orphan_get (struct su
+@@ -554,7 +554,7 @@ struct inode *ext3_orphan_get (struct su
        if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 ||
            !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) {
                ext3_warning(sb, __FUNCTION__,
@@ -44,7 +49,7 @@
                return NULL;
        }
  
-@@ -563,7 +564,7 @@ struct inode *ext3_orphan_get (struct su
+@@ -565,16 +565,16 @@ struct inode *ext3_orphan_get (struct su
        if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) ||
            is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) {
                ext3_warning(sb, __FUNCTION__,
@@ -53,7 +58,6 @@
                printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n",
                       bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data));
                printk(KERN_NOTICE "inode=%p\n", inode);
-@@ -570,9 +571,9 @@ struct inode *ext3_orphan_get (struct su
                if (inode) {
                        printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
                               is_bad_inode(inode));
                }
                /* Avoid freeing blocks if we got a bad deleted inode */
                if (inode && inode->i_nlink == 0)
---- linux/fs/ext3/namei.c.orig Sat Oct 19 11:42:45 2002
-+++ linux/fs/ext3/namei.c      Sat Jan  4 12:13:27 2003
-@@ -716,10 +716,10 @@ int ext3_orphan_del(handle_t *handle, st
- {
+--- linux-2.4.20/fs/ext3/namei.c~ext3-2.4-ino_t        2003-04-08 23:35:24.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/namei.c 2003-04-08 23:35:24.000000000 -0600
+@@ -1808,10 +1808,10 @@ int ext3_orphan_del(handle_t *handle, st
        struct list_head *prev;
+       struct ext3_inode_info *ei = EXT3_I(inode);
        struct ext3_sb_info *sbi;
 -      ino_t ino_next; 
 +      unsigned long ino_next;
 -      
 +
        lock_super(inode->i_sb);
-       if (list_empty(&inode->u.ext3_i.i_orphan)) {
+       if (list_empty(&ei->i_orphan)) {
                unlock_super(inode->i_sb);
-@@ -730,7 +730,7 @@ int ext3_orphan_del(handle_t *handle, st
-       prev = inode->u.ext3_i.i_orphan.prev;
+@@ -1822,7 +1822,7 @@ int ext3_orphan_del(handle_t *handle, st
+       prev = ei->i_orphan.prev;
        sbi = EXT3_SB(inode->i_sb);
  
 -      jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
 +      jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
  
-       list_del(&inode->u.ext3_i.i_orphan);
-       INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
-@@ -741,13 +741,13 @@ int ext3_orphan_del(handle_t *handle, st
+       list_del(&ei->i_orphan);
+       INIT_LIST_HEAD(&ei->i_orphan);
+@@ -1833,13 +1833,13 @@ int ext3_orphan_del(handle_t *handle, st
         * list in memory. */
        if (!handle)
                goto out;
                BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                err = ext3_journal_get_write_access(handle, sbi->s_sbh);
                if (err)
-@@ -758,8 +758,8 @@ int ext3_orphan_del(handle_t *handle, st
+@@ -1850,8 +1850,8 @@ int ext3_orphan_del(handle_t *handle, st
                struct ext3_iloc iloc2;
                struct inode *i_prev =
                        list_entry(prev, struct inode, u.ext3_i.i_orphan);
                          i_prev->i_ino, ino_next);
                err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
                if (err)
-@@ -774,7 +774,7 @@ int ext3_orphan_del(handle_t *handle, st
+@@ -1866,7 +1866,7 @@ int ext3_orphan_del(handle_t *handle, st
        if (err)
                goto out_brelse;
  
        ext3_std_error(inode->i_sb, err);
  out:
        unlock_super(inode->i_sb);
---- linux/include/linux/ext3_fs.h.orig Thu Jan  2 16:10:24 2003
-+++ linux/include/linux/ext3_fs.h      Sat Jan  4 12:25:41 2003
-@@ -622,7 +622,7 @@ extern int ext3_sync_file (struct file *
+--- linux-2.4.20/include/linux/ext3_fs.h~ext3-2.4-ino_t        2003-04-08 23:35:24.000000000 -0600
++++ linux-2.4.20-braam/include/linux/ext3_fs.h 2003-04-08 23:35:24.000000000 -0600
+@@ -673,7 +673,7 @@ extern int ext3fs_dirhash(const char *na
  /* ialloc.c */
  extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
  extern void ext3_free_inode (handle_t *, struct inode *);
  extern unsigned long ext3_count_free_inodes (struct super_block *);
  extern void ext3_check_inodes_bitmap (struct super_block *);
  extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+_
        struct ext3_group_desc * gdp;
        struct ext3_group_desc * tmp;
        struct ext3_super_block * es;
-@@ -318,19 +320,21 @@ struct inode * ext3_new_inode (handle_t 
+@@ -318,7 +320,9 @@ struct inode * ext3_new_inode (handle_t 
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
 +      init_rwsem(&ei->truncate_sem);
  
        lock_super (sb);
--      es = sb->u.ext3_sb.s_es;
-+      es = sbi->s_es;
- repeat:
-       gdp = NULL;
-       i = 0;
+       es = sb->u.ext3_sb.s_es;
+@@ -328,9 +332,9 @@ struct inode * ext3_new_inode (handle_t 
  
        if (S_ISDIR(mode)) {
                avefreei = le32_to_cpu(es->s_free_inodes_count) /
                BUFFER_TRACE(bh, "get_write_access");
                err = ext3_journal_get_write_access(handle, bh);
                if (err) goto fail;
-@@ -436,8 +440,8 @@ repeat:
-               }
-               goto repeat;
-       }
--      j += i * EXT3_INODES_PER_GROUP(sb) + 1;
--      if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) {
-+      j += i * sbi->s_inodes_per_group + 1;
-+      if (j < sbi->s_first_ino || j > le32_to_cpu(es->s_inodes_count)) {
-               ext3_error (sb, "ext3_new_inode",
-                           "reserved inode or inode > inodes count - "
-                           "block_group = %d,inode=%d", i, j);
 @@ -457,13 +461,13 @@ repeat:
        err = ext3_journal_dirty_metadata(handle, bh2);
        if (err) goto fail;
similarity index 60%
rename from lustre/extN/patch-2.4.18-chaos22
rename to lustre/kernel_patches/patches/ext3-2.4.20-fixes.patch
index c40d4ea..5f566de 100644 (file)
@@ -1,7 +1,12 @@
-diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
---- lum-2.4.18-um30/fs/ext3/balloc.c   Mon Feb 25 12:38:08 2002
-+++ uml-2.4.18-12.5/fs/ext3/balloc.c   Thu Sep 19 13:40:11 2002
-@@ -276,7 +276,8 @@
+
+
+
+ fs/ext3/balloc.c |   53 +++++++++++++++++++++++++++++++----------------------
+ 1 files changed, 31 insertions(+), 22 deletions(-)
+
+--- linux-2.4.20/fs/ext3/balloc.c~ext3-2.4.20-fixes    2003-04-08 23:35:17.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/balloc.c        2003-04-08 23:35:17.000000000 -0600
+@@ -276,7 +276,8 @@ void ext3_free_blocks (handle_t *handle,
        }
        lock_super (sb);
        es = sb->u.ext3_sb.s_es;
@@ -11,7 +16,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
            (block + count) > le32_to_cpu(es->s_blocks_count)) {
                ext3_error (sb, "ext3_free_blocks",
                            "Freeing blocks not in datazone - "
-@@ -309,17 +310,6 @@
+@@ -309,17 +310,6 @@ do_more:
        if (!gdp)
                goto error_return;
  
@@ -29,7 +34,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
        /*
         * We are about to start releasing blocks in the bitmap,
         * so we need undo access.
-@@ -345,14 +335,24 @@
+@@ -345,14 +335,24 @@ do_more:
        if (err)
                goto error_return;
  
@@ -38,7 +43,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
 +              if (block == le32_to_cpu(gdp->bg_block_bitmap) ||
 +                  block == le32_to_cpu(gdp->bg_inode_bitmap) ||
 +                  in_range(block, le32_to_cpu(gdp->bg_inode_table),
-+                           sb->u.ext2_sb.s_itb_per_group)) {
++                           EXT3_SB(sb)->s_itb_per_group)) {
 +                      ext3_error(sb, __FUNCTION__,
 +                                 "Freeing block in system zone - block = %lu",
 +                                 block);
@@ -56,7 +61,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
                        if (debug_bh) {
                                BUFFER_TRACE(debug_bh, "Deleted!");
                                if (!bh2jh(bitmap_bh)->b_committed_data)
-@@ -365,9 +365,8 @@
+@@ -365,9 +365,8 @@ do_more:
  #endif
                BUFFER_TRACE(bitmap_bh, "clear bit");
                if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
@@ -68,7 +73,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
                        dquot_freed_blocks++;
-@@ -415,7 +417,6 @@
+@@ -415,7 +414,6 @@ do_more:
        if (!err) err = ret;
  
        if (overflow && !err) {
@@ -76,7 +81,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
                count = overflow;
                goto do_more;
        }
-@@ -575,6 +577,7 @@
+@@ -576,6 +574,7 @@ int ext3_new_block (handle_t *handle, st
  
        ext3_debug ("goal=%lu.\n", goal);
  
@@ -84,7 +89,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
        /*
         * First, test whether the goal block is free.
         */
-@@ -684,10 +686,21 @@
+@@ -684,10 +683,20 @@ got_block:
        if (tmp == le32_to_cpu(gdp->bg_block_bitmap) ||
            tmp == le32_to_cpu(gdp->bg_inode_bitmap) ||
            in_range (tmp, le32_to_cpu(gdp->bg_inode_table),
@@ -106,60 +111,8 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c
 +              ext3_set_bit(j, bh->b_data);
 +              goto repeat;
 +      }
-+
  
        /* The superblock lock should guard against anybody else beating
         * us to this point! */
-diff -ru lum-2.4.18-um30/fs/ext3/namei.c uml-2.4.18-12.5/fs/ext3/namei.c
---- lum-2.4.18-um30/fs/ext3/namei.c    Fri Nov  9 15:25:04 2001
-+++ uml-2.4.18-12.5/fs/ext3/namei.c    Thu Sep 19 13:40:11 2002
-@@ -354,8 +355,8 @@
-                        */
-                       dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-                       dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
--                      ext3_mark_inode_dirty(handle, dir);
-                       dir->i_version = ++event;
-+                      ext3_mark_inode_dirty(handle, dir);
-                       BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
-                       ext3_journal_dirty_metadata(handle, bh);
-                       brelse(bh);
-@@ -464,8 +465,8 @@
-               inode->i_op = &ext3_file_inode_operations;
-               inode->i_fop = &ext3_file_operations;
-               inode->i_mapping->a_ops = &ext3_aops;
--              ext3_mark_inode_dirty(handle, inode);
-               err = ext3_add_nondir(handle, dentry, inode);
-+              ext3_mark_inode_dirty(handle, inode);
-       }
-       ext3_journal_stop(handle, dir);
-       return err;
-@@ -489,8 +490,8 @@
-       err = PTR_ERR(inode);
-       if (!IS_ERR(inode)) {
-               init_special_inode(inode, mode, rdev);
--              ext3_mark_inode_dirty(handle, inode);
-               err = ext3_add_nondir(handle, dentry, inode);
-+              ext3_mark_inode_dirty(handle, inode);
-       }
-       ext3_journal_stop(handle, dir);
-       return err;
-@@ -933,8 +934,8 @@
-               inode->i_size = l-1;
-       }
-       inode->u.ext3_i.i_disksize = inode->i_size;
--      ext3_mark_inode_dirty(handle, inode);
-       err = ext3_add_nondir(handle, dentry, inode);
-+      ext3_mark_inode_dirty(handle, inode);
- out_stop:
-       ext3_journal_stop(handle, dir);
-       return err;
-@@ -970,8 +971,8 @@
-       ext3_inc_count(handle, inode);
-       atomic_inc(&inode->i_count);
--      ext3_mark_inode_dirty(handle, inode);
-       err = ext3_add_nondir(handle, dentry, inode);
-+      ext3_mark_inode_dirty(handle, inode);
-       ext3_journal_stop(handle, dir);
-       return err;
- }
+
+_
similarity index 99%
rename from lustre/extN/ext3-2.5-noread.diff
rename to lustre/kernel_patches/patches/ext3-2.5-noread.patch
index f1c611f..1aa2578 100644 (file)
 +              if (block_end > itable_end)
 +                      block_end = itable_end;
 +
-+              for (; block < block_end; block++) {
++              for (++block; block < block_end; block++) {
 +                      bh[count] = sb_getblk(sb, block);
 +                      if (count && (buffer_uptodate(bh[count]) ||
 +                                    buffer_locked(bh[count]))) {
diff --git a/lustre/kernel_patches/patches/ext3-2.5.63.patch b/lustre/kernel_patches/patches/ext3-2.5.63.patch
new file mode 100644 (file)
index 0000000..fd28cd8
--- /dev/null
@@ -0,0 +1,150 @@
+ fs/ext3/inode.c |   84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ fs/ext3/super.c |    5 ++-
+ fs/ext3/xattr.c |    5 +++
+ fs/ext3/xattr.h |    2 -
+ 4 files changed, 92 insertions(+), 4 deletions(-)
+
+--- linux-2.5.63-nointent/fs/ext3/xattr.c~ext3-2.5.63  Fri Mar 21 18:47:19 2003
++++ linux-2.5.63-nointent-root/fs/ext3/xattr.c Fri Mar 21 18:47:19 2003
+@@ -1181,3 +1181,8 @@ exit_ext3_xattr(void)
+       ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
+                             &ext3_xattr_user_handler);
+ }
++
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_set);
++EXPORT_SYMBOL(ext3_xattr_set_handle);
++
+--- linux-2.5.63-nointent/fs/ext3/inode.c~ext3-2.5.63  Fri Mar 21 18:47:19 2003
++++ linux-2.5.63-nointent-root/fs/ext3/inode.c Fri Mar 21 18:47:19 2003
+@@ -1019,7 +1019,7 @@ struct buffer_head *ext3_bread(handle_t 
+       *err = -EIO;
+       return NULL;
+ }
+-
++EXPORT_SYMBOL(ext3_bread);
+ static int walk_page_buffers( handle_t *handle,
+                               struct buffer_head *head,
+                               unsigned from,
+@@ -2870,3 +2870,85 @@ int ext3_change_inode_journal_flag(struc
+  * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
+  * need to extend" test in ext3_prepare_write() succeeds.  
+  */
++
++/* for each block: 1 ind + 1 dind + 1 tind
++ * for each block: 3 bitmap blocks
++ * for each block: 3 group descriptor blocks
++ * i inode block
++ * 1 superblock
++ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
++ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
++ *
++ * XXX assuming:
++ * (1) fs logic block size == page size
++ * (2) ext3 in writeback mode
++ */
++static inline int ext3_san_write_trans_blocks(int nblocks)
++{
++      int ret;
++      
++      ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1;
++
++#ifdef CONFIG_QUOTA
++      ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++      return ret;
++}
++
++/* Alloc blocks for an inode, while don't create any buffer/page
++ * for data I/O; set the inode size if file is extended.
++ *
++ * @inode:    target inode
++ * @blocks:   array of logic block number
++ * @nblocks:  how many blocks need be alloced
++ * @newsize:  new filesize we should set
++ *
++ * return:    0 success, otherwise failed
++ *            (*blocks) contains physical block number alloced
++ *
++ * XXX this assume the fs block size == page size
++ */
++int ext3_prep_san_write(struct inode *inode, long *blocks,
++                      int nblocks, loff_t newsize)
++{
++      handle_t *handle;
++      struct buffer_head bh_tmp;
++      int needed_blocks;
++      int i, ret = 0, ret2;
++
++      needed_blocks = ext3_san_write_trans_blocks(nblocks);
++
++      lock_kernel();
++      handle = ext3_journal_start(inode, needed_blocks);
++      if (IS_ERR(handle)) {
++              unlock_kernel();
++              return PTR_ERR(handle);
++      }
++      unlock_kernel();
++
++      /* alloc blocks one by one */
++      for (i = 0; i < nblocks; i++) {
++              ret = ext3_get_block_handle(handle, inode, blocks[i],
++                                              &bh_tmp, 1, 1);
++              if (ret)
++                      break;
++
++              blocks[i] = bh_tmp.b_blocknr;
++      }
++
++      /* set inode size if needed */
++      if (!ret && (newsize > inode->i_size)) {
++              inode->i_size = newsize;
++              ext3_mark_inode_dirty(handle, inode);
++      }
++
++      lock_kernel();
++      ret2 = ext3_journal_stop(handle, inode);
++      unlock_kernel();
++
++      if (!ret)
++              ret = ret2;
++      return ret;
++}
++EXPORT_SYMBOL(ext3_prep_san_write);
+--- linux-2.5.63-nointent/fs/ext3/super.c~ext3-2.5.63  Fri Mar 21 18:47:19 2003
++++ linux-2.5.63-nointent-root/fs/ext3/super.c Fri Mar 21 18:47:19 2003
+@@ -1492,10 +1492,10 @@ static journal_t *ext3_get_dev_journal(s
+               printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
+               goto out_journal;
+       }
+-      if (ntohl(journal->j_superblock->s_nr_users) != 1) {
++      if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+               printk(KERN_ERR "EXT3-fs: External journal has more than one "
+                                       "user (unsupported) - %d\n",
+-                      ntohl(journal->j_superblock->s_nr_users));
++                      be32_to_cpu(journal->j_superblock->s_nr_users));
+               goto out_journal;
+       }
+       EXT3_SB(sb)->journal_bdev = bdev;
+@@ -1703,6 +1703,7 @@ int ext3_force_commit(struct super_block
+       unlock_kernel();
+       return ret;
+ }
++EXPORT_SYMBOL(ext3_force_commit);
+ /*
+  * Ext3 always journals updates to the superblock itself, so we don't
+--- linux-2.5.63-nointent/fs/ext3/xattr.h~ext3-2.5.63  Fri Mar 21 18:47:19 2003
++++ linux-2.5.63-nointent-root/fs/ext3/xattr.h Fri Mar 21 18:47:19 2003
+@@ -5,7 +5,7 @@
+   (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+-
++#include <linux/module.h>
+ #include <linux/config.h>
+ #include <linux/xattr.h>
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18.patch
new file mode 100644 (file)
index 0000000..6b9a348
--- /dev/null
@@ -0,0 +1,302 @@
+ 0 files changed
+
+--- linux-2.4.18-chaos52/fs/ext3/super.c~ext3-delete_thread-2.4.18     2003-06-01 03:24:13.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/super.c  2003-06-03 17:01:49.000000000 +0800
+@@ -398,6 +398,210 @@ static void dump_orphan_list(struct supe
+       }
+ }
++#ifdef EXT3_DELETE_THREAD
++/*
++ * Delete inodes in a loop until there are no more to be deleted.
++ * Normally, we run in the background doing the deletes and sleeping again,
++ * and clients just add new inodes to be deleted onto the end of the list.
++ * If someone is concerned about free space (e.g. block allocation or similar)
++ * then they can sleep on s_delete_waiter_queue and be woken up when space
++ * has been freed.
++ */
++int ext3_delete_thread(void *data)
++{
++      struct super_block *sb = data;
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      struct task_struct *tsk = current;
++
++      /* Almost like daemonize, but not quite */
++      exit_mm(current);
++      tsk->session = 1;
++      tsk->pgrp = 1;
++      tsk->tty = NULL;
++      exit_files(current);
++      reparent_to_init();
++
++      sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
++      sigfillset(&tsk->blocked);
++
++      /*tsk->flags |= PF_KERNTHREAD;*/
++
++      INIT_LIST_HEAD(&sbi->s_delete_list);
++      wake_up(&sbi->s_delete_waiter_queue);
++      ext3_debug("EXT3-fs: delete thread on %s started\n",
++             kdevname(sb->s_dev));
++
++      /* main loop */
++      for (;;) {
++              sleep_on(&sbi->s_delete_thread_queue);
++              ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
++                         tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
++
++              spin_lock(&sbi->s_delete_lock);
++              if (list_empty(&sbi->s_delete_list)) {
++                      memset(&sbi->s_delete_list, 0,
++                             sizeof(sbi->s_delete_list));
++                      spin_unlock(&sbi->s_delete_lock);
++                      ext3_debug("ext3 delete thread on %s exiting\n",
++                             kdevname(sb->s_dev));
++                      wake_up(&sbi->s_delete_waiter_queue);
++                      break;
++              }
++
++              while (!list_empty(&sbi->s_delete_list)) {
++                      struct inode *inode=list_entry(sbi->s_delete_list.next,
++                                                     struct inode, i_dentry);
++                      unsigned long blocks = inode->i_blocks >>
++                                                      (inode->i_blkbits - 9);
++
++                      list_del_init(&inode->i_dentry);
++                      spin_unlock(&sbi->s_delete_lock);
++                      ext3_debug("%s delete ino %lu blk %lu\n",
++                                 tsk->comm, inode->i_ino, blocks);
++
++                      iput(inode);
++
++                      spin_lock(&sbi->s_delete_lock);
++                      sbi->s_delete_blocks -= blocks;
++                      sbi->s_delete_inodes--;
++              }
++              if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0)
++                      ext3_warning(sb, __FUNCTION__,
++                                   "%lu blocks, %lu inodes on list?\n",
++                                   sbi->s_delete_blocks,sbi->s_delete_inodes);
++              sbi->s_delete_blocks = 0;
++              sbi->s_delete_inodes = 0;
++              spin_unlock(&sbi->s_delete_lock);
++              wake_up(&sbi->s_delete_waiter_queue);
++      }
++
++      return 0;
++}
++
++static void ext3_start_delete_thread(struct super_block *sb)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      int rc;
++
++      spin_lock_init(&sbi->s_delete_lock);
++      memset(&sbi->s_delete_list, 0, sizeof(sbi->s_delete_list));
++      init_waitqueue_head(&sbi->s_delete_thread_queue);
++      init_waitqueue_head(&sbi->s_delete_waiter_queue);
++      sbi->s_delete_blocks = 0;
++      sbi->s_delete_inodes = 0;
++
++      rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
++      if (rc < 0)
++              printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
++                     rc);
++      else
++              wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
++}
++
++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
++{
++      wake_up(&sbi->s_delete_thread_queue);
++      wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
++}
++
++/* Instead of playing games with the inode flags, destruction, etc we just
++ * create a new inode locally and put it on a list for the truncate thread.
++ * We need large parts of the inode struct in order to complete the
++ * truncate and unlink, so we may as well just have a real inode to do it.
++ *
++ * If we have any problem deferring the delete, just delete it right away.
++ * If we defer it, we also mark how many blocks it would free, so that we
++ * can keep the statfs data correct, and we know if we should sleep on the
++ * truncate thread when we run out of space.
++ *
++ * In 2.5 this can be done much more cleanly by just registering a "drop"
++ * method in the super_operations struct.
++ */
++static void ext3_delete_inode_thread(struct inode *old_inode)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
++      struct inode *new_inode;
++      unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
++
++      if (is_bad_inode(old_inode)) {
++              clear_inode(old_inode);
++              return;
++      }
++      
++      if (!test_opt (old_inode->i_sb, ASYNCDEL)) {
++              ext3_delete_inode(old_inode);
++              return;
++      }
++
++      /* We may want to delete the inode immediately and not defer it */
++      if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
++          !sbi->s_delete_list.next) {
++              ext3_delete_inode(old_inode);
++              return;
++      }
++
++      if (EXT3_I(old_inode)->i_state & EXT3_STATE_DELETE) {
++              ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
++                         old_inode->i_ino, blocks);
++              ext3_delete_inode(old_inode);
++              return;
++      }
++
++      /* We can iget this inode again here, because our caller has unhashed
++       * old_inode, so new_inode will be in a different inode struct.
++       *
++       * We need to ensure that the i_orphan pointers in the other inodes
++       * point at the new inode copy instead of the old one so the orphan
++       * list doesn't get corrupted when the old orphan inode is freed.
++       */
++      down(&sbi->s_orphan_lock);
++
++      EXT3_SB(old_inode->i_sb)->s_mount_state |= EXT3_ORPHAN_FS;
++      new_inode = iget(old_inode->i_sb, old_inode->i_ino);
++      EXT3_SB(old_inode->i_sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
++      if (is_bad_inode(new_inode)) {
++              printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
++              iput(new_inode);
++              new_inode = NULL;
++      }
++      if (!new_inode) {
++              up(&sbi->s_orphan_lock);
++              ext3_debug(KERN_DEBUG "delete inode %lu directly (bad read)\n",
++                         old_inode->i_ino);
++              ext3_delete_inode(old_inode);
++              return;
++      }
++      J_ASSERT(new_inode != old_inode);
++
++      J_ASSERT(!list_empty(&EXT3_I(old_inode)->i_orphan));
++      /* Ugh.  We need to insert new_inode into the same spot on the list
++       * as old_inode was, to ensure the in-memory orphan list is still
++       * in the same order as the on-disk orphan list (badness otherwise).
++       */
++      EXT3_I(new_inode)->i_orphan = EXT3_I(old_inode)->i_orphan;
++      EXT3_I(new_inode)->i_orphan.next->prev = &EXT3_I(new_inode)->i_orphan;
++      EXT3_I(new_inode)->i_orphan.prev->next = &EXT3_I(new_inode)->i_orphan;
++      EXT3_I(new_inode)->i_state |= EXT3_STATE_DELETE;
++      up(&sbi->s_orphan_lock);
++
++      clear_inode(old_inode);
++
++      ext3_debug("delete inode %lu (%lu blocks) by thread\n",
++                 new_inode->i_ino, blocks);
++      spin_lock(&sbi->s_delete_lock);
++      J_ASSERT(list_empty(&new_inode->i_dentry));
++      list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
++      sbi->s_delete_blocks += blocks;
++      sbi->s_delete_inodes++;
++      spin_unlock(&sbi->s_delete_lock);
++
++      wake_up(&sbi->s_delete_thread_queue);
++}
++#else
++#define ext3_start_delete_thread(sbi) do {} while(0)
++#define ext3_stop_delete_thread(sbi) do {} while(0)
++#endif /* EXT3_DELETE_THREAD */
++
+ void ext3_put_super (struct super_block * sb)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+@@ -405,6 +609,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_stop_delete_thread(sbi);
+       ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+@@ -453,7 +658,11 @@ static struct super_operations ext3_sops
+       write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
+       dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
+       put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
++#ifdef EXT3_DELETE_THREAD
++      delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
++#else
+       delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
++#endif
+       put_super:      ext3_put_super,         /* BKL held */
+       write_super:    ext3_write_super,       /* BKL held */
+       sync_fs:        ext3_sync_fs,
+@@ -514,6 +723,12 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef EXT3_DELETE_THREAD
++              if (!strcmp(this_char, "asyncdel"))
++                      set_opt(*mount_options, ASYNCDEL);
++              else
++#endif
++
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -1209,6 +1424,7 @@ struct super_block * ext3_read_super (st
+       }
+       ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
++      ext3_start_delete_thread(sb);
+       /*
+        * akpm: core read_super() calls in here with the superblock locked.
+        * That deadlocks, because orphan cleanup needs to lock the superblock
+--- linux-2.4.18-chaos52/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18     2003-06-01 03:24:11.000000000 +0800
++++ linux-2.4.18-chaos52-root/include/linux/ext3_fs.h  2003-06-03 17:03:28.000000000 +0800
+@@ -190,6 +190,7 @@ struct ext3_group_desc
+  */
+ #define EXT3_STATE_JDATA              0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW                        0x00000002 /* inode is newly created */
++#define EXT3_STATE_DELETE             0x00000010 /* deferred delete inode */
+ /*
+  * ioctl commands
+@@ -317,6 +318,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_INDEX              0x4000  /* Enable directory index */
++#define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+--- linux-2.4.18-chaos52/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18  2003-06-01 03:24:13.000000000 +0800
++++ linux-2.4.18-chaos52-root/include/linux/ext3_fs_sb.h       2003-06-03 16:59:24.000000000 +0800
+@@ -29,6 +29,8 @@
+ #define EXT3_MAX_GROUP_LOADED 32
++#define EXT3_DELETE_THREAD
++
+ /*
+  * third extended-fs super-block data in memory
+  */
+@@ -74,6 +76,14 @@ struct ext3_sb_info {
+       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+ #endif
++#ifdef EXT3_DELETE_THREAD
++      spinlock_t s_delete_lock;
++      struct list_head s_delete_list;
++      unsigned long s_delete_blocks;
++      unsigned long s_delete_inodes;
++      wait_queue_head_t s_delete_thread_queue;
++      wait_queue_head_t s_delete_waiter_queue;
++#endif
+ };
+ #endif        /* _LINUX_EXT3_FS_SB */
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch
new file mode 100644 (file)
index 0000000..be2723c
--- /dev/null
@@ -0,0 +1,300 @@
+diff -puNr origin/fs/ext3/super.c linux/fs/ext3/super.c
+--- origin/fs/ext3/super.c     2003-05-04 17:23:52.000000000 +0400
++++ linux/fs/ext3/super.c      2003-05-04 17:09:20.000000000 +0400
+@@ -398,6 +398,210 @@ static void dump_orphan_list(struct supe
+       }
+ }
++#ifdef EXT3_DELETE_THREAD
++/*
++ * Delete inodes in a loop until there are no more to be deleted.
++ * Normally, we run in the background doing the deletes and sleeping again,
++ * and clients just add new inodes to be deleted onto the end of the list.
++ * If someone is concerned about free space (e.g. block allocation or similar)
++ * then they can sleep on s_delete_waiter_queue and be woken up when space
++ * has been freed.
++ */
++int ext3_delete_thread(void *data)
++{
++      struct super_block *sb = data;
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      struct task_struct *tsk = current;
++
++      /* Almost like daemonize, but not quite */
++      exit_mm(current);
++      tsk->session = 1;
++      tsk->pgrp = 1;
++      tsk->tty = NULL;
++      exit_files(current);
++      reparent_to_init();
++
++      sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
++      sigfillset(&tsk->blocked);
++
++      /*tsk->flags |= PF_KERNTHREAD;*/
++
++      INIT_LIST_HEAD(&sbi->s_delete_list);
++      wake_up(&sbi->s_delete_waiter_queue);
++      ext3_debug("EXT3-fs: delete thread on %s started\n",
++             kdevname(sb->s_dev));
++
++      /* main loop */
++      for (;;) {
++              sleep_on(&sbi->s_delete_thread_queue);
++              ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
++                         tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
++
++              spin_lock(&sbi->s_delete_lock);
++              if (list_empty(&sbi->s_delete_list)) {
++                      memset(&sbi->s_delete_list, 0,
++                             sizeof(sbi->s_delete_list));
++                      spin_unlock(&sbi->s_delete_lock);
++                      ext3_debug("ext3 delete thread on %s exiting\n",
++                             kdevname(sb->s_dev));
++                      wake_up(&sbi->s_delete_waiter_queue);
++                      break;
++              }
++
++              while (!list_empty(&sbi->s_delete_list)) {
++                      struct inode *inode=list_entry(sbi->s_delete_list.next,
++                                                     struct inode, i_dentry);
++                      unsigned long blocks = inode->i_blocks >>
++                                                      (inode->i_blkbits - 9);
++
++                      list_del_init(&inode->i_dentry);
++                      spin_unlock(&sbi->s_delete_lock);
++                      ext3_debug("%s delete ino %lu blk %lu\n",
++                                 tsk->comm, inode->i_ino, blocks);
++
++                      iput(inode);
++
++                      spin_lock(&sbi->s_delete_lock);
++                      sbi->s_delete_blocks -= blocks;
++                      sbi->s_delete_inodes--;
++              }
++              if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0)
++                      ext3_warning(sb, __FUNCTION__,
++                                   "%lu blocks, %lu inodes on list?\n",
++                                   sbi->s_delete_blocks,sbi->s_delete_inodes);
++              sbi->s_delete_blocks = 0;
++              sbi->s_delete_inodes = 0;
++              spin_unlock(&sbi->s_delete_lock);
++              wake_up(&sbi->s_delete_waiter_queue);
++      }
++
++      return 0;
++}
++
++static void ext3_start_delete_thread(struct super_block *sb)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      int rc;
++
++      spin_lock_init(&sbi->s_delete_lock);
++      memset(&sbi->s_delete_list, 0, sizeof(sbi->s_delete_list));
++      init_waitqueue_head(&sbi->s_delete_thread_queue);
++      init_waitqueue_head(&sbi->s_delete_waiter_queue);
++      sbi->s_delete_blocks = 0;
++      sbi->s_delete_inodes = 0;
++
++      rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
++      if (rc < 0)
++              printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
++                     rc);
++      else
++              wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
++}
++
++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
++{
++      wake_up(&sbi->s_delete_thread_queue);
++      wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
++}
++
++/* Instead of playing games with the inode flags, destruction, etc we just
++ * create a new inode locally and put it on a list for the truncate thread.
++ * We need large parts of the inode struct in order to complete the
++ * truncate and unlink, so we may as well just have a real inode to do it.
++ *
++ * If we have any problem deferring the delete, just delete it right away.
++ * If we defer it, we also mark how many blocks it would free, so that we
++ * can keep the statfs data correct, and we know if we should sleep on the
++ * truncate thread when we run out of space.
++ *
++ * In 2.5 this can be done much more cleanly by just registering a "drop"
++ * method in the super_operations struct.
++ */
++static void ext3_delete_inode_thread(struct inode *old_inode)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
++      struct inode *new_inode;
++      unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
++
++      if (is_bad_inode(old_inode)) {
++              clear_inode(old_inode);
++              return;
++      }
++      
++      if (!test_opt (old_inode->i_sb, ASYNCDEL)) {
++              ext3_delete_inode(old_inode);
++              return;
++      }
++
++      /* We may want to delete the inode immediately and not defer it */
++      if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
++          !sbi->s_delete_list.next) {
++              ext3_delete_inode(old_inode);
++              return;
++      }
++
++      if (EXT3_I(old_inode)->i_state & EXT3_STATE_DELETE) {
++              ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
++                         old_inode->i_ino, blocks);
++              ext3_delete_inode(old_inode);
++              return;
++      }
++
++      /* We can iget this inode again here, because our caller has unhashed
++       * old_inode, so new_inode will be in a different inode struct.
++       *
++       * We need to ensure that the i_orphan pointers in the other inodes
++       * point at the new inode copy instead of the old one so the orphan
++       * list doesn't get corrupted when the old orphan inode is freed.
++       */
++      down(&sbi->s_orphan_lock);
++
++      EXT3_SB(old_inode->i_sb)->s_mount_state |= EXT3_ORPHAN_FS;
++      new_inode = iget(old_inode->i_sb, old_inode->i_ino);
++      EXT3_SB(old_inode->i_sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
++      if (is_bad_inode(new_inode)) {
++              printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
++              iput(new_inode);
++              new_inode = NULL;
++      }
++      if (!new_inode) {
++              up(&sbi->s_orphan_lock);
++              ext3_debug(KERN_DEBUG "delete inode %lu directly (bad read)\n",
++                         old_inode->i_ino);
++              ext3_delete_inode(old_inode);
++              return;
++      }
++      J_ASSERT(new_inode != old_inode);
++
++      J_ASSERT(!list_empty(&EXT3_I(old_inode)->i_orphan));
++      /* Ugh.  We need to insert new_inode into the same spot on the list
++       * as old_inode was, to ensure the in-memory orphan list is still
++       * in the same order as the on-disk orphan list (badness otherwise).
++       */
++      EXT3_I(new_inode)->i_orphan = EXT3_I(old_inode)->i_orphan;
++      EXT3_I(new_inode)->i_orphan.next->prev = &EXT3_I(new_inode)->i_orphan;
++      EXT3_I(new_inode)->i_orphan.prev->next = &EXT3_I(new_inode)->i_orphan;
++      EXT3_I(new_inode)->i_state |= EXT3_STATE_DELETE;
++      up(&sbi->s_orphan_lock);
++
++      clear_inode(old_inode);
++
++      ext3_debug("delete inode %lu (%lu blocks) by thread\n",
++                 new_inode->i_ino, blocks);
++      spin_lock(&sbi->s_delete_lock);
++      J_ASSERT(list_empty(&new_inode->i_dentry));
++      list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
++      sbi->s_delete_blocks += blocks;
++      sbi->s_delete_inodes++;
++      spin_unlock(&sbi->s_delete_lock);
++
++      wake_up(&sbi->s_delete_thread_queue);
++}
++#else
++#define ext3_start_delete_thread(sbi) do {} while(0)
++#define ext3_stop_delete_thread(sbi) do {} while(0)
++#endif /* EXT3_DELETE_THREAD */
++
+ void ext3_put_super (struct super_block * sb)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+@@ -405,6 +611,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_stop_delete_thread(sbi);
+       ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+@@ -453,7 +660,11 @@ static struct super_operations ext3_sops
+       write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
+       dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
+       put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
++#ifdef EXT3_DELETE_THREAD
++      delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
++#else
+       delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
++#endif
+       put_super:      ext3_put_super,         /* BKL held */
+       write_super:    ext3_write_super,       /* BKL held */
+       write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
+@@ -514,6 +725,11 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef EXT3_DELETE_THREAD
++              if (!strcmp(this_char, "asyncdel"))
++                      set_opt(*mount_options, ASYNCDEL);
++              else
++#endif
+ #ifdef CONFIG_EXT3_FS_XATTR_USER
+               if (!strcmp (this_char, "user_xattr"))
+                       set_opt (*mount_options, XATTR_USER);
+@@ -1220,6 +1436,7 @@ struct super_block * ext3_read_super (st
+       }
+       ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
++      ext3_start_delete_thread(sb);
+       /*
+        * akpm: core read_super() calls in here with the superblock locked.
+        * That deadlocks, because orphan cleanup needs to lock the superblock
+diff -puNr origin/include/linux/ext3_fs.h linux/include/linux/ext3_fs.h
+--- origin/include/linux/ext3_fs.h     2003-05-04 17:22:49.000000000 +0400
++++ linux/include/linux/ext3_fs.h      2003-05-04 15:06:10.000000000 +0400
+@@ -193,6 +193,7 @@ struct ext3_group_desc
+  */
+ #define EXT3_STATE_JDATA              0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW                        0x00000002 /* inode is newly created */
++#define EXT3_STATE_DELETE             0x00000010 /* deferred delete inode */
+ /*
+  * ioctl commands
+@@ -321,6 +322,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
++#define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+diff -puNr origin/include/linux/ext3_fs_sb.h linux/include/linux/ext3_fs_sb.h
+--- origin/include/linux/ext3_fs_sb.h  2003-05-04 17:23:52.000000000 +0400
++++ linux/include/linux/ext3_fs_sb.h   2003-05-04 11:37:04.000000000 +0400
+@@ -29,6 +29,8 @@
+ #define EXT3_MAX_GROUP_LOADED 8
++#define EXT3_DELETE_THREAD
++
+ /*
+  * third extended-fs super-block data in memory
+  */
+@@ -76,6 +78,14 @@ struct ext3_sb_info {
+       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+ #endif
++#ifdef EXT3_DELETE_THREAD
++      spinlock_t s_delete_lock;
++      struct list_head s_delete_list;
++      unsigned long s_delete_blocks;
++      unsigned long s_delete_inodes;
++      wait_queue_head_t s_delete_thread_queue;
++      wait_queue_head_t s_delete_waiter_queue;
++#endif
+ };
+ #endif        /* _LINUX_EXT3_FS_SB */
diff --git a/lustre/kernel_patches/patches/ext3-largefile.patch b/lustre/kernel_patches/patches/ext3-largefile.patch
new file mode 100644 (file)
index 0000000..aa7a2f2
--- /dev/null
@@ -0,0 +1,16 @@
+ fs/ext3/inode.c |    2 +-
+ 1 files changed, 1 insertion(+), 1 deletion(-)
+
+--- linux-2.4.20/fs/ext3/inode.c~ext3-largefile        2003-04-08 23:35:36.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/inode.c 2003-04-08 23:35:36.000000000 -0600
+@@ -2562,7 +2562,7 @@ void ext3_dirty_inode(struct inode *inod
+       handle_t *handle;
+       lock_kernel();
+-      handle = ext3_journal_start(inode, 1);
++      handle = ext3_journal_start(inode, 2);
+       if (IS_ERR(handle))
+               goto out;
+       if (current_handle &&
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-noread-2.4.20.patch b/lustre/kernel_patches/patches/ext3-noread-2.4.20.patch
new file mode 100644 (file)
index 0000000..b14b869
--- /dev/null
@@ -0,0 +1,218 @@
+ fs/ext3/ialloc.c        |   47 ++++++++++++++++++++++-
+ fs/ext3/inode.c         |   96 +++++++++++++++++++++++++++++++++++++-----------
+ include/linux/ext3_fs.h |    2 +
+ 3 files changed, 121 insertions(+), 24 deletions(-)
+
+--- linux-2.4.20/fs/ext3/ialloc.c~ext3-noread-2.4.20   2003-05-16 12:21:39.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/ialloc.c 2003-05-16 12:21:46.000000000 +0800
+@@ -289,6 +289,37 @@ error_return:
+ }
+ /*
++ * @block_group: block group of inode
++ * @offset: relative offset of inode within @block_group
++ *
++ * Check whether any of the inodes in this disk block are in use.
++ *
++ * Caller must be holding superblock lock (group/bitmap read lock in future).
++ */
++int ext3_itable_block_used(struct super_block *sb, unsigned int block_group,
++                         int offset)
++{
++      int bitmap_nr = load_inode_bitmap(sb, block_group);
++      int inodes_per_block;
++      unsigned long inum, iend;
++      struct buffer_head *ibitmap;
++
++      if (bitmap_nr < 0)
++              return 1;
++
++      inodes_per_block = sb->s_blocksize / EXT3_SB(sb)->s_inode_size;
++      inum = offset & ~(inodes_per_block - 1);
++      iend = inum + inodes_per_block;
++      ibitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr];
++      for (; inum < iend; inum++) {
++              if (inum != offset && ext3_test_bit(inum, ibitmap->b_data))
++                      return 1;
++      }
++
++      return 0;
++}
++
++/*
+  * There are two policies for allocating an inode.  If the new inode is
+  * a directory, then a forward search is made for a block group with both
+  * free space and a low directory-to-inode ratio; if that fails, then of
+@@ -310,6 +341,7 @@ struct inode * ext3_new_inode (handle_t 
+       struct ext3_group_desc * gdp;
+       struct ext3_group_desc * tmp;
+       struct ext3_super_block * es;
++      struct ext3_iloc iloc;
+       int err = 0;
+       /* Cannot create files in a deleted directory */
+@@ -510,8 +542,19 @@ repeat:
+       inode->i_generation = sb->u.ext3_sb.s_next_generation++;
+       inode->u.ext3_i.i_state = EXT3_STATE_NEW;
+-      err = ext3_mark_inode_dirty(handle, inode);
+-      if (err) goto fail;
++      err = ext3_get_inode_loc_new(inode, &iloc, 1);
++      if (err) goto fail;
++      BUFFER_TRACE(iloc->bh, "get_write_access");
++      err = ext3_journal_get_write_access(handle, iloc.bh);
++      if (err) {
++              brelse(iloc.bh);
++              iloc.bh = NULL;
++              goto fail;
++      }
++      err = ext3_mark_iloc_dirty(handle, inode, &iloc);
++      if (err) goto fail;
++ 
++
+       
+       unlock_super (sb);
+       if(DQUOT_ALLOC_INODE(inode)) {
+--- linux-2.4.20/fs/ext3/inode.c~ext3-noread-2.4.20    2003-05-16 12:21:41.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/inode.c  2003-05-16 12:22:15.000000000 +0800
+@@ -2013,14 +2013,19 @@ out_stop:
+       ext3_journal_stop(handle, inode);
+ }
+-/* 
+- * ext3_get_inode_loc returns with an extra refcount against the
+- * inode's underlying buffer_head on success. 
+- */
+-
+-int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
++#define NUM_INODE_PREREAD     16
++  
++/*
++  * ext3_get_inode_loc returns with an extra refcount against the inode's
++  * underlying buffer_head on success.  If this is for a new inode allocation
++  * (new is non-zero) then we may be able to optimize away the read if there
++  * are no other in-use inodes in this inode table block.  If we need to do
++  * a read, then read in a whole chunk of blocks to avoid blocking again soon
++  * if we are doing lots of creates/updates.
++  */
++int ext3_get_inode_loc_new(struct inode *inode, struct ext3_iloc *iloc, int new)
+ {
+-      struct buffer_head *bh = 0;
++      struct buffer_head *bh[NUM_INODE_PREREAD];
+       unsigned long block;
+       unsigned long block_group;
+       unsigned long group_desc;
+@@ -2045,31 +2050,73 @@ int ext3_get_inode_loc (struct inode *in
+       }
+       group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
+       desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
+-      bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
+-      if (!bh) {
++      if (!(inode->i_sb->u.ext3_sb.s_group_desc[group_desc])) {
+               ext3_error (inode->i_sb, "ext3_get_inode_loc",
+                           "Descriptor not loaded");
+               goto bad_inode;
+       }
+-      gdp = (struct ext3_group_desc *) bh->b_data;
++      gdp = (struct ext3_group_desc *)(inode->i_sb->u.ext3_sb.s_group_desc[group_desc]->b_data);
+       /*
+        * Figure out the offset within the block group inode table
+        */
+-      offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
+-              EXT3_INODE_SIZE(inode->i_sb);
++      offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb));
++
+       block = le32_to_cpu(gdp[desc].bg_inode_table) +
+-              (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
+-      if (!(bh = sb_bread(inode->i_sb, block))) {
+-              ext3_error (inode->i_sb, "ext3_get_inode_loc",
+-                          "unable to read inode block - "
+-                          "inode=%lu, block=%lu", inode->i_ino, block);
+-              goto bad_inode;
+-      }
+-      offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
++              (offset * EXT3_INODE_SIZE(inode->i_sb) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
+-      iloc->bh = bh;
+-      iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
++      bh[0] = sb_getblk(inode->i_sb, block);
++      if (buffer_uptodate(bh[0]))
++              goto done;
++ 
++      /* If we don't really need to read this block, and it isn't already
++       * in memory, then we just zero it out.  Otherwise, we keep the
++       * current block contents (deleted inode data) for posterity.
++       */
++      if (new && !ext3_itable_block_used(inode->i_sb, block_group, offset)) {
++              lock_buffer(bh[0]);
++              memset(bh[0]->b_data, 0, bh[0]->b_size);
++              mark_buffer_uptodate(bh[0], 1);
++              unlock_buffer(bh[0]);
++      } else {
++              unsigned long block_end, itable_end;
++              int count = 1;
++ 
++              itable_end = le32_to_cpu(gdp[desc].bg_inode_table) +
++                              inode->i_sb->u.ext3_sb.s_itb_per_group;
++              block_end = block + NUM_INODE_PREREAD;
++              if (block_end > itable_end)
++                      block_end = itable_end;
++
++              for (++block; block < block_end; block++) {
++                      bh[count] = sb_getblk(inode->i_sb, block);
++                      if (count && (buffer_uptodate(bh[count]) ||
++                                    buffer_locked(bh[count]))) {
++                              __brelse(bh[count]);
++                      } else
++                              count++;
++              }
++ 
++              ll_rw_block(READ, count, bh);
++ 
++              /* Release all but the block we actually need (bh[0]) */
++              while (--count > 0)
++                      __brelse(bh[count]);
++ 
++              wait_on_buffer(bh[0]);
++              if (!buffer_uptodate(bh[0])) {
++                      ext3_error(inode->i_sb, __FUNCTION__,
++                                 "unable to read inode block - "
++                                 "inode=%lu, block=%lu", inode->i_ino,
++                                 bh[0]->b_blocknr);
++                      goto bad_inode;
++              }
++      }
++  done:
++      offset = (offset * EXT3_INODE_SIZE(inode->i_sb)) & (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
++  
++      iloc->bh = bh[0];
++      iloc->raw_inode = (struct ext3_inode *)(bh[0]->b_data + offset);
+       iloc->block_group = block_group;
+       
+       return 0;
+@@ -2078,6 +2125,11 @@ int ext3_get_inode_loc (struct inode *in
+       return -EIO;
+ }
++int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
++{
++      return ext3_get_inode_loc_new(inode, iloc, 0);
++}
++ 
+ void ext3_read_inode(struct inode * inode)
+ {
+       struct ext3_iloc iloc;
+--- linux-2.4.20/include/linux/ext3_fs.h~ext3-noread-2.4.20    2003-05-16 12:21:39.000000000 +0800
++++ linux-2.4.20-root/include/linux/ext3_fs.h  2003-05-16 12:21:46.000000000 +0800
+@@ -683,6 +683,8 @@ extern int ext3_forget(handle_t *, int, 
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
++extern int ext3_itable_block_used(struct super_block *sb, unsigned int, int);
++extern int ext3_get_inode_loc_new(struct inode *, struct ext3_iloc *, int);
+ extern int  ext3_get_inode_loc (struct inode *, struct ext3_iloc *);
+ extern void ext3_read_inode (struct inode *);
+ extern void ext3_write_inode (struct inode *, int);
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-orphan_lock.patch b/lustre/kernel_patches/patches/ext3-orphan_lock.patch
new file mode 100644 (file)
index 0000000..d1e5c8d
--- /dev/null
@@ -0,0 +1,79 @@
+--- linux/fs/ext3/namei.c.orig Fri Mar 14 14:11:58 2003
++++ linux/fs/ext3/namei.c      Fri Mar 14 14:39:48 2003
+@@ -1406,8 +1409,8 @@
+       struct super_block *sb = inode->i_sb;
+       struct ext3_iloc iloc;
+       int err = 0, rc;
+-      
+-      lock_super(sb);
++
++      down(&EXT3_SB(sb)->s_orphan_lock);
+       if (!list_empty(&EXT3_I(inode)->i_orphan))
+               goto out_unlock;
+@@ -1455,7 +1458,7 @@
+       jbd_debug(4, "orphan inode %ld will point to %d\n",
+                       inode->i_ino, NEXT_ORPHAN(inode));
+ out_unlock:
+-      unlock_super(sb);
++      up(&EXT3_SB(sb)->s_orphan_lock);
+       ext3_std_error(inode->i_sb, err);
+       return err;
+ }
+@@ -1468,20 +1471,19 @@
+ {
+       struct list_head *prev;
+       struct ext3_inode_info *ei = EXT3_I(inode);
+-      struct ext3_sb_info *sbi;
++      struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
+       unsigned long ino_next;
+       struct ext3_iloc iloc;
+       int err = 0;
+-      lock_super(inode->i_sb);
++      down(&sbi->s_orphan_lock);
+       if (list_empty(&ei->i_orphan)) {
+-              unlock_super(inode->i_sb);
++              up(&sbi->s_orphan_lock);
+               return 0;
+       }
+       ino_next = NEXT_ORPHAN(inode);
+       prev = ei->i_orphan.prev;
+-      sbi = EXT3_SB(inode->i_sb);
+       jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+@@ -1525,10 +1527,10 @@
+       if (err)
+               goto out_brelse;
+-out_err: 
++out_err:
+       ext3_std_error(inode->i_sb, err);
+ out:
+-      unlock_super(inode->i_sb);
++      up(&sbi->s_orphan_lock);
+       return err;
+ out_brelse:
+--- linux/fs/ext3/super.c.orig Fri Mar 14 14:11:58 2003
++++ linux/fs/ext3/super.c      Fri Mar 14 14:36:00 2003
+@@ -1134,6 +1314,7 @@
+        */
+       sb->s_op = &ext3_sops;
+       INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
++      sema_init(&sbi->s_orphan_lock, 1);
+       sb->s_root = 0;
+--- linux/include/linux/ext3_fs_sb.h.orig      Tue Feb 11 16:34:33 2003
++++ linux/include/linux/ext3_fs_sb.h   Fri Mar 14 14:30:11 2003
+@@ -67,6 +69,7 @@
+       struct inode * s_journal_inode;
+       struct journal_s * s_journal;
+       struct list_head s_orphan;
++      struct semaphore s_orphan_lock;
+       unsigned long s_commit_interval;
+       struct block_device *journal_bdev;
+ #ifdef CONFIG_JBD_DEBUG
diff --git a/lustre/kernel_patches/patches/ext3-san-2.4.20.patch b/lustre/kernel_patches/patches/ext3-san-2.4.20.patch
new file mode 100644 (file)
index 0000000..148f4e3
--- /dev/null
@@ -0,0 +1,117 @@
+ fs/ext3/ext3-exports.c |    9 ++++-
+ fs/ext3/inode.c        |   81 +++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 89 insertions(+), 1 deletion(-)
+
+--- linux/fs/ext3/inode.c~ext3-san-2.4.20-hp   Tue Apr 29 11:01:52 2003
++++ linux-mmonroe/fs/ext3/inode.c      Tue Apr 29 11:01:53 2003
+@@ -2734,3 +2734,84 @@ int ext3_change_inode_journal_flag(struc
+  * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
+  * need to extend" test in ext3_prepare_write() succeeds.  
+  */
++
++/* for each block: 1 ind + 1 dind + 1 tind
++ * for each block: 3 bitmap blocks
++ * for each block: 3 group descriptor blocks
++ * i inode block
++ * 1 superblock
++ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
++ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
++ *
++ * XXX assuming:
++ * (1) fs logic block size == page size
++ * (2) ext3 in writeback mode
++ */
++static inline int ext3_san_write_trans_blocks(int nblocks)
++{
++      int ret;
++      
++      ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1;
++
++#ifdef CONFIG_QUOTA
++      ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++      return ret;
++}
++
++/* Alloc blocks for an inode, while don't create any buffer/page
++ * for data I/O; set the inode size if file is extended.
++ *
++ * @inode:    target inode
++ * @blocks:   array of logic block number
++ * @nblocks:  how many blocks need be alloced
++ * @newsize:  new filesize we should set
++ *
++ * return:    0 success, otherwise failed
++ *            (*blocks) contains physical block number alloced
++ *
++ * XXX this assume the fs block size == page size
++ */
++int ext3_prep_san_write(struct inode *inode, long *blocks,
++                      int nblocks, loff_t newsize)
++{
++      handle_t *handle;
++      struct buffer_head bh_tmp;
++      int needed_blocks;
++      int i, ret = 0, ret2;
++
++      needed_blocks = ext3_san_write_trans_blocks(nblocks);
++
++      lock_kernel();
++      handle = ext3_journal_start(inode, needed_blocks);
++      if (IS_ERR(handle)) {
++              unlock_kernel();
++              return PTR_ERR(handle);
++      }
++      unlock_kernel();
++
++      /* alloc blocks one by one */
++      for (i = 0; i < nblocks; i++) {
++              ret = ext3_get_block_handle(handle, inode, blocks[i],
++                                              &bh_tmp, 1);
++              if (ret)
++                      break;
++
++              blocks[i] = bh_tmp.b_blocknr;
++      }
++
++      /* set inode size if needed */
++      if (!ret && (newsize > inode->i_size)) {
++              inode->i_size = newsize;
++              ext3_mark_inode_dirty(handle, inode);
++      }
++
++      lock_kernel();
++      ret2 = ext3_journal_stop(handle, inode);
++      unlock_kernel();
++
++      if (!ret)
++              ret = ret2;
++      return ret;
++}
+--- linux/fs/ext3/ext3-exports.c~ext3-san-2.4.20-hp    Tue Apr 29 11:01:51 2003
++++ linux-mmonroe/fs/ext3/ext3-exports.c       Tue Apr 29 11:07:19 2003
+@@ -1,9 +1,15 @@
+ #include <linux/config.h>
+ #include <linux/module.h>
+-#include <linux/ext3_fs.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
+ #include <linux/ext3_xattr.h>
++int ext3_prep_san_write(struct inode *inode, long *blocks,
++                      int nblocks, loff_t newsize);
++
+ EXPORT_SYMBOL(ext3_force_commit);
+ EXPORT_SYMBOL(ext3_bread);
+ EXPORT_SYMBOL(ext3_xattr_register);
+@@ -11,3 +17,4 @@ EXPORT_SYMBOL(ext3_xattr_unregister);
+ EXPORT_SYMBOL(ext3_xattr_get);
+ EXPORT_SYMBOL(ext3_xattr_list);
+ EXPORT_SYMBOL(ext3_xattr_set);
++EXPORT_SYMBOL(ext3_prep_san_write);
+
+_
diff --git a/lustre/kernel_patches/patches/ext3-truncate_blocks-chaos.patch.patch b/lustre/kernel_patches/patches/ext3-truncate_blocks-chaos.patch.patch
new file mode 100644 (file)
index 0000000..ce3928d
--- /dev/null
@@ -0,0 +1,92 @@
+--- ./fs/ext3/inode.c.orig     Wed Mar 12 02:44:06 2003
++++ ./fs/ext3/inode.c  Wed Mar 12 11:55:20 2003
+@@ -99,7 +99,35 @@ int ext3_forget(handle_t *handle, int is
+       return err;
+ }
+-/* 
++/*
++ * Work out how many blocks we need to progress with the next chunk of a
++ * truncate transaction.
++ */
++
++static unsigned long blocks_for_truncate(struct inode *inode)
++{
++      unsigned long needed;
++
++      needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
++
++      /* Give ourselves just enough room to cope with inodes in which
++       * i_blocks is corrupt: we've seen disk corruptions in the past
++       * which resulted in random data in an inode which looked enough
++       * like a regular file for ext3 to try to delete it.  Things
++       * will go a bit crazy if that happens, but at least we should
++       * try not to panic the whole kernel. */
++      if (needed < 2)
++              needed = 2;
++
++      /* But we need to bound the transaction so we don't overflow the
++       * journal. */
++      if (needed > EXT3_MAX_TRANS_DATA)
++              needed = EXT3_MAX_TRANS_DATA;
++
++      return EXT3_DATA_TRANS_BLOCKS + needed;
++}
++
++/*
+  * Truncate transactions can be complex and absolutely huge.  So we need to
+  * be able to restart the transaction at a conventient checkpoint to make
+  * sure we don't overflow the journal.
+@@ -110,19 +138,14 @@ int ext3_forget(handle_t *handle, int is
+  * transaction in the top-level truncate loop. --sct 
+  */
+-static handle_t *start_transaction(struct inode *inode) 
++static handle_t *start_transaction(struct inode *inode)
+ {
+-      long needed;
+       handle_t *result;
+-      
+-      needed = inode->i_blocks;
+-      if (needed > EXT3_MAX_TRANS_DATA) 
+-              needed = EXT3_MAX_TRANS_DATA;
+-      
+-      result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
++
++      result = ext3_journal_start(inode, blocks_for_truncate(inode));
+       if (!IS_ERR(result))
+               return result;
+-      
++
+       ext3_std_error(inode->i_sb, PTR_ERR(result));
+       return result;
+ }
+@@ -135,14 +158,9 @@ static handle_t *start_transaction(struc
+  */
+ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+ {
+-      long needed;
+-      
+       if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
+               return 0;
+-      needed = inode->i_blocks;
+-      if (needed > EXT3_MAX_TRANS_DATA) 
+-              needed = EXT3_MAX_TRANS_DATA;
+-      if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed))
++      if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
+               return 0;
+       return 1;
+ }
+@@ -154,11 +172,8 @@ static int try_to_extend_transaction(han
+  */
+ static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+ {
+-      long needed = inode->i_blocks;
+-      if (needed > EXT3_MAX_TRANS_DATA) 
+-              needed = EXT3_MAX_TRANS_DATA;
+       jbd_debug(2, "restarting handle %p\n", handle);
+-      return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed);
++      return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ }
+ /*
diff --git a/lustre/kernel_patches/patches/ext3-truncate_blocks.patch b/lustre/kernel_patches/patches/ext3-truncate_blocks.patch
new file mode 100644 (file)
index 0000000..ce3928d
--- /dev/null
@@ -0,0 +1,92 @@
+--- ./fs/ext3/inode.c.orig     Wed Mar 12 02:44:06 2003
++++ ./fs/ext3/inode.c  Wed Mar 12 11:55:20 2003
+@@ -99,7 +99,35 @@ int ext3_forget(handle_t *handle, int is
+       return err;
+ }
+-/* 
++/*
++ * Work out how many blocks we need to progress with the next chunk of a
++ * truncate transaction.
++ */
++
++static unsigned long blocks_for_truncate(struct inode *inode)
++{
++      unsigned long needed;
++
++      needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
++
++      /* Give ourselves just enough room to cope with inodes in which
++       * i_blocks is corrupt: we've seen disk corruptions in the past
++       * which resulted in random data in an inode which looked enough
++       * like a regular file for ext3 to try to delete it.  Things
++       * will go a bit crazy if that happens, but at least we should
++       * try not to panic the whole kernel. */
++      if (needed < 2)
++              needed = 2;
++
++      /* But we need to bound the transaction so we don't overflow the
++       * journal. */
++      if (needed > EXT3_MAX_TRANS_DATA)
++              needed = EXT3_MAX_TRANS_DATA;
++
++      return EXT3_DATA_TRANS_BLOCKS + needed;
++}
++
++/*
+  * Truncate transactions can be complex and absolutely huge.  So we need to
+  * be able to restart the transaction at a conventient checkpoint to make
+  * sure we don't overflow the journal.
+@@ -110,19 +138,14 @@ int ext3_forget(handle_t *handle, int is
+  * transaction in the top-level truncate loop. --sct 
+  */
+-static handle_t *start_transaction(struct inode *inode) 
++static handle_t *start_transaction(struct inode *inode)
+ {
+-      long needed;
+       handle_t *result;
+-      
+-      needed = inode->i_blocks;
+-      if (needed > EXT3_MAX_TRANS_DATA) 
+-              needed = EXT3_MAX_TRANS_DATA;
+-      
+-      result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
++
++      result = ext3_journal_start(inode, blocks_for_truncate(inode));
+       if (!IS_ERR(result))
+               return result;
+-      
++
+       ext3_std_error(inode->i_sb, PTR_ERR(result));
+       return result;
+ }
+@@ -135,14 +158,9 @@ static handle_t *start_transaction(struc
+  */
+ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+ {
+-      long needed;
+-      
+       if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
+               return 0;
+-      needed = inode->i_blocks;
+-      if (needed > EXT3_MAX_TRANS_DATA) 
+-              needed = EXT3_MAX_TRANS_DATA;
+-      if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed))
++      if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
+               return 0;
+       return 1;
+ }
+@@ -154,11 +172,8 @@ static int try_to_extend_transaction(han
+  */
+ static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+ {
+-      long needed = inode->i_blocks;
+-      if (needed > EXT3_MAX_TRANS_DATA) 
+-              needed = EXT3_MAX_TRANS_DATA;
+       jbd_debug(2, "restarting handle %p\n", handle);
+-      return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed);
++      return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ }
+ /*
diff --git a/lustre/kernel_patches/patches/ext3-unmount_sync.patch b/lustre/kernel_patches/patches/ext3-unmount_sync.patch
new file mode 100644 (file)
index 0000000..c57903c
--- /dev/null
@@ -0,0 +1,21 @@
+ fs/ext3/super.c |    7 ++++++-
+ 1 files changed, 6 insertions(+), 1 deletion(-)
+
+--- linux-2.4.20/fs/ext3/super.c~ext3-unmount_sync     2003-04-08 23:35:44.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/super.c 2003-04-08 23:35:44.000000000 -0600
+@@ -1612,7 +1612,12 @@ void ext3_write_super (struct super_bloc
+       sb->s_dirt = 0;
+       target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+-      if (do_sync_supers) {
++      /*
++       * Tricky --- if we are unmounting, the write really does need
++       * to be synchronous.  We can detect that by looking for NULL in
++       * sb->s_root.
++       */
++      if (do_sync_supers || !sb->s_root) {
+               unlock_super(sb);
+               log_wait_commit(EXT3_SB(sb)->s_journal, target);
+               lock_super(sb);
+
+_
@@ -1,21 +1,9 @@
-
-
-If ext3_add_nondir() fails it will do an iput() of the inode.  But we
-continue to run ext3_mark_inode_dirty() against the potentially-freed
-inode.  This oopses when slab poisoning is enabled.
-
-Fix it so that we only run ext3_mark_inode_dirty() if the inode was
-successfully instantiated.
-
-This bug was added in 2.4.20-pre9.
-
-
- fs/ext3/namei.c |   11 +++++------
+ ./fs/ext3/namei.c |   11 +++++------
  1 files changed, 5 insertions(+), 6 deletions(-)
 
---- 24/fs/ext3/namei.c~ext3-use-after-free     Sun Dec 15 11:27:50 2002
-+++ 24-akpm/fs/ext3/namei.c    Sun Dec 15 11:27:50 2002
-@@ -429,8 +429,11 @@ static int ext3_add_nondir(handle_t *han
+--- linux-2.4.20/./fs/ext3/namei.c~ext3-use-after-free 2003-04-08 23:35:51.000000000 -0600
++++ linux-2.4.20-braam/./fs/ext3/namei.c       2003-04-08 23:35:51.000000000 -0600
+@@ -1521,8 +1521,11 @@ static int ext3_add_nondir(handle_t *han
  {
        int err = ext3_add_entry(handle, dentry, inode);
        if (!err) {
@@ -29,7 +17,7 @@ This bug was added in 2.4.20-pre9.
        }
        ext3_dec_count(handle, inode);
        iput(inode);
-@@ -465,7 +468,6 @@ static int ext3_create (struct inode * d
+@@ -1559,7 +1562,6 @@ static int ext3_create (struct inode * d
                inode->i_fop = &ext3_file_operations;
                inode->i_mapping->a_ops = &ext3_aops;
                err = ext3_add_nondir(handle, dentry, inode);
@@ -37,7 +25,7 @@ This bug was added in 2.4.20-pre9.
        }
        ext3_journal_stop(handle, dir);
        return err;
-@@ -490,7 +492,6 @@ static int ext3_mknod (struct inode * di
+@@ -1586,7 +1588,6 @@ static int ext3_mknod (struct inode * di
        if (!IS_ERR(inode)) {
                init_special_inode(inode, mode, rdev);
                err = ext3_add_nondir(handle, dentry, inode);
@@ -45,15 +33,15 @@ This bug was added in 2.4.20-pre9.
        }
        ext3_journal_stop(handle, dir);
        return err;
-@@ -934,7 +935,6 @@ static int ext3_symlink (struct inode * 
+@@ -2035,7 +2036,6 @@ static int ext3_symlink (struct inode * 
        }
-       inode->u.ext3_i.i_disksize = inode->i_size;
+       EXT3_I(inode)->i_disksize = inode->i_size;
        err = ext3_add_nondir(handle, dentry, inode);
 -      ext3_mark_inode_dirty(handle, inode);
  out_stop:
        ext3_journal_stop(handle, dir);
        return err;
-@@ -971,7 +971,6 @@ static int ext3_link (struct dentry * ol
+@@ -2069,7 +2069,6 @@ static int ext3_link (struct dentry * ol
        atomic_inc(&inode->i_count);
  
        err = ext3_add_nondir(handle, dentry, inode);
diff --git a/lustre/kernel_patches/patches/ext3-xattr-2.5.patch b/lustre/kernel_patches/patches/ext3-xattr-2.5.patch
deleted file mode 100644 (file)
index 4179839..0000000
+++ /dev/null
@@ -1,2690 +0,0 @@
-# This is a BitKeeper generated patch for the following project:
-# Project Name: Linux kernel tree
-# This patch format is intended for GNU patch command version 2.5 or higher.
-# This patch includes the following deltas:
-#                 ChangeSet    1.809   -> 1.810  
-#          fs/ext3/Makefile    1.4     -> 1.5    
-#      include/linux/ext3_jbd.h        1.5     -> 1.6    
-#          fs/ext3/ialloc.c    1.17    -> 1.18   
-#         fs/ext3/symlink.c    1.3     -> 1.4    
-#               fs/Makefile    1.42    -> 1.43   
-#           fs/ext3/namei.c    1.22    -> 1.23   
-#      include/linux/ext3_fs.h 1.11    -> 1.12   
-#              fs/Config.in    1.39    -> 1.40   
-#           fs/ext3/inode.c    1.42    -> 1.43   
-#            fs/Config.help    1.21    -> 1.22   
-#           fs/ext3/super.c    1.33    -> 1.34   
-#            fs/ext3/file.c    1.9     -> 1.10   
-#                     (new)            -> 1.1     fs/ext3/xattr.h
-#                     (new)            -> 1.1     include/linux/mbcache.h
-#                     (new)            -> 1.1     fs/ext3/xattr.c
-#                     (new)            -> 1.1     fs/mbcache.c   
-#                     (new)            -> 1.1     fs/ext3/xattr_user.c
-#
-# The following is the BitKeeper ChangeSet Log
-# --------------------------------------------
-# 02/10/20     braam@clusterfs.com     1.810
-# xattrs for UML bk repository
-# --------------------------------------------
-#
-diff -Nru a/fs/Config.help b/fs/Config.help
---- a/fs/Config.help   Sun Dec  8 02:49:56 2002
-+++ b/fs/Config.help   Sun Dec  8 02:49:56 2002
-@@ -154,6 +154,13 @@
-   of your root partition (the one containing the directory /) cannot
-   be compiled as a module, and so this may be dangerous.
-+CONFIG_EXT3_FS_XATTR
-+  Extended attributes are name:value pairs associated with inodes by
-+  the kernel or by users (see the attr(5) manual page, or visit
-+  <http://acl.bestbits.at/> for details).
-+
-+  If unsure, say N.
-+
- CONFIG_JBD
-   This is a generic journaling layer for block devices.  It is
-   currently used by the ext3 file system, but it could also be used to
-diff -Nru a/fs/Config.in b/fs/Config.in
---- a/fs/Config.in     Sun Dec  8 02:49:56 2002
-+++ b/fs/Config.in     Sun Dec  8 02:49:56 2002
-@@ -27,6 +27,7 @@
- dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL
- tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS
-+dep_mbool '  Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS
- # CONFIG_JBD could be its own option (even modular), but until there are
- # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
- # dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
-@@ -180,6 +181,17 @@
-    define_tristate CONFIG_ZISOFS_FS $CONFIG_ISO9660_FS
- else
-    define_tristate CONFIG_ZISOFS_FS n
-+fi
-+
-+# Meta block cache for Extended Attributes (ext2/ext3)
-+if [ "$CONFIG_EXT2_FS_XATTR" = "y" -o "$CONFIG_EXT3_FS_XATTR" = "y" ]; then
-+   if [ "$CONFIG_EXT2_FS" = "y" -o "$CONFIG_EXT3_FS" = "y" ]; then
-+      define_tristate CONFIG_FS_MBCACHE y
-+   else
-+      if [ "$CONFIG_EXT2_FS" = "m" -o "$CONFIG_EXT3_FS" = "m" ]; then
-+         define_tristate CONFIG_FS_MBCACHE m
-+      fi
-+   fi
- fi
- mainmenu_option next_comment
-diff -Nru a/fs/Makefile b/fs/Makefile
---- a/fs/Makefile      Sun Dec  8 02:49:56 2002
-+++ b/fs/Makefile      Sun Dec  8 02:49:56 2002
-@@ -6,7 +6,7 @@
- # 
- export-objs :=        open.o dcache.o buffer.o bio.o inode.o dquot.o mpage.o aio.o \
--                fcntl.o read_write.o dcookies.o
-+                fcntl.o read_write.o dcookies.o mbcache.o
- obj-y :=      open.o read_write.o devices.o file_table.o buffer.o \
-               bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
-@@ -29,6 +29,8 @@
- obj-y                         += binfmt_script.o
- obj-$(CONFIG_BINFMT_ELF)      += binfmt_elf.o
-+
-+obj-$(CONFIG_FS_MBCACHE)      += mbcache.o
- obj-$(CONFIG_QUOTA)           += dquot.o
- obj-$(CONFIG_QFMT_V1)         += quota_v1.o
-diff -Nru a/fs/ext3/Makefile b/fs/ext3/Makefile
---- a/fs/ext3/Makefile Sun Dec  8 02:49:56 2002
-+++ b/fs/ext3/Makefile Sun Dec  8 02:49:56 2002
-@@ -7,4 +7,10 @@
- ext3-objs    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-               ioctl.o namei.o super.o symlink.o hash.o
-+export-objs += xattr.o
-+
-+ifeq ($(CONFIG_EXT3_FS_XATTR),y)
-+ext3-objs += xattr.o xattr_user.o
-+endif
-+
- include $(TOPDIR)/Rules.make
-diff -Nru a/fs/ext3/file.c b/fs/ext3/file.c
---- a/fs/ext3/file.c   Sun Dec  8 02:49:56 2002
-+++ b/fs/ext3/file.c   Sun Dec  8 02:49:56 2002
-@@ -23,7 +23,7 @@
- #include <linux/jbd.h>
- #include <linux/ext3_fs.h>
- #include <linux/ext3_jbd.h>
--#include <linux/smp_lock.h>
-+#include "xattr.h"
- /*
-  * Called when an inode is released. Note that this is different
-@@ -98,5 +98,9 @@
- struct inode_operations ext3_file_inode_operations = {
-       .truncate       = ext3_truncate,
-       .setattr        = ext3_setattr,
-+      .setxattr       = ext3_setxattr,
-+      .getxattr       = ext3_getxattr,
-+      .listxattr      = ext3_listxattr,
-+      .removexattr    = ext3_removexattr,
- };
-diff -Nru a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
---- a/fs/ext3/ialloc.c Sun Dec  8 02:49:56 2002
-+++ b/fs/ext3/ialloc.c Sun Dec  8 02:49:56 2002
-@@ -25,6 +25,8 @@
- #include <asm/bitops.h>
- #include <asm/byteorder.h>
-+#include "xattr.h"
-+
- /*
-  * ialloc.c contains the inodes allocation and deallocation routines
-  */
-@@ -118,6 +120,7 @@
-        * as writing the quota to disk may need the lock as well.
-        */
-       DQUOT_INIT(inode);
-+      ext3_xattr_delete_inode(handle, inode);
-       DQUOT_FREE_INODE(inode);
-       DQUOT_DROP(inode);
-diff -Nru a/fs/ext3/inode.c b/fs/ext3/inode.c
---- a/fs/ext3/inode.c  Sun Dec  8 02:49:56 2002
-+++ b/fs/ext3/inode.c  Sun Dec  8 02:49:56 2002
-@@ -42,6 +42,18 @@
-  */
- #undef SEARCH_FROM_ZERO
-+/*
-+ * Test whether an inode is a fast symlink.
-+ */
-+static inline int ext3_inode_is_fast_symlink(struct inode *inode)
-+{
-+      int ea_blocks = EXT3_I(inode)->i_file_acl ?
-+              (inode->i_sb->s_blocksize >> 9) : 0;
-+
-+      return (S_ISLNK(inode->i_mode) &&
-+              inode->i_blocks - ea_blocks == 0);
-+}
-+
- /* The ext3 forget function must perform a revoke if we are freeing data
-  * which has been journaled.  Metadata (eg. indirect blocks) must be
-  * revoked in all cases. 
-@@ -51,7 +63,7 @@
-  * still needs to be revoked.
-  */
--static int ext3_forget(handle_t *handle, int is_metadata,
-+int ext3_forget(handle_t *handle, int is_metadata,
-                      struct inode *inode, struct buffer_head *bh,
-                      int blocknr)
- {
-@@ -167,9 +179,7 @@
- {
-       handle_t *handle;
-       
--      if (is_bad_inode(inode) ||
--          inode->i_ino == EXT3_ACL_IDX_INO ||
--          inode->i_ino == EXT3_ACL_DATA_INO)
-+      if (is_bad_inode(inode))
-               goto no_delete;
-       lock_kernel();
-@@ -1979,6 +1989,8 @@
-       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-           S_ISLNK(inode->i_mode)))
-               return;
-+      if (ext3_inode_is_fast_symlink(inode))
-+              return;
-       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-               return;
-@@ -2130,8 +2142,6 @@
-       struct ext3_group_desc * gdp;
-               
-       if ((inode->i_ino != EXT3_ROOT_INO &&
--              inode->i_ino != EXT3_ACL_IDX_INO &&
--              inode->i_ino != EXT3_ACL_DATA_INO &&
-               inode->i_ino != EXT3_JOURNAL_INO &&
-               inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
-               inode->i_ino > le32_to_cpu(
-@@ -2263,10 +2273,7 @@
-       brelse (iloc.bh);
--      if (inode->i_ino == EXT3_ACL_IDX_INO ||
--          inode->i_ino == EXT3_ACL_DATA_INO)
--              /* Nothing to do */ ;
--      else if (S_ISREG(inode->i_mode)) {
-+      if (S_ISREG(inode->i_mode)) {
-               inode->i_op = &ext3_file_inode_operations;
-               inode->i_fop = &ext3_file_operations;
-               if (ext3_should_writeback_data(inode))
-@@ -2277,18 +2284,20 @@
-               inode->i_op = &ext3_dir_inode_operations;
-               inode->i_fop = &ext3_dir_operations;
-       } else if (S_ISLNK(inode->i_mode)) {
--              if (!inode->i_blocks)
-+              if (ext3_inode_is_fast_symlink(inode))
-                       inode->i_op = &ext3_fast_symlink_inode_operations;
-               else {
--                      inode->i_op = &page_symlink_inode_operations;
-+                      inode->i_op = &ext3_symlink_inode_operations;
-                       if (ext3_should_writeback_data(inode))
-                               inode->i_mapping->a_ops = &ext3_writeback_aops;
-                       else
-                               inode->i_mapping->a_ops = &ext3_aops;
-               }
--      } else 
-+      } else {
-+              inode->i_op = &ext3_special_inode_operations;
-               init_special_inode(inode, inode->i_mode,
-                                  le32_to_cpu(iloc.raw_inode->i_block[0]));
-+      }
-       if (ei->i_flags & EXT3_SYNC_FL)
-               inode->i_flags |= S_SYNC;
-       if (ei->i_flags & EXT3_APPEND_FL)
-diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c
---- a/fs/ext3/namei.c  Sun Dec  8 02:49:56 2002
-+++ b/fs/ext3/namei.c  Sun Dec  8 02:49:56 2002
-@@ -36,6 +36,7 @@
- #include <linux/quotaops.h>
- #include <linux/buffer_head.h>
- #include <linux/smp_lock.h>
-+#include "xattr.h"
- /*
-@@ -1654,7 +1655,7 @@
-       if (IS_DIRSYNC(dir))
-               handle->h_sync = 1;
--      inode = ext3_new_inode (handle, dir, S_IFDIR);
-+      inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
-       err = PTR_ERR(inode);
-       if (IS_ERR(inode))
-               goto out_stop;
-@@ -1662,7 +1663,6 @@
-       inode->i_op = &ext3_dir_inode_operations;
-       inode->i_fop = &ext3_dir_operations;
-       inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
--      inode->i_blocks = 0;    
-       dir_block = ext3_bread (handle, inode, 0, 1, &err);
-       if (!dir_block) {
-               inode->i_nlink--; /* is this nlink == 0? */
-@@ -1689,9 +1689,6 @@
-       BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
-       ext3_journal_dirty_metadata(handle, dir_block);
-       brelse (dir_block);
--      inode->i_mode = S_IFDIR | mode;
--      if (dir->i_mode & S_ISGID)
--              inode->i_mode |= S_ISGID;
-       ext3_mark_inode_dirty(handle, inode);
-       err = ext3_add_entry (handle, dentry, inode);
-       if (err) {
-@@ -2068,7 +2065,7 @@
-               goto out_stop;
-       if (l > sizeof (EXT3_I(inode)->i_data)) {
--              inode->i_op = &page_symlink_inode_operations;
-+              inode->i_op = &ext3_symlink_inode_operations;
-               if (ext3_should_writeback_data(inode))
-                       inode->i_mapping->a_ops = &ext3_writeback_aops;
-               else
-@@ -2284,4 +2281,17 @@
-       .rmdir          = ext3_rmdir,
-       .mknod          = ext3_mknod,
-       .rename         = ext3_rename,
-+      .setxattr       = ext3_setxattr,        
-+      .getxattr       = ext3_getxattr,        
-+      .listxattr      = ext3_listxattr,       
-+      .removexattr    = ext3_removexattr,
- };
-+
-+struct inode_operations ext3_special_inode_operations = {
-+      .setxattr       = ext3_setxattr,
-+      .getxattr       = ext3_getxattr,
-+      .listxattr      = ext3_listxattr,
-+      .removexattr    = ext3_removexattr,
-+};
-+
-+ 
-diff -Nru a/fs/ext3/super.c b/fs/ext3/super.c
---- a/fs/ext3/super.c  Sun Dec  8 02:49:56 2002
-+++ b/fs/ext3/super.c  Sun Dec  8 02:49:56 2002
-@@ -30,6 +30,7 @@
- #include <linux/smp_lock.h>
- #include <linux/buffer_head.h>
- #include <asm/uaccess.h>
-+#include "xattr.h"
- #ifdef CONFIG_JBD_DEBUG
- static int ext3_ro_after; /* Make fs read-only after this many jiffies */
-@@ -405,6 +406,7 @@
-       struct ext3_super_block *es = sbi->s_es;
-       int i;
-+      ext3_xattr_put_super(sb);
-       journal_destroy(sbi->s_journal);
-       if (!(sb->s_flags & MS_RDONLY)) {
-               EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-@@ -554,6 +556,7 @@
-                         int is_remount)
- {
-       unsigned long *mount_options = &sbi->s_mount_opt;
-+      
-       uid_t *resuid = &sbi->s_resuid;
-       gid_t *resgid = &sbi->s_resgid;
-       char * this_char;
-@@ -566,6 +569,13 @@
-                       continue;
-               if ((value = strchr (this_char, '=')) != NULL)
-                       *value++ = 0;
-+#ifdef CONFIG_EXT3_FS_XATTR
-+              if (!strcmp (this_char, "user_xattr"))
-+                      set_opt (*mount_options, XATTR_USER);
-+              else if (!strcmp (this_char, "nouser_xattr"))
-+                      clear_opt (*mount_options, XATTR_USER);
-+              else
-+#endif
-               if (!strcmp (this_char, "bsddf"))
-                       clear_opt (*mount_options, MINIX_DF);
-               else if (!strcmp (this_char, "nouid32")) {
-@@ -982,6 +992,12 @@
-       sbi->s_mount_opt = 0;
-       sbi->s_resuid = EXT3_DEF_RESUID;
-       sbi->s_resgid = EXT3_DEF_RESGID;
-+
-+      /* Default extended attribute flags */
-+#ifdef CONFIG_EXT3_FS_XATTR
-+      set_opt(sbi->s_mount_opt, XATTR_USER);
-+#endif
-+
-       if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0))
-               goto out_fail;
-@@ -1820,7 +1836,10 @@
- static int __init init_ext3_fs(void)
- {
--      int err = init_inodecache();
-+      int err = init_ext3_xattr();
-+      if (err)
-+              return err;
-+      err = init_inodecache();
-       if (err)
-               goto out1;
-         err = register_filesystem(&ext3_fs_type);
-@@ -1830,6 +1849,7 @@
- out:
-       destroy_inodecache();
- out1:
-+      exit_ext3_xattr();
-       return err;
- }
-@@ -1837,6 +1857,7 @@
- {
-       unregister_filesystem(&ext3_fs_type);
-       destroy_inodecache();
-+      exit_ext3_xattr();
- }
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-diff -Nru a/fs/ext3/symlink.c b/fs/ext3/symlink.c
---- a/fs/ext3/symlink.c        Sun Dec  8 02:49:56 2002
-+++ b/fs/ext3/symlink.c        Sun Dec  8 02:49:56 2002
-@@ -20,6 +20,7 @@
- #include <linux/fs.h>
- #include <linux/jbd.h>
- #include <linux/ext3_fs.h>
-+#include "xattr.h"
- static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
- {
-@@ -33,7 +34,20 @@
-       return vfs_follow_link(nd, (char*)ei->i_data);
- }
-+struct inode_operations ext3_symlink_inode_operations = {
-+      .readlink       = page_readlink,
-+      .follow_link    = page_follow_link,
-+      .setxattr       = ext3_setxattr,
-+      .getxattr       = ext3_getxattr,
-+      .listxattr      = ext3_listxattr,
-+      .removexattr    = ext3_removexattr,
-+};
-+
- struct inode_operations ext3_fast_symlink_inode_operations = {
--      .readlink       = ext3_readlink,                /* BKL not held.  Don't need */
-+      .readlink       = ext3_readlink,        /* BKL not held.  Don't need */
-       .follow_link    = ext3_follow_link,     /* BKL not held.  Don't need */
-+      .setxattr       = ext3_setxattr,
-+      .getxattr       = ext3_getxattr,
-+      .listxattr      = ext3_listxattr,
-+      .removexattr    = ext3_removexattr,
- };
-diff -Nru a/fs/ext3/xattr.c b/fs/ext3/xattr.c
---- /dev/null  Wed Dec 31 16:00:00 1969
-+++ b/fs/ext3/xattr.c  Sun Dec  8 02:49:56 2002
-@@ -0,0 +1,1127 @@
-+/*
-+ * linux/fs/ext3/xattr.c
-+ *
-+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ *
-+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
-+ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
-+ * Extended attributes for symlinks and special files added per
-+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
-+ */
-+
-+/*
-+ * Extended attributes are stored on disk blocks allocated outside of
-+ * any inode. The i_file_acl field is then made to point to this allocated
-+ * block. If all extended attributes of an inode are identical, these
-+ * inodes may share the same extended attribute block. Such situations
-+ * are automatically detected by keeping a cache of recent attribute block
-+ * numbers and hashes over the block's contents in memory.
-+ *
-+ *
-+ * Extended attribute block layout:
-+ *
-+ *   +------------------+
-+ *   | header           |
-+ *   Â¦ entry 1          | |
-+ *   | entry 2          | | growing downwards
-+ *   | entry 3          | v
-+ *   | four null bytes  |
-+ *   | . . .            |
-+ *   | value 1          | ^
-+ *   | value 3          | | growing upwards
-+ *   | value 2          | |
-+ *   +------------------+
-+ *
-+ * The block header is followed by multiple entry descriptors. These entry
-+ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD
-+ * byte boundaries. The entry descriptors are sorted by attribute name,
-+ * so that two extended attribute blocks can be compared efficiently.
-+ *
-+ * Attribute values are aligned to the end of the block, stored in
-+ * no specific order. They are also padded to EXT3_XATTR_PAD byte
-+ * boundaries. No additional gaps are left between them.
-+ *
-+ * Locking strategy
-+ * ----------------
-+ * The VFS holdsinode->i_sem semaphore when any of the xattr inode
-+ * operations are called, so we are guaranteed that only one
-+ * processes accesses extended attributes of an inode at any time.
-+ *
-+ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
-+ * only a single process is modifying an extended attribute block, even
-+ * if the block is shared among inodes.
-+ */
-+
-+#include <linux/init.h>
-+#include <linux/fs.h>
-+#include <linux/slab.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/ext3_fs.h>
-+#include <linux/mbcache.h>
-+#include <linux/quotaops.h>
-+#include <asm/semaphore.h>
-+#include "xattr.h"
-+
-+#define EXT3_EA_USER "user."
-+
-+#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
-+#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
-+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
-+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-+
-+#ifdef EXT3_XATTR_DEBUG
-+# define ea_idebug(inode, f...) do { \
-+              printk(KERN_DEBUG "inode %s:%ld: ", \
-+                      kdevname(inode->i_dev), inode->i_ino); \
-+              printk(f); \
-+              printk("\n"); \
-+      } while (0)
-+# define ea_bdebug(bh, f...) do { \
-+              printk(KERN_DEBUG "block %s:%ld: ", \
-+                      kdevname(bh->b_dev), bh->b_blocknr); \
-+              printk(f); \
-+              printk("\n"); \
-+      } while (0)
-+#else
-+# define ea_idebug(f...)
-+# define ea_bdebug(f...)
-+#endif
-+
-+static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *,
-+                         struct ext3_xattr_header *);
-+
-+static int ext3_xattr_cache_insert(struct buffer_head *);
-+static struct buffer_head *ext3_xattr_cache_find(struct inode *,
-+                                               struct ext3_xattr_header *);
-+static void ext3_xattr_cache_remove(struct buffer_head *);
-+static void ext3_xattr_rehash(struct ext3_xattr_header *,
-+                            struct ext3_xattr_entry *);
-+
-+static struct mb_cache *ext3_xattr_cache;
-+
-+/*
-+ * If a file system does not share extended attributes among inodes,
-+ * we should not need the ext3_xattr_sem semaphore. However, the
-+ * filesystem may still contain shared blocks, so we always take
-+ * the lock.
-+ */
-+
-+static DECLARE_MUTEX(ext3_xattr_sem);
-+static struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
-+static rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
-+
-+int
-+ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
-+{
-+      int error = -EINVAL;
-+
-+      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
-+              write_lock(&ext3_handler_lock);
-+              if (!ext3_xattr_handlers[name_index-1]) {
-+                      ext3_xattr_handlers[name_index-1] = handler;
-+                      error = 0;
-+              }
-+              write_unlock(&ext3_handler_lock);
-+      }
-+      return error;
-+}
-+
-+void
-+ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
-+{
-+      if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
-+              write_lock(&ext3_handler_lock);
-+              ext3_xattr_handlers[name_index-1] = NULL;
-+              write_unlock(&ext3_handler_lock);
-+      }
-+}
-+
-+static inline const char *
-+strcmp_prefix(const char *a, const char *a_prefix)
-+{
-+      while (*a_prefix && *a == *a_prefix) {
-+              a++;
-+              a_prefix++;
-+      }
-+      return *a_prefix ? NULL : a;
-+}
-+
-+/*
-+ * Decode the extended attribute name, and translate it into
-+ * the name_index and name suffix.
-+ */
-+static inline struct ext3_xattr_handler *
-+ext3_xattr_resolve_name(const char **name)
-+{
-+      struct ext3_xattr_handler *handler = NULL;
-+      int i;
-+
-+      if (!*name)
-+              return NULL;
-+      read_lock(&ext3_handler_lock);
-+      for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
-+              if (ext3_xattr_handlers[i]) {
-+                      const char *n = strcmp_prefix(*name,
-+                              ext3_xattr_handlers[i]->prefix);
-+                      if (n) {
-+                              handler = ext3_xattr_handlers[i];
-+                              *name = n;
-+                              break;
-+                      }
-+              }
-+      }
-+      read_unlock(&ext3_handler_lock);
-+      return handler;
-+}
-+
-+static inline struct ext3_xattr_handler *
-+ext3_xattr_handler(int name_index)
-+{
-+      struct ext3_xattr_handler *handler = NULL;
-+      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
-+              read_lock(&ext3_handler_lock);
-+              handler = ext3_xattr_handlers[name_index-1];
-+              read_unlock(&ext3_handler_lock);
-+      }
-+      return handler;
-+}
-+
-+/*
-+ * Inode operation getxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ */
-+ssize_t
-+ext3_getxattr(struct dentry *dentry, const char *name,
-+            void *buffer, size_t size)
-+{
-+      struct ext3_xattr_handler *handler;
-+      struct inode *inode = dentry->d_inode;
-+
-+      handler = ext3_xattr_resolve_name(&name);
-+      if (!handler)
-+              return -EOPNOTSUPP;
-+      return handler->get(inode, name, buffer, size);
-+}
-+
-+/*
-+ * Inode operation listxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ */
-+ssize_t
-+ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
-+{
-+      return ext3_xattr_list(dentry->d_inode, buffer, size);
-+}
-+
-+/*
-+ * Inode operation setxattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ */
-+int
-+ext3_setxattr(struct dentry *dentry, const char *name,
-+            void *value, size_t size, int flags)
-+{
-+      struct ext3_xattr_handler *handler;
-+      struct inode *inode = dentry->d_inode;
-+
-+      if (size == 0)
-+              value = "";  /* empty EA, do not remove */
-+      handler = ext3_xattr_resolve_name(&name);
-+      if (!handler)
-+              return -EOPNOTSUPP;
-+      return handler->set(inode, name, value, size, flags);
-+}
-+
-+/*
-+ * Inode operation removexattr()
-+ *
-+ * dentry->d_inode->i_sem down
-+ */
-+int
-+ext3_removexattr(struct dentry *dentry, const char *name)
-+{
-+      struct ext3_xattr_handler *handler;
-+      struct inode *inode = dentry->d_inode;
-+
-+      handler = ext3_xattr_resolve_name(&name);
-+      if (!handler)
-+              return -EOPNOTSUPP;
-+      return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
-+}
-+
-+/*
-+ * ext3_xattr_get()
-+ *
-+ * Copy an extended attribute into the buffer
-+ * provided, or compute the buffer size required.
-+ * Buffer is NULL to compute the size of the buffer required.
-+ *
-+ * Returns a negative error number on failure, or the number of bytes
-+ * used / required on success.
-+ */
-+int
-+ext3_xattr_get(struct inode *inode, int name_index, const char *name,
-+             void *buffer, size_t buffer_size)
-+{
-+      struct buffer_head *bh = NULL;
-+      struct ext3_xattr_entry *entry;
-+      unsigned int block, size;
-+      char *end;
-+      int name_len, error;
-+
-+      ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
-+                name_index, name, buffer, (long)buffer_size);
-+
-+      if (name == NULL)
-+              return -EINVAL;
-+      if (!EXT3_I(inode)->i_file_acl)
-+              return -ENODATA;
-+      block = EXT3_I(inode)->i_file_acl;
-+      ea_idebug(inode, "reading block %d", block);
-+      bh = sb_bread(inode->i_sb, block);
-+      if (!bh)
-+              return -EIO;
-+      ea_bdebug(bh, "b_count=%d, refcount=%d",
-+              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
-+      end = bh->b_data + bh->b_size;
-+      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-+          HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+bad_block:    ext3_error(inode->i_sb, "ext3_xattr_get",
-+                      "inode %ld: bad block %d", inode->i_ino, block);
-+              error = -EIO;
-+              goto cleanup;
-+      }
-+      /* find named attribute */
-+      name_len = strlen(name);
-+
-+      error = -ERANGE;
-+      if (name_len > 255)
-+              goto cleanup;
-+      entry = FIRST_ENTRY(bh);
-+      while (!IS_LAST_ENTRY(entry)) {
-+              struct ext3_xattr_entry *next =
-+                      EXT3_XATTR_NEXT(entry);
-+              if ((char *)next >= end)
-+                      goto bad_block;
-+              if (name_index == entry->e_name_index &&
-+                  name_len == entry->e_name_len &&
-+                  memcmp(name, entry->e_name, name_len) == 0)
-+                      goto found;
-+              entry = next;
-+      }
-+      /* Check the remaining name entries */
-+      while (!IS_LAST_ENTRY(entry)) {
-+              struct ext3_xattr_entry *next =
-+                      EXT3_XATTR_NEXT(entry);
-+              if ((char *)next >= end)
-+                      goto bad_block;
-+              entry = next;
-+      }
-+      if (ext3_xattr_cache_insert(bh))
-+              ea_idebug(inode, "cache insert failed");
-+      error = -ENODATA;
-+      goto cleanup;
-+found:
-+      /* check the buffer size */
-+      if (entry->e_value_block != 0)
-+              goto bad_block;
-+      size = le32_to_cpu(entry->e_value_size);
-+      if (size > inode->i_sb->s_blocksize ||
-+          le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
-+              goto bad_block;
-+
-+      if (ext3_xattr_cache_insert(bh))
-+              ea_idebug(inode, "cache insert failed");
-+      if (buffer) {
-+              error = -ERANGE;
-+              if (size > buffer_size)
-+                      goto cleanup;
-+              /* return value of attribute */
-+              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
-+                      size);
-+      }
-+      error = size;
-+
-+cleanup:
-+      brelse(bh);
-+
-+      return error;
-+}
-+
-+/*
-+ * ext3_xattr_list()
-+ *
-+ * Copy a list of attribute names into the buffer
-+ * provided, or compute the buffer size required.
-+ * Buffer is NULL to compute the size of the buffer required.
-+ *
-+ * Returns a negative error number on failure, or the number of bytes
-+ * used / required on success.
-+ */
-+int
-+ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
-+{
-+      struct buffer_head *bh = NULL;
-+      struct ext3_xattr_entry *entry;
-+      unsigned int block, size = 0;
-+      char *buf, *end;
-+      int error;
-+
-+      ea_idebug(inode, "buffer=%p, buffer_size=%ld",
-+                buffer, (long)buffer_size);
-+
-+      if (!EXT3_I(inode)->i_file_acl)
-+              return 0;
-+      block = EXT3_I(inode)->i_file_acl;
-+      ea_idebug(inode, "reading block %d", block);
-+      bh = sb_bread(inode->i_sb, block);
-+      if (!bh)
-+              return -EIO;
-+      ea_bdebug(bh, "b_count=%d, refcount=%d",
-+              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
-+      end = bh->b_data + bh->b_size;
-+      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-+          HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+bad_block:    ext3_error(inode->i_sb, "ext3_xattr_list",
-+                      "inode %ld: bad block %d", inode->i_ino, block);
-+              error = -EIO;
-+              goto cleanup;
-+      }
-+      /* compute the size required for the list of attribute names */
-+      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
-+           entry = EXT3_XATTR_NEXT(entry)) {
-+              struct ext3_xattr_handler *handler;
-+              struct ext3_xattr_entry *next =
-+                      EXT3_XATTR_NEXT(entry);
-+              if ((char *)next >= end)
-+                      goto bad_block;
-+
-+              handler = ext3_xattr_handler(entry->e_name_index);
-+              if (handler) {
-+                      size += handler->list(NULL, inode, entry->e_name,
-+                                            entry->e_name_len) + 1;
-+              }
-+      }
-+
-+      if (ext3_xattr_cache_insert(bh))
-+              ea_idebug(inode, "cache insert failed");
-+      if (!buffer) {
-+              error = size;
-+              goto cleanup;
-+      } else {
-+              error = -ERANGE;
-+              if (size > buffer_size)
-+                      goto cleanup;
-+      }
-+
-+      /* list the attribute names */
-+      buf = buffer;
-+      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
-+           entry = EXT3_XATTR_NEXT(entry)) {
-+              struct ext3_xattr_handler *handler;
-+
-+              handler = ext3_xattr_handler(entry->e_name_index);
-+              if (handler) {
-+                      buf += handler->list(buf, inode, entry->e_name,
-+                                           entry->e_name_len);
-+                      *buf++ = '\0';
-+              }
-+      }
-+      error = size;
-+
-+cleanup:
-+      brelse(bh);
-+
-+      return error;
-+}
-+
-+/*
-+ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
-+ * not set, set it.
-+ */
-+static void ext3_xattr_update_super_block(handle_t *handle,
-+                                        struct super_block *sb)
-+{
-+      if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
-+              return;
-+
-+      lock_super(sb);
-+      ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
-+      EXT3_SB(sb)->s_es->s_feature_compat |=
-+              cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
-+      sb->s_dirt = 1;
-+      ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-+      unlock_super(sb);
-+}
-+
-+/*
-+ * ext3_xattr_set()
-+ *
-+ * Create, replace or remove an extended attribute for this inode. Buffer
-+ * is NULL to remove an existing extended attribute, and non-NULL to
-+ * either replace an existing extended attribute, or create a new extended
-+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
-+ * specify that an extended attribute must exist and must not exist
-+ * previous to the call, respectively.
-+ *
-+ * Returns 0, or a negative error number on failure.
-+ */
-+int
-+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
-+             const char *name, const void *value, size_t value_len, int flags)
-+{
-+      struct super_block *sb = inode->i_sb;
-+      struct buffer_head *bh = NULL;
-+      struct ext3_xattr_header *header = NULL;
-+      struct ext3_xattr_entry *here, *last;
-+      unsigned int name_len;
-+      int min_offs = sb->s_blocksize, not_found = 1, free, error;
-+      char *end;
-+      
-+      /*
-+       * header -- Points either into bh, or to a temporarily
-+       *           allocated buffer.
-+       * here -- The named entry found, or the place for inserting, within
-+       *         the block pointed to by header.
-+       * last -- Points right after the last named entry within the block
-+       *         pointed to by header.
-+       * min_offs -- The offset of the first value (values are aligned
-+       *             towards the end of the block).
-+       * end -- Points right after the block pointed to by header.
-+       */
-+      
-+      ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
-+                name_index, name, value, (long)value_len);
-+
-+      if (IS_RDONLY(inode))
-+              return -EROFS;
-+      if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-+              return -EPERM;
-+      if (value == NULL)
-+              value_len = 0;
-+      if (name == NULL)
-+              return -EINVAL;
-+      name_len = strlen(name);
-+      if (name_len > 255 || value_len > sb->s_blocksize)
-+              return -ERANGE;
-+      down(&ext3_xattr_sem);
-+
-+      if (EXT3_I(inode)->i_file_acl) {
-+              /* The inode already has an extended attribute block. */
-+              int block = EXT3_I(inode)->i_file_acl;
-+
-+              bh = sb_bread(sb, block);
-+              error = -EIO;
-+              if (!bh)
-+                      goto cleanup;
-+              ea_bdebug(bh, "b_count=%d, refcount=%d",
-+                      atomic_read(&(bh->b_count)),
-+                      le32_to_cpu(HDR(bh)->h_refcount));
-+              header = HDR(bh);
-+              end = bh->b_data + bh->b_size;
-+              if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-+                  header->h_blocks != cpu_to_le32(1)) {
-+bad_block:            ext3_error(sb, "ext3_xattr_set",
-+                              "inode %ld: bad block %d", inode->i_ino, block);
-+                      error = -EIO;
-+                      goto cleanup;
-+              }
-+              /* Find the named attribute. */
-+              here = FIRST_ENTRY(bh);
-+              while (!IS_LAST_ENTRY(here)) {
-+                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here);
-+                      if ((char *)next >= end)
-+                              goto bad_block;
-+                      if (!here->e_value_block && here->e_value_size) {
-+                              int offs = le16_to_cpu(here->e_value_offs);
-+                              if (offs < min_offs)
-+                                      min_offs = offs;
-+                      }
-+                      not_found = name_index - here->e_name_index;
-+                      if (!not_found)
-+                              not_found = name_len - here->e_name_len;
-+                      if (!not_found)
-+                              not_found = memcmp(name, here->e_name,name_len);
-+                      if (not_found <= 0)
-+                              break;
-+                      here = next;
-+              }
-+              last = here;
-+              /* We still need to compute min_offs and last. */
-+              while (!IS_LAST_ENTRY(last)) {
-+                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last);
-+                      if ((char *)next >= end)
-+                              goto bad_block;
-+                      if (!last->e_value_block && last->e_value_size) {
-+                              int offs = le16_to_cpu(last->e_value_offs);
-+                              if (offs < min_offs)
-+                                      min_offs = offs;
-+                      }
-+                      last = next;
-+              }
-+
-+              /* Check whether we have enough space left. */
-+              free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
-+      } else {
-+              /* We will use a new extended attribute block. */
-+              free = sb->s_blocksize -
-+                      sizeof(struct ext3_xattr_header) - sizeof(__u32);
-+              here = last = NULL;  /* avoid gcc uninitialized warning. */
-+      }
-+
-+      if (not_found) {
-+              /* Request to remove a nonexistent attribute? */
-+              error = -ENODATA;
-+              if (flags & XATTR_REPLACE)
-+                      goto cleanup;
-+              error = 0;
-+              if (value == NULL)
-+                      goto cleanup;
-+              else
-+                      free -= EXT3_XATTR_LEN(name_len);
-+      } else {
-+              /* Request to create an existing attribute? */
-+              error = -EEXIST;
-+              if (flags & XATTR_CREATE)
-+                      goto cleanup;
-+              if (!here->e_value_block && here->e_value_size) {
-+                      unsigned int size = le32_to_cpu(here->e_value_size);
-+
-+                      if (le16_to_cpu(here->e_value_offs) + size > 
-+                          sb->s_blocksize || size > sb->s_blocksize)
-+                              goto bad_block;
-+                      free += EXT3_XATTR_SIZE(size);
-+              }
-+      }
-+      free -= EXT3_XATTR_SIZE(value_len);
-+      error = -ENOSPC;
-+      if (free < 0)
-+              goto cleanup;
-+
-+      /* Here we know that we can set the new attribute. */
-+
-+      if (header) {
-+              if (header->h_refcount == cpu_to_le32(1)) {
-+                      ea_bdebug(bh, "modifying in-place");
-+                      ext3_xattr_cache_remove(bh);
-+                      error = ext3_journal_get_write_access(handle, bh);
-+                      if (error)
-+                              goto cleanup;
-+              } else {
-+                      int offset;
-+
-+                      ea_bdebug(bh, "cloning");
-+                      header = kmalloc(bh->b_size, GFP_KERNEL);
-+                      error = -ENOMEM;
-+                      if (header == NULL)
-+                              goto cleanup;
-+                      memcpy(header, HDR(bh), bh->b_size);
-+                      header->h_refcount = cpu_to_le32(1);
-+                      offset = (char *)header - bh->b_data;
-+                      here = ENTRY((char *)here + offset);
-+                      last = ENTRY((char *)last + offset);
-+              }
-+      } else {
-+              /* Allocate a buffer where we construct the new block. */
-+              header = kmalloc(sb->s_blocksize, GFP_KERNEL);
-+              error = -ENOMEM;
-+              if (header == NULL)
-+                      goto cleanup;
-+              memset(header, 0, sb->s_blocksize);
-+              end = (char *)header + sb->s_blocksize;
-+              header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
-+              header->h_blocks = header->h_refcount = cpu_to_le32(1);
-+              last = here = ENTRY(header+1);
-+      }
-+
-+      if (not_found) {
-+              /* Insert the new name. */
-+              int size = EXT3_XATTR_LEN(name_len);
-+              int rest = (char *)last - (char *)here;
-+              memmove((char *)here + size, here, rest);
-+              memset(here, 0, size);
-+              here->e_name_index = name_index;
-+              here->e_name_len = name_len;
-+              memcpy(here->e_name, name, name_len);
-+      } else {
-+              /* Remove the old value. */
-+              if (!here->e_value_block && here->e_value_size) {
-+                      char *first_val = (char *)header + min_offs;
-+                      int offs = le16_to_cpu(here->e_value_offs);
-+                      char *val = (char *)header + offs;
-+                      size_t size = EXT3_XATTR_SIZE(
-+                              le32_to_cpu(here->e_value_size));
-+                      memmove(first_val + size, first_val, val - first_val);
-+                      memset(first_val, 0, size);
-+                      here->e_value_offs = 0;
-+                      min_offs += size;
-+
-+                      /* Adjust all value offsets. */
-+                      last = ENTRY(header+1);
-+                      while (!IS_LAST_ENTRY(last)) {
-+                              int o = le16_to_cpu(last->e_value_offs);
-+                              if (!last->e_value_block && o < offs)
-+                                      last->e_value_offs =
-+                                              cpu_to_le16(o + size);
-+                              last = EXT3_XATTR_NEXT(last);
-+                      }
-+              }
-+              if (value == NULL) {
-+                      /* Remove this attribute. */
-+                      if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) {
-+                              /* This block is now empty. */
-+                              error = ext3_xattr_set2(handle, inode, bh,NULL);
-+                              goto cleanup;
-+                      } else {
-+                              /* Remove the old name. */
-+                              int size = EXT3_XATTR_LEN(name_len);
-+                              last = ENTRY((char *)last - size);
-+                              memmove(here, (char*)here + size,
-+                                      (char*)last - (char*)here);
-+                              memset(last, 0, size);
-+                      }
-+              }
-+      }
-+
-+      if (value != NULL) {
-+              /* Insert the new value. */
-+              here->e_value_size = cpu_to_le32(value_len);
-+              if (value_len) {
-+                      size_t size = EXT3_XATTR_SIZE(value_len);
-+                      char *val = (char *)header + min_offs - size;
-+                      here->e_value_offs =
-+                              cpu_to_le16((char *)val - (char *)header);
-+                      memset(val + size - EXT3_XATTR_PAD, 0,
-+                             EXT3_XATTR_PAD); /* Clear the pad bytes. */
-+                      memcpy(val, value, value_len);
-+              }
-+      }
-+      ext3_xattr_rehash(header, here);
-+
-+      error = ext3_xattr_set2(handle, inode, bh, header);
-+
-+cleanup:
-+      brelse(bh);
-+      if (!(bh && header == HDR(bh)))
-+              kfree(header);
-+      up(&ext3_xattr_sem);
-+
-+      return error;
-+}
-+
-+/*
-+ * Second half of ext3_xattr_set(): Update the file system.
-+ */
-+static int
-+ext3_xattr_set2(handle_t *handle, struct inode *inode,
-+              struct buffer_head *old_bh, struct ext3_xattr_header *header)
-+{
-+      struct super_block *sb = inode->i_sb;
-+      struct buffer_head *new_bh = NULL;
-+      int error;
-+
-+      if (header) {
-+              new_bh = ext3_xattr_cache_find(inode, header);
-+              if (new_bh) {
-+                      /*
-+                       * We found an identical block in the cache.
-+                       * The old block will be released after updating
-+                       * the inode.
-+                       */
-+                      ea_bdebug(old_bh, "reusing block %ld",
-+                              new_bh->b_blocknr);
-+                      
-+                      error = -EDQUOT;
-+                      if (DQUOT_ALLOC_BLOCK(inode, 1))
-+                              goto cleanup;
-+                      
-+                      error = ext3_journal_get_write_access(handle, new_bh);
-+                      if (error)
-+                              goto cleanup;
-+                      HDR(new_bh)->h_refcount = cpu_to_le32(
-+                              le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
-+                      ea_bdebug(new_bh, "refcount now=%d",
-+                              le32_to_cpu(HDR(new_bh)->h_refcount));
-+              } else if (old_bh && header == HDR(old_bh)) {
-+                      /* Keep this block. */
-+                      new_bh = old_bh;
-+                      ext3_xattr_cache_insert(new_bh);
-+              } else {
-+                      /* We need to allocate a new block */
-+                      int block;
-+                      int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-+                              EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb);
-+
-+                      block = ext3_new_block(handle, inode, goal, 0,
-+                                             0, &error);
-+                      if (error)
-+                              goto cleanup;
-+                      ea_idebug(inode, "creating block %d", block);
-+
-+                      new_bh = sb_getblk(sb, block);
-+                      if (!new_bh) {
-+getblk_failed:
-+                              ext3_free_blocks(handle, inode, block, 1);
-+                              error = -EIO;
-+                              goto cleanup;
-+                      }
-+                      lock_buffer(new_bh);
-+                      error = ext3_journal_get_create_access(handle, new_bh);
-+                      if (error) {
-+                              unlock_buffer(new_bh);
-+                              goto getblk_failed;
-+                      }
-+                      memcpy(new_bh->b_data, header, new_bh->b_size);
-+                      set_buffer_uptodate(new_bh);
-+                      unlock_buffer(new_bh);
-+                      ext3_xattr_cache_insert(new_bh);
-+                      
-+                      ext3_xattr_update_super_block(handle, sb);
-+              }
-+              error = ext3_journal_dirty_metadata(handle, new_bh);
-+              if (error)
-+                      goto cleanup;
-+      }
-+
-+      /* Update the inode. */
-+      EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
-+      inode->i_ctime = CURRENT_TIME;
-+      ext3_mark_inode_dirty(handle, inode);
-+      if (IS_SYNC(inode))
-+              handle->h_sync = 1;
-+
-+      error = 0;
-+      if (old_bh && old_bh != new_bh) {
-+              /*
-+               * If there was an old block, and we are not still using it,
-+               * we now release the old block.
-+              */
-+              unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
-+
-+              error = ext3_journal_get_write_access(handle, old_bh);
-+              if (error)
-+                      goto cleanup;
-+              if (refcount == 1) {
-+                      /* Free the old block. */
-+                      ea_bdebug(old_bh, "freeing");
-+                      ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1);
-+
-+                      /* ext3_forget() calls bforget() for us, but we
-+                         let our caller release old_bh, so we need to
-+                         duplicate the handle before. */
-+                      get_bh(old_bh);
-+                      ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
-+              } else {
-+                      /* Decrement the refcount only. */
-+                      refcount--;
-+                      HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
-+                      DQUOT_FREE_BLOCK(inode, 1);
-+                      ext3_journal_dirty_metadata(handle, old_bh);
-+                      ea_bdebug(old_bh, "refcount now=%d", refcount);
-+              }
-+      }
-+
-+cleanup:
-+      if (old_bh != new_bh)
-+              brelse(new_bh);
-+
-+      return error;
-+}
-+
-+/*
-+ * ext3_xattr_delete_inode()
-+ *
-+ * Free extended attribute resources associated with this inode. This
-+ * is called immediately before an inode is freed.
-+ */
-+void
-+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
-+{
-+      struct buffer_head *bh;
-+      unsigned int block = EXT3_I(inode)->i_file_acl;
-+
-+      if (!block)
-+              return;
-+      down(&ext3_xattr_sem);
-+
-+      bh = sb_bread(inode->i_sb, block);
-+      if (!bh) {
-+              ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
-+                      "inode %ld: block %d read error", inode->i_ino, block);
-+              goto cleanup;
-+      }
-+      ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
-+      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
-+          HDR(bh)->h_blocks != cpu_to_le32(1)) {
-+              ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
-+                      "inode %ld: bad block %d", inode->i_ino, block);
-+              goto cleanup;
-+      }
-+      ext3_journal_get_write_access(handle, bh);
-+      ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
-+      if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
-+              ext3_xattr_cache_remove(bh);
-+              ext3_free_blocks(handle, inode, block, 1);
-+              ext3_forget(handle, 1, inode, bh, block);
-+              bh = NULL;
-+      } else {
-+              HDR(bh)->h_refcount = cpu_to_le32(
-+                      le32_to_cpu(HDR(bh)->h_refcount) - 1);
-+              ext3_journal_dirty_metadata(handle, bh);
-+              if (IS_SYNC(inode))
-+                      handle->h_sync = 1;
-+              DQUOT_FREE_BLOCK(inode, 1);
-+      }
-+      EXT3_I(inode)->i_file_acl = 0;
-+
-+cleanup:
-+      brelse(bh);
-+      up(&ext3_xattr_sem);
-+}
-+
-+/*
-+ * ext3_xattr_put_super()
-+ *
-+ * This is called when a file system is unmounted.
-+ */
-+void
-+ext3_xattr_put_super(struct super_block *sb)
-+{
-+      mb_cache_shrink(ext3_xattr_cache, sb->s_bdev);
-+}
-+
-+/*
-+ * ext3_xattr_cache_insert()
-+ *
-+ * Create a new entry in the extended attribute cache, and insert
-+ * it unless such an entry is already in the cache.
-+ *
-+ * Returns 0, or a negative error number on failure.
-+ */
-+static int
-+ext3_xattr_cache_insert(struct buffer_head *bh)
-+{
-+      __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
-+      struct mb_cache_entry *ce;
-+      int error;
-+
-+      ce = mb_cache_entry_alloc(ext3_xattr_cache);
-+      if (!ce)
-+              return -ENOMEM;
-+      error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
-+      if (error) {
-+              mb_cache_entry_free(ce);
-+              if (error == -EBUSY) {
-+                      ea_bdebug(bh, "already in cache (%d cache entries)",
-+                              atomic_read(&ext3_xattr_cache->c_entry_count));
-+                      error = 0;
-+              }
-+      } else {
-+              ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
-+                        atomic_read(&ext3_xattr_cache->c_entry_count));
-+              mb_cache_entry_release(ce);
-+      }
-+      return error;
-+}
-+
-+/*
-+ * ext3_xattr_cmp()
-+ *
-+ * Compare two extended attribute blocks for equality.
-+ *
-+ * Returns 0 if the blocks are equal, 1 if they differ, and
-+ * a negative error number on errors.
-+ */
-+static int
-+ext3_xattr_cmp(struct ext3_xattr_header *header1,
-+             struct ext3_xattr_header *header2)
-+{
-+      struct ext3_xattr_entry *entry1, *entry2;
-+
-+      entry1 = ENTRY(header1+1);
-+      entry2 = ENTRY(header2+1);
-+      while (!IS_LAST_ENTRY(entry1)) {
-+              if (IS_LAST_ENTRY(entry2))
-+                      return 1;
-+              if (entry1->e_hash != entry2->e_hash ||
-+                  entry1->e_name_len != entry2->e_name_len ||
-+                  entry1->e_value_size != entry2->e_value_size ||
-+                  memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
-+                      return 1;
-+              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
-+                      return -EIO;
-+              if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
-+                         (char *)header2 + le16_to_cpu(entry2->e_value_offs),
-+                         le32_to_cpu(entry1->e_value_size)))
-+                      return 1;
-+
-+              entry1 = EXT3_XATTR_NEXT(entry1);
-+              entry2 = EXT3_XATTR_NEXT(entry2);
-+      }
-+      if (!IS_LAST_ENTRY(entry2))
-+              return 1;
-+      return 0;
-+}
-+
-+/*
-+ * ext3_xattr_cache_find()
-+ *
-+ * Find an identical extended attribute block.
-+ *
-+ * Returns a pointer to the block found, or NULL if such a block was
-+ * not found or an error occurred.
-+ */
-+static struct buffer_head *
-+ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
-+{
-+      __u32 hash = le32_to_cpu(header->h_hash);
-+      struct mb_cache_entry *ce;
-+
-+      if (!header->h_hash)
-+              return NULL;  /* never share */
-+      ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-+      ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_bdev, hash);
-+      while (ce) {
-+              struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
-+
-+              if (!bh) {
-+                      ext3_error(inode->i_sb, "ext3_xattr_cache_find",
-+                              "inode %ld: block %ld read error",
-+                              inode->i_ino, (unsigned long) ce->e_block);
-+              } else if (le32_to_cpu(HDR(bh)->h_refcount) >
-+                         EXT3_XATTR_REFCOUNT_MAX) {
-+                      ea_idebug(inode, "block %ld refcount %d>%d",
-+                                (unsigned long) ce->e_block,
-+                                le32_to_cpu(HDR(bh)->h_refcount),
-+                                EXT3_XATTR_REFCOUNT_MAX);
-+              } else if (!ext3_xattr_cmp(header, HDR(bh))) {
-+                      ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
-+                      mb_cache_entry_release(ce);
-+                      return bh;
-+              }
-+              brelse(bh);
-+              ce = mb_cache_entry_find_next(ce, 0, inode->i_bdev, hash);
-+      }
-+      return NULL;
-+}
-+
-+/*
-+ * ext3_xattr_cache_remove()
-+ *
-+ * Remove the cache entry of a block from the cache. Called when a
-+ * block becomes invalid.
-+ */
-+static void
-+ext3_xattr_cache_remove(struct buffer_head *bh)
-+{
-+      struct mb_cache_entry *ce;
-+
-+      ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev,
-+                              bh->b_blocknr);
-+      if (ce) {
-+              ea_bdebug(bh, "removing (%d cache entries remaining)",
-+                        atomic_read(&ext3_xattr_cache->c_entry_count)-1);
-+              mb_cache_entry_free(ce);
-+      } else 
-+              ea_bdebug(bh, "no cache entry");
-+}
-+
-+#define NAME_HASH_SHIFT 5
-+#define VALUE_HASH_SHIFT 16
-+
-+/*
-+ * ext3_xattr_hash_entry()
-+ *
-+ * Compute the hash of an extended attribute.
-+ */
-+static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
-+                                       struct ext3_xattr_entry *entry)
-+{
-+      __u32 hash = 0;
-+      char *name = entry->e_name;
-+      int n;
-+
-+      for (n=0; n < entry->e_name_len; n++) {
-+              hash = (hash << NAME_HASH_SHIFT) ^
-+                     (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
-+                     *name++;
-+      }
-+
-+      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
-+              __u32 *value = (__u32 *)((char *)header +
-+                      le16_to_cpu(entry->e_value_offs));
-+              for (n = (le32_to_cpu(entry->e_value_size) +
-+                   EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
-+                      hash = (hash << VALUE_HASH_SHIFT) ^
-+                             (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
-+                             le32_to_cpu(*value++);
-+              }
-+      }
-+      entry->e_hash = cpu_to_le32(hash);
-+}
-+
-+#undef NAME_HASH_SHIFT
-+#undef VALUE_HASH_SHIFT
-+
-+#define BLOCK_HASH_SHIFT 16
-+
-+/*
-+ * ext3_xattr_rehash()
-+ *
-+ * Re-compute the extended attribute hash value after an entry has changed.
-+ */
-+static void ext3_xattr_rehash(struct ext3_xattr_header *header,
-+                            struct ext3_xattr_entry *entry)
-+{
-+      struct ext3_xattr_entry *here;
-+      __u32 hash = 0;
-+      
-+      ext3_xattr_hash_entry(header, entry);
-+      here = ENTRY(header+1);
-+      while (!IS_LAST_ENTRY(here)) {
-+              if (!here->e_hash) {
-+                      /* Block is not shared if an entry's hash value == 0 */
-+                      hash = 0;
-+                      break;
-+              }
-+              hash = (hash << BLOCK_HASH_SHIFT) ^
-+                     (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
-+                     le32_to_cpu(here->e_hash);
-+              here = EXT3_XATTR_NEXT(here);
-+      }
-+      header->h_hash = cpu_to_le32(hash);
-+}
-+
-+#undef BLOCK_HASH_SHIFT
-+
-+int __init
-+init_ext3_xattr(void)
-+{
-+      int     err;
-+      
-+      err = ext3_xattr_register(EXT3_XATTR_INDEX_USER, &ext3_xattr_user_handler);
-+      if (err)
-+              return err;
-+      ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
-+              sizeof(struct mb_cache_entry) +
-+              sizeof(struct mb_cache_entry_index), 1, 6);
-+      if (!ext3_xattr_cache) {
-+              ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, &ext3_xattr_user_handler);
-+              return -ENOMEM;
-+      }
-+
-+      return 0;
-+}
-+
-+void
-+exit_ext3_xattr(void)
-+{
-+      if (ext3_xattr_cache)
-+              mb_cache_destroy(ext3_xattr_cache);
-+      ext3_xattr_cache = NULL;
-+      ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, &ext3_xattr_user_handler);
-+}
-+
-diff -Nru a/fs/ext3/xattr.h b/fs/ext3/xattr.h
---- /dev/null  Wed Dec 31 16:00:00 1969
-+++ b/fs/ext3/xattr.h  Sun Dec  8 02:49:56 2002
-@@ -0,0 +1,133 @@
-+/*
-+  File: fs/ext3/xattr.h
-+
-+  On-disk format of extended attributes for the ext3 filesystem.
-+
-+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+*/
-+
-+#include <linux/config.h>
-+#include <linux/xattr.h>
-+
-+/* Magic value in attribute blocks */
-+#define EXT3_XATTR_MAGIC              0xEA020000
-+
-+/* Maximum number of references to one attribute block */
-+#define EXT3_XATTR_REFCOUNT_MAX               1024
-+
-+/* Name indexes */
-+#define EXT3_XATTR_INDEX_MAX                  10
-+#define EXT3_XATTR_INDEX_USER                 1
-+#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS     2
-+#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT    3
-+
-+struct ext3_xattr_header {
-+      __u32   h_magic;        /* magic number for identification */
-+      __u32   h_refcount;     /* reference count */
-+      __u32   h_blocks;       /* number of disk blocks used */
-+      __u32   h_hash;         /* hash value of all attributes */
-+      __u32   h_reserved[4];  /* zero right now */
-+};
-+
-+struct ext3_xattr_entry {
-+      __u8    e_name_len;     /* length of name */
-+      __u8    e_name_index;   /* attribute name index */
-+      __u16   e_value_offs;   /* offset in disk block of value */
-+      __u32   e_value_block;  /* disk block attribute is stored on (n/i) */
-+      __u32   e_value_size;   /* size of attribute value */
-+      __u32   e_hash;         /* hash value of name and value */
-+      char    e_name[0];      /* attribute name */
-+};
-+
-+#define EXT3_XATTR_PAD_BITS           2
-+#define EXT3_XATTR_PAD                (1<<EXT3_XATTR_PAD_BITS)
-+#define EXT3_XATTR_ROUND              (EXT3_XATTR_PAD-1)
-+#define EXT3_XATTR_LEN(name_len) \
-+      (((name_len) + EXT3_XATTR_ROUND + \
-+      sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
-+#define EXT3_XATTR_NEXT(entry) \
-+      ( (struct ext3_xattr_entry *)( \
-+        (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
-+#define EXT3_XATTR_SIZE(size) \
-+      (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
-+
-+# ifdef CONFIG_EXT3_FS_XATTR
-+
-+struct ext3_xattr_handler {
-+      char *prefix;
-+      size_t (*list)(char *list, struct inode *inode, const char *name,
-+                     int name_len);
-+      int (*get)(struct inode *inode, const char *name, void *buffer,
-+                 size_t size);
-+      int (*set)(struct inode *inode, const char *name, const void *buffer,
-+                 size_t size, int flags);
-+};
-+
-+extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
-+extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
-+
-+extern int ext3_setxattr(struct dentry *, const char *, void *, size_t, int);
-+extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
-+extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
-+extern int ext3_removexattr(struct dentry *, const char *);
-+
-+extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
-+extern int ext3_xattr_list(struct inode *, char *, size_t);
-+extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int);
-+
-+extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
-+extern void ext3_xattr_put_super(struct super_block *);
-+
-+extern int init_ext3_xattr(void);
-+extern void exit_ext3_xattr(void);
-+
-+# else  /* CONFIG_EXT3_FS_XATTR */
-+#  define ext3_setxattr               NULL
-+#  define ext3_getxattr               NULL
-+#  define ext3_listxattr      NULL
-+#  define ext3_removexattr    NULL
-+
-+static inline int
-+ext3_xattr_get(struct inode *inode, int name_index, const char *name,
-+             void *buffer, size_t size, int flags)
-+{
-+      return -EOPNOTSUPP;
-+}
-+
-+static inline int
-+ext3_xattr_list(struct inode *inode, void *buffer, size_t size, int flags)
-+{
-+      return -EOPNOTSUPP;
-+}
-+
-+static inline int
-+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
-+             const char *name, const void *value, size_t size, int flags)
-+{
-+      return -EOPNOTSUPP;
-+}
-+
-+static inline void
-+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
-+{
-+}
-+
-+static inline void
-+ext3_xattr_put_super(struct super_block *sb)
-+{
-+}
-+
-+static inline int
-+init_ext3_xattr(void)
-+{
-+      return 0;
-+}
-+
-+static inline void
-+exit_ext3_xattr(void)
-+{
-+}
-+
-+# endif  /* CONFIG_EXT3_FS_XATTR */
-+
-+extern struct ext3_xattr_handler ext3_xattr_user_handler;
-diff -Nru a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
---- /dev/null  Wed Dec 31 16:00:00 1969
-+++ b/fs/ext3/xattr_user.c     Sun Dec  8 02:49:56 2002
-@@ -0,0 +1,99 @@
-+/*
-+ * linux/fs/ext3/xattr_user.c
-+ * Handler for extended user attributes.
-+ *
-+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ */
-+
-+#include <linux/module.h>
-+#include <linux/string.h>
-+#include <linux/fs.h>
-+#include <linux/smp_lock.h>
-+#include <linux/ext3_jbd.h>
-+#include <linux/ext3_fs.h>
-+#include "xattr.h"
-+
-+#ifdef CONFIG_EXT3_FS_POSIX_ACL
-+# include <linux/ext3_acl.h>
-+#endif
-+
-+#define XATTR_USER_PREFIX "user."
-+
-+static size_t
-+ext3_xattr_user_list(char *list, struct inode *inode,
-+                   const char *name, int name_len)
-+{
-+      const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
-+
-+      if (!test_opt(inode->i_sb, XATTR_USER))
-+              return 0;
-+
-+      if (list) {
-+              memcpy(list, XATTR_USER_PREFIX, prefix_len);
-+              memcpy(list+prefix_len, name, name_len);
-+      }
-+      return prefix_len + name_len;
-+}
-+
-+static int
-+ext3_xattr_user_get(struct inode *inode, const char *name,
-+                  void *buffer, size_t size)
-+{
-+      int error;
-+
-+      if (strcmp(name, "") == 0)
-+              return -EINVAL;
-+      if (!test_opt(inode->i_sb, XATTR_USER))
-+              return -EOPNOTSUPP;
-+#ifdef CONFIG_EXT3_FS_POSIX_ACL
-+      error = ext3_permission_locked(inode, MAY_READ);
-+#else
-+      error = permission(inode, MAY_READ);
-+#endif
-+      if (error)
-+              return error;
-+
-+      return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name,
-+                            buffer, size);
-+}
-+
-+static int
-+ext3_xattr_user_set(struct inode *inode, const char *name,
-+                  const void *value, size_t size, int flags)
-+{
-+      handle_t *handle;
-+      int error;
-+
-+      if (strcmp(name, "") == 0)
-+              return -EINVAL;
-+      if (!test_opt(inode->i_sb, XATTR_USER))
-+              return -EOPNOTSUPP;
-+      if ( !S_ISREG(inode->i_mode) &&
-+          (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
-+              return -EPERM;
-+#ifdef CONFIG_EXT3_FS_POSIX_ACL
-+      error = ext3_permission_locked(inode, MAY_WRITE);
-+#else
-+      error = permission(inode, MAY_WRITE);
-+#endif
-+      if (error)
-+              return error;
-+  
-+      lock_kernel();
-+      handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
-+      if (IS_ERR(handle))
-+              return PTR_ERR(handle);
-+      error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name,
-+                             value, size, flags);
-+      ext3_journal_stop(handle, inode);
-+      unlock_kernel();
-+
-+      return error;
-+}
-+
-+struct ext3_xattr_handler ext3_xattr_user_handler = {
-+      prefix: XATTR_USER_PREFIX,
-+      list:   ext3_xattr_user_list,
-+      get:    ext3_xattr_user_get,
-+      set:    ext3_xattr_user_set,
-+};
-diff -Nru a/fs/mbcache.c b/fs/mbcache.c
---- /dev/null  Wed Dec 31 16:00:00 1969
-+++ b/fs/mbcache.c     Sun Dec  8 02:49:56 2002
-@@ -0,0 +1,702 @@
-+/*
-+ * linux/fs/mbcache.c
-+ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+ */
-+
-+/*
-+ * Filesystem Meta Information Block Cache (mbcache)
-+ *
-+ * The mbcache caches blocks of block devices that need to be located
-+ * by their device/block number, as well as by other criteria (such
-+ * as the block's contents).
-+ *
-+ * There can only be one cache entry in a cache per device and block number.
-+ * Additional indexes need not be unique in this sense. The number of
-+ * additional indexes (=other criteria) can be hardwired (at compile time)
-+ * or specified at cache create time.
-+ *
-+ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
-+ * in the cache. A valid entry is in the main hash tables of the cache,
-+ * and may also be in the lru list. An invalid entry is not in any hashes
-+ * or lists.
-+ *
-+ * A valid cache entry is only in the lru list if no handles refer to it.
-+ * Invalid cache entries will be freed when the last handle to the cache
-+ * entry is released.
-+ */
-+
-+#include <linux/kernel.h>
-+#include <linux/module.h>
-+
-+#include <linux/hash.h>
-+#include <linux/fs.h>
-+#include <linux/mm.h>
-+#include <linux/slab.h>
-+#include <linux/sched.h>
-+#include <linux/init.h>
-+#include <linux/mbcache.h>
-+
-+
-+#ifdef MB_CACHE_DEBUG
-+# define mb_debug(f...) do { \
-+              printk(KERN_DEBUG f); \
-+              printk("\n"); \
-+      } while (0)
-+#define mb_assert(c) do { if (!(c)) \
-+              printk(KERN_ERR "assertion " #c " failed\n"); \
-+      } while(0)
-+#else
-+# define mb_debug(f...) do { } while(0)
-+# define mb_assert(c) do { } while(0)
-+#endif
-+#define mb_error(f...) do { \
-+              printk(KERN_ERR f); \
-+              printk("\n"); \
-+      } while(0)
-+              
-+MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
-+MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
-+MODULE_LICENSE("GPL");
-+
-+EXPORT_SYMBOL(mb_cache_create);
-+EXPORT_SYMBOL(mb_cache_shrink);
-+EXPORT_SYMBOL(mb_cache_destroy);
-+EXPORT_SYMBOL(mb_cache_entry_alloc);
-+EXPORT_SYMBOL(mb_cache_entry_insert);
-+EXPORT_SYMBOL(mb_cache_entry_release);
-+EXPORT_SYMBOL(mb_cache_entry_takeout);
-+EXPORT_SYMBOL(mb_cache_entry_free);
-+EXPORT_SYMBOL(mb_cache_entry_dup);
-+EXPORT_SYMBOL(mb_cache_entry_get);
-+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-+EXPORT_SYMBOL(mb_cache_entry_find_first);
-+EXPORT_SYMBOL(mb_cache_entry_find_next);
-+#endif
-+
-+
-+/*
-+ * Global data: list of all mbcache's, lru list, and a spinlock for
-+ * accessing cache data structures on SMP machines. (The lru list is
-+ * global across all mbcaches.)
-+ */
-+
-+static LIST_HEAD(mb_cache_list);
-+static LIST_HEAD(mb_cache_lru_list);
-+static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED;
-+static struct shrinker *mb_shrinker;
-+
-+static inline int
-+mb_cache_indexes(struct mb_cache *cache)
-+{
-+#ifdef MB_CACHE_INDEXES_COUNT
-+      return MB_CACHE_INDEXES_COUNT;
-+#else
-+      return cache->c_indexes_count;
-+#endif
-+}
-+
-+/*
-+ * What the mbcache registers as to get shrunk dynamically.
-+ */
-+
-+static int mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask);
-+
-+static inline void
-+__mb_cache_entry_takeout_lru(struct mb_cache_entry *ce)
-+{
-+      if (!list_empty(&ce->e_lru_list))
-+              list_del_init(&ce->e_lru_list);
-+}
-+
-+
-+static inline void
-+__mb_cache_entry_into_lru(struct mb_cache_entry *ce)
-+{
-+      list_add(&ce->e_lru_list, &mb_cache_lru_list);
-+}
-+
-+
-+static inline int
-+__mb_cache_entry_in_lru(struct mb_cache_entry *ce)
-+{
-+      return (!list_empty(&ce->e_lru_list));
-+}
-+
-+
-+/*
-+ * Insert the cache entry into all hashes.
-+ */
-+static inline void
-+__mb_cache_entry_link(struct mb_cache_entry *ce)
-+{
-+      struct mb_cache *cache = ce->e_cache;
-+      unsigned int bucket;
-+      int n;
-+      
-+      bucket = hash_long((unsigned long)ce->e_bdev +
-+                         (ce->e_block & 0xffffff), cache->c_bucket_bits);
-+      list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
-+      for (n=0; n<mb_cache_indexes(cache); n++) {
-+              bucket = hash_long(ce->e_indexes[n].o_key,
-+                                 cache->c_bucket_bits);
-+              list_add(&ce->e_indexes[n].o_list,
-+                       &cache->c_indexes_hash[n][bucket]);
-+      }
-+}
-+
-+
-+/*
-+ * Remove the cache entry from all hashes.
-+ */
-+static inline void
-+__mb_cache_entry_unlink(struct mb_cache_entry *ce)
-+{
-+      int n;
-+
-+      list_del_init(&ce->e_block_list);
-+      for (n = 0; n < mb_cache_indexes(ce->e_cache); n++)
-+              list_del(&ce->e_indexes[n].o_list);
-+}
-+
-+
-+static inline int
-+__mb_cache_entry_is_linked(struct mb_cache_entry *ce)
-+{
-+      return (!list_empty(&ce->e_block_list));
-+}
-+
-+
-+static inline struct mb_cache_entry *
-+__mb_cache_entry_read(struct mb_cache_entry *ce)
-+{
-+      __mb_cache_entry_takeout_lru(ce);
-+      atomic_inc(&ce->e_used);
-+      return ce;
-+}
-+
-+
-+static inline void
-+__mb_cache_entry_forget(struct mb_cache_entry *ce)
-+{
-+      struct mb_cache *cache = ce->e_cache;
-+
-+      mb_assert(atomic_read(&ce->e_used) == 0);
-+      atomic_dec(&cache->c_entry_count);
-+      if (cache->c_op.free)
-+              cache->c_op.free(ce);
-+      kmem_cache_free(cache->c_entry_cache, ce);
-+}
-+
-+
-+static inline void
-+__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
-+{
-+      if (atomic_dec_and_test(&ce->e_used)) {
-+              if (!__mb_cache_entry_is_linked(ce))
-+                      goto forget;
-+              __mb_cache_entry_into_lru(ce);
-+      }
-+      spin_unlock(&mb_cache_spinlock);
-+      return;
-+forget:
-+      spin_unlock(&mb_cache_spinlock);
-+      __mb_cache_entry_forget(ce);
-+}
-+
-+
-+/*
-+ * mb_cache_shrink_fn()  memory pressure callback
-+ *
-+ * This function is called by the kernel memory management when memory
-+ * gets low.
-+ *
-+ * @nr_to_scan: Number of objects to scan
-+ * @gfp_mask: (ignored)
-+ *
-+ * Returns the number of objects which are present in the cache.
-+ */
-+static int
-+mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask)
-+{
-+      LIST_HEAD(free_list);
-+      struct list_head *l;
-+      int count = 0;
-+
-+      spin_lock(&mb_cache_spinlock);
-+      list_for_each_prev(l, &mb_cache_list) {
-+              struct mb_cache *cache =
-+                      list_entry(l, struct mb_cache, c_cache_list);
-+              mb_debug("cache %s (%d)", cache->c_name,
-+                        atomic_read(&cache->c_entry_count));
-+              count += atomic_read(&cache->c_entry_count);
-+      }
-+      mb_debug("trying to free %d entries", nr_to_scan);
-+      if (nr_to_scan == 0) {
-+              spin_unlock(&mb_cache_spinlock);
-+              goto out;
-+      }
-+      while (nr_to_scan && !list_empty(&mb_cache_lru_list)) {
-+              struct mb_cache_entry *ce =
-+                      list_entry(mb_cache_lru_list.prev,
-+                                 struct mb_cache_entry, e_lru_list);
-+              list_move(&ce->e_lru_list, &free_list);
-+              if (__mb_cache_entry_is_linked(ce))
-+                      __mb_cache_entry_unlink(ce);
-+              nr_to_scan--;
-+      }
-+      spin_unlock(&mb_cache_spinlock);
-+      l = free_list.prev;
-+      while (l != &free_list) {
-+              struct mb_cache_entry *ce = list_entry(l,
-+                      struct mb_cache_entry, e_lru_list);
-+              l = l->prev;
-+              __mb_cache_entry_forget(ce);
-+              count--;
-+      }
-+out:
-+      mb_debug("%d remaining entries ", count);
-+      return count;
-+}
-+
-+
-+/*
-+ * mb_cache_create()  create a new cache
-+ *
-+ * All entries in one cache are equal size. Cache entries may be from
-+ * multiple devices. If this is the first mbcache created, registers
-+ * the cache with kernel memory management. Returns NULL if no more
-+ * memory was available.
-+ *
-+ * @name: name of the cache (informal)
-+ * @cache_op: contains the callback called when freeing a cache entry
-+ * @entry_size: The size of a cache entry, including
-+ *              struct mb_cache_entry
-+ * @indexes_count: number of additional indexes in the cache. Must equal
-+ *                 MB_CACHE_INDEXES_COUNT if the number of indexes is
-+ *                 hardwired.
-+ * @bucket_bits: log2(number of hash buckets)
-+ */
-+struct mb_cache *
-+mb_cache_create(const char *name, struct mb_cache_op *cache_op,
-+              size_t entry_size, int indexes_count, int bucket_bits)
-+{
-+      int m=0, n, bucket_count = 1 << bucket_bits;
-+      struct mb_cache *cache = NULL;
-+
-+      if(entry_size < sizeof(struct mb_cache_entry) +
-+         indexes_count * sizeof(struct mb_cache_entry_index))
-+              return NULL;
-+
-+      cache = kmalloc(sizeof(struct mb_cache) +
-+                      indexes_count * sizeof(struct list_head), GFP_KERNEL);
-+      if (!cache)
-+              goto fail;
-+      cache->c_name = name;
-+      if (cache_op)
-+              cache->c_op.free = cache_op->free;
-+      else
-+              cache->c_op.free = NULL;
-+      atomic_set(&cache->c_entry_count, 0);
-+      cache->c_bucket_bits = bucket_bits;
-+#ifdef MB_CACHE_INDEXES_COUNT
-+      mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
-+#else
-+      cache->c_indexes_count = indexes_count;
-+#endif
-+      cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
-+                                    GFP_KERNEL);
-+      if (!cache->c_block_hash)
-+              goto fail;
-+      for (n=0; n<bucket_count; n++)
-+              INIT_LIST_HEAD(&cache->c_block_hash[n]);
-+      for (m=0; m<indexes_count; m++) {
-+              cache->c_indexes_hash[m] = kmalloc(bucket_count *
-+                                               sizeof(struct list_head),
-+                                               GFP_KERNEL);
-+              if (!cache->c_indexes_hash[m])
-+                      goto fail;
-+              for (n=0; n<bucket_count; n++)
-+                      INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
-+      }
-+      cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
-+              0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL);
-+      if (!cache->c_entry_cache)
-+              goto fail;
-+
-+      spin_lock(&mb_cache_spinlock);
-+      if (list_empty(&mb_cache_list)) {
-+              if (mb_shrinker) {
-+                      printk(KERN_ERR "%s: already have a shrinker!\n",
-+                                      __FUNCTION__);
-+                      remove_shrinker(mb_shrinker);
-+              }
-+              mb_shrinker = set_shrinker(DEFAULT_SEEKS, mb_cache_shrink_fn);
-+      }
-+      list_add(&cache->c_cache_list, &mb_cache_list);
-+      spin_unlock(&mb_cache_spinlock);
-+      return cache;
-+
-+fail:
-+      if (cache) {
-+              while (--m >= 0)
-+                      kfree(cache->c_indexes_hash[m]);
-+              if (cache->c_block_hash)
-+                      kfree(cache->c_block_hash);
-+              kfree(cache);
-+      }
-+      return NULL;
-+}
-+
-+
-+/*
-+ * mb_cache_shrink()
-+ *
-+ * Removes all cache entires of a device from the cache. All cache entries
-+ * currently in use cannot be freed, and thus remain in the cache. All others
-+ * are freed.
-+ *
-+ * @cache: which cache to shrink
-+ * @bdev: which device's cache entries to shrink
-+ */
-+void
-+mb_cache_shrink(struct mb_cache *cache, struct block_device *bdev)
-+{
-+      LIST_HEAD(free_list);
-+      struct list_head *l;
-+
-+      spin_lock(&mb_cache_spinlock);
-+      l = mb_cache_lru_list.prev;
-+      while (l != &mb_cache_lru_list) {
-+              struct mb_cache_entry *ce =
-+                      list_entry(l, struct mb_cache_entry, e_lru_list);
-+              l = l->prev;
-+              if (ce->e_bdev == bdev) {
-+                      list_move(&ce->e_lru_list, &free_list);
-+                      if (__mb_cache_entry_is_linked(ce))
-+                              __mb_cache_entry_unlink(ce);
-+              }
-+      }
-+      spin_unlock(&mb_cache_spinlock);
-+      l = free_list.prev;
-+      while (l != &free_list) {
-+              struct mb_cache_entry *ce =
-+                      list_entry(l, struct mb_cache_entry, e_lru_list);
-+              l = l->prev;
-+              __mb_cache_entry_forget(ce);
-+      }
-+}
-+
-+
-+/*
-+ * mb_cache_destroy()
-+ *
-+ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
-+ * and then destroys it. If this was the last mbcache, un-registers the
-+ * mbcache from kernel memory management.
-+ */
-+void
-+mb_cache_destroy(struct mb_cache *cache)
-+{
-+      LIST_HEAD(free_list);
-+      struct list_head *l;
-+      int n;
-+
-+      spin_lock(&mb_cache_spinlock);
-+      l = mb_cache_lru_list.prev;
-+      while (l != &mb_cache_lru_list) {
-+              struct mb_cache_entry *ce =
-+                      list_entry(l, struct mb_cache_entry, e_lru_list);
-+              l = l->prev;
-+              if (ce->e_cache == cache) {
-+                      list_move(&ce->e_lru_list, &free_list);
-+                      if (__mb_cache_entry_is_linked(ce))
-+                              __mb_cache_entry_unlink(ce);
-+              }
-+      }
-+      list_del(&cache->c_cache_list);
-+      if (list_empty(&mb_cache_list) && mb_shrinker) {
-+              remove_shrinker(mb_shrinker);
-+              mb_shrinker = 0;
-+      }
-+      spin_unlock(&mb_cache_spinlock);
-+
-+      l = free_list.prev;
-+      while (l != &free_list) {
-+              struct mb_cache_entry *ce =
-+                      list_entry(l, struct mb_cache_entry, e_lru_list);
-+              l = l->prev;
-+              __mb_cache_entry_forget(ce);
-+      }
-+
-+      if (atomic_read(&cache->c_entry_count) > 0) {
-+              mb_error("cache %s: %d orphaned entries",
-+                        cache->c_name,
-+                        atomic_read(&cache->c_entry_count));
-+      }
-+
-+      kmem_cache_destroy(cache->c_entry_cache);
-+
-+      for (n=0; n < mb_cache_indexes(cache); n++)
-+              kfree(cache->c_indexes_hash[n]);
-+      kfree(cache->c_block_hash);
-+
-+      kfree(cache);
-+}
-+
-+
-+/*
-+ * mb_cache_entry_alloc()
-+ *
-+ * Allocates a new cache entry. The new entry will not be valid initially,
-+ * and thus cannot be looked up yet. It should be filled with data, and
-+ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
-+ * if no more memory was available.
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_alloc(struct mb_cache *cache)
-+{
-+      struct mb_cache_entry *ce;
-+
-+      atomic_inc(&cache->c_entry_count);
-+      ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
-+      if (ce) {
-+              INIT_LIST_HEAD(&ce->e_lru_list);
-+              INIT_LIST_HEAD(&ce->e_block_list);
-+              ce->e_cache = cache;
-+              atomic_set(&ce->e_used, 1);
-+      }
-+      return ce;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_insert()
-+ *
-+ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
-+ * the cache. After this, the cache entry can be looked up, but is not yet
-+ * in the lru list as the caller still holds a handle to it. Returns 0 on
-+ * success, or -EBUSY if a cache entry for that device + inode exists
-+ * already (this may happen after a failed lookup, but when another process
-+ * has inserted the same cache entry in the meantime).
-+ *
-+ * @bdev: device the cache entry belongs to
-+ * @block: block number
-+ * @keys: array of additional keys. There must be indexes_count entries
-+ *        in the array (as specified when creating the cache).
-+ */
-+int
-+mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
-+                    sector_t block, unsigned int keys[])
-+{
-+      struct mb_cache *cache = ce->e_cache;
-+      unsigned int bucket;
-+      struct list_head *l;
-+      int error = -EBUSY, n;
-+
-+      bucket =  hash_long((unsigned long)bdev + (block & 0xffffffff), 
-+                          cache->c_bucket_bits);
-+      spin_lock(&mb_cache_spinlock);
-+      list_for_each_prev(l, &cache->c_block_hash[bucket]) {
-+              struct mb_cache_entry *ce =
-+                      list_entry(l, struct mb_cache_entry, e_block_list);
-+              if (ce->e_bdev == bdev && ce->e_block == block)
-+                      goto out;
-+      }
-+      mb_assert(!__mb_cache_entry_is_linked(ce));
-+      ce->e_bdev = bdev;
-+      ce->e_block = block;
-+      for (n=0; n<mb_cache_indexes(cache); n++)
-+              ce->e_indexes[n].o_key = keys[n];
-+      __mb_cache_entry_link(ce);
-+out:
-+      spin_unlock(&mb_cache_spinlock);
-+      return error;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_release()
-+ *
-+ * Release a handle to a cache entry. When the last handle to a cache entry
-+ * is released it is either freed (if it is invalid) or otherwise inserted
-+ * in to the lru list.
-+ */
-+void
-+mb_cache_entry_release(struct mb_cache_entry *ce)
-+{
-+      spin_lock(&mb_cache_spinlock);
-+      __mb_cache_entry_release_unlock(ce);
-+}
-+
-+
-+/*
-+ * mb_cache_entry_takeout()
-+ *
-+ * Take a cache entry out of the cache, making it invalid. The entry can later
-+ * be re-inserted using mb_cache_entry_insert(), or released using
-+ * mb_cache_entry_release().
-+ */
-+void
-+mb_cache_entry_takeout(struct mb_cache_entry *ce)
-+{
-+      spin_lock(&mb_cache_spinlock);
-+      mb_assert(!__mb_cache_entry_in_lru(ce));
-+      if (__mb_cache_entry_is_linked(ce))
-+              __mb_cache_entry_unlink(ce);
-+      spin_unlock(&mb_cache_spinlock);
-+}
-+
-+
-+/*
-+ * mb_cache_entry_free()
-+ *
-+ * This is equivalent to the sequence mb_cache_entry_takeout() --
-+ * mb_cache_entry_release().
-+ */
-+void
-+mb_cache_entry_free(struct mb_cache_entry *ce)
-+{
-+      spin_lock(&mb_cache_spinlock);
-+      mb_assert(!__mb_cache_entry_in_lru(ce));
-+      if (__mb_cache_entry_is_linked(ce))
-+              __mb_cache_entry_unlink(ce);
-+      __mb_cache_entry_release_unlock(ce);
-+}
-+
-+
-+/*
-+ * mb_cache_entry_dup()
-+ *
-+ * Duplicate a handle to a cache entry (does not duplicate the cache entry
-+ * itself). After the call, both the old and the new handle must be released.
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_dup(struct mb_cache_entry *ce)
-+{
-+      atomic_inc(&ce->e_used);
-+      return ce;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_get()
-+ *
-+ * Get a cache entry  by device / block number. (There can only be one entry
-+ * in the cache per device and block.) Returns NULL if no such cache entry
-+ * exists.
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
-+                 sector_t block)
-+{
-+      unsigned int bucket;
-+      struct list_head *l;
-+      struct mb_cache_entry *ce;
-+
-+      bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
-+                         cache->c_bucket_bits);
-+      spin_lock(&mb_cache_spinlock);
-+      list_for_each(l, &cache->c_block_hash[bucket]) {
-+              ce = list_entry(l, struct mb_cache_entry, e_block_list);
-+              if (ce->e_bdev == bdev && ce->e_block == block) {
-+                      ce = __mb_cache_entry_read(ce);
-+                      goto cleanup;
-+              }
-+      }
-+      ce = NULL;
-+
-+cleanup:
-+      spin_unlock(&mb_cache_spinlock);
-+      return ce;
-+}
-+
-+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-+
-+static struct mb_cache_entry *
-+__mb_cache_entry_find(struct list_head *l, struct list_head *head,
-+                    int index, struct block_device *bdev, unsigned int key)
-+{
-+      while (l != head) {
-+              struct mb_cache_entry *ce =
-+                      list_entry(l, struct mb_cache_entry,
-+                                 e_indexes[index].o_list);
-+              if (ce->e_bdev == bdev &&
-+                  ce->e_indexes[index].o_key == key) {
-+                      ce = __mb_cache_entry_read(ce);
-+                      if (ce)
-+                              return ce;
-+              }
-+              l = l->next;
-+      }
-+      return NULL;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_find_first()
-+ *
-+ * Find the first cache entry on a given device with a certain key in
-+ * an additional index. Additonal matches can be found with
-+ * mb_cache_entry_find_next(). Returns NULL if no match was found.
-+ *
-+ * @cache: the cache to search
-+ * @index: the number of the additonal index to search (0<=index<indexes_count)
-+ * @bdev: the device the cache entry should belong to
-+ * @key: the key in the index
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_find_first(struct mb_cache *cache, int index,
-+                        struct block_device *bdev, unsigned int key)
-+{
-+      unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-+      struct list_head *l;
-+      struct mb_cache_entry *ce;
-+
-+      mb_assert(index < mb_cache_indexes(cache));
-+      spin_lock(&mb_cache_spinlock);
-+      l = cache->c_indexes_hash[index][bucket].next;
-+      ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
-+                                 index, bdev, key);
-+      spin_unlock(&mb_cache_spinlock);
-+      return ce;
-+}
-+
-+
-+/*
-+ * mb_cache_entry_find_next()
-+ *
-+ * Find the next cache entry on a given device with a certain key in an
-+ * additional index. Returns NULL if no match could be found. The previous
-+ * entry is atomatically released, so that mb_cache_entry_find_next() can
-+ * be called like this:
-+ *
-+ * entry = mb_cache_entry_find_first();
-+ * while (entry) {
-+ *    ...
-+ *    entry = mb_cache_entry_find_next(entry, ...);
-+ * }
-+ *
-+ * @prev: The previous match
-+ * @index: the number of the additonal index to search (0<=index<indexes_count)
-+ * @bdev: the device the cache entry should belong to
-+ * @key: the key in the index
-+ */
-+struct mb_cache_entry *
-+mb_cache_entry_find_next(struct mb_cache_entry *prev, int index,
-+                       struct block_device *bdev, unsigned int key)
-+{
-+      struct mb_cache *cache = prev->e_cache;
-+      unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-+      struct list_head *l;
-+      struct mb_cache_entry *ce;
-+
-+      mb_assert(index < mb_cache_indexes(cache));
-+      spin_lock(&mb_cache_spinlock);
-+      l = prev->e_indexes[index].o_list.next;
-+      ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
-+                                 index, bdev, key);
-+      __mb_cache_entry_release_unlock(prev);
-+      return ce;
-+}
-+
-+#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
-diff -Nru a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
---- a/include/linux/ext3_fs.h  Sun Dec  8 02:49:56 2002
-+++ b/include/linux/ext3_fs.h  Sun Dec  8 02:49:56 2002
-@@ -64,8 +64,6 @@
-  */
- #define       EXT3_BAD_INO             1      /* Bad blocks inode */
- #define EXT3_ROOT_INO          2      /* Root inode */
--#define EXT3_ACL_IDX_INO       3      /* ACL inode */
--#define EXT3_ACL_DATA_INO      4      /* ACL inode */
- #define EXT3_BOOT_LOADER_INO   5      /* Boot loader inode */
- #define EXT3_UNDEL_DIR_INO     6      /* Undelete directory inode */
- #define EXT3_RESIZE_INO                7      /* Reserved group descriptors inode */
-@@ -95,7 +93,6 @@
- #else
- # define EXT3_BLOCK_SIZE(s)           (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
- #endif
--#define EXT3_ACLE_PER_BLOCK(s)                (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
- #define       EXT3_ADDR_PER_BLOCK(s)          (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
- #ifdef __KERNEL__
- # define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
-@@ -130,28 +127,6 @@
- #endif
- /*
-- * ACL structures
-- */
--struct ext3_acl_header        /* Header of Access Control Lists */
--{
--      __u32   aclh_size;
--      __u32   aclh_file_count;
--      __u32   aclh_acle_count;
--      __u32   aclh_first_acle;
--};
--
--struct ext3_acl_entry /* Access Control List Entry */
--{
--      __u32   acle_size;
--      __u16   acle_perms;     /* Access permissions */
--      __u16   acle_type;      /* Type of entry */
--      __u16   acle_tag;       /* User or group identity */
--      __u16   acle_pad1;
--      __u32   acle_next;      /* Pointer on next entry for the */
--                                      /* same inode or on next free entry */
--};
--
--/*
-  * Structure of a blocks group descriptor
-  */
- struct ext3_group_desc
-@@ -347,6 +322,7 @@
-   #define EXT3_MOUNT_WRITEBACK_DATA   0x0C00  /* No data ordering */
- #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
- #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
-+#define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
- /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
- #ifndef _LINUX_EXT2_FS_H
-@@ -529,7 +505,7 @@
- #define EXT3_FEATURE_INCOMPAT_RECOVER         0x0004 /* Needs recovery */
- #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008 /* Journal device */
--#define EXT3_FEATURE_COMPAT_SUPP      0
-+#define EXT3_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
- #define EXT3_FEATURE_INCOMPAT_SUPP    (EXT3_FEATURE_INCOMPAT_FILETYPE| \
-                                        EXT3_FEATURE_INCOMPAT_RECOVER)
- #define EXT3_FEATURE_RO_COMPAT_SUPP   (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
-@@ -713,6 +689,7 @@
- /* inode.c */
-+extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
- extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
- extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
-@@ -781,8 +758,10 @@
- /* namei.c */
- extern struct inode_operations ext3_dir_inode_operations;
-+extern struct inode_operations ext3_special_inode_operations;
- /* symlink.c */
-+extern struct inode_operations ext3_symlink_inode_operations;
- extern struct inode_operations ext3_fast_symlink_inode_operations;
-diff -Nru a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h
---- a/include/linux/ext3_jbd.h Sun Dec  8 02:49:56 2002
-+++ b/include/linux/ext3_jbd.h Sun Dec  8 02:49:56 2002
-@@ -30,13 +30,19 @@
- #define EXT3_SINGLEDATA_TRANS_BLOCKS  8
-+/* Extended attributes may touch two data buffers, two bitmap buffers,
-+ * and two group and summaries. */
-+
-+#define EXT3_XATTR_TRANS_BLOCKS               8
-+
- /* Define the minimum size for a transaction which modifies data.  This
-  * needs to take into account the fact that we may end up modifying two
-  * quota files too (one for the group, one for the user quota).  The
-  * superblock only gets updated once, of course, so don't bother
-  * counting that again for the quota updates. */
--#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
-+#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \
-+                                       EXT3_XATTR_TRANS_BLOCKS - 2)
- extern int ext3_writepage_trans_blocks(struct inode *inode);
-diff -Nru a/include/linux/mbcache.h b/include/linux/mbcache.h
---- /dev/null  Wed Dec 31 16:00:00 1969
-+++ b/include/linux/mbcache.h  Sun Dec  8 02:49:56 2002
-@@ -0,0 +1,72 @@
-+/*
-+  File: linux/mbcache.h
-+
-+  (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
-+*/
-+
-+/* Hardwire the number of additional indexes */
-+#define MB_CACHE_INDEXES_COUNT 1
-+
-+struct mb_cache_entry;
-+
-+struct mb_cache_op {
-+      void (*free)(struct mb_cache_entry *);
-+};
-+
-+struct mb_cache {
-+      struct list_head                c_cache_list;
-+      const char                      *c_name;
-+      struct mb_cache_op              c_op;
-+      atomic_t                        c_entry_count;
-+      int                             c_bucket_bits;
-+#ifndef MB_CACHE_INDEXES_COUNT
-+      int                             c_indexes_count;
-+#endif
-+      kmem_cache_t                    *c_entry_cache;
-+      struct list_head                *c_block_hash;
-+      struct list_head                *c_indexes_hash[0];
-+};
-+
-+struct mb_cache_entry_index {
-+      struct list_head                o_list;
-+      unsigned int                    o_key;
-+};
-+
-+struct mb_cache_entry {
-+      struct list_head                e_lru_list;
-+      struct mb_cache                 *e_cache;
-+      atomic_t                        e_used;
-+      struct block_device             *e_bdev;
-+      sector_t                        e_block;
-+      struct list_head                e_block_list;
-+      struct mb_cache_entry_index     e_indexes[0];
-+};
-+
-+/* Functions on caches */
-+
-+struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
-+                                int, int);
-+void mb_cache_shrink(struct mb_cache *, struct block_device *);
-+void mb_cache_destroy(struct mb_cache *);
-+
-+/* Functions on cache entries */
-+
-+struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *);
-+int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *,
-+                        sector_t, unsigned int[]);
-+void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]);
-+void mb_cache_entry_release(struct mb_cache_entry *);
-+void mb_cache_entry_takeout(struct mb_cache_entry *);
-+void mb_cache_entry_free(struct mb_cache_entry *);
-+struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *);
-+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *,
-+                                        struct block_device *,
-+                                        sector_t);
-+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
-+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
-+                                               struct block_device *, 
-+                                               unsigned int);
-+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
-+                                              struct block_device *, 
-+                                              unsigned int);
-+#endif
diff --git a/lustre/kernel_patches/patches/ext3_orphan_lock-2.4.20-rh.patch b/lustre/kernel_patches/patches/ext3_orphan_lock-2.4.20-rh.patch
new file mode 100644 (file)
index 0000000..d029650
--- /dev/null
@@ -0,0 +1,82 @@
+ fs/ext3/namei.c            |   15 +++++++--------
+ fs/ext3/super.c            |    1 +
+ include/linux/ext3_fs_sb.h |    1 +
+ 3 files changed, 9 insertions(+), 8 deletions(-)
+
+--- linux-rh-2.4.20-8/fs/ext3/namei.c~ext3_orphan_lock-2.4.20-rh       2003-05-05 19:49:15.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/namei.c     2003-05-05 20:01:28.000000000 +0800
+@@ -1747,8 +1747,8 @@ int ext3_orphan_add(handle_t *handle, st
+       struct super_block *sb = inode->i_sb;
+       struct ext3_iloc iloc;
+       int err = 0, rc;
+-      
+-      lock_super(sb);
++
++      down(&EXT3_SB(sb)->s_orphan_lock);
+       if (!list_empty(&EXT3_I(inode)->i_orphan))
+               goto out_unlock;
+@@ -1796,7 +1796,7 @@ int ext3_orphan_add(handle_t *handle, st
+       jbd_debug(4, "orphan inode %ld will point to %d\n",
+                       inode->i_ino, NEXT_ORPHAN(inode));
+ out_unlock:
+-      unlock_super(sb);
++      up(&EXT3_SB(sb)->s_orphan_lock);
+       ext3_std_error(inode->i_sb, err);
+       return err;
+ }
+@@ -1809,20 +1809,19 @@ int ext3_orphan_del(handle_t *handle, st
+ {
+       struct list_head *prev;
+       struct ext3_inode_info *ei = EXT3_I(inode);
+-      struct ext3_sb_info *sbi;
++      struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
+       unsigned long ino_next;
+       struct ext3_iloc iloc;
+       int err = 0;
+-      lock_super(inode->i_sb);
++      down(&sbi->s_orphan_lock);
+       if (list_empty(&ei->i_orphan)) {
+-              unlock_super(inode->i_sb);
++              up(&sbi->s_orphan_lock);
+               return 0;
+       }
+       ino_next = NEXT_ORPHAN(inode);
+       prev = ei->i_orphan.prev;
+-      sbi = EXT3_SB(inode->i_sb);
+       jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+@@ -1871,7 +1870,7 @@ int ext3_orphan_del(handle_t *handle, st
+ out_err:
+       ext3_std_error(inode->i_sb, err);
+ out:
+-      unlock_super(inode->i_sb);
++      up(&sbi->s_orphan_lock);
+       return err;
+ out_brelse:
+--- linux-rh-2.4.20-8/fs/ext3/super.c~ext3_orphan_lock-2.4.20-rh       2003-05-05 19:49:15.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/super.c     2003-05-05 19:54:09.000000000 +0800
+@@ -1151,6 +1151,7 @@ struct super_block * ext3_read_super (st
+        */
+       sb->s_op = &ext3_sops;
+       INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
++      sema_init(&sbi->s_orphan_lock, 1);
+       sb->s_root = 0;
+--- linux-rh-2.4.20-8/include/linux/ext3_fs_sb.h~ext3_orphan_lock-2.4.20-rh    2003-05-05 19:49:07.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/ext3_fs_sb.h  2003-05-05 19:54:09.000000000 +0800
+@@ -69,6 +69,7 @@ struct ext3_sb_info {
+       struct inode * s_journal_inode;
+       struct journal_s * s_journal;
+       struct list_head s_orphan;
++      struct semaphore s_orphan_lock;
+       unsigned long s_commit_interval;
+       struct block_device *journal_bdev;
+ #ifdef CONFIG_JBD_DEBUG
+
+_
diff --git a/lustre/kernel_patches/patches/extN-2.4.18-ino_sb_fixup.patch b/lustre/kernel_patches/patches/extN-2.4.18-ino_sb_fixup.patch
new file mode 100644 (file)
index 0000000..df46643
--- /dev/null
@@ -0,0 +1,33 @@
+--- ./include/linux/ext3_fs.h.orig     Tue May  7 17:06:03 2002
++++ ./include/linux/ext3_fs.h  Tue May  7 17:07:11 2002
+@@ -17,6 +17,8 @@
+ #define _LINUX_EXT3_FS_H
+ #include <linux/types.h>
++#include <linux/ext3_fs_sb.h>
++#include <linux/ext3_fs_i.h>
+ /*
+  * The second extended filesystem constants/structures
+@@ -86,8 +88,8 @@
+ #define EXT3_MIN_BLOCK_LOG_SIZE                 10
+ #ifdef __KERNEL__
+-#define EXT3_SB(sb)   (&((sb)->u.ext3_sb))
+-#define EXT3_I(inode) (&((inode)->u.ext3_i))
++#define EXT3_SB(sb)   ((struct ext3_sb_info *)&((sb)->u.generic_sbp))
++#define EXT3_I(inode) ((struct ext3_inode_info *)&((inode)->u.generic_ip))
+ #define EXT3_BLOCK_SIZE(s)            ((s)->s_blocksize)
+ #define EXT3_BLOCK_SIZE_BITS(s)               ((s)->s_blocksize_bits)
+@@ -447,7 +447,9 @@
+ #define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime
+ static inline struct inode *orphan_list_entry(struct list_head *l)
+ {
+-      return list_entry(l, struct inode, u.ext3_i.i_orphan);
++      return ((struct inode *)((char *)l -
++              (unsigned long)(offsetof(struct inode, u.generic_ip) +
++                              offsetof(struct ext3_inode_info, i_orphan))));
+ }
+ /*
diff --git a/lustre/kernel_patches/patches/extN-delete_thread.patch b/lustre/kernel_patches/patches/extN-delete_thread.patch
new file mode 100644 (file)
index 0000000..4248b5c
--- /dev/null
@@ -0,0 +1,278 @@
+ 0 files changed
+
+--- linux-2.4.18-p4smp-61chaos/include/linux/ext3_fs.h~extN-delete_thread      2003-05-29 10:19:15.000000000 +0800
++++ linux-2.4.18-p4smp-61chaos-root/include/linux/ext3_fs.h    2003-05-29 10:50:04.000000000 +0800
+@@ -190,6 +190,7 @@ struct ext3_group_desc
+  */
+ #define EXT3_STATE_JDATA              0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW                        0x00000002 /* inode is newly created */
++#define EXT3_STATE_DELETE             0x00000010 /* deferred delete inode */
+ /*
+  * ioctl commands
+--- linux-2.4.18-p4smp-61chaos/include/linux/ext3_fs_sb.h~extN-delete_thread   2003-05-29 10:19:15.000000000 +0800
++++ linux-2.4.18-p4smp-61chaos-root/include/linux/ext3_fs_sb.h 2003-05-29 10:50:04.000000000 +0800
+@@ -29,6 +29,8 @@
+ #define EXT3_MAX_GROUP_LOADED 32
++#define EXT3_DELETE_THREAD
++
+ /*
+  * third extended-fs super-block data in memory
+  */
+@@ -74,6 +76,14 @@ struct ext3_sb_info {
+       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+ #endif
++#ifdef EXT3_DELETE_THREAD
++      spinlock_t s_delete_lock;
++      struct list_head s_delete_list;
++      unsigned long s_delete_blocks;
++      unsigned long s_delete_inodes;
++      wait_queue_head_t s_delete_thread_queue;
++      wait_queue_head_t s_delete_waiter_queue;
++#endif
+ };
+ #endif        /* _LINUX_EXT3_FS_SB */
+--- linux-2.4.18-p4smp-61chaos/fs/ext3/super.c~extN-delete_thread      2003-05-29 10:19:15.000000000 +0800
++++ linux-2.4.18-p4smp-61chaos-root/fs/ext3/super.c    2003-05-29 10:50:04.000000000 +0800
+@@ -398,6 +398,207 @@ static void dump_orphan_list(struct supe
+       }
+ }
++#ifdef EXT3_DELETE_THREAD
++/*
++ * Delete inodes in a loop until there are no more to be deleted.
++ * Normally, we run in the background doing the deletes and sleeping again,
++ * and clients just add new inodes to be deleted onto the end of the list.
++ * If someone is concerned about free space (e.g. block allocation or similar)
++ * then they can sleep on s_delete_waiter_queue and be woken up when space
++ * has been freed.
++ */
++int ext3_delete_thread(void *data)
++{
++      struct super_block *sb = data;
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      struct task_struct *tsk = current;
++
++      /* Almost like daemonize, but not quite */
++      exit_mm(current);
++      tsk->session = 1;
++      tsk->pgrp = 1;
++      tsk->tty = NULL;
++      exit_files(current);
++      reparent_to_init();
++
++      sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
++      sigfillset(&tsk->blocked);
++
++      tsk->flags |= PF_KERNTHREAD;
++
++      INIT_LIST_HEAD(&sbi->s_delete_list);
++      wake_up(&sbi->s_delete_waiter_queue);
++      printk(KERN_INFO "EXT3-fs: delete thread on %s started\n",
++             kdevname(sb->s_dev));
++
++      /* main loop */
++      for (;;) {
++              sleep_on(&sbi->s_delete_thread_queue);
++              printk(KERN_DEBUG "%s woken up: %lu inodes, %lu blocks\n",
++                     tsk->comm, sbi->s_delete_inodes, sbi->s_delete_blocks);
++
++              spin_lock(&sbi->s_delete_lock);
++              if (list_empty(&sbi->s_delete_list)) {
++                      memset(&sbi->s_delete_list, 0,
++                             sizeof(sbi->s_delete_list));
++                      spin_unlock(&sbi->s_delete_lock);
++                      printk(KERN_DEBUG "ext3 delete thread on %s exiting\n",
++                             kdevname(sb->s_dev));
++                      wake_up(&sbi->s_delete_waiter_queue);
++                      break;
++              }
++
++              while (!list_empty(&sbi->s_delete_list)) {
++                      struct inode *inode=list_entry(sbi->s_delete_list.next,
++                                                     struct inode, i_dentry);
++                      unsigned long blocks = inode->i_blocks >>
++                                                      (inode->i_blkbits - 9);
++
++                      list_del_init(&inode->i_dentry);
++                      spin_unlock(&sbi->s_delete_lock);
++                      printk(KERN_DEBUG "%s delete ino %lu blk %lu\n",
++                                 tsk->comm, inode->i_ino, blocks);
++
++                      iput(inode);
++
++                      spin_lock(&sbi->s_delete_lock);
++                      sbi->s_delete_blocks -= blocks;
++                      sbi->s_delete_inodes--;
++              }
++              if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0)
++                      printk(KERN_WARNING
++                             "%lu blocks and %lu left on list?\n",
++                             sbi->s_delete_blocks, sbi->s_delete_inodes);
++              sbi->s_delete_blocks = 0;
++              sbi->s_delete_inodes = 0;
++              spin_unlock(&sbi->s_delete_lock);
++              wake_up(&sbi->s_delete_waiter_queue);
++      }
++
++      return 0;
++}
++
++static void ext3_start_delete_thread(struct super_block *sb)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      int rc;
++
++      spin_lock_init(&sbi->s_delete_lock);
++      memset(&sbi->s_delete_list, 0, sizeof(sbi->s_delete_list));
++      init_waitqueue_head(&sbi->s_delete_thread_queue);
++      init_waitqueue_head(&sbi->s_delete_waiter_queue);
++      sbi->s_delete_blocks = 0;
++      sbi->s_delete_inodes = 0;
++      rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
++      if (rc < 0)
++              printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
++                     rc);
++      else
++              wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
++}
++
++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
++{
++      wake_up(&sbi->s_delete_thread_queue);
++      wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
++}
++
++/* Instead of playing games with the inode flags, destruction, etc we just
++ * duplicate the inode data locally and put it on a list for the truncate
++ * thread.  We need large parts of the inode struct in order to complete
++ * the truncate and unlink, so we may as well just copy the whole thing.
++ *
++ * If we have any problem deferring the delete, just delete it right away.
++ * If we defer it, we also mark how many blocks it would free, so that we
++ * can keep the statfs data correct, and we know if we should sleep on the
++ * truncate thread when we run out of space.
++ *
++ * One shouldn't consider this duplicate an "inode", as it isn't really
++ * visible to the VFS, but rather a data struct that holds truncate data.
++ *
++ * In 2.5 this can be done much more cleanly by just registering a "drop"
++ * method in the super_operations struct.
++ */
++static void ext3_delete_inode_thread(struct inode *old_inode)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
++      struct inode *new_inode;
++      unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
++
++      if (is_bad_inode(old_inode)) {
++              clear_inode(old_inode);
++              return;
++      }
++
++      /* We may want to delete the inode immediately and not defer it */
++      if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
++          !sbi->s_delete_list.next) {
++              ext3_delete_inode(old_inode);
++              return;
++      }
++
++      if (EXT3_I(old_inode)->i_state & EXT3_STATE_DELETE) {
++              ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
++                         old_inode->i_ino, blocks);
++              ext3_delete_inode(old_inode);
++              return;
++      }
++
++      /* We can iget this inode again here, because our caller has unhashed
++       * old_inode, so new_inode will be in a different inode struct.
++       *
++       * We need to ensure that the i_orphan pointers in the other inodes
++       * point at the new inode copy instead of the old one so the orphan
++       * list doesn't get corrupted when the old orphan inode is freed.
++       */
++      down(&sbi->s_orphan_lock);
++
++      EXT3_SB(old_inode->i_sb)->s_mount_state |= EXT3_ORPHAN_FS;
++      new_inode = iget(old_inode->i_sb, old_inode->i_ino);
++      EXT3_SB(old_inode->i_sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
++      if (is_bad_inode(new_inode)) {
++              printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
++              iput(new_inode);
++              new_inode = NULL;
++      }
++      if (!new_inode) {
++              up(&sbi->s_orphan_lock);
++              ext3_debug(KERN_DEBUG "delete inode %lu directly (bad read)\n",
++                         old_inode->i_ino);
++              ext3_delete_inode(old_inode);
++              return;
++      }
++      J_ASSERT(new_inode != old_inode);
++
++      J_ASSERT(!list_empty(&EXT3_I(old_inode)->i_orphan));
++      /* Ugh.  We need to insert new_inode into the same spot on the list
++       * as old_inode was, to ensure the in-memory orphan list is still
++       * the same as the on-disk orphan list.
++       */
++      EXT3_I(new_inode)->i_orphan = EXT3_I(old_inode)->i_orphan;
++      EXT3_I(new_inode)->i_orphan.next->prev = &EXT3_I(new_inode)->i_orphan;
++      EXT3_I(new_inode)->i_orphan.prev->next = &EXT3_I(new_inode)->i_orphan;
++      EXT3_I(new_inode)->i_state |= EXT3_STATE_DELETE;
++      up(&sbi->s_orphan_lock);
++
++      clear_inode(old_inode);
++
++      printk(KERN_DEBUG "delete inode %lu (%lu blocks) by thread\n",
++             new_inode->i_ino, blocks);
++      spin_lock(&sbi->s_delete_lock);
++      J_ASSERT(list_empty(&new_inode->i_dentry));
++      list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
++      sbi->s_delete_blocks += blocks;
++      sbi->s_delete_inodes++;
++      spin_unlock(&sbi->s_delete_lock);
++
++      wake_up(&sbi->s_delete_thread_queue);
++}
++#else
++#define ext3_start_delete_thread(sbi) do {} while(0)
++#define ext3_stop_delete_thread(sbi) do {} while(0)
++#endif /* EXT3_DELETE_THREAD */
++
+ void ext3_put_super (struct super_block * sb)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+@@ -405,6 +606,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_stop_delete_thread(sbi);
+       ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+@@ -453,7 +655,11 @@ static struct super_operations ext3_sops
+       write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
+       dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
+       put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
++#ifdef EXT3_DELETE_THREAD
++      delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
++#else
+       delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
++#endif
+       put_super:      ext3_put_super,         /* BKL held */
+       write_super:    ext3_write_super,       /* BKL held */
+       sync_fs:        ext3_sync_fs,
+@@ -1209,6 +1415,7 @@ struct super_block * ext3_read_super (st
+       }
+       ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
++      ext3_start_delete_thread(sb);
+       /*
+        * akpm: core read_super() calls in here with the superblock locked.
+        * That deadlocks, because orphan cleanup needs to lock the superblock
+
+_
similarity index 78%
rename from lustre/extN/extN-iget-debug.diff
rename to lustre/kernel_patches/patches/extN-iget-debug.patch
index 9714e35..dbe90c8 100644 (file)
@@ -4,7 +4,7 @@
        return ret;
  }
  
-+static int extN_find_inode(struct inode *inode, unsigned long ino,
++static int ext3_find_inode(struct inode *inode, unsigned long ino,
 +                         void *opaque)
 +{
 +      const char *name = NULL;
 +      return 1;
 +}
 +
- static struct dentry *extN_lookup(struct inode * dir, struct dentry *dentry)
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
  {
        struct inode * inode;
-@@ -724,7 +742,7 @@
+@@ -724,8 +742,8 @@
        if (bh) {
                unsigned long ino = le32_to_cpu(de->inode);
                brelse (bh);
 -              inode = iget(dir->i_sb, ino);
-+              inode = iget4(dir->i_sb, ino, extN_find_inode, dentry);
++              inode = iget4(dir->i_sb, ino, ext3_find_inode, dentry);
  
                if (!inode)
                        return ERR_PTR(-EACCES);
 +++ linux/fs/ext3/inode.c      Sat Feb  1 00:34:45 2003
 @@ -166,6 +166,9 @@
   */
- void extN_put_inode (struct inode * inode)
+ void ext3_put_inode (struct inode * inode)
  {
 +      printk(KERN_INFO "putting inode %s:%lu (%p) count %d\n",
 +             kdevname(inode->i_dev), inode->i_ino, inode,
 +             atomic_read(&inode->i_count));
-       extN_discard_prealloc (inode);
+       ext3_discard_prealloc (inode);
  }
  
similarity index 58%
rename from lustre/extN/extN-misc-fixup.diff
rename to lustre/kernel_patches/patches/extN-misc-fixup.patch
index db0bc0f..06ea72a 100644 (file)
@@ -1,23 +1,23 @@
---- linux-2.4.17/fs/extN/super.c.orig  Fri Dec 21 10:41:55 2001
-+++ linux-2.4.17/fs/extN/super.c       Fri Mar 22 11:00:41 2002
+--- linux-2.4.17/fs/ext3/super.c.orig  Fri Dec 21 10:41:55 2001
++++ linux-2.4.17/fs/ext3/super.c       Fri Mar 22 11:00:41 2002
 @@ -1344,10 +1342,10 @@
-               printk(KERN_ERR "EXTN-fs: I/O error on journal device\n");
+               printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
                goto out_journal;
        }
 -      if (ntohl(journal->j_superblock->s_nr_users) != 1) {
 +      if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-               printk(KERN_ERR "EXTN-fs: External journal has more than one "
+               printk(KERN_ERR "EXT3-fs: External journal has more than one "
                                        "user (unsupported) - %d\n",
 -                      ntohl(journal->j_superblock->s_nr_users));
 +                      be32_to_cpu(journal->j_superblock->s_nr_users));
                goto out_journal;
        }
-       EXTN_SB(sb)->journal_bdev = bdev;
+       EXT3_SB(sb)->journal_bdev = bdev;
 @@ -1560,6 +1560,7 @@
        unlock_kernel();
        return ret;
  }
-+EXPORT_SYMBOL(extN_force_commit); /* here to avoid potential patch collisions */
++EXPORT_SYMBOL(ext3_force_commit); /* here to avoid potential patch collisions */
  
  /*
   * Ext3 always journals updates to the superblock itself, so we don't
similarity index 54%
rename from lustre/extN/extN-noread.diff
rename to lustre/kernel_patches/patches/extN-noread.patch
index 56220e2..63f4463 100644 (file)
@@ -1,7 +1,11 @@
-diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c
---- lustre-head/fs/extN/ialloc.c       Mon Dec 23 10:02:58 2002
-+++ lustre/fs/extN/ialloc.c    Mon Dec 23 09:46:20 2002
-@@ -289,6 +289,37 @@
+ fs/ext3/ialloc.c        |   47 +++++++++++++++++++++-
+ fs/ext3/inode.c         |   99 ++++++++++++++++++++++++++++++++++++------------
+ include/linux/ext3_fs.h |    2 
+ 3 files changed, 122 insertions(+), 26 deletions(-)
+
+--- linux-2.4.18-chaos52/fs/ext3/ialloc.c~extN-noread  2003-05-16 12:26:29.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/ialloc.c 2003-05-16 12:26:31.000000000 +0800
+@@ -289,6 +289,37 @@ error_return:
  }
  
  /*
@@ -12,7 +16,7 @@ diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c
 + *
 + * Caller must be holding superblock lock (group/bitmap read lock in future).
 + */
-+int extN_itable_block_used(struct super_block *sb, unsigned int block_group,
++int ext3_itable_block_used(struct super_block *sb, unsigned int block_group,
 +                         int offset)
 +{
 +      int bitmap_nr = load_inode_bitmap(sb, block_group);
@@ -23,12 +27,12 @@ diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c
 +      if (bitmap_nr < 0)
 +              return 1;
 +
-+      inodes_per_block = sb->s_blocksize / EXTN_SB(sb)->s_inode_size;
++      inodes_per_block = sb->s_blocksize / EXT3_SB(sb)->s_inode_size;
 +      inum = offset & ~(inodes_per_block - 1);
 +      iend = inum + inodes_per_block;
-+      ibitmap = EXTN_SB(sb)->s_inode_bitmap[bitmap_nr];
++      ibitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr];
 +      for (; inum < iend; inum++) {
-+              if (inum != offset && extN_test_bit(inum, ibitmap->b_data))
++              if (inum != offset && ext3_test_bit(inum, ibitmap->b_data))
 +                      return 1;
 +      }
 +
@@ -39,70 +43,69 @@ diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c
   * There are two policies for allocating an inode.  If the new inode is
   * a directory, then a forward search is made for a block group with both
   * free space and a low directory-to-inode ratio; if that fails, then of
-@@ -312,6 +343,7 @@
-       struct extN_group_desc * gdp;
-       struct extN_group_desc * tmp;
-       struct extN_super_block * es;
-+      struct extN_iloc iloc;
+@@ -312,6 +343,7 @@ struct inode * ext3_new_inode (handle_t 
+       struct ext3_group_desc * gdp;
+       struct ext3_group_desc * tmp;
+       struct ext3_super_block * es;
++      struct ext3_iloc iloc;
        int err = 0;
  
        /* Cannot create files in a deleted directory */
-@@ -505,7 +538,7 @@
+@@ -505,7 +537,7 @@ repeat:
        ei->i_prealloc_count = 0;
  #endif
        ei->i_block_group = i;
 -      
 +
-       if (ei->i_flags & EXTN_SYNC_FL)
+       if (ei->i_flags & EXT3_SYNC_FL)
                inode->i_flags |= S_SYNC;
        if (IS_SYNC(inode))
-@@ -514,9 +547,18 @@
+@@ -514,9 +546,18 @@ repeat:
        inode->i_generation = sbi->s_next_generation++;
  
-       ei->i_state = EXTN_STATE_NEW;
--      err = extN_mark_inode_dirty(handle, inode);
-+      err = extN_get_inode_loc_new(inode, &iloc, 1);
+       ei->i_state = EXT3_STATE_NEW;
+-      err = ext3_mark_inode_dirty(handle, inode);
++      err = ext3_get_inode_loc_new(inode, &iloc, 1);
        if (err) goto fail;
 -      
 +      BUFFER_TRACE(iloc->bh, "get_write_access");
-+      err = extN_journal_get_write_access(handle, iloc.bh);
++      err = ext3_journal_get_write_access(handle, iloc.bh);
 +      if (err) {
 +              brelse(iloc.bh);
 +              iloc.bh = NULL;
 +              goto fail;
 +      }
-+      err = extN_mark_iloc_dirty(handle, inode, &iloc);
++      err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 +      if (err) goto fail;
 +
        unlock_super (sb);
        if(DQUOT_ALLOC_INODE(inode)) {
                DQUOT_DROP(inode);
-diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c
---- lustre-head/fs/extN/inode.c        Mon Dec 23 10:02:58 2002
-+++ lustre/fs/extN/inode.c     Mon Dec 23 09:50:25 2002
-@@ -2011,23 +1994,28 @@
-       extN_journal_stop(handle, inode);
+--- linux-2.4.18-chaos52/fs/ext3/inode.c~extN-noread   2003-05-16 12:26:29.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/inode.c  2003-05-16 12:27:06.000000000 +0800
+@@ -2011,23 +2011,28 @@ out_stop:
+       ext3_journal_stop(handle, inode);
  }
  
 -/* 
-- * extN_get_inode_loc returns with an extra refcount against the
+- * ext3_get_inode_loc returns with an extra refcount against the
 - * inode's underlying buffer_head on success. 
 - */
 +#define NUM_INODE_PREREAD     16
  
--int extN_get_inode_loc (struct inode *inode, struct extN_iloc *iloc)
+-int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
 +/*
-+ * extN_get_inode_loc returns with an extra refcount against the inode's
++ * ext3_get_inode_loc returns with an extra refcount against the inode's
 + * underlying buffer_head on success.  If this is for a new inode allocation
 + * (new is non-zero) then we may be able to optimize away the read if there
 + * are no other in-use inodes in this inode table block.  If we need to do
 + * a read, then read in a whole chunk of blocks to avoid blocking again soon
 + * if we are doing lots of creates/updates.
 + */
-+int extN_get_inode_loc_new(struct inode *inode, struct extN_iloc *iloc, int new)
++int ext3_get_inode_loc_new(struct inode *inode, struct ext3_iloc *iloc, int new)
  {
        struct super_block *sb = inode->i_sb;
-       struct extN_sb_info *sbi = EXTN_SB(sb);
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
 -      struct buffer_head *bh = 0;
 +      struct buffer_head *bh[NUM_INODE_PREREAD];
        unsigned long block;
@@ -110,25 +113,25 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c
        unsigned long group_desc;
        unsigned long desc;
        unsigned long offset;
-       struct extN_group_desc * gdp;
+       struct ext3_group_desc * gdp;
 -              
 +
-       if ((inode->i_ino != EXTN_ROOT_INO &&
-               inode->i_ino != EXTN_JOURNAL_INO &&
-               inode->i_ino < EXTN_FIRST_INO(sb)) ||
-@@ -2042,38 +2034,86 @@
+       if ((inode->i_ino != EXT3_ROOT_INO &&
+               inode->i_ino != EXT3_JOURNAL_INO &&
+               inode->i_ino < EXT3_FIRST_INO(sb)) ||
+@@ -2042,38 +2047,86 @@ int ext3_get_inode_loc (struct inode *in
        }
        group_desc = block_group >> sbi->s_desc_per_block_bits;
        desc = block_group & (sbi->s_desc_per_block - 1);
 -      bh = sbi->s_group_desc[group_desc];
 -      if (!bh) {
 +      if (!sbi->s_group_desc[group_desc]) {
-               extN_error(sb, __FUNCTION__, "Descriptor not loaded");
+               ext3_error(sb, __FUNCTION__, "Descriptor not loaded");
                goto bad_inode;
        }
  
--      gdp = (struct extN_group_desc *) bh->b_data;
-+      gdp = (struct extN_group_desc *)(sbi->s_group_desc[group_desc]->b_data);
+-      gdp = (struct ext3_group_desc *) bh->b_data;
++      gdp = (struct ext3_group_desc *)(sbi->s_group_desc[group_desc]->b_data);
 +
        /*
         * Figure out the offset within the block group inode table
@@ -138,13 +141,13 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c
 +      offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group);
 +
        block = le32_to_cpu(gdp[desc].bg_inode_table) +
--              (offset >> EXTN_BLOCK_SIZE_BITS(sb));
+-              (offset >> EXT3_BLOCK_SIZE_BITS(sb));
 -      if (!(bh = sb_bread(sb, block))) {
--              extN_error (sb, __FUNCTION__,
+-              ext3_error (sb, __FUNCTION__,
 -                          "unable to read inode block - "
 -                          "inode=%lu, block=%lu", inode->i_ino, block);
 -              goto bad_inode;
-+              (offset * sbi->s_inode_size >> EXTN_BLOCK_SIZE_BITS(sb));
++              (offset * sbi->s_inode_size >> EXT3_BLOCK_SIZE_BITS(sb));
 +
 +      bh[0] = sb_getblk(sb, block);
 +      if (buffer_uptodate(bh[0]))
@@ -154,7 +157,7 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c
 +       * in memory, then we just zero it out.  Otherwise, we keep the
 +       * current block contents (deleted inode data) for posterity.
 +       */
-+      if (new && !extN_itable_block_used(sb, block_group, offset)) {
++      if (new && !ext3_itable_block_used(sb, block_group, offset)) {
 +              lock_buffer(bh[0]);
 +              memset(bh[0]->b_data, 0, bh[0]->b_size);
 +              mark_buffer_uptodate(bh[0], 1);
@@ -169,7 +172,7 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c
 +              if (block_end > itable_end)
 +                      block_end = itable_end;
 +
-+              for (; block < block_end; block++) {
++              for (++block; block < block_end; block++) {
 +                      bh[count] = sb_getblk(sb, block);
 +                      if (count && (buffer_uptodate(bh[count]) ||
 +                                    buffer_locked(bh[count]))) {
@@ -186,21 +189,21 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c
 +
 +              wait_on_buffer(bh[0]);
 +              if (!buffer_uptodate(bh[0])) {
-+                      extN_error(sb, __FUNCTION__,
++                      ext3_error(sb, __FUNCTION__,
 +                                 "unable to read inode block - "
 +                                 "inode=%lu, block=%lu", inode->i_ino,
 +                                 bh[0]->b_blocknr);
 +                      goto bad_inode;
 +              }
        }
--      offset &= (EXTN_BLOCK_SIZE(sb) - 1);
+-      offset &= (EXT3_BLOCK_SIZE(sb) - 1);
 + done:
-+      offset = (offset * sbi->s_inode_size) & (EXTN_BLOCK_SIZE(sb) - 1);
++      offset = (offset * sbi->s_inode_size) & (EXT3_BLOCK_SIZE(sb) - 1);
  
 -      iloc->bh = bh;
--      iloc->raw_inode = (struct extN_inode *) (bh->b_data + offset);
+-      iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
 +      iloc->bh = bh[0];
-+      iloc->raw_inode = (struct extN_inode *)(bh[0]->b_data + offset);
++      iloc->raw_inode = (struct ext3_inode *)(bh[0]->b_data + offset);
        iloc->block_group = block_group;
 -      
 +
@@ -211,23 +214,24 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c
        return -EIO;
  }
  
-+int extN_get_inode_loc(struct inode *inode, struct extN_iloc *iloc)
++int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
 +{
-+      return extN_get_inode_loc_new(inode, iloc, 0);
++      return ext3_get_inode_loc_new(inode, iloc, 0);
 +}
 +
- void extN_read_inode(struct inode * inode)
+ void ext3_read_inode(struct inode * inode)
  {
-       struct extN_iloc iloc;
-diff -ru include/linux/extN_fs.h.orig include/linux/extN_fs.h
---- lustre/include/linux/extN_fs.h.orig        Sat Mar  8 01:23:09 2003
-+++ lustre/include/linux/extN_fs.h     Sat Mar  8 01:24:31 2003
-@@ -642,6 +646,8 @@
- extern struct buffer_head * extN_getblk (handle_t *, struct inode *, long, int, int *);
- extern struct buffer_head * extN_bread (handle_t *, struct inode *, int, int, int *);
+       struct ext3_iloc iloc;
+--- linux-2.4.18-chaos52/include/linux/ext3_fs.h~extN-noread   2003-05-16 12:26:29.000000000 +0800
++++ linux-2.4.18-chaos52-root/include/linux/ext3_fs.h  2003-05-16 12:26:31.000000000 +0800
+@@ -640,6 +640,8 @@ extern int ext3_forget(handle_t *, int, 
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
  
-+extern int extN_itable_block_used(struct super_block *sb, unsigned int, int);
-+extern int extN_get_inode_loc_new(struct inode *, struct extN_iloc *, int);
- extern int  extN_get_inode_loc (struct inode *, struct extN_iloc *);
- extern void extN_read_inode (struct inode *);
- extern void extN_write_inode (struct inode *, int);
++extern int ext3_itable_block_used(struct super_block *sb, unsigned int, int);
++extern int ext3_get_inode_loc_new(struct inode *, struct ext3_iloc *, int);
+ extern int  ext3_get_inode_loc (struct inode *, struct ext3_iloc *);
+ extern void ext3_read_inode (struct inode *);
+ extern void ext3_write_inode (struct inode *, int);
+
+_
diff --git a/lustre/kernel_patches/patches/extN-san.patch b/lustre/kernel_patches/patches/extN-san.patch
new file mode 100644 (file)
index 0000000..d58fe8c
--- /dev/null
@@ -0,0 +1,106 @@
+ fs/ext3/inode.c |   81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/super.c |    4 ++
+ 2 files changed, 85 insertions(+)
+
+--- linux-2.4.18-18.8.0-l18/fs/ext3/inode.c~extN-san   Sun May 18 12:58:13 2003
++++ linux-2.4.18-18.8.0-l18-phil/fs/ext3/inode.c       Sun May 18 13:24:49 2003
+@@ -2781,3 +2781,84 @@ int ext3_change_inode_journal_flag(struc
+  * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
+  * need to extend" test in ext3_prepare_write() succeeds.  
+  */
++
++/* for each block: 1 ind + 1 dind + 1 tind
++ * for each block: 3 bitmap blocks
++ * for each block: 3 group descriptor blocks
++ * i inode block
++ * 1 superblock
++ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
++ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
++ *
++ * XXX assuming:
++ * (1) fs logic block size == page size
++ * (2) ext3 in writeback mode
++ */
++static inline int ext3_san_write_trans_blocks(int nblocks)
++{
++      int ret;
++      
++      ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1;
++
++#ifdef CONFIG_QUOTA
++      ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
++#endif
++
++      return ret;
++}
++
++/* Alloc blocks for an inode, while don't create any buffer/page
++ * for data I/O; set the inode size if file is extended.
++ *
++ * @inode:    target inode
++ * @blocks:   array of logic block number
++ * @nblocks:  how many blocks need be alloced
++ * @newsize:  new filesize we should set
++ *
++ * return:    0 success, otherwise failed
++ *            (*blocks) contains physical block number alloced
++ *
++ * XXX this assume the fs block size == page size
++ */
++int ext3_prep_san_write(struct inode *inode, long *blocks,
++                      int nblocks, loff_t newsize)
++{
++      handle_t *handle;
++      struct buffer_head bh_tmp;
++      int needed_blocks;
++      int i, ret = 0, ret2;
++
++      needed_blocks = ext3_san_write_trans_blocks(nblocks);
++
++      lock_kernel();
++      handle = ext3_journal_start(inode, needed_blocks);
++      if (IS_ERR(handle)) {
++              unlock_kernel();
++              return PTR_ERR(handle);
++      }
++      unlock_kernel();
++
++      /* alloc blocks one by one */
++      for (i = 0; i < nblocks; i++) {
++              ret = ext3_get_block_handle(handle, inode, blocks[i],
++                                              &bh_tmp, 1);
++              if (ret)
++                      break;
++
++              blocks[i] = bh_tmp.b_blocknr;
++      }
++
++      /* set inode size if needed */
++      if (!ret && (newsize > inode->i_size)) {
++              inode->i_size = newsize;
++              ext3_mark_inode_dirty(handle, inode);
++      }
++
++      lock_kernel();
++      ret2 = ext3_journal_stop(handle, inode);
++      unlock_kernel();
++
++      if (!ret)
++              ret = ret2;
++      return ret;
++}
+--- linux-2.4.18-18.8.0-l18/fs/ext3/super.c~extN-san   Sun May 18 13:24:35 2003
++++ linux-2.4.18-18.8.0-l18-phil/fs/ext3/super.c       Sun May 18 13:24:55 2003
+@@ -1774,6 +1774,10 @@ static int __init init_ext3_fs(void)
+ EXPORT_SYMBOL(ext3_bread);
++int ext3_prep_san_write(struct inode *inode, long *blocks,
++                        int nblocks, loff_t newsize);
++EXPORT_SYMBOL(ext3_prep_san_write);
++
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+ MODULE_LICENSE("GPL");
+
+_
diff --git a/lustre/kernel_patches/patches/extN-wantedi.patch b/lustre/kernel_patches/patches/extN-wantedi.patch
new file mode 100644 (file)
index 0000000..fc74c6b
--- /dev/null
@@ -0,0 +1,171 @@
+ fs/ext3/ialloc.c        |   38 ++++++++++++++++++++++++++++++++++++--
+ fs/ext3/ioctl.c         |   25 +++++++++++++++++++++++++
+ fs/ext3/namei.c         |   12 ++++++++----
+ include/linux/ext3_fs.h |    5 ++++-
+ 4 files changed, 73 insertions(+), 7 deletions(-)
+
+--- linux-2.4.20/fs/ext3/namei.c~extN-wantedi  2003-04-08 23:35:55.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/namei.c 2003-04-08 23:35:55.000000000 -0600
+@@ -1555,7 +1555,8 @@ static int ext3_create (struct inode * d
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext3_new_inode (handle, dir, mode);
++      inode = ext3_new_inode (handle, dir, mode,
++                              (unsigned long)dentry->d_fsdata);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               inode->i_op = &ext3_file_inode_operations;
+@@ -1583,7 +1584,8 @@ static int ext3_mknod (struct inode * di
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext3_new_inode (handle, dir, mode);
++      inode = ext3_new_inode (handle, dir, mode,
++                              (unsigned long)dentry->d_fsdata);
+       err = PTR_ERR(inode);
+       if (!IS_ERR(inode)) {
+               init_special_inode(inode, mode, rdev);
+@@ -1613,7 +1615,8 @@ static int ext3_mkdir(struct inode * dir
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
++      inode = ext3_new_inode (handle, dir, S_IFDIR | mode,
++                              (unsigned long)dentry->d_fsdata);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -2009,7 +2012,8 @@ static int ext3_symlink (struct inode * 
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
++      inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO,
++                              (unsigned long)dentry->d_fsdata);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+--- linux-2.4.20/fs/ext3/ialloc.c~extN-wantedi 2003-04-08 23:35:55.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/ialloc.c        2003-04-08 23:35:55.000000000 -0600
+@@ -299,7 +299,8 @@ error_return:
+  * group to find a free inode.
+  */
+ struct inode * ext3_new_inode (handle_t *handle,
+-                              const struct inode * dir, int mode)
++                              const struct inode * dir, int mode,
++                              unsigned long goal)
+ {
+       struct super_block * sb;
+       struct buffer_head * bh;
+@@ -323,7 +324,39 @@ struct inode * ext3_new_inode (handle_t 
+       init_rwsem(&inode->u.ext3_i.truncate_sem);
+       lock_super (sb);
+-      es = sb->u.ext3_sb.s_es;
++      es = EXT3_SB(sb)->s_es;
++
++      if (goal) {
++              i = (goal - 1) / EXT3_INODES_PER_GROUP(sb);
++              j = (goal - 1) % EXT3_INODES_PER_GROUP(sb);
++              gdp = ext3_get_group_desc(sb, i, &bh2);
++
++              bitmap_nr = load_inode_bitmap (sb, i);
++              if (bitmap_nr < 0)
++                      goto fail;
++
++              bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr];
++
++              BUFFER_TRACE(bh, "get_write_access");
++              err = ext3_journal_get_write_access(handle, bh);
++              if (err) goto fail;
++
++              if (ext3_set_bit(j, bh->b_data)) {
++                      printk(KERN_ERR "goal inode %lu unavailable\n", goal);
++                      /* Oh well, we tried. */
++                      goto repeat;
++              }
++
++              BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++              err = ext3_journal_dirty_metadata(handle, bh);
++              if (err) goto fail;
++
++              /* We've shortcircuited the allocation system successfully,
++               * now finish filling in the inode.
++               */
++              goto have_bit_and_group;
++      }
++
+ repeat:
+       gdp = NULL;
+       i = 0;
+@@ -438,6 +471,7 @@ repeat:
+               }
+               goto repeat;
+       }
++ have_bit_and_group:
+       j += i * EXT3_INODES_PER_GROUP(sb) + 1;
+       if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) {
+               ext3_error (sb, "ext3_new_inode",
+--- linux-2.4.20/fs/ext3/ioctl.c~extN-wantedi  2003-04-08 23:35:55.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/ioctl.c 2003-04-08 23:35:55.000000000 -0600
+@@ -23,6 +23,31 @@ int ext3_ioctl (struct inode * inode, st
+       ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+       switch (cmd) {
++      case EXT3_IOC_CREATE_INUM: {
++              char name[32];
++              struct dentry *dchild, *dparent;
++              int rc = 0;
++
++              dparent = list_entry(inode->i_dentry.next, struct dentry,
++                                   d_alias);
++              snprintf(name, sizeof name, "%lu", arg);
++              dchild = lookup_one_len(name, dparent, strlen(name));
++              if (dchild->d_inode) {
++                      printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n",
++                             dparent->d_name.len, dparent->d_name.name, arg,
++                             dchild->d_inode->i_ino);
++                      rc = -EEXIST;
++              } else {
++                      dchild->d_fsdata = (void *)arg;
++                      rc = vfs_create(inode, dchild, 0644);
++                      if (rc)
++                              printk(KERN_ERR "vfs_create: %d\n", rc);
++                      else if (dchild->d_inode->i_ino != arg)
++                              rc = -EEXIST;
++              }
++              dput(dchild);
++              return rc;
++      }
+       case EXT3_IOC_GETFLAGS:
+               flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
+               return put_user(flags, (int *) arg);
+--- linux-2.4.20/include/linux/ext3_fs.h~extN-wantedi  2003-04-08 23:35:55.000000000 -0600
++++ linux-2.4.20-braam/include/linux/ext3_fs.h 2003-04-08 23:35:55.000000000 -0600
+@@ -201,6 +201,7 @@ struct ext3_group_desc
+ #define       EXT3_IOC_SETFLAGS               _IOW('f', 2, long)
+ #define       EXT3_IOC_GETVERSION             _IOR('f', 3, long)
+ #define       EXT3_IOC_SETVERSION             _IOW('f', 4, long)
++/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+ #define       EXT3_IOC_GETVERSION_OLD         _IOR('v', 1, long)
+ #define       EXT3_IOC_SETVERSION_OLD         _IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+@@ -671,7 +672,8 @@ extern int ext3fs_dirhash(const char *na
+                         dx_hash_info *hinfo);
+ /* ialloc.c */
+-extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
++extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int,
++                                    unsigned long);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+@@ -757,4 +759,5 @@ extern struct inode_operations ext3_fast
+ #endif        /* __KERNEL__ */
++#define EXT3_IOC_CREATE_INUM                  _IOW('f', 5, long)
+ #endif        /* _LINUX_EXT3_FS_H */
+
+_
similarity index 99%
rename from lustre/extN/htree-ext3-2.4.18.diff
rename to lustre/kernel_patches/patches/htree-ext3-2.4.18.patch
index 4251251..a54e9ca 100644 (file)
@@ -13,7 +13,7 @@
                else if (!strcmp (this_char, "debug"))
                        set_opt (*mount_options, DEBUG);
                else if (!strcmp (this_char, "errors")) {
-@@ -702,6 +708,12 @@ static int ext3_setup_super(struct super
+@@ -702,6 +708,12 @@
        es->s_mtime = cpu_to_le32(CURRENT_TIME);
        ext3_update_dynamic_rev(sb);
        EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
diff --git a/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch b/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch
new file mode 100644 (file)
index 0000000..6e7d920
--- /dev/null
@@ -0,0 +1,114 @@
+ fs/inode.c         |   23 +++++++++++++++--------
+ fs/smbfs/inode.c   |    2 +-
+ fs/super.c         |    4 ++--
+ include/linux/fs.h |    2 +-
+ 4 files changed, 19 insertions(+), 12 deletions(-)
+
+--- kernel-2.4.20/fs/inode.c~invalidate_show-2.4.20-rh 2003-05-24 01:56:40.000000000 -0400
++++ kernel-2.4.20-root/fs/inode.c      2003-06-02 00:35:37.000000000 -0400
+@@ -628,7 +628,8 @@ static void dispose_list(struct list_hea
+ /*
+  * Invalidate all inodes for a device.
+  */
+-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
++static int invalidate_list(struct list_head *head, struct super_block * sb,
++                         struct list_head * dispose, int show)
+ {
+       struct list_head *next;
+       int busy = 0, count = 0;
+@@ -653,6 +654,11 @@ static int invalidate_list(struct list_h
+                       count++;
+                       continue;
+               }
++              if (show)
++                      printk(KERN_ERR
++                             "inode busy: dev %s:%lu (%p) mode %o count %u\n",
++                             kdevname(sb->s_dev), inode->i_ino, inode,
++                             inode->i_mode, atomic_read(&inode->i_count));
+               busy = 1;
+       }
+       /* only unused inodes may be cached with i_count zero */
+@@ -671,23 +677,24 @@ static int invalidate_list(struct list_h
+ /**
+  *    invalidate_inodes       - discard the inodes on a device
+  *    @sb: superblock
++ *      @show: whether we should display any busy inodes found
+  *
+  *    Discard all of the inodes for a given superblock. If the discard
+  *    fails because there are busy inodes then a non zero value is returned.
+  *    If the discard is successful all the inodes have been discarded.
+  */
+  
+-int invalidate_inodes(struct super_block * sb)
++int invalidate_inodes(struct super_block * sb, int show)
+ {
+       int busy;
+       LIST_HEAD(throw_away);
+       spin_lock(&inode_lock);
+-      busy = invalidate_list(&inode_in_use, sb, &throw_away);
+-      busy |= invalidate_list(&inode_unused, sb, &throw_away);
+-      busy |= invalidate_list(&inode_unused_pagecache, sb, &throw_away);
+-      busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+-      busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away);
++      busy = invalidate_list(&inode_in_use, sb, &throw_away, show);
++      busy |= invalidate_list(&inode_unused, sb, &throw_away, show);
++      busy |= invalidate_list(&inode_unused_pagecache, sb, &throw_away, show);
++      busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show);
++      busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show);
+       spin_unlock(&inode_lock);
+       dispose_list(&throw_away);
+@@ -713,7 +720,7 @@ int invalidate_device(kdev_t dev, int do
+                * hold).
+                */
+               shrink_dcache_sb(sb);
+-              res = invalidate_inodes(sb);
++              res = invalidate_inodes(sb, 0);
+               drop_super(sb);
+       }
+       invalidate_buffers(dev);
+--- kernel-2.4.20/fs/super.c~invalidate_show-2.4.20-rh 2003-05-24 01:56:24.000000000 -0400
++++ kernel-2.4.20-root/fs/super.c      2003-06-02 00:35:00.000000000 -0400
+@@ -943,7 +943,7 @@ void kill_super(struct super_block *sb)
+       lock_super(sb);
+       lock_kernel();
+       sb->s_flags &= ~MS_ACTIVE;
+-      invalidate_inodes(sb);  /* bad name - it should be evict_inodes() */
++      invalidate_inodes(sb, 0);  /* bad name - it should be evict_inodes() */
+       if (sop) {
+               if (sop->write_super && sb->s_dirt)
+                       sop->write_super(sb);
+@@ -952,7 +952,7 @@ void kill_super(struct super_block *sb)
+       }
+       /* Forget any remaining inodes */
+-      if (invalidate_inodes(sb)) {
++      if (invalidate_inodes(sb, 1)) {
+               printk(KERN_ERR "VFS: Busy inodes after unmount. "
+                       "Self-destruct in 5 seconds.  Have a nice day...\n");
+       }
+--- kernel-2.4.20/include/linux/fs.h~invalidate_show-2.4.20-rh 2003-06-02 00:31:47.000000000 -0400
++++ kernel-2.4.20-root/include/linux/fs.h      2003-06-02 00:35:00.000000000 -0400
+@@ -1284,7 +1284,7 @@ static inline void mark_buffer_dirty_ino
+ extern void set_buffer_flushtime(struct buffer_head *);
+ extern void balance_dirty(void);
+ extern int check_disk_change(kdev_t);
+-extern int invalidate_inodes(struct super_block *);
++extern int invalidate_inodes(struct super_block *, int);
+ extern int invalidate_device(kdev_t, int);
+ extern void invalidate_inode_pages(struct inode *);
+ extern void invalidate_inode_pages2(struct address_space *);
+--- kernel-2.4.20/fs/smbfs/inode.c~invalidate_show-2.4.20-rh   2002-11-28 18:53:15.000000000 -0500
++++ kernel-2.4.20-root/fs/smbfs/inode.c        2003-06-02 00:35:00.000000000 -0400
+@@ -167,7 +167,7 @@ smb_invalidate_inodes(struct smb_sb_info
+ {
+       VERBOSE("\n");
+       shrink_dcache_sb(SB_of(server));
+-      invalidate_inodes(SB_of(server));
++      invalidate_inodes(SB_of(server), 0);
+ }
+ /*
+
+_
index c3ae2f5..9273c5c 100644 (file)
@@ -1,6 +1,15 @@
---- lum/fs/inode.c     Sat Oct 19 11:42:42 2002
-+++ linux-2.4.18-uml35-ext3online/fs/inode.c   Mon Oct 14 00:41:20 2002
-@@ -606,7 +553,8 @@ static void dispose_list(struct list_hea
+
+
+
+ fs/inode.c         |   21 ++++++++++++++-------
+ fs/smbfs/inode.c   |    2 +-
+ fs/super.c         |    4 ++--
+ include/linux/fs.h |    2 +-
+ 4 files changed, 18 insertions(+), 11 deletions(-)
+
+--- linux-rh-2.4.20-8/fs/inode.c~invalidate_show       2003-04-11 14:04:56.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/inode.c  2003-04-16 20:59:35.000000000 +0800
+@@ -604,7 +604,8 @@ static void dispose_list(struct list_hea
  /*
   * Invalidate all inodes for a device.
   */
@@ -10,7 +19,7 @@
  {
        struct list_head *next;
        int busy = 0, count = 0;
-@@ -631,6 +579,11 @@ static int invalidate_list(struct list_h
+@@ -629,6 +630,11 @@ static int invalidate_list(struct list_h
                        count++;
                        continue;
                }
@@ -22,7 +31,7 @@
                busy = 1;
        }
        /* only unused inodes may be cached with i_count zero */
-@@ -649,22 +601,23 @@ static int invalidate_list(struct list_h
+@@ -647,22 +653,23 @@ static int invalidate_list(struct list_h
  /**
   *    invalidate_inodes       - discard the inodes on a device
   *    @sb: superblock
@@ -51,7 +60,7 @@
        spin_unlock(&inode_lock);
  
        dispose_list(&throw_away);
-@@ -690,7 +643,7 @@ int invalidate_device(kdev_t dev, int do
+@@ -688,7 +695,7 @@ int invalidate_device(kdev_t dev, int do
                 * hold).
                 */
                shrink_dcache_sb(sb);
@@ -60,9 +69,9 @@
                drop_super(sb);
        }
        invalidate_buffers(dev);
---- lum/fs/super.c.orig        Sat Oct 19 11:42:42 2002
-+++ lum/fs/super.c     Wed Oct 30 17:16:55 2002
-@@ -936,7 +936,7 @@
+--- linux-rh-2.4.20-8/fs/super.c~invalidate_show       2003-04-11 14:04:57.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/super.c  2003-04-16 20:59:35.000000000 +0800
+@@ -943,7 +943,7 @@ void kill_super(struct super_block *sb)
        lock_super(sb);
        lock_kernel();
        sb->s_flags &= ~MS_ACTIVE;
@@ -71,7 +80,7 @@
        if (sop) {
                if (sop->write_super && sb->s_dirt)
                        sop->write_super(sb);
-@@ -945,7 +945,7 @@
+@@ -952,7 +952,7 @@ void kill_super(struct super_block *sb)
        }
  
        /* Forget any remaining inodes */
@@ -80,9 +89,9 @@
                printk(KERN_ERR "VFS: Busy inodes after unmount. "
                        "Self-destruct in 5 seconds.  Have a nice day...\n");
        }
---- lum/include/linux/fs.h     Wed Oct 30 17:10:42 2002
-+++ lum/include/linux/fs.h.orig        Tue Oct 22 23:15:00 2002
-@@ -1261,7 +1261,7 @@
+--- linux-rh-2.4.20-8/include/linux/fs.h~invalidate_show       2003-04-16 20:55:35.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/fs.h  2003-04-16 20:59:35.000000000 +0800
+@@ -1283,7 +1283,7 @@ static inline void mark_buffer_dirty_ino
  extern void set_buffer_flushtime(struct buffer_head *);
  extern void balance_dirty(void);
  extern int check_disk_change(kdev_t);
  extern int invalidate_device(kdev_t, int);
  extern void invalidate_inode_pages(struct inode *);
  extern void invalidate_inode_pages2(struct address_space *);
---- lum/fs/smbfs/inode.c.orig  Mon Feb 25 12:38:09 2002
-+++ lum/fs/smbfs/inode.c       Thu Feb  6 21:34:26 2003
-@@ -166,7 +166,7 @@
+--- linux-rh-2.4.20-8/fs/smbfs/inode.c~invalidate_show 2003-04-16 20:59:48.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/smbfs/inode.c    2003-04-16 21:00:43.000000000 +0800
+@@ -167,7 +167,7 @@ smb_invalidate_inodes(struct smb_sb_info
  {
        VERBOSE("\n");
        shrink_dcache_sb(SB_of(server));
  }
  
  /*
+
+_
diff --git a/lustre/kernel_patches/patches/iod-rmap-exports-2.4.20.patch b/lustre/kernel_patches/patches/iod-rmap-exports-2.4.20.patch
new file mode 100644 (file)
index 0000000..3fdf3fd
--- /dev/null
@@ -0,0 +1,86 @@
+ fs/Makefile     |    4 +++-
+ fs/inode.c      |    4 +++-
+ mm/Makefile     |    2 +-
+ mm/page_alloc.c |    1 +
+ mm/vmscan.c     |    3 +++
+ 5 files changed, 11 insertions(+), 3 deletions(-)
+
+--- linux-rh-2.4.20-6/fs/inode.c~iod-rmap-exports      Tue Apr  1 01:01:56 2003
++++ linux-rh-2.4.20-6-braam/fs/inode.c Tue Apr  1 01:01:56 2003
+@@ -5,6 +5,7 @@
+  */
+ #include <linux/config.h>
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/string.h>
+ #include <linux/mm.h>
+@@ -66,7 +67,8 @@ static LIST_HEAD(anon_hash_chain); /* fo
+  * NOTE! You also have to own the lock if you change
+  * the i_state of an inode while it is in use..
+  */
+-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
++EXPORT_SYMBOL(inode_lock);
+ /*
+  * Statistics gathering..
+--- linux-rh-2.4.20-6/fs/Makefile~iod-rmap-exports     Tue Apr  1 01:01:56 2003
++++ linux-rh-2.4.20-6-braam/fs/Makefile        Tue Apr  1 01:02:34 2003
+@@ -1,3 +1,5 @@
++
++
+ #
+ # Makefile for the Linux filesystems.
+ #
+@@ -7,7 +9,7 @@
+ O_TARGET := fs.o
+-export-objs :=        filesystems.o open.o dcache.o buffer.o dquot.o dcookies.o
++export-objs :=        filesystems.o open.o dcache.o buffer.o dquot.o dcookies.o inode.o
+ mod-subdirs :=        nls
+ obj-y :=      open.o read_write.o devices.o file_table.o buffer.o \
+--- linux-rh-2.4.20-6/mm/vmscan.c~iod-rmap-exports     Tue Apr  1 01:01:56 2003
++++ linux-rh-2.4.20-6-braam/mm/vmscan.c        Tue Apr  1 01:01:56 2003
+@@ -15,6 +15,8 @@
+  *  O(1) rmap vm, Arjan van de ven <arjanv@redhat.com>
+  */
++#include <linux/config.h>
++#include <linux/module.h>
+ #include <linux/slab.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+@@ -1061,6 +1063,7 @@ void wakeup_kswapd(unsigned int gfp_mask
+       set_current_state(TASK_RUNNING);
+       remove_wait_queue(&kswapd_done, &wait);
+ }
++EXPORT_SYMBOL(wakeup_kswapd);
+ static void wakeup_memwaiters(void)
+ {
+--- linux-rh-2.4.20-6/mm/Makefile~iod-rmap-exports     Tue Apr  1 01:01:56 2003
++++ linux-rh-2.4.20-6-braam/mm/Makefile        Tue Apr  1 01:01:56 2003
+@@ -9,7 +9,7 @@
+ O_TARGET := mm.o
+-export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
++export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o vmscan.o
+ obj-y  := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
+           vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
+--- linux-rh-2.4.20-6/mm/page_alloc.c~iod-rmap-exports Tue Apr  1 01:01:56 2003
++++ linux-rh-2.4.20-6-braam/mm/page_alloc.c    Tue Apr  1 01:01:56 2003
+@@ -27,6 +27,7 @@
+ int nr_swap_pages;
+ pg_data_t *pgdat_list;
++EXPORT_SYMBOL(pgdat_list);
+ /*
+  *
+
+_
index 8df0d82..5ba68dd 100644 (file)
@@ -1,5 +1,12 @@
---- linux/fs/inode.c.b_io      2003-02-18 16:39:16.000000000 -0800
-+++ linux/fs/inode.c   2003-02-18 16:39:45.000000000 -0800
+ fs/Makefile     |    4 +++-
+ fs/inode.c      |    4 +++-
+ mm/Makefile     |    2 +-
+ mm/page_alloc.c |    1 +
+ mm/vmscan.c     |    3 +++
+ 5 files changed, 11 insertions(+), 3 deletions(-)
+
+--- linux-2.4.18-18/fs/inode.c~iod-rmap-exports        Thu Apr  3 00:40:01 2003
++++ linux-2.4.18-18-braam/fs/inode.c   Thu Apr  3 00:40:01 2003
 @@ -5,6 +5,7 @@
   */
  
@@ -8,7 +15,7 @@
  #include <linux/fs.h>
  #include <linux/string.h>
  #include <linux/mm.h>
-@@ -66,7 +67,8 @@
+@@ -66,7 +67,8 @@ static LIST_HEAD(anon_hash_chain); /* fo
   * NOTE! You also have to own the lock if you change
   * the i_state of an inode while it is in use..
   */
  
  /*
   * Statistics gathering..
---- linux/fs/Makefile.b_io     2003-02-18 16:39:16.000000000 -0800
-+++ linux/fs/Makefile  2003-02-18 16:39:37.000000000 -0800
-@@ -7,7 +7,7 @@
+--- linux-2.4.18-18/fs/Makefile~iod-rmap-exports       Thu Apr  3 00:40:01 2003
++++ linux-2.4.18-18-braam/fs/Makefile  Thu Apr  3 00:40:29 2003
+@@ -1,3 +1,5 @@
++
++
+ #
+ # Makefile for the Linux filesystems.
+ #
+@@ -7,7 +9,7 @@
  
  O_TARGET := fs.o
  
@@ -29,8 +42,8 @@
  mod-subdirs :=        nls
  
  obj-y :=      open.o read_write.o devices.o file_table.o buffer.o \
---- linux/mm/vmscan.c.b_io     2003-02-18 16:39:16.000000000 -0800
-+++ linux/mm/vmscan.c  2003-02-18 16:40:01.000000000 -0800
+--- linux-2.4.18-18/mm/vmscan.c~iod-rmap-exports       Thu Apr  3 00:40:01 2003
++++ linux-2.4.18-18-braam/mm/vmscan.c  Thu Apr  3 00:40:01 2003
 @@ -14,6 +14,8 @@
   *  Multiqueue VM started 5.8.00, Rik van Riel.
   */
@@ -40,7 +53,7 @@
  #include <linux/slab.h>
  #include <linux/kernel_stat.h>
  #include <linux/swap.h>
-@@ -837,6 +839,7 @@
+@@ -837,6 +839,7 @@ void wakeup_kswapd(unsigned int gfp_mask
        set_current_state(TASK_RUNNING);
        remove_wait_queue(&kswapd_done, &wait);
  }
@@ -48,8 +61,8 @@
  
  static void wakeup_memwaiters(void)
  {
---- linux/mm/Makefile.b_io     2003-02-18 16:39:16.000000000 -0800
-+++ linux/mm/Makefile  2003-02-18 16:39:37.000000000 -0800
+--- linux-2.4.18-18/mm/Makefile~iod-rmap-exports       Thu Apr  3 00:40:01 2003
++++ linux-2.4.18-18-braam/mm/Makefile  Thu Apr  3 00:40:01 2003
 @@ -9,7 +9,7 @@
  
  O_TARGET := mm.o
@@ -59,9 +72,9 @@
  
  obj-y  := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
            vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
---- linux-chaos/mm/page_alloc.c.b_io_export    Wed Jan 29 17:00:32 2003
-+++ linux-chaos/mm/page_alloc.c        Wed Jan 29 17:01:31 2003
-@@ -31,6 +31,7 @@
+--- linux-2.4.18-18/mm/page_alloc.c~iod-rmap-exports   Thu Apr  3 00:40:01 2003
++++ linux-2.4.18-18-braam/mm/page_alloc.c      Thu Apr  3 00:40:01 2003
+@@ -31,6 +31,7 @@ int nr_active_pages;
  int nr_inactive_dirty_pages;
  int nr_inactive_clean_pages;
  pg_data_t *pgdat_list;
@@ -69,3 +82,5 @@
  
  /*
   * The zone_table array is used to look up the address of the
+
+_
diff --git a/lustre/kernel_patches/patches/iod-stock-24-exports.patch b/lustre/kernel_patches/patches/iod-stock-24-exports.patch
new file mode 100644 (file)
index 0000000..2070377
--- /dev/null
@@ -0,0 +1,48 @@
+ fs/Makefile     |    2 +-
+ fs/inode.c      |    4 +++-
+ mm/page_alloc.c |    1 +
+ 3 files changed, 5 insertions(+), 2 deletions(-)
+
+--- linux-2.4.20/fs/inode.c~iod-stock-24-exports       Wed Apr  2 23:21:20 2003
++++ linux-2.4.20-braam/fs/inode.c      Wed Apr  2 23:21:20 2003
+@@ -5,6 +5,7 @@
+  */
+ #include <linux/config.h>
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/string.h>
+ #include <linux/mm.h>
+@@ -66,7 +67,8 @@ static LIST_HEAD(anon_hash_chain); /* fo
+  * NOTE! You also have to own the lock if you change
+  * the i_state of an inode while it is in use..
+  */
+-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
++EXPORT_SYMBOL(inode_lock);
+ /*
+  * Statistics gathering..
+--- linux-2.4.20/fs/Makefile~iod-stock-24-exports      Wed Apr  2 23:21:20 2003
++++ linux-2.4.20-braam/fs/Makefile     Wed Apr  2 23:21:53 2003
+@@ -7,7 +7,7 @@
+ O_TARGET := fs.o
+-export-objs :=        filesystems.o open.o dcache.o buffer.o
++export-objs :=        filesystems.o open.o dcache.o buffer.o inode.o
+ mod-subdirs :=        nls
+ obj-y :=      open.o read_write.o devices.o file_table.o buffer.o \
+--- linux-2.4.20/mm/page_alloc.c~iod-stock-24-exports  Wed Apr  2 23:21:20 2003
++++ linux-2.4.20-braam/mm/page_alloc.c Wed Apr  2 23:21:20 2003
+@@ -28,6 +28,7 @@ int nr_inactive_pages;
+ LIST_HEAD(inactive_list);
+ LIST_HEAD(active_list);
+ pg_data_t *pgdat_list;
++EXPORT_SYMBOL(pgdat_list);
+ /*
+  *
+
+_
index 669b44d..3035f55 100644 (file)
@@ -1,5 +1,10 @@
---- linux-2.4.19-hp2_pnnl4_Lv13/fs/inode.c.iod-export  2003-02-27 14:28:04.000000000 -0800
-+++ linux-2.4.19-hp2_pnnl4_Lv13/fs/inode.c     2003-03-03 13:54:59.000000000 -0800
+ fs/Makefile     |    2 +-
+ fs/inode.c      |    4 +++-
+ mm/page_alloc.c |    1 +
+ 3 files changed, 5 insertions(+), 2 deletions(-)
+
+--- linux/fs/inode.c~iod-stock-24-exports_hp   Wed Apr  9 10:44:54 2003
++++ linux-mmonroe/fs/inode.c   Wed Apr  9 10:49:50 2003
 @@ -5,6 +5,7 @@
   */
  
@@ -8,7 +13,7 @@
  #include <linux/fs.h>
  #include <linux/string.h>
  #include <linux/mm.h>
-@@ -66,7 +67,8 @@
+@@ -66,7 +67,8 @@ static LIST_HEAD(anon_hash_chain); /* fo
   * NOTE! You also have to own the lock if you change
   * the i_state of an inode while it is in use..
   */
@@ -18,8 +23,8 @@
  
  /*
   * Statistics gathering..
---- linux-2.4.19-hp2_pnnl4_Lv13/fs/Makefile.iod-export 2003-02-27 14:28:01.000000000 -0800
-+++ linux-2.4.19-hp2_pnnl4_Lv13/fs/Makefile    2003-03-03 13:56:11.000000000 -0800
+--- linux/fs/Makefile~iod-stock-24-exports_hp  Wed Apr  9 10:26:08 2003
++++ linux-mmonroe/fs/Makefile  Wed Apr  9 10:49:50 2003
 @@ -7,7 +7,7 @@
  
  O_TARGET := fs.o
  mod-subdirs :=        nls xfs
  
  obj-y :=      open.o read_write.o devices.o file_table.o buffer.o \
---- linux-2.4.19-hp2_pnnl4_Lv13/mm/page_alloc.c.iod-export     2003-02-27 14:28:01.000000000 -0800
-+++ linux-2.4.19-hp2_pnnl4_Lv13/mm/page_alloc.c        2003-03-03 13:54:59.000000000 -0800
-@@ -28,6 +28,7 @@
+--- linux/mm/page_alloc.c~iod-stock-24-exports_hp      Wed Apr  9 10:26:14 2003
++++ linux-mmonroe/mm/page_alloc.c      Wed Apr  9 10:49:50 2003
+@@ -28,6 +28,7 @@ int nr_inactive_pages;
  LIST_HEAD(inactive_list);
  LIST_HEAD(active_list);
  pg_data_t *pgdat_list;
 +EXPORT_SYMBOL(pgdat_list);
  
- /* Used to look up the address of the struct zone encoded in page->zone */
- zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
+ /*
+  *
+
+_
diff --git a/lustre/kernel_patches/patches/iopen-2.4.18.patch b/lustre/kernel_patches/patches/iopen-2.4.18.patch
new file mode 100644 (file)
index 0000000..d8dbdfb
--- /dev/null
@@ -0,0 +1,414 @@
+ 0 files changed
+
+--- linux-2.4.18-chaos52/Documentation/filesystems/ext2.txt~iopen-2.4.18       2003-04-13 15:21:33.000000000 +0800
++++ linux-2.4.18-chaos52-root/Documentation/filesystems/ext2.txt       2003-06-03 17:10:55.000000000 +0800
+@@ -35,6 +35,22 @@ resgid=n                    The group ID which may use th
+ sb=n                          Use alternate superblock at this location.
++iopen                         Makes an invisible pseudo-directory called 
++                              __iopen__ available in the root directory
++                              of the filesystem.  Allows open-by-inode-
++                              number.  i.e., inode 3145 can be accessed
++                              via /mntpt/__iopen__/3145
++
++iopen_nopriv                  This option makes the iopen directory be
++                              world-readable.  This may be safer since it
++                              allows daemons to run as an unprivileged user,
++                              however it significantly changes the security
++                              model of a Unix filesystem, since previously
++                              all files under a mode 700 directory were not
++                              generally avilable even if the
++                              permissions on the file itself is
++                              world-readable.
++
+ grpquota,noquota,quota,usrquota       Quota options are silently ignored by ext2.
+--- linux-2.4.18-chaos52/fs/ext3/Makefile~iopen-2.4.18 2003-06-01 03:24:07.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/Makefile 2003-06-03 17:10:55.000000000 +0800
+@@ -11,7 +11,7 @@ O_TARGET := ext3.o
+ export-objs :=        super.o inode.o xattr.o
+-obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+               ioctl.o namei.o super.o symlink.o xattr.o
+ obj-m    := $(O_TARGET)
+--- linux-2.4.18-chaos52/fs/ext3/inode.c~iopen-2.4.18  2003-06-03 17:10:21.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/inode.c  2003-06-03 17:10:55.000000000 +0800
+@@ -31,6 +31,7 @@
+ #include <linux/highuid.h>
+ #include <linux/quotaops.h>
+ #include <linux/module.h>
++#include "iopen.h"
+ /*
+  * SEARCH_FROM_ZERO forces each block allocation to search from the start
+@@ -2135,6 +2136,9 @@ void ext3_read_inode(struct inode * inod
+       struct buffer_head *bh;
+       int block;
+       
++      if (ext3_iopen_get_inode(inode))
++              return;
++      
+       if(ext3_get_inode_loc(inode, &iloc))
+               goto bad_inode;
+       bh = iloc.bh;
+--- /dev/null  2002-08-31 07:31:37.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/iopen.c  2003-06-03 17:10:55.000000000 +0800
+@@ -0,0 +1,259 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ * 
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ *
++ *
++ * Invariants:
++ *   - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ *     for an inode at one time.
++ *   - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ *     aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup().  Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent.  This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/smp_lock.h>
++#include "iopen.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN        32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry)
++{
++      struct inode *inode;
++      unsigned long ino;
++      struct list_head *lp;
++      struct dentry *alternate;
++      char buf[IOPEN_NAME_LEN];
++      
++      if (dentry->d_name.len >= IOPEN_NAME_LEN)
++              return ERR_PTR(-ENAMETOOLONG);
++
++      memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++      buf[dentry->d_name.len] = 0;
++
++      if (strcmp(buf, ".") == 0)
++              ino = dir->i_ino;
++      else if (strcmp(buf, "..") == 0)
++              ino = EXT3_ROOT_INO;
++      else
++              ino = simple_strtoul(buf, 0, 0);
++
++      if ((ino != EXT3_ROOT_INO &&
++           //ino != EXT3_ACL_IDX_INO &&
++           //ino != EXT3_ACL_DATA_INO &&
++           ino < EXT3_FIRST_INO(dir->i_sb)) ||
++          ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
++              return ERR_PTR(-ENOENT);
++
++      inode = iget(dir->i_sb, ino);
++      if (!inode)
++              return ERR_PTR(-EACCES);
++      if (is_bad_inode(inode)) {
++              iput(inode);
++              return ERR_PTR(-ENOENT);
++      }
++
++      /* preferrably return a connected dentry */
++      spin_lock(&dcache_lock);
++      list_for_each(lp, &inode->i_dentry) {
++              alternate = list_entry(lp, struct dentry, d_alias);
++              assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED));
++      }
++
++      if (!list_empty(&inode->i_dentry)) {
++              alternate = list_entry(inode->i_dentry.next, 
++                                     struct dentry, d_alias);
++              dget_locked(alternate);
++              alternate->d_vfs_flags |= DCACHE_REFERENCED;
++              iput(inode);
++              spin_unlock(&dcache_lock);
++              return alternate;
++      }
++      dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
++      spin_unlock(&dcache_lock);
++
++      d_add(dentry, inode);
++      return NULL;
++}
++
++#define do_switch(x,y) do { \
++      __typeof__ (x) __tmp = x; \
++      x = y; y = __tmp; } while (0)
++
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
++{
++      const unsigned char *old_name, *new_name;
++
++      memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); 
++      old_name = target->d_name.name;
++      new_name = dentry->d_name.name;
++      if (old_name == target->d_iname)
++              old_name = dentry->d_iname;
++      if (new_name == dentry->d_iname)
++              new_name = target->d_iname;
++      target->d_name.name = new_name;
++      dentry->d_name.name = old_name;
++}
++
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++{
++      struct dentry *tmp, *goal = NULL;
++      struct list_head *lp;
++
++      /* preferrably return a connected dentry */
++      spin_lock(&dcache_lock);
++      /* verify this dentry is really new */
++      assert(!de->d_inode);
++      assert(list_empty(&de->d_subdirs));
++      assert(list_empty(&de->d_alias));
++
++
++      list_for_each(lp, &inode->i_dentry) {
++              tmp = list_entry(lp, struct dentry, d_alias);
++              if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) {
++                      assert(tmp->d_alias.next == &inode->i_dentry);
++                      assert(tmp->d_alias.prev == &inode->i_dentry);
++                      goal = tmp;
++                      dget_locked(goal);
++                      break;
++              }
++      }
++
++      if (!goal) { 
++              spin_unlock(&dcache_lock);
++              return NULL; 
++      }
++
++      /* Move the goal to the de hash queue - like d_move() */
++      goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
++      list_del(&goal->d_hash);
++      list_add(&goal->d_hash, &de->d_hash);
++
++      list_del(&goal->d_child);
++      list_del(&de->d_child);
++
++      /* Switch the parents and the names.. */
++      switch_names(goal, de);
++      do_switch(goal->d_parent, de->d_parent);
++      do_switch(goal->d_name.len, de->d_name.len);
++      do_switch(goal->d_name.hash, de->d_name.hash);
++
++      /* And add them back to the (new) parent lists */
++      list_add(&goal->d_child, &goal->d_parent->d_subdirs);
++      list_add(&de->d_child, &de->d_parent->d_subdirs);
++      spin_unlock(&dcache_lock);
++
++      return goal;
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++      lookup:         iopen_lookup,           /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++      read:           generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++      int     len;
++
++      len = strlen(name);
++      if (dentry->d_name.len != len)
++              return 0;
++      if (strncmp(dentry->d_name.name, name, len))
++              return 0;
++      return 1;
++}
++
++/*
++ * This function is spliced into ext3_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++      struct inode *inode;
++
++      if (dir->i_ino != EXT3_ROOT_INO ||
++          !test_opt(dir->i_sb, IOPEN) ||
++          !match_dentry(dentry, "__iopen__"))
++              return 0;
++
++      inode = iget(dir->i_sb, EXT3_BAD_INO);
++
++      if (!inode) 
++              return 0;
++      d_add(dentry, inode);
++      return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately.  Otherwise, this fuction returns 0.
++ */
++int ext3_iopen_get_inode(struct inode *inode)
++{
++      if (inode->i_ino != EXT3_BAD_INO)
++              return 0;
++
++      inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++      if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++              inode->i_mode |= 0777;
++      inode->i_uid = 0;
++      inode->i_gid = 0;
++      inode->i_nlink = 1;
++      inode->i_size = 4096;
++      inode->i_atime = CURRENT_TIME;
++      inode->i_ctime = CURRENT_TIME;
++      inode->i_mtime = CURRENT_TIME;
++      inode->u.ext3_i.i_dtime = 0;
++      inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
++                                       * (for stat), not the fs block
++                                       * size */  
++      inode->i_blocks = 0;
++      inode->i_version = 1;
++      inode->i_generation = 0;
++
++      inode->i_op = &iopen_inode_operations;
++      inode->i_fop = &iopen_file_operations;
++      inode->i_mapping->a_ops = 0;
++
++      return 1;
++}
+--- /dev/null  2002-08-31 07:31:37.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/iopen.h  2003-06-03 17:10:55.000000000 +0800
+@@ -0,0 +1,13 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ * 
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ * 
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
+--- linux-2.4.18-chaos52/fs/ext3/namei.c~iopen-2.4.18  2003-06-03 17:10:20.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/namei.c  2003-06-03 17:10:55.000000000 +0800
+@@ -34,6 +34,7 @@
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+ #include <linux/slab.h>
++#include "iopen.h"
+ /*
+  * define how far ahead to read directories while searching them.
+@@ -703,16 +704,21 @@ cleanup_and_exit:
+               brelse (bh_use[ra_ptr]);
+       return ret;
+ }
++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode);
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+       struct inode * inode;
+       struct ext3_dir_entry_2 * de;
+       struct buffer_head * bh;
++      struct dentry *alternate = NULL;
+       if (dentry->d_name.len > EXT3_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
++      if (ext3_check_for_iopen(dir, dentry))
++              return NULL;
++
+       bh = ext3_find_entry(dentry, &de);
+       inode = NULL;
+       if (bh) {
+@@ -723,6 +729,12 @@ static struct dentry *ext3_lookup(struct
+               if (!inode)
+                       return ERR_PTR(-EACCES);
+       }
++
++      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
++              iput(inode);
++              return alternate;
++      }
++
+       d_add(dentry, inode);
+       return NULL;
+ }
+--- linux-2.4.18-chaos52/fs/ext3/super.c~iopen-2.4.18  2003-06-03 17:10:21.000000000 +0800
++++ linux-2.4.18-chaos52-root/fs/ext3/super.c  2003-06-03 17:10:55.000000000 +0800
+@@ -820,6 +820,17 @@ static int parse_options (char * options
+                        || !strcmp (this_char, "quota")
+                        || !strcmp (this_char, "usrquota"))
+                       /* Don't do anything ;-) */ ;
++              else if (!strcmp (this_char, "iopen")) {
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              } else if (!strcmp (this_char, "noiopen")) {
++                      clear_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              }
++              else if (!strcmp (this_char, "iopen_nopriv")) {
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              }
+               else if (!strcmp (this_char, "journal")) {
+                       /* @@@ FIXME */
+                       /* Eventually we will want to be able to create
+--- linux-2.4.18-chaos52/include/linux/ext3_fs.h~iopen-2.4.18  2003-06-03 17:10:22.000000000 +0800
++++ linux-2.4.18-chaos52-root/include/linux/ext3_fs.h  2003-06-03 17:12:08.000000000 +0800
+@@ -321,6 +321,8 @@ struct ext3_inode {
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_INDEX              0x4000  /* Enable directory index */
++#define EXT3_MOUNT_IOPEN              0x8000  /* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV               0x10000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_ASYNCDEL          0x20000  /* Delayed deletion */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+
+_
diff --git a/lustre/kernel_patches/patches/iopen-2.4.20.patch b/lustre/kernel_patches/patches/iopen-2.4.20.patch
new file mode 100644 (file)
index 0000000..3038cc8
--- /dev/null
@@ -0,0 +1,423 @@
+ Documentation/filesystems/ext2.txt |   16 ++
+ fs/ext3/Makefile                   |    2 
+ fs/ext3/inode.c                    |    4 
+ fs/ext3/iopen.c                    |  240 +++++++++++++++++++++++++++++++++++++
+ fs/ext3/iopen.h                    |   15 ++
+ fs/ext3/namei.c                    |   13 +-
+ fs/ext3/super.c                    |   11 +
+ include/linux/ext3_fs.h            |    2 
+ 8 files changed, 301 insertions(+), 2 deletions(-)
+
+--- linux-2.4.20/Documentation/filesystems/ext2.txt~iopen      2001-07-11 16:44:45.000000000 -0600
++++ linux-2.4.20-braam/Documentation/filesystems/ext2.txt      2003-05-17 14:06:00.000000000 -0600
+@@ -35,6 +35,22 @@ resgid=n                    The group ID which may use th
+ sb=n                          Use alternate superblock at this location.
++iopen                         Makes an invisible pseudo-directory called 
++                              __iopen__ available in the root directory
++                              of the filesystem.  Allows open-by-inode-
++                              number.  i.e., inode 3145 can be accessed
++                              via /mntpt/__iopen__/3145
++
++iopen_nopriv                  This option makes the iopen directory be
++                              world-readable.  This may be safer since it
++                              allows daemons to run as an unprivileged user,
++                              however it significantly changes the security
++                              model of a Unix filesystem, since previously
++                              all files under a mode 700 directory were not
++                              generally avilable even if the
++                              permissions on the file itself is
++                              world-readable.
++
+ grpquota,noquota,quota,usrquota       Quota options are silently ignored by ext2.
+--- linux-2.4.20/fs/ext3/Makefile~iopen        2003-05-17 14:05:57.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/Makefile        2003-05-17 14:06:00.000000000 -0600
+@@ -11,7 +11,7 @@ O_TARGET := ext3.o
+ export-objs := ext3-exports.o
+-obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
++obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \
+               ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
+ obj-m    := $(O_TARGET)
+--- linux-2.4.20/fs/ext3/inode.c~iopen 2003-05-17 14:06:00.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/inode.c 2003-05-17 14:06:00.000000000 -0600
+@@ -31,6 +31,7 @@
+ #include <linux/highuid.h>
+ #include <linux/quotaops.h>
+ #include <linux/module.h>
++#include "iopen.h"
+ /*
+  * SEARCH_FROM_ZERO forces each block allocation to search from the start
+@@ -2137,6 +2138,9 @@ void ext3_read_inode(struct inode * inod
+       struct buffer_head *bh;
+       int block;
+       
++      if (ext3_iopen_get_inode(inode))
++              return;
++      
+       if(ext3_get_inode_loc(inode, &iloc))
+               goto bad_inode;
+       bh = iloc.bh;
+--- /dev/null  2003-01-30 03:24:37.000000000 -0700
++++ linux-2.4.20-braam/fs/ext3/iopen.c 2003-05-17 22:18:55.000000000 -0600
+@@ -0,0 +1,259 @@
++/*
++ * linux/fs/ext3/iopen.c
++ *
++ * Special support for open by inode number
++ *
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ * 
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ * 
++ *
++ * Invariants:
++ *   - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias
++ *     for an inode at one time.
++ *   - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry
++ *     aliases on an inode at the same time.
++ *
++ * If we have any connected dentry aliases for an inode, use one of those
++ * in iopen_lookup().  Otherwise, we instantiate a single NFSD_DISCONNECTED
++ * dentry for this inode, which thereafter will be found by the dcache
++ * when looking up this inode number in __iopen__, so we don't return here
++ * until it is gone.
++ *
++ * If we get an inode via a regular name lookup, then we "rename" the
++ * NFSD_DISCONNECTED dentry to the proper name and parent.  This ensures
++ * existing users of the disconnected dentry will continue to use the same
++ * dentry as the connected users, and there will never be both kinds of
++ * dentry aliases at one time.
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/ext3_jbd.h>
++#include <linux/jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/smp_lock.h>
++#include "iopen.h"
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#define IOPEN_NAME_LEN        32
++
++/*
++ * This implements looking up an inode by number.
++ */
++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry)
++{
++      struct inode *inode;
++      unsigned long ino;
++      struct list_head *lp;
++      struct dentry *alternate;
++      char buf[IOPEN_NAME_LEN];
++      
++      if (dentry->d_name.len >= IOPEN_NAME_LEN)
++              return ERR_PTR(-ENAMETOOLONG);
++
++      memcpy(buf, dentry->d_name.name, dentry->d_name.len);
++      buf[dentry->d_name.len] = 0;
++
++      if (strcmp(buf, ".") == 0)
++              ino = dir->i_ino;
++      else if (strcmp(buf, "..") == 0)
++              ino = EXT3_ROOT_INO;
++      else
++              ino = simple_strtoul(buf, 0, 0);
++
++      if ((ino != EXT3_ROOT_INO &&
++           //ino != EXT3_ACL_IDX_INO &&
++           //ino != EXT3_ACL_DATA_INO &&
++           ino < EXT3_FIRST_INO(dir->i_sb)) ||
++          ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
++              return ERR_PTR(-ENOENT);
++
++      inode = iget(dir->i_sb, ino);
++      if (!inode)
++              return ERR_PTR(-EACCES);
++      if (is_bad_inode(inode)) {
++              iput(inode);
++              return ERR_PTR(-ENOENT);
++      }
++
++      /* preferrably return a connected dentry */
++      spin_lock(&dcache_lock);
++      list_for_each(lp, &inode->i_dentry) {
++              alternate = list_entry(lp, struct dentry, d_alias);
++              assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED));
++      }
++
++      if (!list_empty(&inode->i_dentry)) {
++              alternate = list_entry(inode->i_dentry.next, 
++                                     struct dentry, d_alias);
++              dget_locked(alternate);
++              alternate->d_vfs_flags |= DCACHE_REFERENCED;
++              iput(inode);
++              spin_unlock(&dcache_lock);
++              return alternate;
++      }
++      dentry->d_flags |= DCACHE_NFSD_DISCONNECTED;
++      spin_unlock(&dcache_lock);
++
++      d_add(dentry, inode);
++      return NULL;
++}
++
++#define do_switch(x,y) do { \
++      __typeof__ (x) __tmp = x; \
++      x = y; y = __tmp; } while (0)
++
++static inline void switch_names(struct dentry *dentry, struct dentry *target)
++{
++      const unsigned char *old_name, *new_name;
++
++      memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); 
++      old_name = target->d_name.name;
++      new_name = dentry->d_name.name;
++      if (old_name == target->d_iname)
++              old_name = dentry->d_iname;
++      if (new_name == dentry->d_iname)
++              new_name = target->d_iname;
++      target->d_name.name = new_name;
++      dentry->d_name.name = old_name;
++}
++
++/* This function is spliced into ext3_lookup and does the move of a
++ * disconnected dentry (if it exists) to a connected dentry.
++ */
++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode)
++{
++      struct dentry *tmp, *goal = NULL;
++      struct list_head *lp;
++
++      /* preferrably return a connected dentry */
++      spin_lock(&dcache_lock);
++      /* verify this dentry is really new */
++      assert(!de->d_inode);
++      assert(list_empty(&de->d_subdirs));
++      assert(list_empty(&de->d_alias));
++
++
++      list_for_each(lp, &inode->i_dentry) {
++              tmp = list_entry(lp, struct dentry, d_alias);
++              if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) {
++                      assert(tmp->d_alias.next == &inode->i_dentry);
++                      assert(tmp->d_alias.prev == &inode->i_dentry);
++                      goal = tmp;
++                      dget_locked(goal);
++                      break;
++              }
++      }
++
++      if (!goal) { 
++              spin_unlock(&dcache_lock);
++              return NULL; 
++      }
++
++      /* Move the goal to the de hash queue - like d_move() */
++      goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED;
++      list_del(&goal->d_hash);
++      list_add(&goal->d_hash, &de->d_hash);
++
++      list_del(&goal->d_child);
++      list_del(&de->d_child);
++
++      /* Switch the parents and the names.. */
++      switch_names(goal, de);
++      do_switch(goal->d_parent, de->d_parent);
++      do_switch(goal->d_name.len, de->d_name.len);
++      do_switch(goal->d_name.hash, de->d_name.hash);
++
++      /* And add them back to the (new) parent lists */
++      list_add(&goal->d_child, &goal->d_parent->d_subdirs);
++      list_add(&de->d_child, &de->d_parent->d_subdirs);
++      spin_unlock(&dcache_lock);
++
++      return goal;
++}
++
++/*
++ * These are the special structures for the iopen pseudo directory.
++ */
++
++static struct inode_operations iopen_inode_operations = {
++      lookup:         iopen_lookup,           /* BKL held */
++};
++
++static struct file_operations iopen_file_operations = {
++      read:           generic_read_dir,
++};
++
++static int match_dentry(struct dentry *dentry, const char *name)
++{
++      int     len;
++
++      len = strlen(name);
++      if (dentry->d_name.len != len)
++              return 0;
++      if (strncmp(dentry->d_name.name, name, len))
++              return 0;
++      return 1;
++}
++
++/*
++ * This function is spliced into ext3_lookup and returns 1 the file
++ * name is __iopen__ and dentry has been filled in appropriately.
++ */
++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry)
++{
++      struct inode *inode;
++
++      if (dir->i_ino != EXT3_ROOT_INO ||
++          !test_opt(dir->i_sb, IOPEN) ||
++          !match_dentry(dentry, "__iopen__"))
++              return 0;
++
++      inode = iget(dir->i_sb, EXT3_BAD_INO);
++
++      if (!inode) 
++              return 0;
++      d_add(dentry, inode);
++      return 1;
++}
++
++/*
++ * This function is spliced into read_inode; it returns 1 if inode
++ * number is the one for /__iopen__, in which case the inode is filled
++ * in appropriately.  Otherwise, this fuction returns 0.
++ */
++int ext3_iopen_get_inode(struct inode *inode)
++{
++      if (inode->i_ino != EXT3_BAD_INO)
++              return 0;
++
++      inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++      if (test_opt(inode->i_sb, IOPEN_NOPRIV))
++              inode->i_mode |= 0777;
++      inode->i_uid = 0;
++      inode->i_gid = 0;
++      inode->i_nlink = 1;
++      inode->i_size = 4096;
++      inode->i_atime = CURRENT_TIME;
++      inode->i_ctime = CURRENT_TIME;
++      inode->i_mtime = CURRENT_TIME;
++      inode->u.ext3_i.i_dtime = 0;
++      inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
++                                       * (for stat), not the fs block
++                                       * size */  
++      inode->i_blocks = 0;
++      inode->i_version = 1;
++      inode->i_generation = 0;
++
++      inode->i_op = &iopen_inode_operations;
++      inode->i_fop = &iopen_file_operations;
++      inode->i_mapping->a_ops = 0;
++
++      return 1;
++}
+--- /dev/null  2003-01-30 03:24:37.000000000 -0700
++++ linux-2.4.20-braam/fs/ext3/iopen.h 2003-05-17 14:06:00.000000000 -0600
+@@ -0,0 +1,13 @@
++/*
++ * iopen.h
++ *
++ * Special support for opening files by inode number.
++ * 
++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu).
++ * 
++ * This file may be redistributed under the terms of the GNU General
++ * Public License.
++ */
++
++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry);
++extern int ext3_iopen_get_inode(struct inode *inode);
+--- linux-2.4.20/fs/ext3/namei.c~iopen 2003-05-17 14:05:59.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/namei.c 2003-05-17 22:23:08.000000000 -0600
+@@ -35,7 +35,7 @@
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+-
++#include "iopen.h"
+ /*
+  * define how far ahead to read directories while searching them.
+@@ -921,16 +921,21 @@ errout:
+       return NULL;
+ }
+ #endif
++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode);
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+       struct inode * inode;
+       struct ext3_dir_entry_2 * de;
+       struct buffer_head * bh;
++      struct dentry *alternate = NULL;
+       if (dentry->d_name.len > EXT3_NAME_LEN)
+               return ERR_PTR(-ENAMETOOLONG);
++      if (ext3_check_for_iopen(dir, dentry))
++              return NULL;
++
+       bh = ext3_find_entry(dentry, &de);
+       inode = NULL;
+       if (bh) {
+@@ -942,6 +947,12 @@ static struct dentry *ext3_lookup(struct
+                       return ERR_PTR(-EACCES);
+               }
+       }
++
++      if (inode && (alternate = iopen_connect_dentry(dentry, inode))) {
++              iput(inode);
++              return alternate;
++      }
++
+       d_add(dentry, inode);
+       return NULL;
+ }
+--- linux-2.4.20/fs/ext3/super.c~iopen 2003-05-17 14:05:59.000000000 -0600
++++ linux-2.4.20-braam/fs/ext3/super.c 2003-05-17 14:06:00.000000000 -0600
+@@ -820,6 +820,17 @@ static int parse_options (char * options
+                        || !strcmp (this_char, "quota")
+                        || !strcmp (this_char, "usrquota"))
+                       /* Don't do anything ;-) */ ;
++              else if (!strcmp (this_char, "iopen")) {
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              } else if (!strcmp (this_char, "noiopen")) {
++                      clear_opt (sbi->s_mount_opt, IOPEN);
++                      clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              }
++              else if (!strcmp (this_char, "iopen_nopriv")) {
++                      set_opt (sbi->s_mount_opt, IOPEN);
++                      set_opt (sbi->s_mount_opt, IOPEN_NOPRIV);
++              }
+               else if (!strcmp (this_char, "journal")) {
+                       /* @@@ FIXME */
+                       /* Eventually we will want to be able to create
+--- linux-2.4.20/include/linux/ext3_fs.h~iopen 2003-05-17 14:05:59.000000000 -0600
++++ linux-2.4.20-braam/include/linux/ext3_fs.h 2003-05-17 14:06:29.000000000 -0600
+@@ -322,6 +322,8 @@ struct ext3_inode {
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
++#define EXT3_MOUNT_IOPEN              0x8000  /* Allow access via iopen */
++#define EXT3_MOUNT_IOPEN_NOPRIV               0x10000 /* Make iopen world-readable */
+ #define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+
+_
diff --git a/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20-rh.patch b/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20-rh.patch
new file mode 100644 (file)
index 0000000..8113828
--- /dev/null
@@ -0,0 +1,124 @@
+
+
+
+ arch/i386/mm/init.c  |    6 +++++
+ arch/ia64/mm/init.c  |    6 +++++
+ include/linux/slab.h |    1 
+ kernel/ksyms.c       |    1 
+ mm/slab.c            |   53 +++++++++++++++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 67 insertions(+)
+
+--- rh-2.4.20/arch/i386/mm/init.c~kmem_cache_validate_2.4.20-rh        2003-04-11 14:05:09.000000000 +0800
++++ rh-2.4.20-root/arch/i386/mm/init.c 2003-04-13 10:51:58.000000000 +0800
+@@ -43,6 +43,12 @@ unsigned long highstart_pfn, highend_pfn
+ static unsigned long totalram_pages;
+ static unsigned long totalhigh_pages;
++struct page *check_get_page(unsigned long kaddr)
++{
++#warning FIXME: Lustre team, is this solid?
++      return virt_to_page(kaddr);
++}
++
+ int do_check_pgt_cache(int low, int high)
+ {
+       return 0;       /* FIXME! */
+--- rh-2.4.20/arch/ia64/mm/init.c~kmem_cache_validate_2.4.20-rh        2003-04-11 14:04:43.000000000 +0800
++++ rh-2.4.20-root/arch/ia64/mm/init.c 2003-04-13 10:51:58.000000000 +0800
+@@ -45,6 +45,12 @@ unsigned long vmalloc_end = VMALLOC_END_
+ static struct page *vmem_map;
+ static unsigned long num_dma_physpages;
++struct page *check_get_page(unsigned long kaddr)
++{
++#warning FIXME: Lustre team, is this solid?
++      return virt_to_page(kaddr);
++}
++
+ int
+ do_check_pgt_cache (int low, int high)
+ {
+--- rh-2.4.20/include/linux/slab.h~kmem_cache_validate_2.4.20-rh       2003-04-12 15:46:39.000000000 +0800
++++ rh-2.4.20-root/include/linux/slab.h        2003-04-13 10:53:00.000000000 +0800
+@@ -57,6 +57,7 @@ extern int kmem_cache_destroy(kmem_cache
+ extern int kmem_cache_shrink(kmem_cache_t *);
+ extern void *kmem_cache_alloc(kmem_cache_t *, int);
+ extern void kmem_cache_free(kmem_cache_t *, void *);
++extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp);
+ extern unsigned int kmem_cache_size(kmem_cache_t *);
+ extern void *kmalloc(size_t, int);
+--- rh-2.4.20/kernel/ksyms.c~kmem_cache_validate_2.4.20-rh     2003-04-12 16:15:26.000000000 +0800
++++ rh-2.4.20-root/kernel/ksyms.c      2003-04-13 10:54:10.000000000 +0800
+@@ -123,6 +123,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
+ EXPORT_SYMBOL(kmem_cache_shrink);
+ EXPORT_SYMBOL(kmem_cache_alloc);
+ EXPORT_SYMBOL(kmem_cache_free);
++EXPORT_SYMBOL(kmem_cache_validate);
+ EXPORT_SYMBOL(kmem_cache_size);
+ EXPORT_SYMBOL(kmalloc);
+ EXPORT_SYMBOL(kfree);
+--- rh-2.4.20/mm/slab.c~kmem_cache_validate_2.4.20-rh  2003-04-11 14:04:56.000000000 +0800
++++ rh-2.4.20-root/mm/slab.c   2003-04-13 10:51:58.000000000 +0800
+@@ -1208,6 +1208,59 @@ failed:
+  * Called with the cache-lock held.
+  */
++extern struct page *check_get_page(unsigned long kaddr);
++struct page *page_mem_map(struct page *page);
++static int kmem_check_cache_obj (kmem_cache_t * cachep,
++                               slab_t *slabp, void * objp)
++{
++      int i;
++      unsigned int objnr;
++
++#if DEBUG
++      if (cachep->flags & SLAB_RED_ZONE) {
++              objp -= BYTES_PER_WORD;
++              if ( *(unsigned long *)objp != RED_MAGIC2)
++                      /* Either write before start, or a double free. */
++                      return 0;
++              if (*(unsigned long *)(objp+cachep->objsize -
++                              BYTES_PER_WORD) != RED_MAGIC2)
++                      /* Either write past end, or a double free. */
++                      return 0;
++      }
++#endif
++
++      objnr = (objp-slabp->s_mem)/cachep->objsize;
++      if (objnr >= cachep->num)
++              return 0;
++      if (objp != slabp->s_mem + objnr*cachep->objsize)
++              return 0;
++
++      /* Check slab's freelist to see if this obj is there. */
++      for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
++              if (i == objnr)
++                      return 0;
++      }
++      return 1;
++}
++
++
++int kmem_cache_validate(kmem_cache_t *cachep, void *objp)
++{
++      struct page *page = check_get_page((unsigned long)objp);
++
++      if (!VALID_PAGE(page))
++              return 0;
++
++      if (!PageSlab(page))
++              return 0;
++
++      /* XXX check for freed slab objects ? */
++      if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp))
++              return 0;
++
++      return (cachep == GET_PAGE_CACHE(page));
++}
++
+ #if DEBUG
+ static int kmem_extra_free_checks (kmem_cache_t * cachep,
+                       slab_t *slabp, void * objp)
+
+_
diff --git a/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20.patch b/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20.patch
new file mode 100644 (file)
index 0000000..e802312
--- /dev/null
@@ -0,0 +1,116 @@
+ 0 files changed
+
+--- linux-2.4.20-8/arch/ia64/mm/init.c~kmem_cache_validate_2.4.20      2002-11-29 07:53:09.000000000 +0800
++++ linux-2.4.20-8-root/arch/ia64/mm/init.c    2003-06-01 01:44:13.000000000 +0800
+@@ -45,6 +45,12 @@ static struct page *vmem_map;
+ static unsigned long num_dma_physpages;
+ #endif
++struct page *check_get_page(unsigned long kaddr)
++{
++#warning FIXME: Lustre team, is this solid?
++      return virt_to_page(kaddr);
++}
++
+ int
+ do_check_pgt_cache (int low, int high)
+ {
+--- linux-2.4.20-8/include/linux/slab.h~kmem_cache_validate_2.4.20     2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-8-root/include/linux/slab.h   2003-06-01 01:44:13.000000000 +0800
+@@ -56,6 +56,7 @@ extern kmem_cache_t *kmem_cache_create(c
+ extern int kmem_cache_destroy(kmem_cache_t *);
+ extern int kmem_cache_shrink(kmem_cache_t *);
+ extern void *kmem_cache_alloc(kmem_cache_t *, int);
++extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp);
+ extern void kmem_cache_free(kmem_cache_t *, void *);
+ extern unsigned int kmem_cache_size(kmem_cache_t *);
+--- linux-2.4.20-8/kernel/ksyms.c~kmem_cache_validate_2.4.20   2003-06-01 01:44:11.000000000 +0800
++++ linux-2.4.20-8-root/kernel/ksyms.c 2003-06-01 01:44:13.000000000 +0800
+@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
+ EXPORT_SYMBOL(kmem_cache_create);
+ EXPORT_SYMBOL(kmem_cache_destroy);
+ EXPORT_SYMBOL(kmem_cache_shrink);
++EXPORT_SYMBOL(kmem_cache_validate);
+ EXPORT_SYMBOL(kmem_cache_alloc);
+ EXPORT_SYMBOL(kmem_cache_free);
+ EXPORT_SYMBOL(kmem_cache_size);
+--- linux-2.4.20-8/mm/slab.c~kmem_cache_validate_2.4.20        2003-06-01 01:44:08.000000000 +0800
++++ linux-2.4.20-8-root/mm/slab.c      2003-06-01 01:44:13.000000000 +0800
+@@ -1205,6 +1205,59 @@ failed:
+  * Called with the cache-lock held.
+  */
++extern struct page *check_get_page(unsigned long kaddr);
++struct page *page_mem_map(struct page *page);
++static int kmem_check_cache_obj (kmem_cache_t * cachep,
++                               slab_t *slabp, void * objp)
++{
++      int i;
++      unsigned int objnr;
++
++#if DEBUG
++      if (cachep->flags & SLAB_RED_ZONE) {
++              objp -= BYTES_PER_WORD;
++              if ( *(unsigned long *)objp != RED_MAGIC2)
++                      /* Either write before start, or a double free. */
++                      return 0;
++              if (*(unsigned long *)(objp+cachep->objsize -
++                              BYTES_PER_WORD) != RED_MAGIC2)
++                      /* Either write past end, or a double free. */
++                      return 0;
++      }
++#endif
++
++      objnr = (objp-slabp->s_mem)/cachep->objsize;
++      if (objnr >= cachep->num)
++              return 0;
++      if (objp != slabp->s_mem + objnr*cachep->objsize)
++              return 0;
++
++      /* Check slab's freelist to see if this obj is there. */
++      for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
++              if (i == objnr)
++                      return 0;
++      }
++      return 1;
++}
++
++
++int kmem_cache_validate(kmem_cache_t *cachep, void *objp)
++{
++      struct page *page = check_get_page((unsigned long)objp);
++
++      if (!VALID_PAGE(page))
++              return 0;
++
++      if (!PageSlab(page))
++              return 0;
++
++      /* XXX check for freed slab objects ? */
++      if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp))
++              return 0;
++
++      return (cachep == GET_PAGE_CACHE(page));
++}
++
+ #if DEBUG
+ static int kmem_extra_free_checks (kmem_cache_t * cachep,
+                       slab_t *slabp, void * objp)
+--- linux-2.4.20-8/arch/i386/mm/init.c~kmem_cache_validate_2.4.20      2002-11-29 07:53:09.000000000 +0800
++++ linux-2.4.20-8-root/arch/i386/mm/init.c    2003-06-01 01:46:43.000000000 +0800
+@@ -43,6 +43,12 @@ unsigned long highstart_pfn, highend_pfn
+ static unsigned long totalram_pages;
+ static unsigned long totalhigh_pages;
++struct page *check_get_page(unsigned long kaddr)
++{
++#warning FIXME: Lustre team, is this solid?
++              return virt_to_page(kaddr);
++}
++
+ int do_check_pgt_cache(int low, int high)
+ {
+       int freed = 0;
+
+_
index 03385a7..04b49ea 100644 (file)
@@ -1,12 +1,13 @@
+ arch/i386/mm/init.c  |    6 +++++
  arch/ia64/mm/init.c  |    6 +++++
  include/linux/slab.h |    1 
  kernel/ksyms.c       |    1 
  mm/slab.c            |   53 +++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 61 insertions(+)
5 files changed, 67 insertions(+)
 
---- linux-2.4.19-hp2_pnnl2/arch/ia64/mm/init.c~kmem_cache_validate_hp  Sun Jan 19 18:59:23 2003
-+++ linux-2.4.19-hp2_pnnl2-root/arch/ia64/mm/init.c    Sun Jan 19 18:59:24 2003
-@@ -44,6 +44,12 @@ unsigned long vmalloc_end = VMALLOC_END_
+--- linux/arch/ia64/mm/init.c~kmem_cache_validate_hp   2003-04-11 14:24:25.000000000 +0800
++++ linux-root/arch/ia64/mm/init.c     2003-05-16 20:03:56.000000000 +0800
+@@ -45,6 +45,12 @@ unsigned long vmalloc_end = VMALLOC_END_
  static struct page *vmem_map;
  static unsigned long num_dma_physpages;
  
@@ -19,8 +20,8 @@
  int
  do_check_pgt_cache (int low, int high)
  {
---- linux-2.4.19-hp2_pnnl2/include/linux/slab.h~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003
-+++ linux-2.4.19-hp2_pnnl2-root/include/linux/slab.h   Sun Jan 19 19:01:07 2003
+--- linux/include/linux/slab.h~kmem_cache_validate_hp  2002-11-29 07:53:15.000000000 +0800
++++ linux-root/include/linux/slab.h    2003-05-16 20:03:56.000000000 +0800
 @@ -56,6 +56,7 @@ extern kmem_cache_t *kmem_cache_create(c
  extern int kmem_cache_destroy(kmem_cache_t *);
  extern int kmem_cache_shrink(kmem_cache_t *);
@@ -29,9 +30,9 @@
  extern void kmem_cache_free(kmem_cache_t *, void *);
  extern unsigned int kmem_cache_size(kmem_cache_t *);
  
---- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~kmem_cache_validate_hp       Sun Jan 19 18:59:23 2003
-+++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 19:00:32 2003
-@@ -118,6 +118,7 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
+--- linux/kernel/ksyms.c~kmem_cache_validate_hp        2003-05-16 20:03:55.000000000 +0800
++++ linux-root/kernel/ksyms.c  2003-05-16 20:03:56.000000000 +0800
+@@ -119,6 +119,7 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
  EXPORT_SYMBOL(kmem_cache_create);
  EXPORT_SYMBOL(kmem_cache_destroy);
  EXPORT_SYMBOL(kmem_cache_shrink);
@@ -39,9 +40,9 @@
  EXPORT_SYMBOL(kmem_cache_alloc);
  EXPORT_SYMBOL(kmem_cache_free);
  EXPORT_SYMBOL(kmem_cache_size);
---- linux-2.4.19-hp2_pnnl2/mm/slab.c~kmem_cache_validate_hp    Sun Jan 19 18:59:23 2003
-+++ linux-2.4.19-hp2_pnnl2-root/mm/slab.c      Sun Jan 19 18:59:24 2003
-@@ -1207,6 +1207,59 @@ failed:
+--- linux/mm/slab.c~kmem_cache_validate_hp     2002-11-29 07:53:15.000000000 +0800
++++ linux-root/mm/slab.c       2003-05-16 20:03:56.000000000 +0800
+@@ -1205,6 +1205,59 @@ failed:
   * Called with the cache-lock held.
   */
  
  #if DEBUG
  static int kmem_extra_free_checks (kmem_cache_t * cachep,
                        slab_t *slabp, void * objp)
+--- linux/arch/i386/mm/init.c~kmem_cache_validate_hp   2003-05-16 20:03:22.000000000 +0800
++++ linux-root/arch/i386/mm/init.c     2003-05-16 20:06:16.000000000 +0800
+@@ -42,6 +42,12 @@ mmu_gather_t mmu_gathers[NR_CPUS];
+ unsigned long highstart_pfn, highend_pfn;
+ static unsigned long totalram_pages;
+ static unsigned long totalhigh_pages;
++                                                                                                                                             
++struct page *check_get_page(unsigned long kaddr)
++{
++#warning FIXME: Lustre team, is this solid?
++       return virt_to_page(kaddr);
++}
+ int do_check_pgt_cache(int low, int high)
+ {
 
 _
@@ -1,30 +1,7 @@
-Linux Extended Attributes -- Kernel Patch
-24 April 2002, 11:31:18
+ 0 files changed
 
-
-This patch is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This patch is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this patch; if not, write to the Free Software Foundation,
-Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-
-After extracting the linux-2.4.18.tar.gz package, apply this patch as follows:
-
-       cd linux
-       patch -p1 < ../linux-2.4.18ea-0.8.26.patch
-
-diff -Nur linux-2.4.18/fs/ext3/ialloc.c linux-2.4.18ea/fs/ext3/ialloc.c
---- linux-2.4.18/fs/ext3/ialloc.c      Sun Feb 24 04:42:59 2002
-+++ linux-2.4.18ea/fs/ext3/ialloc.c    Sun Feb 24 04:34:43 2002
+--- linux-2.4.18-18/fs/ext3/ialloc.c~linux-2.4.18ea-0.8.26     2003-04-20 16:14:31.000000000 +0800
++++ linux-2.4.18-18-root/fs/ext3/ialloc.c      2003-04-20 16:14:31.000000000 +0800
 @@ -17,6 +17,7 @@
  #include <linux/jbd.h>
  #include <linux/ext3_fs.h>
@@ -33,7 +10,7 @@ diff -Nur linux-2.4.18/fs/ext3/ialloc.c linux-2.4.18ea/fs/ext3/ialloc.c
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/locks.h>
-@@ -216,6 +217,7 @@
+@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, 
         * as writing the quota to disk may need the lock as well.
         */
        DQUOT_INIT(inode);
@@ -41,9 +18,8 @@ diff -Nur linux-2.4.18/fs/ext3/ialloc.c linux-2.4.18ea/fs/ext3/ialloc.c
        DQUOT_FREE_INODE(inode);
        DQUOT_DROP(inode);
  
-diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c
---- linux-2.4.18/fs/ext3/inode.c       Sun Feb 24 04:42:59 2002
-+++ linux-2.4.18ea/fs/ext3/inode.c     Thu Mar 14 21:51:59 2002
+--- linux-2.4.18-18/fs/ext3/inode.c~linux-2.4.18ea-0.8.26      2003-04-20 16:14:31.000000000 +0800
++++ linux-2.4.18-18-root/fs/ext3/inode.c       2003-04-20 16:14:31.000000000 +0800
 @@ -39,6 +39,18 @@
   */
  #undef SEARCH_FROM_ZERO
@@ -72,7 +48,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c
                       struct inode *inode, struct buffer_head *bh,
                       int blocknr)
  {
-@@ -164,9 +176,7 @@
+@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i
  {
        handle_t *handle;
        
@@ -83,7 +59,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c
                goto no_delete;
  
        lock_kernel();
-@@ -1845,6 +1855,8 @@
+@@ -1861,6 +1871,8 @@ void ext3_truncate(struct inode * inode)
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode)))
                return;
@@ -92,7 +68,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
  
-@@ -1992,8 +2004,6 @@
+@@ -2008,8 +2020,6 @@ int ext3_get_inode_loc (struct inode *in
        struct ext3_group_desc * gdp;
                
        if ((inode->i_ino != EXT3_ROOT_INO &&
@@ -101,7 +77,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c
                inode->i_ino != EXT3_JOURNAL_INO &&
                inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
                inode->i_ino > le32_to_cpu(
-@@ -2120,10 +2130,7 @@
+@@ -2136,10 +2146,7 @@ void ext3_read_inode(struct inode * inod
  
        brelse (iloc.bh);
  
@@ -113,7 +89,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c
                inode->i_op = &ext3_file_inode_operations;
                inode->i_fop = &ext3_file_operations;
                inode->i_mapping->a_ops = &ext3_aops;
-@@ -2131,7 +2138,7 @@
+@@ -2147,7 +2154,7 @@ void ext3_read_inode(struct inode * inod
                inode->i_op = &ext3_dir_inode_operations;
                inode->i_fop = &ext3_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
@@ -122,10 +98,9 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c
                        inode->i_op = &ext3_fast_symlink_inode_operations;
                else {
                        inode->i_op = &page_symlink_inode_operations;
-diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c
---- linux-2.4.18/fs/ext3/namei.c       Fri Nov  9 23:25:04 2001
-+++ linux-2.4.18ea/fs/ext3/namei.c     Mon Mar 11 03:27:00 2002
-@@ -23,6 +23,7 @@
+--- linux-2.4.18-18/fs/ext3/namei.c~linux-2.4.18ea-0.8.26      2003-04-20 16:14:31.000000000 +0800
++++ linux-2.4.18-18-root/fs/ext3/namei.c       2003-04-20 16:14:31.000000000 +0800
+@@ -27,6 +27,7 @@
  #include <linux/sched.h>
  #include <linux/ext3_fs.h>
  #include <linux/ext3_jbd.h>
@@ -133,15 +108,15 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c
  #include <linux/fcntl.h>
  #include <linux/stat.h>
  #include <linux/string.h>
-@@ -435,6 +435,7 @@ static int ext3_add_nondir(handle_t *han
-                       return 0;
-               }
+@@ -1183,6 +1184,7 @@ static int ext3_add_nondir(handle_t *han
+               d_instantiate(dentry, inode);
+               return 0;
        }
 +      ext3_xattr_drop_inode(handle, inode);
        ext3_dec_count(handle, inode);
        iput(inode);
        return err;
-@@ -514,7 +519,7 @@
+@@ -1268,15 +1270,14 @@ static int ext3_mkdir(struct inode * dir
        if (IS_SYNC(dir))
                handle->h_sync = 1;
  
@@ -150,7 +125,7 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
-@@ -522,7 +527,6 @@
        inode->i_op = &ext3_dir_inode_operations;
        inode->i_fop = &ext3_dir_operations;
 -      inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
@@ -159,7 +134,7 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
        if (!dir_block) {
                inode->i_nlink--; /* is this nlink == 0? */
-@@ -549,9 +553,6 @@
+@@ -1303,9 +1304,6 @@ static int ext3_mkdir(struct inode * dir
        BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
        ext3_journal_dirty_metadata(handle, dir_block);
        brelse (dir_block);
@@ -169,16 +144,17 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c
        ext3_mark_inode_dirty(handle, inode);
        err = ext3_add_entry (handle, dentry, inode);
        if (err)
-@@ -917,5 +919,5 @@
+@@ -1671,7 +1669,7 @@ static int ext3_symlink (struct inode * 
+       if (IS_ERR(inode))
                goto out_stop;
  
 -      if (l > sizeof (inode->u.ext3_i.i_data)) {
 +      if (l > sizeof(EXT3_I(inode)->i_data)) {
                inode->i_op = &page_symlink_inode_operations;
                inode->i_mapping->a_ops = &ext3_aops;
-diff -Nur linux-2.4.18/fs/ext3/super.c linux-2.4.18ea/fs/ext3/super.c
---- linux-2.4.18/fs/ext3/super.c       Sun Feb 24 04:42:59 2002
-+++ linux-2.4.18ea/fs/ext3/super.c     Thu Apr  4 21:41:05 2002
+               /*
+--- linux-2.4.18-18/fs/ext3/super.c~linux-2.4.18ea-0.8.26      2003-04-20 16:14:31.000000000 +0800
++++ linux-2.4.18-18-root/fs/ext3/super.c       2003-04-20 16:14:31.000000000 +0800
 @@ -24,6 +24,7 @@
  #include <linux/jbd.h>
  #include <linux/ext3_fs.h>
@@ -187,7 +163,7 @@ diff -Nur linux-2.4.18/fs/ext3/super.c linux-2.4.18ea/fs/ext3/super.c
  #include <linux/slab.h>
  #include <linux/init.h>
  #include <linux/locks.h>
-@@ -404,6 +405,7 @@
+@@ -404,6 +405,7 @@ void ext3_put_super (struct super_block 
        kdev_t j_dev = sbi->s_journal->j_dev;
        int i;
  
@@ -195,7 +171,7 @@ diff -Nur linux-2.4.18/fs/ext3/super.c linux-2.4.18ea/fs/ext3/super.c
        journal_destroy(sbi->s_journal);
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-@@ -1734,14 +1772,25 @@
+@@ -1748,14 +1750,25 @@ int ext3_statfs (struct super_block * sb
  
  static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super);
  
@@ -224,10 +200,9 @@ diff -Nur linux-2.4.18/fs/ext3/super.c linux-2.4.18ea/fs/ext3/super.c
 +      return error;
  }
  
- EXPORT_NO_SYMBOLS;
-diff -Nur linux-2.4.18/fs/ext3/xattr.c linux-2.4.18ea/fs/ext3/xattr.c
---- linux-2.4.18/fs/ext3/xattr.c       Thu Jan  1 01:00:00 1970
-+++ linux-2.4.18ea/fs/ext3/xattr.c     Wed Apr  3 13:19:05 2002
+ EXPORT_SYMBOL(ext3_bread);
+--- /dev/null  2002-08-31 07:31:37.000000000 +0800
++++ linux-2.4.18-18-root/fs/ext3/xattr.c       2003-04-20 16:14:31.000000000 +0800
 @@ -0,0 +1,1247 @@
 +/*
 + * linux/fs/ext3/xattr.c
@@ -302,11 +277,11 @@ diff -Nur linux-2.4.18/fs/ext3/xattr.c linux-2.4.18ea/fs/ext3/xattr.c
 +#include <linux/module.h>
 +
 +/* These symbols may be needed by a module. */
-+EXPORT_SYMBOL(extN_xattr_register);
-+EXPORT_SYMBOL(extN_xattr_unregister);
-+EXPORT_SYMBOL(extN_xattr_get);
-+EXPORT_SYMBOL(extN_xattr_list);
-+EXPORT_SYMBOL(extN_xattr_set);
++EXPORT_SYMBOL(ext3_xattr_register);
++EXPORT_SYMBOL(ext3_xattr_unregister);
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_list);
++EXPORT_SYMBOL(ext3_xattr_set);
 +
 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
 +# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
@@ -1476,9 +1451,8 @@ diff -Nur linux-2.4.18/fs/ext3/xattr.c linux-2.4.18ea/fs/ext3/xattr.c
 +}
 +
 +#endif  /* CONFIG_EXT3_FS_XATTR_SHARING */
-diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3_fs.h
---- linux-2.4.18/include/linux/ext3_fs.h       Sun Feb 24 04:42:59 2002
-+++ linux-2.4.18ea/include/linux/ext3_fs.h     Mon Mar 11 03:27:00 2002
+--- linux-2.4.18-18/include/linux/ext3_fs.h~linux-2.4.18ea-0.8.26      2003-04-20 16:14:31.000000000 +0800
++++ linux-2.4.18-18-root/include/linux/ext3_fs.h       2003-04-20 16:14:31.000000000 +0800
 @@ -58,8 +58,6 @@
   */
  #define       EXT3_BAD_INO             1      /* Bad blocks inode */
@@ -1525,7 +1499,7 @@ diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3
   * Structure of a blocks group descriptor
   */
  struct ext3_group_desc
-@@ -512,7 +487,7 @@
+@@ -513,7 +488,7 @@ struct ext3_super_block {
  #define EXT3_FEATURE_INCOMPAT_RECOVER         0x0004 /* Needs recovery */
  #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008 /* Journal device */
  
@@ -1534,8 +1508,9 @@ diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3
  #define EXT3_FEATURE_INCOMPAT_SUPP    (EXT3_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT3_FEATURE_INCOMPAT_RECOVER)
  #define EXT3_FEATURE_RO_COMPAT_SUPP   (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
-@@ -603,4 +578,22 @@
- */
+@@ -606,6 +581,24 @@ struct ext3_iloc
+       unsigned long block_group;
+ };
  
 +/* Defined for extended attributes */
 +#define CONFIG_EXT3_FS_XATTR y
@@ -1556,8 +1531,9 @@ diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3
 +#endif
 +
  /*
-  * Ok, these declarations are also in <linux/kernel.h> but none of the
-@@ -628,6 +603,7 @@
+  * Function prototypes
+  */
+@@ -647,6 +640,7 @@ extern void ext3_check_inodes_bitmap (st
  extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
  
  /* inode.c */
@@ -1565,9 +1541,8 @@ diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3
  extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
  extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
  
-diff -Nur linux-2.4.18/include/linux/ext3_jbd.h linux-2.4.18ea/include/linux/ext3_jbd.h
---- linux-2.4.18/include/linux/ext3_jbd.h      Fri Dec 21 18:42:03 2001
-+++ linux-2.4.18ea/include/linux/ext3_jbd.h    Mon Mar 25 00:11:36 2002
+--- linux-2.4.18-18/include/linux/ext3_jbd.h~linux-2.4.18ea-0.8.26     2003-04-20 16:14:31.000000000 +0800
++++ linux-2.4.18-18-root/include/linux/ext3_jbd.h      2003-04-20 16:14:31.000000000 +0800
 @@ -30,13 +30,19 @@
  
  #define EXT3_SINGLEDATA_TRANS_BLOCKS  8
@@ -1589,9 +1564,8 @@ diff -Nur linux-2.4.18/include/linux/ext3_jbd.h linux-2.4.18ea/include/linux/ext
  
  extern int ext3_writepage_trans_blocks(struct inode *inode);
  
-diff -Nur linux-2.4.18/include/linux/ext3_xattr.h linux-2.4.18ea/include/linux/ext3_xattr.h
---- linux-2.4.18/include/linux/ext3_xattr.h    Thu Jan  1 01:00:00 1970
-+++ linux-2.4.18ea/include/linux/ext3_xattr.h  Fri Apr  5 10:08:01 2002
+--- /dev/null  2002-08-31 07:31:37.000000000 +0800
++++ linux-2.4.18-18-root/include/linux/ext3_xattr.h    2003-04-20 16:14:31.000000000 +0800
 @@ -0,0 +1,155 @@
 +/*
 +  File: linux/ext3_xattr.h
@@ -1748,9 +1722,8 @@ diff -Nur linux-2.4.18/include/linux/ext3_xattr.h linux-2.4.18ea/include/linux/e
 +
 +#endif  /* __KERNEL__ */
 +
-diff -Nur linux-2.4.18/include/linux/xattr.h linux-2.4.18ea/include/linux/xattr.h
---- linux-2.4.18/include/linux/xattr.h Thu Jan  1 01:00:00 1970
-+++ linux-2.4.18ea/include/linux/xattr.h       Sun Mar 24 23:42:21 2002
+--- /dev/null  2002-08-31 07:31:37.000000000 +0800
++++ linux-2.4.18-18-root/include/linux/xattr.h 2003-04-20 16:14:31.000000000 +0800
 @@ -0,0 +1,15 @@
 +/*
 +  File: linux/xattr.h
@@ -1767,3 +1740,20 @@ diff -Nur linux-2.4.18/include/linux/xattr.h linux-2.4.18ea/include/linux/xattr.
 +#define XATTR_REPLACE 2       /* set value, fail if attr does not exist */
 +
 +#endif        /* _LINUX_XATTR_H */
+--- linux-2.4.18-18/fs/ext3/Makefile~linux-2.4.18ea-0.8.26     2003-04-20 16:14:54.000000000 +0800
++++ linux-2.4.18-18-root/fs/ext3/Makefile      2003-04-20 16:15:15.000000000 +0800
+@@ -9,10 +9,10 @@
+ O_TARGET := ext3.o
+-export-objs :=        super.o inode.o
++export-objs :=        super.o inode.o xattr.o
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-              ioctl.o namei.o super.o symlink.o
++              ioctl.o namei.o super.o symlink.o xattr.o
+ obj-m    := $(O_TARGET)
+ include $(TOPDIR)/Rules.make
+
+_
diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch
new file mode 100644 (file)
index 0000000..5c6c6a9
--- /dev/null
@@ -0,0 +1,5538 @@
+ Documentation/Configure.help  |   66 ++
+ arch/alpha/defconfig          |    7 
+ arch/alpha/kernel/entry.S     |   12 
+ arch/arm/defconfig            |    7 
+ arch/arm/kernel/calls.S       |   24 
+ arch/i386/defconfig           |    7 
+ arch/ia64/defconfig           |    7 
+ arch/m68k/defconfig           |    7 
+ arch/mips/defconfig           |    7 
+ arch/mips64/defconfig         |    7 
+ arch/ppc/defconfig            |   14 
+ arch/ppc64/kernel/misc.S      |    2 
+ arch/s390/defconfig           |    7 
+ arch/s390/kernel/entry.S      |   24 
+ arch/s390x/defconfig          |    7 
+ arch/s390x/kernel/entry.S     |   24 
+ arch/s390x/kernel/wrapper32.S |   92 +++
+ arch/sparc/defconfig          |    7 
+ arch/sparc/kernel/systbls.S   |   10 
+ arch/sparc64/defconfig        |    7 
+ arch/sparc64/kernel/systbls.S |   20 
+ fs/Config.in                  |   14 
+ fs/Makefile                   |    3 
+ fs/ext2/Makefile              |    4 
+ fs/ext2/file.c                |    5 
+ fs/ext2/ialloc.c              |    2 
+ fs/ext2/inode.c               |   34 -
+ fs/ext2/namei.c               |   14 
+ fs/ext2/super.c               |   29 
+ fs/ext2/symlink.c             |   14 
+ fs/ext2/xattr.c               | 1212 +++++++++++++++++++++++++++++++++++++++++
+ fs/ext2/xattr_user.c          |  103 +++
+ fs/ext3/Makefile              |   10 
+ fs/ext3/file.c                |    5 
+ fs/ext3/ialloc.c              |    2 
+ fs/ext3/inode.c               |   35 -
+ fs/ext3/namei.c               |   21 
+ fs/ext3/super.c               |   36 +
+ fs/ext3/symlink.c             |   14 
+ fs/ext3/xattr.c               | 1225 ++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/xattr_user.c          |  111 +++
+ fs/jfs/jfs_xattr.h            |    6 
+ fs/jfs/xattr.c                |    6 
+ fs/mbcache.c                  |  648 ++++++++++++++++++++++
+ include/asm-arm/unistd.h      |    2 
+ include/asm-ppc64/unistd.h    |    2 
+ include/asm-s390/unistd.h     |   15 
+ include/asm-s390x/unistd.h    |   15 
+ include/asm-sparc/unistd.h    |   24 
+ include/asm-sparc64/unistd.h  |   24 
+ include/linux/cache_def.h     |   15 
+ include/linux/errno.h         |    4 
+ include/linux/ext2_fs.h       |   31 -
+ include/linux/ext2_xattr.h    |  157 +++++
+ include/linux/ext3_fs.h       |   31 -
+ include/linux/ext3_jbd.h      |    8 
+ include/linux/ext3_xattr.h    |  157 +++++
+ include/linux/fs.h            |    2 
+ include/linux/mbcache.h       |   69 ++
+ kernel/ksyms.c                |    4 
+ mm/vmscan.c                   |   36 +
+ fs/ext3/ext3-exports.c        |   14 +  
+ 62 files changed, 4331 insertions(+), 197 deletions(-)
+
+--- linux-rh-2.4.20-8/Documentation/Configure.help~linux-2.4.20-xattr-0.8.54-chaos     2003-05-07 17:33:50.000000000 +0800
++++ linux-rh-2.4.20-8-root/Documentation/Configure.help        2003-05-07 17:34:25.000000000 +0800
+@@ -15226,6 +15226,39 @@ CONFIG_EXT2_FS
+   be compiled as a module, and so this could be dangerous.  Most
+   everyone wants to say Y here.
++Ext2 extended attributes
++CONFIG_EXT2_FS_XATTR
++  Extended attributes are name:value pairs associated with inodes by
++  the kernel or by users (see the attr(5) manual page, or visit
++  <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext2 extended attribute block sharing
++CONFIG_EXT2_FS_XATTR_SHARING
++  This options enables code for sharing identical extended attribute
++  blocks among multiple inodes.
++
++  Usually, say Y.
++
++Ext2 extended user attributes
++CONFIG_EXT2_FS_XATTR_USER
++  This option enables extended user attributes on ext2. Processes can
++  associate extended user attributes with inodes to store additional
++  information such as the character encoding of files, etc. (see the
++  attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext2 trusted extended attributes
++CONFIG_EXT2_FS_XATTR_TRUSTED
++  This option enables extended attributes on ext2 that are accessible
++  (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++  is only the super user. Trusted extended attributes are meant for
++  implementing system/security services.
++
++  If unsure, say N.
++
+ Ext3 journalling file system support (EXPERIMENTAL)
+ CONFIG_EXT3_FS
+   This is the journalling version of the Second extended file system
+@@ -15258,6 +15291,39 @@ CONFIG_EXT3_FS
+   of your root partition (the one containing the directory /) cannot
+   be compiled as a module, and so this may be dangerous.
++Ext3 extended attributes
++CONFIG_EXT3_FS_XATTR
++  Extended attributes are name:value pairs associated with inodes by
++  the kernel or by users (see the attr(5) manual page, or visit
++  <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext3 extended attribute block sharing
++CONFIG_EXT3_FS_XATTR_SHARING
++  This options enables code for sharing identical extended attribute
++  blocks among multiple inodes.
++
++  Usually, say Y.
++
++Ext3 extended user attributes
++CONFIG_EXT3_FS_XATTR_USER
++  This option enables extended user attributes on ext3. Processes can
++  associate extended user attributes with inodes to store additional
++  information such as the character encoding of files, etc. (see the
++  attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext3 trusted extended attributes
++CONFIG_EXT3_FS_XATTR_TRUSTED
++  This option enables extended attributes on ext3 that are accessible
++  (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++  is only the super user. Trusted extended attributes are meant for
++  implementing system/security services.
++
++  If unsure, say N.
++
+ Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
+ CONFIG_JBD
+   This is a generic journalling layer for block devices.  It is
+--- linux-rh-2.4.20-8/arch/alpha/defconfig~linux-2.4.20-xattr-0.8.54-chaos     2001-11-20 07:19:42.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/alpha/defconfig        2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_ALPHA=y
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+--- linux-rh-2.4.20-8/arch/alpha/kernel/entry.S~linux-2.4.20-xattr-0.8.54-chaos        2003-04-11 14:04:53.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/alpha/kernel/entry.S   2003-05-07 17:34:25.000000000 +0800
+@@ -1162,6 +1162,18 @@ sys_call_table:
+       .quad sys_readahead
+       .quad sys_ni_syscall                    /* 380, sys_security */
+       .quad sys_tkill
++      .quad sys_setxattr
++      .quad sys_lsetxattr
++      .quad sys_fsetxattr
++      .quad sys_getxattr                      /* 385 */
++      .quad sys_lgetxattr
++      .quad sys_fgetxattr
++      .quad sys_listxattr
++      .quad sys_llistxattr
++      .quad sys_flistxattr                    /* 390 */
++      .quad sys_removexattr
++      .quad sys_lremovexattr
++      .quad sys_fremovexattr
+ /* Remember to update everything, kids.  */
+ .ifne (. - sys_call_table) - (NR_SYSCALLS * 8)
+--- linux-rh-2.4.20-8/arch/arm/defconfig~linux-2.4.20-xattr-0.8.54-chaos       2001-05-20 08:43:05.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/arm/defconfig  2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_ARM=y
+ # CONFIG_EISA is not set
+ # CONFIG_SBUS is not set
+--- linux-rh-2.4.20-8/arch/arm/kernel/calls.S~linux-2.4.20-xattr-0.8.54-chaos  2002-08-03 08:39:42.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/arm/kernel/calls.S     2003-05-07 17:34:25.000000000 +0800
+@@ -240,18 +240,18 @@ __syscall_start:
+               .long   SYMBOL_NAME(sys_ni_syscall) /* Security */
+               .long   SYMBOL_NAME(sys_gettid)
+ /* 225 */     .long   SYMBOL_NAME(sys_readahead)
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_setxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lsetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fsetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_getxattr */
+-/* 230 */     .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lgetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fgetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_listxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_llistxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_flistxattr */
+-/* 235 */     .long   SYMBOL_NAME(sys_ni_syscall) /* sys_removexattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lremovexattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fremovexattr */
++              .long   SYMBOL_NAME(sys_setxattr)
++              .long   SYMBOL_NAME(sys_lsetxattr)
++              .long   SYMBOL_NAME(sys_fsetxattr)
++              .long   SYMBOL_NAME(sys_getxattr)
++/* 230 */     .long   SYMBOL_NAME(sys_lgetxattr)
++              .long   SYMBOL_NAME(sys_fgetxattr)
++              .long   SYMBOL_NAME(sys_listxattr)
++              .long   SYMBOL_NAME(sys_llistxattr)
++              .long   SYMBOL_NAME(sys_flistxattr)
++/* 235 */     .long   SYMBOL_NAME(sys_removexattr)
++              .long   SYMBOL_NAME(sys_lremovexattr)
++              .long   SYMBOL_NAME(sys_fremovexattr)
+               .long   SYMBOL_NAME(sys_tkill)
+               /*
+                * Please check 2.5 _before_ adding calls here,
+--- linux-rh-2.4.20-8/arch/i386/defconfig~linux-2.4.20-xattr-0.8.54-chaos      2003-04-11 14:04:53.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/i386/defconfig 2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_X86=y
+ CONFIG_ISA=y
+ # CONFIG_SBUS is not set
+--- linux-rh-2.4.20-8/arch/ia64/defconfig~linux-2.4.20-xattr-0.8.54-chaos      2003-04-11 14:04:43.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/ia64/defconfig 2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ #
+ # Code maturity level options
+--- linux-rh-2.4.20-8/arch/m68k/defconfig~linux-2.4.20-xattr-0.8.54-chaos      2000-06-20 03:56:08.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/m68k/defconfig 2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_UID16=y
+ #
+--- linux-rh-2.4.20-8/arch/mips/defconfig~linux-2.4.20-xattr-0.8.54-chaos      2002-11-29 07:53:10.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/mips/defconfig 2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_MIPS=y
+ CONFIG_MIPS32=y
+ # CONFIG_MIPS64 is not set
+--- linux-rh-2.4.20-8/arch/mips64/defconfig~linux-2.4.20-xattr-0.8.54-chaos    2002-11-29 07:53:10.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/mips64/defconfig       2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_MIPS=y
+ # CONFIG_MIPS32 is not set
+ CONFIG_MIPS64=y
+--- linux-rh-2.4.20-8/arch/ppc/defconfig~linux-2.4.20-xattr-0.8.54-chaos       2003-04-11 14:04:43.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/ppc/defconfig  2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,20 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+--- linux-rh-2.4.20-8/arch/ppc64/kernel/misc.S~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:11.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/ppc64/kernel/misc.S    2003-05-07 17:34:25.000000000 +0800
+@@ -731,6 +731,7 @@ _GLOBAL(sys_call_table32)
+       .llong .sys_gettid              /* 207 */
+ #if 0 /* Reserved syscalls */
+       .llong .sys_tkill               /* 208 */
++#endif
+       .llong .sys_setxattr
+       .llong .sys_lsetxattr   /* 210 */
+       .llong .sys_fsetxattr
+@@ -743,6 +744,7 @@ _GLOBAL(sys_call_table32)
+       .llong .sys_removexattr
+       .llong .sys_lremovexattr
+       .llong .sys_fremovexattr        /* 220 */
++#if 0 /* Reserved syscalls */
+       .llong .sys_futex
+ #endif
+       .llong .sys_perfmonctl   /* Put this here for now ... */
+--- linux-rh-2.4.20-8/arch/s390/defconfig~linux-2.4.20-xattr-0.8.54-chaos      2002-11-29 07:53:11.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/s390/defconfig 2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_ISA is not set
+ # CONFIG_EISA is not set
+ # CONFIG_MCA is not set
+--- linux-rh-2.4.20-8/arch/s390/kernel/entry.S~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:11.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/s390/kernel/entry.S    2003-05-07 17:34:25.000000000 +0800
+@@ -558,18 +558,18 @@ sys_call_table:
+         .long  sys_fcntl64 
+       .long  sys_ni_syscall
+       .long  sys_ni_syscall
+-      .long  sys_ni_syscall            /* 224 - reserved for setxattr  */
+-      .long  sys_ni_syscall            /* 225 - reserved for lsetxattr */
+-      .long  sys_ni_syscall            /* 226 - reserved for fsetxattr */
+-      .long  sys_ni_syscall            /* 227 - reserved for getxattr  */
+-      .long  sys_ni_syscall            /* 228 - reserved for lgetxattr */
+-      .long  sys_ni_syscall            /* 229 - reserved for fgetxattr */
+-      .long  sys_ni_syscall            /* 230 - reserved for listxattr */
+-      .long  sys_ni_syscall            /* 231 - reserved for llistxattr */
+-      .long  sys_ni_syscall            /* 232 - reserved for flistxattr */
+-      .long  sys_ni_syscall            /* 233 - reserved for removexattr */
+-      .long  sys_ni_syscall            /* 234 - reserved for lremovexattr */
+-      .long  sys_ni_syscall            /* 235 - reserved for fremovexattr */
++      .long  sys_setxattr
++      .long  sys_lsetxattr            /* 225 */
++      .long  sys_fsetxattr
++      .long  sys_getxattr
++      .long  sys_lgetxattr
++      .long  sys_fgetxattr
++      .long  sys_listxattr            /* 230 */
++      .long  sys_llistxattr
++      .long  sys_flistxattr
++      .long  sys_removexattr
++      .long  sys_lremovexattr
++      .long  sys_fremovexattr         /* 235 */
+       .long  sys_gettid
+       .long  sys_tkill
+       .rept  255-237
+--- linux-rh-2.4.20-8/arch/s390x/defconfig~linux-2.4.20-xattr-0.8.54-chaos     2002-11-29 07:53:11.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/s390x/defconfig        2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_ISA is not set
+ # CONFIG_EISA is not set
+ # CONFIG_MCA is not set
+--- linux-rh-2.4.20-8/arch/s390x/kernel/entry.S~linux-2.4.20-xattr-0.8.54-chaos        2002-11-29 07:53:11.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/s390x/kernel/entry.S   2003-05-07 17:34:25.000000000 +0800
+@@ -591,18 +591,18 @@ sys_call_table:
+       .long  SYSCALL(sys_ni_syscall,sys32_fcntl64_wrapper)
+       .long  SYSCALL(sys_ni_syscall,sys_ni_syscall)
+       .long  SYSCALL(sys_ni_syscall,sys_ni_syscall)
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 224 - reserved for setxattr  */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 225 - reserved for lsetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 226 - reserved for fsetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 227 - reserved for getxattr  */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 228 - reserved for lgetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 229 - reserved for fgetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 230 - reserved for listxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 231 - reserved for llistxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 232 - reserved for flistxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 233 - reserved for removexattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 234 - reserved for lremovexattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 235 - reserved for fremovexattr */
++      .long  SYSCALL(sys_setxattr,sys32_setxattr_wrapper)
++      .long  SYSCALL(sys_lsetxattr,sys32_lsetxattr_wrapper)   /* 225 */
++      .long  SYSCALL(sys_fsetxattr,sys32_fsetxattr_wrapper)
++      .long  SYSCALL(sys_getxattr,sys32_getxattr_wrapper)
++      .long  SYSCALL(sys_lgetxattr,sys32_lgetxattr_wrapper)
++      .long  SYSCALL(sys_fgetxattr,sys32_fgetxattr_wrapper)
++      .long  SYSCALL(sys_listxattr,sys32_listxattr_wrapper)   /* 230 */
++      .long  SYSCALL(sys_llistxattr,sys32_llistxattr_wrapper)
++      .long  SYSCALL(sys_flistxattr,sys32_flistxattr_wrapper)
++      .long  SYSCALL(sys_removexattr,sys32_removexattr_wrapper)
++      .long  SYSCALL(sys_lremovexattr,sys32_lremovexattr_wrapper)
++      .long  SYSCALL(sys_fremovexattr,sys32_fremovexattr_wrapper)/* 235 */
+       .long  SYSCALL(sys_gettid,sys_gettid)
+       .long  SYSCALL(sys_tkill,sys_tkill)
+       .rept  255-237
+--- linux-rh-2.4.20-8/arch/s390x/kernel/wrapper32.S~linux-2.4.20-xattr-0.8.54-chaos    2002-02-26 03:37:56.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/s390x/kernel/wrapper32.S       2003-05-07 17:34:25.000000000 +0800
+@@ -1091,3 +1091,95 @@ sys32_fstat64_wrapper:
+       llgtr   %r3,%r3                 # struct stat64 *
+       llgfr   %r4,%r4                 # long
+       jg      sys32_fstat64           # branch to system call
++
++      .globl  sys32_setxattr_wrapper
++sys32_setxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_setxattr
++
++      .globl  sys32_lsetxattr_wrapper
++sys32_lsetxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_lsetxattr
++
++      .globl  sys32_fsetxattr_wrapper
++sys32_fsetxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_fsetxattr
++
++      .globl  sys32_getxattr_wrapper
++sys32_getxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_getxattr
++
++      .globl  sys32_lgetxattr_wrapper
++sys32_lgetxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_lgetxattr
++
++      .globl  sys32_fgetxattr_wrapper
++sys32_fgetxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_fgetxattr
++
++      .globl  sys32_listxattr_wrapper
++sys32_listxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_listxattr
++
++      .globl  sys32_llistxattr_wrapper
++sys32_llistxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_llistxattr
++
++      .globl  sys32_flistxattr_wrapper
++sys32_flistxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_flistxattr
++
++      .globl  sys32_removexattr_wrapper
++sys32_removexattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      jg      sys_removexattr
++
++      .globl  sys32_lremovexattr_wrapper
++sys32_lremovexattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      jg      sys_lremovexattr
++
++      .globl  sys32_fremovexattr_wrapper
++sys32_fremovexattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      jg      sys_fremovexattr
++
++
+--- linux-rh-2.4.20-8/arch/sparc/defconfig~linux-2.4.20-xattr-0.8.54-chaos     2002-08-03 08:39:43.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/sparc/defconfig        2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_UID16=y
+ CONFIG_HIGHMEM=y
+--- linux-rh-2.4.20-8/arch/sparc/kernel/systbls.S~linux-2.4.20-xattr-0.8.54-chaos      2002-08-03 08:39:43.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/sparc/kernel/systbls.S 2003-05-07 17:34:25.000000000 +0800
+@@ -51,11 +51,11 @@ sys_call_table:
+ /*150*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+ /*155*/       .long sys_fcntl64, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
+ /*160*/       .long sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall
+-/*165*/       .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents
+-/*175*/       .long sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_sigpending, sys_query_module
+-/*185*/       .long sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sys_newuname
++/*165*/       .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr
++/*170*/       .long sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents
++/*175*/       .long sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .long sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_sigpending, sys_query_module
++/*185*/       .long sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sys_newuname
+ /*190*/       .long sys_init_module, sys_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+ /*195*/       .long sys_nis_syscall, sys_nis_syscall, sys_getppid, sparc_sigaction, sys_sgetmask
+ /*200*/       .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, old_readdir
+--- linux-rh-2.4.20-8/arch/sparc64/defconfig~linux-2.4.20-xattr-0.8.54-chaos   2003-04-11 14:04:43.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/sparc64/defconfig      2003-05-07 17:34:25.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ #
+ # Code maturity level options
+--- linux-rh-2.4.20-8/arch/sparc64/kernel/systbls.S~linux-2.4.20-xattr-0.8.54-chaos    2002-08-03 08:39:43.000000000 +0800
++++ linux-rh-2.4.20-8-root/arch/sparc64/kernel/systbls.S       2003-05-07 17:34:25.000000000 +0800
+@@ -52,11 +52,11 @@ sys_call_table32:
+ /*150*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+       .word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount
+ /*160*/       .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall
+-      .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_getdents
+-      .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_sigpending, sys32_query_module
+-      .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname
++      .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr
++/*170*/       .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys32_getdents
++      .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys32_sigpending, sys32_query_module
++      .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname
+ /*190*/       .word sys32_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+       .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys32_sigaction, sys_sgetmask
+ /*200*/       .word sys_ssetmask, sys_sigsuspend, sys32_newlstat, sys_uselib, old32_readdir
+@@ -111,11 +111,11 @@ sys_call_table:
+ /*150*/       .word sys_getsockname, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+       .word sys_nis_syscall, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
+ /*160*/       .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_utrap_install
+-      .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents
+-      .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_query_module
+-      .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname
++      .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr
++/*170*/       .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents
++      .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_nis_syscall, sys_query_module
++      .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname
+ /*190*/       .word sys_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+       .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys_nis_syscall, sys_sgetmask
+ /*200*/       .word sys_ssetmask, sys_nis_syscall, sys_newlstat, sys_uselib, sys_nis_syscall
+--- linux-rh-2.4.20-8/fs/Config.in~linux-2.4.20-xattr-0.8.54-chaos     2003-04-11 14:05:03.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/Config.in        2003-05-07 17:34:25.000000000 +0800
+@@ -34,6 +34,11 @@ dep_mbool '  Debug Befs' CONFIG_BEFS_DEB
+ dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL
+ tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS
++dep_mbool '  Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS
++dep_bool '    Ext3 extended attribute block sharing' \
++    CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR
++dep_bool '    Ext3 extended user attributes' \
++    CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR
+ # CONFIG_JBD could be its own option (even modular), but until there are
+ # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
+ # dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
+@@ -93,6 +98,11 @@ dep_mbool '  QNX4FS write support (DANGE
+ tristate 'ROM file system support' CONFIG_ROMFS_FS
+ tristate 'Second extended fs support' CONFIG_EXT2_FS
++dep_mbool '  Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS
++dep_bool '    Ext2 extended attribute block sharing' \
++    CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR
++dep_bool '    Ext2 extended user attributes' \
++    CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR
+ tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS
+@@ -164,6 +174,10 @@ else
+    define_tristate CONFIG_ZISOFS_FS n
+ fi
++# Meta block cache for Extended Attributes (ext2/ext3)
++#tristate 'Meta block cache' CONFIG_FS_MBCACHE
++define_tristate CONFIG_FS_MBCACHE y
++
+ mainmenu_option next_comment
+ comment 'Partition Types'
+ source fs/partitions/Config.in
+--- linux-rh-2.4.20-8/fs/Makefile~linux-2.4.20-xattr-0.8.54-chaos      2003-05-07 17:33:58.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/Makefile 2003-05-07 17:34:25.000000000 +0800
+@@ -84,6 +84,9 @@ obj-y                                += binfmt_script.o
+ obj-$(CONFIG_BINFMT_ELF)      += binfmt_elf.o
++export-objs += mbcache.o
++obj-$(CONFIG_FS_MBCACHE)      += mbcache.o
++
+ # persistent filesystems
+ obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
+--- linux-rh-2.4.20-8/fs/ext2/Makefile~linux-2.4.20-xattr-0.8.54-chaos 2001-10-11 23:05:18.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/Makefile    2003-05-07 17:34:25.000000000 +0800
+@@ -13,4 +13,8 @@ obj-y    := balloc.o bitmap.o dir.o file
+               ioctl.o namei.o super.o symlink.o
+ obj-m    := $(O_TARGET)
++export-objs += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+--- linux-rh-2.4.20-8/fs/ext2/file.c~linux-2.4.20-xattr-0.8.54-chaos   2001-10-11 23:05:18.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/file.c      2003-05-07 17:34:25.000000000 +0800
+@@ -20,6 +20,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/sched.h>
+ /*
+@@ -51,4 +52,8 @@ struct file_operations ext2_file_operati
+ struct inode_operations ext2_file_inode_operations = {
+       truncate:       ext2_truncate,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- linux-rh-2.4.20-8/fs/ext2/ialloc.c~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:15.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/ialloc.c    2003-05-07 17:34:25.000000000 +0800
+@@ -15,6 +15,7 @@
+ #include <linux/config.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+@@ -167,6 +168,7 @@ void ext2_free_inode (struct inode * ino
+        */
+       if (!is_bad_inode(inode)) {
+               /* Quota is already initialized in iput() */
++              ext2_xattr_delete_inode(inode);
+               DQUOT_FREE_INODE(inode);
+               DQUOT_DROP(inode);
+       }
+--- linux-rh-2.4.20-8/fs/ext2/inode.c~linux-2.4.20-xattr-0.8.54-chaos  2002-11-29 07:53:15.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/inode.c     2003-05-07 17:34:25.000000000 +0800
+@@ -39,6 +39,18 @@ MODULE_LICENSE("GPL");
+ static int ext2_update_inode(struct inode * inode, int do_sync);
+ /*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext2_inode_is_fast_symlink(struct inode *inode)
++{
++      int ea_blocks = inode->u.ext2_i.i_file_acl ?
++              (inode->i_sb->s_blocksize >> 9) : 0;
++
++      return (S_ISLNK(inode->i_mode) &&
++              inode->i_blocks - ea_blocks == 0);
++}
++
++/*
+  * Called at each iput()
+  */
+ void ext2_put_inode (struct inode * inode)
+@@ -53,9 +65,7 @@ void ext2_delete_inode (struct inode * i
+ {
+       lock_kernel();
+-      if (is_bad_inode(inode) ||
+-          inode->i_ino == EXT2_ACL_IDX_INO ||
+-          inode->i_ino == EXT2_ACL_DATA_INO)
++      if (is_bad_inode(inode))
+               goto no_delete;
+       inode->u.ext2_i.i_dtime = CURRENT_TIME;
+       mark_inode_dirty(inode);
+@@ -801,6 +811,8 @@ void ext2_truncate (struct inode * inode
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode)))
+               return;
++      if (ext2_inode_is_fast_symlink(inode))
++              return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+               return;
+@@ -888,8 +900,7 @@ void ext2_read_inode (struct inode * ino
+       unsigned long offset;
+       struct ext2_group_desc * gdp;
+-      if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO &&
+-           inode->i_ino != EXT2_ACL_DATA_INO &&
++      if ((inode->i_ino != EXT2_ROOT_INO &&
+            inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
+           inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
+               ext2_error (inode->i_sb, "ext2_read_inode",
+@@ -974,10 +985,7 @@ void ext2_read_inode (struct inode * ino
+       for (block = 0; block < EXT2_N_BLOCKS; block++)
+               inode->u.ext2_i.i_data[block] = raw_inode->i_block[block];
+-      if (inode->i_ino == EXT2_ACL_IDX_INO ||
+-          inode->i_ino == EXT2_ACL_DATA_INO)
+-              /* Nothing to do */ ;
+-      else if (S_ISREG(inode->i_mode)) {
++      if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &ext2_file_inode_operations;
+               inode->i_fop = &ext2_file_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+@@ -986,15 +994,17 @@ void ext2_read_inode (struct inode * ino
+               inode->i_fop = &ext2_dir_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+       } else if (S_ISLNK(inode->i_mode)) {
+-              if (!inode->i_blocks)
++              if (ext2_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext2_fast_symlink_inode_operations;
+               else {
+-                      inode->i_op = &page_symlink_inode_operations;
++                      inode->i_op = &ext2_symlink_inode_operations;
+                       inode->i_mapping->a_ops = &ext2_aops;
+               }
+-      } else 
++      } else {
++              inode->i_op = &ext2_special_inode_operations;
+               init_special_inode(inode, inode->i_mode,
+                                  le32_to_cpu(raw_inode->i_block[0]));
++      }
+       brelse (bh);
+       inode->i_attr_flags = 0;
+       if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) {
+--- linux-rh-2.4.20-8/fs/ext2/namei.c~linux-2.4.20-xattr-0.8.54-chaos  2001-10-04 13:57:36.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/namei.c     2003-05-07 17:34:25.000000000 +0800
+@@ -31,6 +31,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/pagemap.h>
+ /*
+@@ -136,7 +137,7 @@ static int ext2_symlink (struct inode * 
+       if (l > sizeof (inode->u.ext2_i.i_data)) {
+               /* slow symlink */
+-              inode->i_op = &page_symlink_inode_operations;
++              inode->i_op = &ext2_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+               err = block_symlink(inode, symname, l);
+               if (err)
+@@ -345,4 +346,15 @@ struct inode_operations ext2_dir_inode_o
+       rmdir:          ext2_rmdir,
+       mknod:          ext2_mknod,
+       rename:         ext2_rename,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
++};
++
++struct inode_operations ext2_special_inode_operations = {
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- linux-rh-2.4.20-8/fs/ext2/super.c~linux-2.4.20-xattr-0.8.54-chaos  2002-11-29 07:53:15.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/super.c     2003-05-07 17:34:25.000000000 +0800
+@@ -21,6 +21,7 @@
+ #include <linux/string.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -125,6 +126,7 @@ void ext2_put_super (struct super_block 
+       int db_count;
+       int i;
++      ext2_xattr_put_super(sb);
+       if (!(sb->s_flags & MS_RDONLY)) {
+               struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+@@ -175,6 +177,13 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++              if (!strcmp (this_char, "user_xattr"))
++                      set_opt (*mount_options, XATTR_USER);
++              else if (!strcmp (this_char, "nouser_xattr"))
++                      clear_opt (*mount_options, XATTR_USER);
++              else
++#endif
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -424,6 +433,9 @@ struct super_block * ext2_read_super (st
+           blocksize = BLOCK_SIZE;
+       sb->u.ext2_sb.s_mount_opt = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++      /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */
++#endif
+       if (!parse_options ((char *) data, &sb_block, &resuid, &resgid,
+           &sb->u.ext2_sb.s_mount_opt)) {
+               return NULL;
+@@ -813,12 +825,27 @@ static DECLARE_FSTYPE_DEV(ext2_fs_type, 
+ static int __init init_ext2_fs(void)
+ {
+-        return register_filesystem(&ext2_fs_type);
++      int error = init_ext2_xattr();
++      if (error)
++              return error;
++      error = init_ext2_xattr_user();
++      if (error)
++              goto fail;
++      error = register_filesystem(&ext2_fs_type);
++      if (!error)
++              return 0;
++
++      exit_ext2_xattr_user();
++fail:
++      exit_ext2_xattr();
++      return error;
+ }
+ static void __exit exit_ext2_fs(void)
+ {
+       unregister_filesystem(&ext2_fs_type);
++      exit_ext2_xattr_user();
++      exit_ext2_xattr();
+ }
+ EXPORT_NO_SYMBOLS;
+--- linux-rh-2.4.20-8/fs/ext2/symlink.c~linux-2.4.20-xattr-0.8.54-chaos        2000-09-28 04:41:33.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/symlink.c   2003-05-07 17:34:25.000000000 +0800
+@@ -19,6 +19,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -32,7 +33,20 @@ static int ext2_follow_link(struct dentr
+       return vfs_follow_link(nd, s);
+ }
++struct inode_operations ext2_symlink_inode_operations = {
++      readlink:       page_readlink,
++      follow_link:    page_follow_link,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
++};
++
+ struct inode_operations ext2_fast_symlink_inode_operations = {
+       readlink:       ext2_readlink,
+       follow_link:    ext2_follow_link,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/xattr.c     2003-05-07 17:34:25.000000000 +0800
+@@ -0,0 +1,1212 @@
++/*
++ * linux/fs/ext2/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Extended attributes for symlinks and special files added per
++ *  suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ *   +------------------+
++ *   | header           |
++ *   | entry 1          | |
++ *   | entry 2          | | growing downwards
++ *   | entry 3          | v
++ *   | four null bytes  |
++ *   | . . .            |
++ *   | value 1          | ^
++ *   | value 3          | | growing upwards
++ *   | value 2          | |
++ *   +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT2_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++/* These symbols may be needed by a module. */
++EXPORT_SYMBOL(ext2_xattr_register);
++EXPORT_SYMBOL(ext2_xattr_unregister);
++EXPORT_SYMBOL(ext2_xattr_get);
++EXPORT_SYMBOL(ext2_xattr_list);
++EXPORT_SYMBOL(ext2_xattr_set);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT2_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++              printk(KERN_DEBUG "inode %s:%ld: ", \
++                      kdevname(inode->i_dev), inode->i_ino); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++# define ea_bdebug(bh, f...) do { \
++              printk(KERN_DEBUG "block %s:%ld: ", \
++                      kdevname(bh->b_dev), bh->b_blocknr); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext2_xattr_set2(struct inode *, struct buffer_head *,
++                         struct ext2_xattr_header *);
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++static int ext2_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext2_xattr_cache_find(struct inode *,
++                                               struct ext2_xattr_header *);
++static void ext2_xattr_cache_remove(struct buffer_head *);
++static void ext2_xattr_rehash(struct ext2_xattr_header *,
++                            struct ext2_xattr_entry *);
++
++static struct mb_cache *ext2_xattr_cache;
++
++#else
++# define ext2_xattr_cache_insert(bh) 0
++# define ext2_xattr_cache_find(inode, header) NULL
++# define ext2_xattr_cache_remove(bh) while(0) {}
++# define ext2_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext2_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext2_xattr_sem);
++
++static inline int
++ext2_xattr_new_block(struct inode *inode, int * errp, int force)
++{
++      struct super_block *sb = inode->i_sb;
++      int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) +
++              EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb);
++
++      /* How can we enforce the allocation? */
++      int block = ext2_new_block(inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++      if (!*errp)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++      return block;
++}
++
++static inline int
++ext2_xattr_quota_alloc(struct inode *inode, int force)
++{
++      /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++      int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++      if (!error)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++      int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++      return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext2_xattr_quota_free(struct inode *inode)
++{
++      DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext2_xattr_free_block(struct inode * inode, unsigned long block)
++{
++      ext2_free_blocks(inode, block, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext2_xattr_quota_free(inode) \
++      DQUOT_FREE_BLOCK(inode, 1)
++# define ext2_xattr_free_block(inode, block) \
++      ext2_free_blocks(inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++      return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++      return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler)
++{
++      int error = -EINVAL;
++
++      if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++              write_lock(&ext2_handler_lock);
++              if (!ext2_xattr_handlers[name_index-1]) {
++                      ext2_xattr_handlers[name_index-1] = handler;
++                      error = 0;
++              }
++              write_unlock(&ext2_handler_lock);
++      }
++      return error;
++}
++
++void
++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler)
++{
++      if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) {
++              write_lock(&ext2_handler_lock);
++              ext2_xattr_handlers[name_index-1] = NULL;
++              write_unlock(&ext2_handler_lock);
++      }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++      while (*a_prefix && *a == *a_prefix) {
++              a++;
++              a_prefix++;
++      }
++      return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static struct ext2_xattr_handler *
++ext2_xattr_resolve_name(const char **name)
++{
++      struct ext2_xattr_handler *handler = NULL;
++      int i;
++
++      if (!*name)
++              return NULL;
++      read_lock(&ext2_handler_lock);
++      for (i=0; i<EXT2_XATTR_INDEX_MAX; i++) {
++              if (ext2_xattr_handlers[i]) {
++                      const char *n = strcmp_prefix(*name,
++                              ext2_xattr_handlers[i]->prefix);
++                      if (n) {
++                              handler = ext2_xattr_handlers[i];
++                              *name = n;
++                              break;
++                      }
++              }
++      }
++      read_unlock(&ext2_handler_lock);
++      return handler;
++}
++
++static inline struct ext2_xattr_handler *
++ext2_xattr_handler(int name_index)
++{
++      struct ext2_xattr_handler *handler = NULL;
++      if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++              read_lock(&ext2_handler_lock);
++              handler = ext2_xattr_handlers[name_index-1];
++              read_unlock(&ext2_handler_lock);
++      }
++      return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_getxattr(struct dentry *dentry, const char *name,
++            void *buffer, size_t size)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++      return ext2_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_setxattr(struct dentry *dentry, const char *name,
++            const void *value, size_t size, int flags)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      if (size == 0)
++              value = "";  /* empty EA, do not remove */
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_removexattr(struct dentry *dentry, const char *name)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext2_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_entry *entry;
++      unsigned int block, size;
++      char *end;
++      int name_len, error;
++
++      ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++                name_index, name, buffer, (long)buffer_size);
++
++      if (name == NULL)
++              return -EINVAL;
++      if (!EXT2_I(inode)->i_file_acl)
++              return -ENOATTR;
++      block = EXT2_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext2_error(inode->i_sb, "ext2_xattr_get",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* find named attribute */
++      name_len = strlen(name);
++
++      error = -ERANGE;
++      if (name_len > 255)
++              goto cleanup;
++      entry = FIRST_ENTRY(bh);
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              if (name_index == entry->e_name_index &&
++                  name_len == entry->e_name_len &&
++                  memcmp(name, entry->e_name, name_len) == 0)
++                      goto found;
++              entry = next;
++      }
++      /* Check the remaining name entries */
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              entry = next;
++      }
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      error = -ENOATTR;
++      goto cleanup;
++found:
++      /* check the buffer size */
++      if (entry->e_value_block != 0)
++              goto bad_block;
++      size = le32_to_cpu(entry->e_value_size);
++      if (size > inode->i_sb->s_blocksize ||
++          le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++              goto bad_block;
++
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (buffer) {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++              /* return value of attribute */
++              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++                      size);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * ext2_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_entry *entry;
++      unsigned int block, size = 0;
++      char *buf, *end;
++      int error;
++
++      ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++                buffer, (long)buffer_size);
++
++      if (!EXT2_I(inode)->i_file_acl)
++              return 0;
++      block = EXT2_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext2_error(inode->i_sb, "ext2_xattr_list",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* compute the size required for the list of attribute names */
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT2_XATTR_NEXT(entry)) {
++              struct ext2_xattr_handler *handler;
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++
++              handler = ext2_xattr_handler(entry->e_name_index);
++              if (handler)
++                      size += handler->list(NULL, inode, entry->e_name,
++                                            entry->e_name_len);
++      }
++
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (!buffer) {
++              error = size;
++              goto cleanup;
++      } else {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++      }
++
++      /* list the attribute names */
++      buf = buffer;
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT2_XATTR_NEXT(entry)) {
++              struct ext2_xattr_handler *handler;
++              
++              handler = ext2_xattr_handler(entry->e_name_index);
++              if (handler)
++                      buf += handler->list(buf, inode, entry->e_name,
++                                           entry->e_name_len);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext2_xattr_update_super_block(struct super_block *sb)
++{
++      if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
++              return;
++
++      lock_super(sb);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++      EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR;
++#endif
++      EXT2_SB(sb)->s_es->s_feature_compat |=
++              cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR);
++      sb->s_dirt = 1;
++      mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
++      unlock_super(sb);
++}
++
++/*
++ * ext2_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++             const void *value, size_t value_len, int flags)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_header *header = NULL;
++      struct ext2_xattr_entry *here, *last;
++      unsigned int name_len;
++      int block = EXT2_I(inode)->i_file_acl;
++      int min_offs = sb->s_blocksize, not_found = 1, free, error;
++      char *end;
++      
++      /*
++       * header -- Points either into bh, or to a temporarily
++       *           allocated buffer.
++       * here -- The named entry found, or the place for inserting, within
++       *         the block pointed to by header.
++       * last -- Points right after the last named entry within the block
++       *         pointed to by header.
++       * min_offs -- The offset of the first value (values are aligned
++       *             towards the end of the block).
++       * end -- Points right after the block pointed to by header.
++       */
++      
++      ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++                name_index, name, value, (long)value_len);
++
++      if (IS_RDONLY(inode))
++              return -EROFS;
++      if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++              return -EPERM;
++      if (value == NULL)
++              value_len = 0;
++      if (name == NULL)
++              return -EINVAL;
++      name_len = strlen(name);
++      if (name_len > 255 || value_len > sb->s_blocksize)
++              return -ERANGE;
++      down(&ext2_xattr_sem);
++
++      if (block) {
++              /* The inode already has an extended attribute block. */
++
++              bh = sb_bread(sb, block);
++              error = -EIO;
++              if (!bh)
++                      goto cleanup;
++              ea_bdebug(bh, "b_count=%d, refcount=%d",
++                      atomic_read(&(bh->b_count)),
++                      le32_to_cpu(HDR(bh)->h_refcount));
++              header = HDR(bh);
++              end = bh->b_data + bh->b_size;
++              if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++                  header->h_blocks != cpu_to_le32(1)) {
++bad_block:            ext2_error(sb, "ext2_xattr_set",
++                              "inode %ld: bad block %d", inode->i_ino, block);
++                      error = -EIO;
++                      goto cleanup;
++              }
++              /* Find the named attribute. */
++              here = FIRST_ENTRY(bh);
++              while (!IS_LAST_ENTRY(here)) {
++                      struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!here->e_value_block && here->e_value_size) {
++                              int offs = le16_to_cpu(here->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      not_found = name_index - here->e_name_index;
++                      if (!not_found)
++                              not_found = name_len - here->e_name_len;
++                      if (!not_found)
++                              not_found = memcmp(name, here->e_name,name_len);
++                      if (not_found <= 0)
++                              break;
++                      here = next;
++              }
++              last = here;
++              /* We still need to compute min_offs and last. */
++              while (!IS_LAST_ENTRY(last)) {
++                      struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!last->e_value_block && last->e_value_size) {
++                              int offs = le16_to_cpu(last->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      last = next;
++              }
++
++              /* Check whether we have enough space left. */
++              free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++      } else {
++              /* We will use a new extended attribute block. */
++              free = sb->s_blocksize -
++                      sizeof(struct ext2_xattr_header) - sizeof(__u32);
++              here = last = NULL;  /* avoid gcc uninitialized warning. */
++      }
++
++      if (not_found) {
++              /* Request to remove a nonexistent attribute? */
++              error = -ENOATTR;
++              if (flags & XATTR_REPLACE)
++                      goto cleanup;
++              error = 0;
++              if (value == NULL)
++                      goto cleanup;
++              else
++                      free -= EXT2_XATTR_LEN(name_len);
++      } else {
++              /* Request to create an existing attribute? */
++              error = -EEXIST;
++              if (flags & XATTR_CREATE)
++                      goto cleanup;
++              if (!here->e_value_block && here->e_value_size) {
++                      unsigned int size = le32_to_cpu(here->e_value_size);
++
++                      if (le16_to_cpu(here->e_value_offs) + size > 
++                          sb->s_blocksize || size > sb->s_blocksize)
++                              goto bad_block;
++                      free += EXT2_XATTR_SIZE(size);
++              }
++      }
++      free -= EXT2_XATTR_SIZE(value_len);
++      error = -ENOSPC;
++      if (free < 0)
++              goto cleanup;
++
++      /* Here we know that we can set the new attribute. */
++
++      if (header) {
++              if (header->h_refcount == cpu_to_le32(1)) {
++                      ea_bdebug(bh, "modifying in-place");
++                      ext2_xattr_cache_remove(bh);
++              } else {
++                      int offset;
++
++                      ea_bdebug(bh, "cloning");
++                      header = kmalloc(bh->b_size, GFP_KERNEL);
++                      error = -ENOMEM;
++                      if (header == NULL)
++                              goto cleanup;
++                      memcpy(header, HDR(bh), bh->b_size);
++                      header->h_refcount = cpu_to_le32(1);
++                      offset = (char *)header - bh->b_data;
++                      here = ENTRY((char *)here + offset);
++                      last = ENTRY((char *)last + offset);
++              }
++      } else {
++              /* Allocate a buffer where we construct the new block. */
++              header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++              error = -ENOMEM;
++              if (header == NULL)
++                      goto cleanup;
++              memset(header, 0, sb->s_blocksize);
++              end = (char *)header + sb->s_blocksize;
++              header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
++              header->h_blocks = header->h_refcount = cpu_to_le32(1);
++              last = here = ENTRY(header+1);
++      }
++
++      if (not_found) {
++              /* Insert the new name. */
++              int size = EXT2_XATTR_LEN(name_len);
++              int rest = (char *)last - (char *)here;
++              memmove((char *)here + size, here, rest);
++              memset(here, 0, size);
++              here->e_name_index = name_index;
++              here->e_name_len = name_len;
++              memcpy(here->e_name, name, name_len);
++      } else {
++              /* Remove the old value. */
++              if (!here->e_value_block && here->e_value_size) {
++                      char *first_val = (char *)header + min_offs;
++                      int offs = le16_to_cpu(here->e_value_offs);
++                      char *val = (char *)header + offs;
++                      size_t size = EXT2_XATTR_SIZE(
++                              le32_to_cpu(here->e_value_size));
++                      memmove(first_val + size, first_val, val - first_val);
++                      memset(first_val, 0, size);
++                      here->e_value_offs = 0;
++                      min_offs += size;
++
++                      /* Adjust all value offsets. */
++                      last = ENTRY(header+1);
++                      while (!IS_LAST_ENTRY(last)) {
++                              int o = le16_to_cpu(last->e_value_offs);
++                              if (!last->e_value_block && o < offs)
++                                      last->e_value_offs =
++                                              cpu_to_le16(o + size);
++                              last = EXT2_XATTR_NEXT(last);
++                      }
++              }
++              if (value == NULL) {
++                      /* Remove this attribute. */
++                      if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) {
++                              /* This block is now empty. */
++                              error = ext2_xattr_set2(inode, bh, NULL);
++                              goto cleanup;
++                      } else {
++                              /* Remove the old name. */
++                              int size = EXT2_XATTR_LEN(name_len);
++                              last = ENTRY((char *)last - size);
++                              memmove(here, (char*)here + size,
++                                      (char*)last - (char*)here);
++                              memset(last, 0, size);
++                      }
++              }
++      }
++
++      if (value != NULL) {
++              /* Insert the new value. */
++              here->e_value_size = cpu_to_le32(value_len);
++              if (value_len) {
++                      size_t size = EXT2_XATTR_SIZE(value_len);
++                      char *val = (char *)header + min_offs - size;
++                      here->e_value_offs =
++                              cpu_to_le16((char *)val - (char *)header);
++                      memset(val + size - EXT2_XATTR_PAD, 0,
++                             EXT2_XATTR_PAD); /* Clear the pad bytes. */
++                      memcpy(val, value, value_len);
++              }
++      }
++      ext2_xattr_rehash(header, here);
++
++      error = ext2_xattr_set2(inode, bh, header);
++
++cleanup:
++      brelse(bh);
++      if (!(bh && header == HDR(bh)))
++              kfree(header);
++      up(&ext2_xattr_sem);
++
++      return error;
++}
++
++/*
++ * Second half of ext2_xattr_set(): Update the file system.
++ */
++static int
++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
++              struct ext2_xattr_header *header)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *new_bh = NULL;
++      int error;
++
++      if (header) {
++              new_bh = ext2_xattr_cache_find(inode, header);
++              if (new_bh) {
++                      /*
++                       * We found an identical block in the cache.
++                       * The old block will be released after updating
++                       * the inode.
++                       */
++                      ea_bdebug(old_bh, "reusing block %ld",
++                              new_bh->b_blocknr);
++                      
++                      error = -EDQUOT;
++                      if (ext2_xattr_quota_alloc(inode, 1))
++                              goto cleanup;
++                      
++                      HDR(new_bh)->h_refcount = cpu_to_le32(
++                              le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++                      ea_bdebug(new_bh, "refcount now=%d",
++                              le32_to_cpu(HDR(new_bh)->h_refcount));
++              } else if (old_bh && header == HDR(old_bh)) {
++                      /* Keep this block. */
++                      new_bh = old_bh;
++                      ext2_xattr_cache_insert(new_bh);
++              } else {
++                      /* We need to allocate a new block */
++                      int force = EXT2_I(inode)->i_file_acl != 0;
++                      int block = ext2_xattr_new_block(inode, &error, force);
++                      if (error)
++                              goto cleanup;
++                      ea_idebug(inode, "creating block %d", block);
++
++                      new_bh = sb_getblk(sb, block);
++                      if (!new_bh) {
++                              ext2_xattr_free_block(inode, block);
++                              error = -EIO;
++                              goto cleanup;
++                      }
++                      lock_buffer(new_bh);
++                      memcpy(new_bh->b_data, header, new_bh->b_size);
++                      mark_buffer_uptodate(new_bh, 1);
++                      unlock_buffer(new_bh);
++                      ext2_xattr_cache_insert(new_bh);
++                      
++                      ext2_xattr_update_super_block(sb);
++              }
++              mark_buffer_dirty(new_bh);
++              if (IS_SYNC(inode)) {
++                      ll_rw_block(WRITE, 1, &new_bh);
++                      wait_on_buffer(new_bh); 
++                      error = -EIO;
++                      if (buffer_req(new_bh) && !buffer_uptodate(new_bh))
++                              goto cleanup;
++              }
++      }
++
++      /* Update the inode. */
++      EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++      inode->i_ctime = CURRENT_TIME;
++      if (IS_SYNC(inode)) {
++              error = ext2_sync_inode (inode);
++              if (error)
++                      goto cleanup;
++      } else
++              mark_inode_dirty(inode);
++
++      error = 0;
++      if (old_bh && old_bh != new_bh) {
++              /*
++               * If there was an old block, and we are not still using it,
++               * we now release the old block.
++              */
++              unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++              if (refcount == 1) {
++                      /* Free the old block. */
++                      ea_bdebug(old_bh, "freeing");
++                      ext2_xattr_free_block(inode, old_bh->b_blocknr);
++                      mark_buffer_clean(old_bh);
++              } else {
++                      /* Decrement the refcount only. */
++                      refcount--;
++                      HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++                      ext2_xattr_quota_free(inode);
++                      mark_buffer_dirty(old_bh);
++                      ea_bdebug(old_bh, "refcount now=%d", refcount);
++              }
++      }
++
++cleanup:
++      if (old_bh != new_bh)
++              brelse(new_bh);
++
++      return error;
++}
++
++/*
++ * ext2_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++      struct buffer_head *bh;
++      unsigned int block = EXT2_I(inode)->i_file_acl;
++
++      if (!block)
++              return;
++      down(&ext2_xattr_sem);
++
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh) {
++              ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++                      "inode %ld: block %d read error", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++              ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++      if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++              ext2_xattr_cache_remove(bh);
++              ext2_xattr_free_block(inode, block);
++              bforget(bh);
++              bh = NULL;
++      } else {
++              HDR(bh)->h_refcount = cpu_to_le32(
++                      le32_to_cpu(HDR(bh)->h_refcount) - 1);
++              mark_buffer_dirty(bh);
++              if (IS_SYNC(inode)) {
++                      ll_rw_block(WRITE, 1, &bh);
++                      wait_on_buffer(bh);
++              }
++              ext2_xattr_quota_free(inode);
++      }
++      EXT2_I(inode)->i_file_acl = 0;
++
++cleanup:
++      brelse(bh);
++      up(&ext2_xattr_sem);
++}
++
++/*
++ * ext2_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext2_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++      mb_cache_shrink(ext2_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++/*
++ * ext2_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext2_xattr_cache_insert(struct buffer_head *bh)
++{
++      __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++      struct mb_cache_entry *ce;
++      int error;
++
++      ce = mb_cache_entry_alloc(ext2_xattr_cache);
++      if (!ce)
++              return -ENOMEM;
++      error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++      if (error) {
++              mb_cache_entry_free(ce);
++              if (error == -EBUSY) {
++                      ea_bdebug(bh, "already in cache (%d cache entries)",
++                              atomic_read(&ext2_xattr_cache->c_entry_count));
++                      error = 0;
++              }
++      } else {
++              ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++                        atomic_read(&ext2_xattr_cache->c_entry_count));
++              mb_cache_entry_release(ce);
++      }
++      return error;
++}
++
++/*
++ * ext2_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext2_xattr_cmp(struct ext2_xattr_header *header1,
++             struct ext2_xattr_header *header2)
++{
++      struct ext2_xattr_entry *entry1, *entry2;
++
++      entry1 = ENTRY(header1+1);
++      entry2 = ENTRY(header2+1);
++      while (!IS_LAST_ENTRY(entry1)) {
++              if (IS_LAST_ENTRY(entry2))
++                      return 1;
++              if (entry1->e_hash != entry2->e_hash ||
++                  entry1->e_name_len != entry2->e_name_len ||
++                  entry1->e_value_size != entry2->e_value_size ||
++                  memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++                      return 1;
++              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++                      return -EIO;
++              if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++                         (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++                         le32_to_cpu(entry1->e_value_size)))
++                      return 1;
++
++              entry1 = EXT2_XATTR_NEXT(entry1);
++              entry2 = EXT2_XATTR_NEXT(entry2);
++      }
++      if (!IS_LAST_ENTRY(entry2))
++              return 1;
++      return 0;
++}
++
++/*
++ * ext2_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
++{
++      __u32 hash = le32_to_cpu(header->h_hash);
++      struct mb_cache_entry *ce;
++
++      if (!header->h_hash)
++              return NULL;  /* never share */
++      ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++      ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash);
++      while (ce) {
++              struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++              if (!bh) {
++                      ext2_error(inode->i_sb, "ext2_xattr_cache_find",
++                              "inode %ld: block %ld read error",
++                              inode->i_ino, ce->e_block);
++              } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++                         EXT2_XATTR_REFCOUNT_MAX) {
++                      ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++                              le32_to_cpu(HDR(bh)->h_refcount),
++                              EXT2_XATTR_REFCOUNT_MAX);
++              } else if (!ext2_xattr_cmp(header, HDR(bh))) {
++                      ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++                      mb_cache_entry_release(ce);
++                      return bh;
++              }
++              brelse(bh);
++              ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++      }
++      return NULL;
++}
++
++/*
++ * ext2_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext2_xattr_cache_remove(struct buffer_head *bh)
++{
++      struct mb_cache_entry *ce;
++
++      ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr);
++      if (ce) {
++              ea_bdebug(bh, "removing (%d cache entries remaining)",
++                        atomic_read(&ext2_xattr_cache->c_entry_count)-1);
++              mb_cache_entry_free(ce);
++      } else 
++              ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header,
++                                       struct ext2_xattr_entry *entry)
++{
++      __u32 hash = 0;
++      char *name = entry->e_name;
++      int n;
++
++      for (n=0; n < entry->e_name_len; n++) {
++              hash = (hash << NAME_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++                     *name++;
++      }
++
++      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++              __u32 *value = (__u32 *)((char *)header +
++                      le16_to_cpu(entry->e_value_offs));
++              for (n = (le32_to_cpu(entry->e_value_size) +
++                   EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) {
++                      hash = (hash << VALUE_HASH_SHIFT) ^
++                             (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++                             le32_to_cpu(*value++);
++              }
++      }
++      entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext2_xattr_rehash(struct ext2_xattr_header *header,
++                            struct ext2_xattr_entry *entry)
++{
++      struct ext2_xattr_entry *here;
++      __u32 hash = 0;
++      
++      ext2_xattr_hash_entry(header, entry);
++      here = ENTRY(header+1);
++      while (!IS_LAST_ENTRY(here)) {
++              if (!here->e_hash) {
++                      /* Block is not shared if an entry's hash value == 0 */
++                      hash = 0;
++                      break;
++              }
++              hash = (hash << BLOCK_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++                     le32_to_cpu(here->e_hash);
++              here = EXT2_XATTR_NEXT(here);
++      }
++      header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext2_xattr(void)
++{
++      ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
++              sizeof(struct mb_cache_entry) +
++              sizeof(struct mb_cache_entry_index), 1, 61);
++      if (!ext2_xattr_cache)
++              return -ENOMEM;
++
++      return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++      mb_cache_destroy(ext2_xattr_cache);
++}
++
++#else  /* CONFIG_EXT2_FS_XATTR_SHARING */
++
++int __init
++init_ext2_xattr(void)
++{
++      return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++}
++
++#endif  /* CONFIG_EXT2_FS_XATTR_SHARING */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext2/xattr_user.c        2003-05-07 17:34:25.000000000 +0800
+@@ -0,0 +1,103 @@
++/*
++ * linux/fs/ext2/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++# include <linux/ext2_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext2_xattr_user_list(char *list, struct inode *inode,
++                   const char *name, int name_len)
++{
++      const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return 0;
++
++      if (list) {
++              memcpy(list, XATTR_USER_PREFIX, prefix_len);
++              memcpy(list+prefix_len, name, name_len);
++              list[prefix_len + name_len] = '\0';
++      }
++      return prefix_len + name_len + 1;
++}
++
++static int
++ext2_xattr_user_get(struct inode *inode, const char *name,
++                  void *buffer, size_t size)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++      error = ext2_permission_locked(inode, MAY_READ);
++#else
++      error = permission(inode, MAY_READ);
++#endif
++      if (error)
++              return error;
++
++      return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name,
++                            buffer, size);
++}
++
++static int
++ext2_xattr_user_set(struct inode *inode, const char *name,
++                  const void *value, size_t size, int flags)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++      if ( !S_ISREG(inode->i_mode) &&
++          (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++              return -EPERM;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++      error = ext2_permission_locked(inode, MAY_WRITE);
++#else
++      error = permission(inode, MAY_WRITE);
++#endif
++      if (error)
++              return error;
++  
++      return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
++                            value, size, flags);
++}
++
++struct ext2_xattr_handler ext2_xattr_user_handler = {
++      prefix: XATTR_USER_PREFIX,
++      list:   ext2_xattr_user_list,
++      get:    ext2_xattr_user_get,
++      set:    ext2_xattr_user_set,
++};
++
++int __init
++init_ext2_xattr_user(void)
++{
++      return ext2_xattr_register(EXT2_XATTR_INDEX_USER,
++                                 &ext2_xattr_user_handler);
++}
++
++void
++exit_ext2_xattr_user(void)
++{
++      ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
++                            &ext2_xattr_user_handler);
++}
+--- linux-rh-2.4.20-8/fs/ext3/Makefile~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:59.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/Makefile    2003-05-07 17:45:13.000000000 +0800
+@@ -1,5 +1,5 @@
+ #
+-# Makefile for the linux ext2-filesystem routines.
++# Makefile for the linux ext3-filesystem routines.
+ #
+ # Note! Dependencies are done automagically by 'make dep', which also
+ # removes any old dependencies. DON'T put your own dependencies here
+@@ -9,10 +9,14 @@
+ O_TARGET := ext3.o
+-export-objs :=        super.o inode.o
++export-objs := ext3-exports.o
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-              ioctl.o namei.o super.o symlink.o hash.o
++              ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
+ obj-m    := $(O_TARGET)
++export-objs += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+--- linux-rh-2.4.20-8/fs/ext3/file.c~linux-2.4.20-xattr-0.8.54-chaos   2003-05-07 17:33:59.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/file.c      2003-05-07 17:34:25.000000000 +0800
+@@ -23,6 +23,7 @@
+ #include <linux/locks.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/smp_lock.h>
+@@ -126,5 +127,9 @@ struct file_operations ext3_file_operati
+ struct inode_operations ext3_file_inode_operations = {
+       truncate:       ext3_truncate,          /* BKL held */
+       setattr:        ext3_setattr,           /* BKL held */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
+--- linux-rh-2.4.20-8/fs/ext3/ialloc.c~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:48.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/ialloc.c    2003-05-07 17:34:25.000000000 +0800
+@@ -17,6 +17,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, 
+        * as writing the quota to disk may need the lock as well.
+        */
+       DQUOT_INIT(inode);
++      ext3_xattr_delete_inode(handle, inode);
+       DQUOT_FREE_INODE(inode);
+       DQUOT_DROP(inode);
+--- linux-rh-2.4.20-8/fs/ext3/inode.c~linux-2.4.20-xattr-0.8.54-chaos  2003-04-11 14:04:58.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/inode.c     2003-05-07 17:34:25.000000000 +0800
+@@ -39,6 +39,18 @@
+  */
+ #undef SEARCH_FROM_ZERO
++/*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext3_inode_is_fast_symlink(struct inode *inode)
++{
++      int ea_blocks = inode->u.ext3_i.i_file_acl ?
++              (inode->i_sb->s_blocksize >> 9) : 0;
++
++      return (S_ISLNK(inode->i_mode) &&
++              inode->i_blocks - ea_blocks == 0);
++}
++
+ /* The ext3 forget function must perform a revoke if we are freeing data
+  * which has been journaled.  Metadata (eg. indirect blocks) must be
+  * revoked in all cases. 
+@@ -48,7 +60,7 @@
+  * still needs to be revoked.
+  */
+-static int ext3_forget(handle_t *handle, int is_metadata,
++int ext3_forget(handle_t *handle, int is_metadata,
+                      struct inode *inode, struct buffer_head *bh,
+                      int blocknr)
+ {
+@@ -179,9 +191,7 @@ void ext3_delete_inode (struct inode * i
+ {
+       handle_t *handle;
+       
+-      if (is_bad_inode(inode) ||
+-          inode->i_ino == EXT3_ACL_IDX_INO ||
+-          inode->i_ino == EXT3_ACL_DATA_INO)
++      if (is_bad_inode(inode))
+               goto no_delete;
+       lock_kernel();
+@@ -1874,6 +1884,8 @@ void ext3_truncate(struct inode * inode)
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode)))
+               return;
++      if (ext3_inode_is_fast_symlink(inode))
++              return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+               return;
+@@ -2021,8 +2033,6 @@ int ext3_get_inode_loc (struct inode *in
+       struct ext3_group_desc * gdp;
+               
+       if ((inode->i_ino != EXT3_ROOT_INO &&
+-              inode->i_ino != EXT3_ACL_IDX_INO &&
+-              inode->i_ino != EXT3_ACL_DATA_INO &&
+               inode->i_ino != EXT3_JOURNAL_INO &&
+               inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+               inode->i_ino > le32_to_cpu(
+@@ -2149,10 +2159,7 @@ void ext3_read_inode(struct inode * inod
+       brelse (iloc.bh);
+-      if (inode->i_ino == EXT3_ACL_IDX_INO ||
+-          inode->i_ino == EXT3_ACL_DATA_INO)
+-              /* Nothing to do */ ;
+-      else if (S_ISREG(inode->i_mode)) {
++      if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &ext3_file_inode_operations;
+               inode->i_fop = &ext3_file_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+@@ -2160,15 +2167,17 @@ void ext3_read_inode(struct inode * inod
+               inode->i_op = &ext3_dir_inode_operations;
+               inode->i_fop = &ext3_dir_operations;
+       } else if (S_ISLNK(inode->i_mode)) {
+-              if (!inode->i_blocks)
++              if (ext3_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext3_fast_symlink_inode_operations;
+               else {
+-                      inode->i_op = &page_symlink_inode_operations;
++                      inode->i_op = &ext3_symlink_inode_operations;
+                       inode->i_mapping->a_ops = &ext3_aops;
+               }
+-      } else 
++      } else {
++              inode->i_op = &ext3_special_inode_operations;
+               init_special_inode(inode, inode->i_mode,
+                                  le32_to_cpu(iloc.raw_inode->i_block[0]));
++      }
+       /* inode->i_attr_flags = 0;                             unused */
+       if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
+--- linux-rh-2.4.20-8/fs/ext3/namei.c~linux-2.4.20-xattr-0.8.54-chaos  2003-05-07 17:33:59.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/namei.c     2003-05-07 17:34:25.000000000 +0800
+@@ -29,6 +29,7 @@
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+@@ -1613,7 +1614,7 @@ static int ext3_mkdir(struct inode * dir
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext3_new_inode (handle, dir, S_IFDIR);
++      inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -1621,7 +1622,6 @@ static int ext3_mkdir(struct inode * dir
+       inode->i_op = &ext3_dir_inode_operations;
+       inode->i_fop = &ext3_dir_operations;
+       inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+-      inode->i_blocks = 0;    
+       dir_block = ext3_bread (handle, inode, 0, 1, &err);
+       if (!dir_block) {
+               inode->i_nlink--; /* is this nlink == 0? */
+@@ -1648,9 +1648,6 @@ static int ext3_mkdir(struct inode * dir
+       BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+       ext3_journal_dirty_metadata(handle, dir_block);
+       brelse (dir_block);
+-      inode->i_mode = S_IFDIR | mode;
+-      if (dir->i_mode & S_ISGID)
+-              inode->i_mode |= S_ISGID;
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_entry (handle, dentry, inode);
+       if (err) {
+@@ -2019,7 +2016,7 @@ static int ext3_symlink (struct inode * 
+               goto out_stop;
+       if (l > sizeof (EXT3_I(inode)->i_data)) {
+-              inode->i_op = &page_symlink_inode_operations;
++              inode->i_op = &ext3_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+               /*
+                * block_symlink() calls back into ext3_prepare/commit_write.
+@@ -2245,4 +2242,16 @@ struct inode_operations ext3_dir_inode_o
+       rmdir:          ext3_rmdir,             /* BKL held */
+       mknod:          ext3_mknod,             /* BKL held */
+       rename:         ext3_rename,            /* BKL held */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
++
++struct inode_operations ext3_special_inode_operations = {
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
++};
++
+--- linux-rh-2.4.20-8/fs/ext3/super.c~linux-2.4.20-xattr-0.8.54-chaos  2003-05-07 17:33:59.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/super.c     2003-05-07 17:40:45.000000000 +0800
+@@ -24,6 +24,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -406,6 +407,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+               EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+@@ -502,6 +504,7 @@ static int parse_options (char * options
+                         int is_remount)
+ {
+       unsigned long *mount_options = &sbi->s_mount_opt;
++      
+       uid_t *resuid = &sbi->s_resuid;
+       gid_t *resgid = &sbi->s_resgid;
+       char * this_char;
+@@ -514,6 +517,13 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++              if (!strcmp (this_char, "user_xattr"))
++                      set_opt (*mount_options, XATTR_USER);
++              else if (!strcmp (this_char, "nouser_xattr"))
++                      clear_opt (*mount_options, XATTR_USER);
++              else
++#endif
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -931,6 +941,12 @@ struct super_block * ext3_read_super (st
+       sbi->s_mount_opt = 0;
+       sbi->s_resuid = EXT3_DEF_RESUID;
+       sbi->s_resgid = EXT3_DEF_RESGID;
++
++      /* Default extended attribute flags */
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++      /* set_opt(sbi->s_mount_opt, XATTR_USER); */
++#endif
++
+       if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
+               sb->s_dev = 0;
+               goto out_fail;
+@@ -1768,17 +1784,29 @@ static DECLARE_FSTYPE_DEV(ext3_fs_type, 
+ static int __init init_ext3_fs(void)
+ {
+-        return register_filesystem(&ext3_fs_type);
++      int error = init_ext3_xattr();
++      if (error)
++              return error;
++      error = init_ext3_xattr_user();
++      if (error)
++              goto fail;
++      error = register_filesystem(&ext3_fs_type);
++      if (!error)
++              return 0;
++      
++      exit_ext3_xattr_user();
++fail:
++      exit_ext3_xattr();
++      return error;
+ }
+ static void __exit exit_ext3_fs(void)
+ {
+       unregister_filesystem(&ext3_fs_type);
++      exit_ext3_xattr_user();
++      exit_ext3_xattr();
+ }
+-EXPORT_SYMBOL(ext3_force_commit);
+-EXPORT_SYMBOL(ext3_bread);
+-
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+ MODULE_LICENSE("GPL");
+--- linux-rh-2.4.20-8/fs/ext3/symlink.c~linux-2.4.20-xattr-0.8.54-chaos        2001-11-10 06:25:04.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/symlink.c   2003-05-07 17:34:25.000000000 +0800
+@@ -20,6 +20,7 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -33,7 +34,20 @@ static int ext3_follow_link(struct dentr
+       return vfs_follow_link(nd, s);
+ }
++struct inode_operations ext3_symlink_inode_operations = {
++      readlink:       page_readlink,          /* BKL not held.  Don't need */
++      follow_link:    page_follow_link,       /* BKL not held.  Don't need */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
++};
++
+ struct inode_operations ext3_fast_symlink_inode_operations = {
+       readlink:       ext3_readlink,          /* BKL not held.  Don't need */
+       follow_link:    ext3_follow_link,       /* BKL not held.  Don't need */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/xattr.c     2003-05-07 17:42:06.000000000 +0800
+@@ -0,0 +1,1225 @@
++/*
++ * linux/fs/ext3/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
++ * Extended attributes for symlinks and special files added per
++ *  suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ *   +------------------+
++ *   | header           |
++ *   | entry 1          | |
++ *   | entry 2          | | growing downwards
++ *   | entry 3          | v
++ *   | four null bytes  |
++ *   | . . .            |
++ *   | value 1          | ^
++ *   | value 3          | | growing upwards
++ *   | value 2          | |
++ *   +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT3_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++#define EXT3_EA_USER "user."
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT3_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++              printk(KERN_DEBUG "inode %s:%ld: ", \
++                      kdevname(inode->i_dev), inode->i_ino); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++# define ea_bdebug(bh, f...) do { \
++              printk(KERN_DEBUG "block %s:%ld: ", \
++                      kdevname(bh->b_dev), bh->b_blocknr); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *,
++                         struct ext3_xattr_header *);
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++static int ext3_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext3_xattr_cache_find(struct inode *,
++                                               struct ext3_xattr_header *);
++static void ext3_xattr_cache_remove(struct buffer_head *);
++static void ext3_xattr_rehash(struct ext3_xattr_header *,
++                            struct ext3_xattr_entry *);
++
++static struct mb_cache *ext3_xattr_cache;
++
++#else
++# define ext3_xattr_cache_insert(bh) 0
++# define ext3_xattr_cache_find(inode, header) NULL
++# define ext3_xattr_cache_remove(bh) while(0) {}
++# define ext3_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext3_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext3_xattr_sem);
++
++static inline int
++ext3_xattr_new_block(handle_t *handle, struct inode *inode,
++                   int * errp, int force)
++{
++      struct super_block *sb = inode->i_sb;
++      int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
++              EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb);
++
++      /* How can we enforce the allocation? */
++      int block = ext3_new_block(handle, inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++      if (!*errp)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++      return block;
++}
++
++static inline int
++ext3_xattr_quota_alloc(struct inode *inode, int force)
++{
++      /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++      int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++      if (!error)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++      int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++      return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext3_xattr_quota_free(struct inode *inode)
++{
++      DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext3_xattr_free_block(handle_t *handle, struct inode * inode,
++                    unsigned long block)
++{
++      ext3_free_blocks(handle, inode, block, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext3_xattr_quota_free(inode) \
++      DQUOT_FREE_BLOCK(inode, 1)
++# define ext3_xattr_free_block(handle, inode, block) \
++      ext3_free_blocks(handle, inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++      return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++      return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
++{
++      int error = -EINVAL;
++
++      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++              write_lock(&ext3_handler_lock);
++              if (!ext3_xattr_handlers[name_index-1]) {
++                      ext3_xattr_handlers[name_index-1] = handler;
++                      error = 0;
++              }
++              write_unlock(&ext3_handler_lock);
++      }
++      return error;
++}
++
++void
++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
++{
++      if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
++              write_lock(&ext3_handler_lock);
++              ext3_xattr_handlers[name_index-1] = NULL;
++              write_unlock(&ext3_handler_lock);
++      }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++      while (*a_prefix && *a == *a_prefix) {
++              a++;
++              a_prefix++;
++      }
++      return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static inline struct ext3_xattr_handler *
++ext3_xattr_resolve_name(const char **name)
++{
++      struct ext3_xattr_handler *handler = NULL;
++      int i;
++
++      if (!*name)
++              return NULL;
++      read_lock(&ext3_handler_lock);
++      for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
++              if (ext3_xattr_handlers[i]) {
++                      const char *n = strcmp_prefix(*name,
++                              ext3_xattr_handlers[i]->prefix);
++                      if (n) {
++                              handler = ext3_xattr_handlers[i];
++                              *name = n;
++                              break;
++                      }
++              }
++      }
++      read_unlock(&ext3_handler_lock);
++      return handler;
++}
++
++static inline struct ext3_xattr_handler *
++ext3_xattr_handler(int name_index)
++{
++      struct ext3_xattr_handler *handler = NULL;
++      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++              read_lock(&ext3_handler_lock);
++              handler = ext3_xattr_handlers[name_index-1];
++              read_unlock(&ext3_handler_lock);
++      }
++      return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_getxattr(struct dentry *dentry, const char *name,
++            void *buffer, size_t size)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++      return ext3_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_setxattr(struct dentry *dentry, const char *name,
++            const void *value, size_t size, int flags)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      if (size == 0)
++              value = "";  /* empty EA, do not remove */
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_removexattr(struct dentry *dentry, const char *name)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext3_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_entry *entry;
++      unsigned int block, size;
++      char *end;
++      int name_len, error;
++
++      ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++                name_index, name, buffer, (long)buffer_size);
++
++      if (name == NULL)
++              return -EINVAL;
++      if (!EXT3_I(inode)->i_file_acl)
++              return -ENOATTR;
++      block = EXT3_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext3_error(inode->i_sb, "ext3_xattr_get",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* find named attribute */
++      name_len = strlen(name);
++
++      error = -ERANGE;
++      if (name_len > 255)
++              goto cleanup;
++      entry = FIRST_ENTRY(bh);
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              if (name_index == entry->e_name_index &&
++                  name_len == entry->e_name_len &&
++                  memcmp(name, entry->e_name, name_len) == 0)
++                      goto found;
++              entry = next;
++      }
++      /* Check the remaining name entries */
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              entry = next;
++      }
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      error = -ENOATTR;
++      goto cleanup;
++found:
++      /* check the buffer size */
++      if (entry->e_value_block != 0)
++              goto bad_block;
++      size = le32_to_cpu(entry->e_value_size);
++      if (size > inode->i_sb->s_blocksize ||
++          le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++              goto bad_block;
++
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (buffer) {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++              /* return value of attribute */
++              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++                      size);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * ext3_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_entry *entry;
++      unsigned int block, size = 0;
++      char *buf, *end;
++      int error;
++
++      ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++                buffer, (long)buffer_size);
++
++      if (!EXT3_I(inode)->i_file_acl)
++              return 0;
++      block = EXT3_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext3_error(inode->i_sb, "ext3_xattr_list",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* compute the size required for the list of attribute names */
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT3_XATTR_NEXT(entry)) {
++              struct ext3_xattr_handler *handler;
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++
++              handler = ext3_xattr_handler(entry->e_name_index);
++              if (handler)
++                      size += handler->list(NULL, inode, entry->e_name,
++                                            entry->e_name_len);
++      }
++
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (!buffer) {
++              error = size;
++              goto cleanup;
++      } else {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++      }
++
++      /* list the attribute names */
++      buf = buffer;
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT3_XATTR_NEXT(entry)) {
++              struct ext3_xattr_handler *handler;
++
++              handler = ext3_xattr_handler(entry->e_name_index);
++              if (handler)
++                      buf += handler->list(buf, inode, entry->e_name,
++                                           entry->e_name_len);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext3_xattr_update_super_block(handle_t *handle,
++                                        struct super_block *sb)
++{
++      if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
++              return;
++
++      lock_super(sb);
++      ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++      EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR;
++#endif
++      EXT3_SB(sb)->s_es->s_feature_compat |=
++              cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
++      sb->s_dirt = 1;
++      ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++      unlock_super(sb);
++}
++
++/*
++ * ext3_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++             const char *name, const void *value, size_t value_len, int flags)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_header *header = NULL;
++      struct ext3_xattr_entry *here, *last;
++      unsigned int name_len;
++      int block = EXT3_I(inode)->i_file_acl;
++      int min_offs = sb->s_blocksize, not_found = 1, free, error;
++      char *end;
++      
++      /*
++       * header -- Points either into bh, or to a temporarily
++       *           allocated buffer.
++       * here -- The named entry found, or the place for inserting, within
++       *         the block pointed to by header.
++       * last -- Points right after the last named entry within the block
++       *         pointed to by header.
++       * min_offs -- The offset of the first value (values are aligned
++       *             towards the end of the block).
++       * end -- Points right after the block pointed to by header.
++       */
++      
++      ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++                name_index, name, value, (long)value_len);
++
++      if (IS_RDONLY(inode))
++              return -EROFS;
++      if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++              return -EPERM;
++      if (value == NULL)
++              value_len = 0;
++      if (name == NULL)
++              return -EINVAL;
++      name_len = strlen(name);
++      if (name_len > 255 || value_len > sb->s_blocksize)
++              return -ERANGE;
++      down(&ext3_xattr_sem);
++
++      if (block) {
++              /* The inode already has an extended attribute block. */
++              bh = sb_bread(sb, block);
++              error = -EIO;
++              if (!bh)
++                      goto cleanup;
++              ea_bdebug(bh, "b_count=%d, refcount=%d",
++                      atomic_read(&(bh->b_count)),
++                      le32_to_cpu(HDR(bh)->h_refcount));
++              header = HDR(bh);
++              end = bh->b_data + bh->b_size;
++              if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++                  header->h_blocks != cpu_to_le32(1)) {
++bad_block:            ext3_error(sb, "ext3_xattr_set",
++                              "inode %ld: bad block %d", inode->i_ino, block);
++                      error = -EIO;
++                      goto cleanup;
++              }
++              /* Find the named attribute. */
++              here = FIRST_ENTRY(bh);
++              while (!IS_LAST_ENTRY(here)) {
++                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!here->e_value_block && here->e_value_size) {
++                              int offs = le16_to_cpu(here->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      not_found = name_index - here->e_name_index;
++                      if (!not_found)
++                              not_found = name_len - here->e_name_len;
++                      if (!not_found)
++                              not_found = memcmp(name, here->e_name,name_len);
++                      if (not_found <= 0)
++                              break;
++                      here = next;
++              }
++              last = here;
++              /* We still need to compute min_offs and last. */
++              while (!IS_LAST_ENTRY(last)) {
++                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!last->e_value_block && last->e_value_size) {
++                              int offs = le16_to_cpu(last->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      last = next;
++              }
++
++              /* Check whether we have enough space left. */
++              free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++      } else {
++              /* We will use a new extended attribute block. */
++              free = sb->s_blocksize -
++                      sizeof(struct ext3_xattr_header) - sizeof(__u32);
++              here = last = NULL;  /* avoid gcc uninitialized warning. */
++      }
++
++      if (not_found) {
++              /* Request to remove a nonexistent attribute? */
++              error = -ENOATTR;
++              if (flags & XATTR_REPLACE)
++                      goto cleanup;
++              error = 0;
++              if (value == NULL)
++                      goto cleanup;
++              else
++                      free -= EXT3_XATTR_LEN(name_len);
++      } else {
++              /* Request to create an existing attribute? */
++              error = -EEXIST;
++              if (flags & XATTR_CREATE)
++                      goto cleanup;
++              if (!here->e_value_block && here->e_value_size) {
++                      unsigned int size = le32_to_cpu(here->e_value_size);
++
++                      if (le16_to_cpu(here->e_value_offs) + size > 
++                          sb->s_blocksize || size > sb->s_blocksize)
++                              goto bad_block;
++                      free += EXT3_XATTR_SIZE(size);
++              }
++      }
++      free -= EXT3_XATTR_SIZE(value_len);
++      error = -ENOSPC;
++      if (free < 0)
++              goto cleanup;
++
++      /* Here we know that we can set the new attribute. */
++
++      if (header) {
++              if (header->h_refcount == cpu_to_le32(1)) {
++                      ea_bdebug(bh, "modifying in-place");
++                      ext3_xattr_cache_remove(bh);
++                      error = ext3_journal_get_write_access(handle, bh);
++                      if (error)
++                              goto cleanup;
++              } else {
++                      int offset;
++
++                      ea_bdebug(bh, "cloning");
++                      header = kmalloc(bh->b_size, GFP_KERNEL);
++                      error = -ENOMEM;
++                      if (header == NULL)
++                              goto cleanup;
++                      memcpy(header, HDR(bh), bh->b_size);
++                      header->h_refcount = cpu_to_le32(1);
++                      offset = (char *)header - bh->b_data;
++                      here = ENTRY((char *)here + offset);
++                      last = ENTRY((char *)last + offset);
++              }
++      } else {
++              /* Allocate a buffer where we construct the new block. */
++              header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++              error = -ENOMEM;
++              if (header == NULL)
++                      goto cleanup;
++              memset(header, 0, sb->s_blocksize);
++              end = (char *)header + sb->s_blocksize;
++              header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
++              header->h_blocks = header->h_refcount = cpu_to_le32(1);
++              last = here = ENTRY(header+1);
++      }
++
++      if (not_found) {
++              /* Insert the new name. */
++              int size = EXT3_XATTR_LEN(name_len);
++              int rest = (char *)last - (char *)here;
++              memmove((char *)here + size, here, rest);
++              memset(here, 0, size);
++              here->e_name_index = name_index;
++              here->e_name_len = name_len;
++              memcpy(here->e_name, name, name_len);
++      } else {
++              /* Remove the old value. */
++              if (!here->e_value_block && here->e_value_size) {
++                      char *first_val = (char *)header + min_offs;
++                      int offs = le16_to_cpu(here->e_value_offs);
++                      char *val = (char *)header + offs;
++                      size_t size = EXT3_XATTR_SIZE(
++                              le32_to_cpu(here->e_value_size));
++                      memmove(first_val + size, first_val, val - first_val);
++                      memset(first_val, 0, size);
++                      here->e_value_offs = 0;
++                      min_offs += size;
++
++                      /* Adjust all value offsets. */
++                      last = ENTRY(header+1);
++                      while (!IS_LAST_ENTRY(last)) {
++                              int o = le16_to_cpu(last->e_value_offs);
++                              if (!last->e_value_block && o < offs)
++                                      last->e_value_offs =
++                                              cpu_to_le16(o + size);
++                              last = EXT3_XATTR_NEXT(last);
++                      }
++              }
++              if (value == NULL) {
++                      /* Remove this attribute. */
++                      if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) {
++                              /* This block is now empty. */
++                              error = ext3_xattr_set2(handle, inode, bh,NULL);
++                              goto cleanup;
++                      } else {
++                              /* Remove the old name. */
++                              int size = EXT3_XATTR_LEN(name_len);
++                              last = ENTRY((char *)last - size);
++                              memmove(here, (char*)here + size,
++                                      (char*)last - (char*)here);
++                              memset(last, 0, size);
++                      }
++              }
++      }
++
++      if (value != NULL) {
++              /* Insert the new value. */
++              here->e_value_size = cpu_to_le32(value_len);
++              if (value_len) {
++                      size_t size = EXT3_XATTR_SIZE(value_len);
++                      char *val = (char *)header + min_offs - size;
++                      here->e_value_offs =
++                              cpu_to_le16((char *)val - (char *)header);
++                      memset(val + size - EXT3_XATTR_PAD, 0,
++                             EXT3_XATTR_PAD); /* Clear the pad bytes. */
++                      memcpy(val, value, value_len);
++              }
++      }
++      ext3_xattr_rehash(header, here);
++
++      error = ext3_xattr_set2(handle, inode, bh, header);
++
++cleanup:
++      brelse(bh);
++      if (!(bh && header == HDR(bh)))
++              kfree(header);
++      up(&ext3_xattr_sem);
++
++      return error;
++}
++
++/*
++ * Second half of ext3_xattr_set(): Update the file system.
++ */
++static int
++ext3_xattr_set2(handle_t *handle, struct inode *inode,
++              struct buffer_head *old_bh, struct ext3_xattr_header *header)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *new_bh = NULL;
++      int error;
++
++      if (header) {
++              new_bh = ext3_xattr_cache_find(inode, header);
++              if (new_bh) {
++                      /*
++                       * We found an identical block in the cache.
++                       * The old block will be released after updating
++                       * the inode.
++                       */
++                      ea_bdebug(old_bh, "reusing block %ld",
++                              new_bh->b_blocknr);
++                      
++                      error = -EDQUOT;
++                      if (ext3_xattr_quota_alloc(inode, 1))
++                              goto cleanup;
++                      
++                      error = ext3_journal_get_write_access(handle, new_bh);
++                      if (error)
++                              goto cleanup;
++                      HDR(new_bh)->h_refcount = cpu_to_le32(
++                              le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++                      ea_bdebug(new_bh, "refcount now=%d",
++                              le32_to_cpu(HDR(new_bh)->h_refcount));
++              } else if (old_bh && header == HDR(old_bh)) {
++                      /* Keep this block. */
++                      new_bh = old_bh;
++                      ext3_xattr_cache_insert(new_bh);
++              } else {
++                      /* We need to allocate a new block */
++                      int force = EXT3_I(inode)->i_file_acl != 0;
++                      int block = ext3_xattr_new_block(handle, inode,
++                                                       &error, force);
++                      if (error)
++                              goto cleanup;
++                      ea_idebug(inode, "creating block %d", block);
++
++                      new_bh = sb_getblk(sb, block);
++                      if (!new_bh) {
++getblk_failed:                        ext3_xattr_free_block(handle, inode, block);
++                              error = -EIO;
++                              goto cleanup;
++                      }
++                      lock_buffer(new_bh);
++                      error = ext3_journal_get_create_access(handle, new_bh);
++                      if (error) {
++                              unlock_buffer(new_bh);
++                              goto getblk_failed;
++                      }
++                      memcpy(new_bh->b_data, header, new_bh->b_size);
++                      mark_buffer_uptodate(new_bh, 1);
++                      unlock_buffer(new_bh);
++                      ext3_xattr_cache_insert(new_bh);
++                      
++                      ext3_xattr_update_super_block(handle, sb);
++              }
++              error = ext3_journal_dirty_metadata(handle, new_bh);
++              if (error)
++                      goto cleanup;
++      }
++
++      /* Update the inode. */
++      EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++      inode->i_ctime = CURRENT_TIME;
++      ext3_mark_inode_dirty(handle, inode);
++      if (IS_SYNC(inode))
++              handle->h_sync = 1;
++
++      error = 0;
++      if (old_bh && old_bh != new_bh) {
++              /*
++               * If there was an old block, and we are not still using it,
++               * we now release the old block.
++              */
++              unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++              error = ext3_journal_get_write_access(handle, old_bh);
++              if (error)
++                      goto cleanup;
++              if (refcount == 1) {
++                      /* Free the old block. */
++                      ea_bdebug(old_bh, "freeing");
++                      ext3_xattr_free_block(handle, inode, old_bh->b_blocknr);
++
++                      /* ext3_forget() calls bforget() for us, but we
++                         let our caller release old_bh, so we need to
++                         duplicate the handle before. */
++                      get_bh(old_bh);
++                      ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
++              } else {
++                      /* Decrement the refcount only. */
++                      refcount--;
++                      HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++                      ext3_xattr_quota_free(inode);
++                      ext3_journal_dirty_metadata(handle, old_bh);
++                      ea_bdebug(old_bh, "refcount now=%d", refcount);
++              }
++      }
++
++cleanup:
++      if (old_bh != new_bh)
++              brelse(new_bh);
++
++      return error;
++}
++
++/*
++ * ext3_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++      struct buffer_head *bh;
++      unsigned int block = EXT3_I(inode)->i_file_acl;
++
++      if (!block)
++              return;
++      down(&ext3_xattr_sem);
++
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh) {
++              ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++                      "inode %ld: block %d read error", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++              ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              goto cleanup;
++      }
++      ext3_journal_get_write_access(handle, bh);
++      ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++      if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++              ext3_xattr_cache_remove(bh);
++              ext3_xattr_free_block(handle, inode, block);
++              ext3_forget(handle, 1, inode, bh, block);
++              bh = NULL;
++      } else {
++              HDR(bh)->h_refcount = cpu_to_le32(
++                      le32_to_cpu(HDR(bh)->h_refcount) - 1);
++              ext3_journal_dirty_metadata(handle, bh);
++              if (IS_SYNC(inode))
++                      handle->h_sync = 1;
++              ext3_xattr_quota_free(inode);
++      }
++      EXT3_I(inode)->i_file_acl = 0;
++
++cleanup:
++      brelse(bh);
++      up(&ext3_xattr_sem);
++}
++
++/*
++ * ext3_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext3_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++      mb_cache_shrink(ext3_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++/*
++ * ext3_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext3_xattr_cache_insert(struct buffer_head *bh)
++{
++      __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++      struct mb_cache_entry *ce;
++      int error;
++
++      ce = mb_cache_entry_alloc(ext3_xattr_cache);
++      if (!ce)
++              return -ENOMEM;
++      error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++      if (error) {
++              mb_cache_entry_free(ce);
++              if (error == -EBUSY) {
++                      ea_bdebug(bh, "already in cache (%d cache entries)",
++                              atomic_read(&ext3_xattr_cache->c_entry_count));
++                      error = 0;
++              }
++      } else {
++              ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++                        atomic_read(&ext3_xattr_cache->c_entry_count));
++              mb_cache_entry_release(ce);
++      }
++      return error;
++}
++
++/*
++ * ext3_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext3_xattr_cmp(struct ext3_xattr_header *header1,
++             struct ext3_xattr_header *header2)
++{
++      struct ext3_xattr_entry *entry1, *entry2;
++
++      entry1 = ENTRY(header1+1);
++      entry2 = ENTRY(header2+1);
++      while (!IS_LAST_ENTRY(entry1)) {
++              if (IS_LAST_ENTRY(entry2))
++                      return 1;
++              if (entry1->e_hash != entry2->e_hash ||
++                  entry1->e_name_len != entry2->e_name_len ||
++                  entry1->e_value_size != entry2->e_value_size ||
++                  memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++                      return 1;
++              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++                      return -EIO;
++              if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++                         (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++                         le32_to_cpu(entry1->e_value_size)))
++                      return 1;
++
++              entry1 = EXT3_XATTR_NEXT(entry1);
++              entry2 = EXT3_XATTR_NEXT(entry2);
++      }
++      if (!IS_LAST_ENTRY(entry2))
++              return 1;
++      return 0;
++}
++
++/*
++ * ext3_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
++{
++      __u32 hash = le32_to_cpu(header->h_hash);
++      struct mb_cache_entry *ce;
++
++      if (!header->h_hash)
++              return NULL;  /* never share */
++      ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++      ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash);
++      while (ce) {
++              struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++              if (!bh) {
++                      ext3_error(inode->i_sb, "ext3_xattr_cache_find",
++                              "inode %ld: block %ld read error",
++                              inode->i_ino, ce->e_block);
++              } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++                         EXT3_XATTR_REFCOUNT_MAX) {
++                      ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++                              le32_to_cpu(HDR(bh)->h_refcount),
++                              EXT3_XATTR_REFCOUNT_MAX);
++              } else if (!ext3_xattr_cmp(header, HDR(bh))) {
++                      ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++                      mb_cache_entry_release(ce);
++                      return bh;
++              }
++              brelse(bh);
++              ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++      }
++      return NULL;
++}
++
++/*
++ * ext3_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext3_xattr_cache_remove(struct buffer_head *bh)
++{
++      struct mb_cache_entry *ce;
++
++      ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr);
++      if (ce) {
++              ea_bdebug(bh, "removing (%d cache entries remaining)",
++                        atomic_read(&ext3_xattr_cache->c_entry_count)-1);
++              mb_cache_entry_free(ce);
++      } else 
++              ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
++                                       struct ext3_xattr_entry *entry)
++{
++      __u32 hash = 0;
++      char *name = entry->e_name;
++      int n;
++
++      for (n=0; n < entry->e_name_len; n++) {
++              hash = (hash << NAME_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++                     *name++;
++      }
++
++      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++              __u32 *value = (__u32 *)((char *)header +
++                      le16_to_cpu(entry->e_value_offs));
++              for (n = (le32_to_cpu(entry->e_value_size) +
++                   EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
++                      hash = (hash << VALUE_HASH_SHIFT) ^
++                             (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++                             le32_to_cpu(*value++);
++              }
++      }
++      entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext3_xattr_rehash(struct ext3_xattr_header *header,
++                            struct ext3_xattr_entry *entry)
++{
++      struct ext3_xattr_entry *here;
++      __u32 hash = 0;
++      
++      ext3_xattr_hash_entry(header, entry);
++      here = ENTRY(header+1);
++      while (!IS_LAST_ENTRY(here)) {
++              if (!here->e_hash) {
++                      /* Block is not shared if an entry's hash value == 0 */
++                      hash = 0;
++                      break;
++              }
++              hash = (hash << BLOCK_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++                     le32_to_cpu(here->e_hash);
++              here = EXT3_XATTR_NEXT(here);
++      }
++      header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext3_xattr(void)
++{
++      ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
++              sizeof(struct mb_cache_entry) +
++              sizeof(struct mb_cache_entry_index), 1, 61);
++      if (!ext3_xattr_cache)
++              return -ENOMEM;
++
++      return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++      if (ext3_xattr_cache)
++              mb_cache_destroy(ext3_xattr_cache);
++      ext3_xattr_cache = NULL;
++}
++
++#else  /* CONFIG_EXT3_FS_XATTR_SHARING */
++
++int __init
++init_ext3_xattr(void)
++{
++      return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++}
++
++#endif  /* CONFIG_EXT3_FS_XATTR_SHARING */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/ext3/xattr_user.c        2003-05-07 17:34:25.000000000 +0800
+@@ -0,0 +1,111 @@
++/*
++ * linux/fs/ext3/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++# include <linux/ext3_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext3_xattr_user_list(char *list, struct inode *inode,
++                   const char *name, int name_len)
++{
++      const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return 0;
++
++      if (list) {
++              memcpy(list, XATTR_USER_PREFIX, prefix_len);
++              memcpy(list+prefix_len, name, name_len);
++              list[prefix_len + name_len] = '\0';
++      }
++      return prefix_len + name_len + 1;
++}
++
++static int
++ext3_xattr_user_get(struct inode *inode, const char *name,
++                  void *buffer, size_t size)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++      error = ext3_permission_locked(inode, MAY_READ);
++#else
++      error = permission(inode, MAY_READ);
++#endif
++      if (error)
++              return error;
++
++      return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name,
++                            buffer, size);
++}
++
++static int
++ext3_xattr_user_set(struct inode *inode, const char *name,
++                  const void *value, size_t size, int flags)
++{
++      handle_t *handle;
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++      if ( !S_ISREG(inode->i_mode) &&
++          (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++              return -EPERM;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++      error = ext3_permission_locked(inode, MAY_WRITE);
++#else
++      error = permission(inode, MAY_WRITE);
++#endif
++      if (error)
++              return error;
++  
++      handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++      error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name,
++                             value, size, flags);
++      ext3_journal_stop(handle, inode);
++
++      return error;
++}
++
++struct ext3_xattr_handler ext3_xattr_user_handler = {
++      prefix: XATTR_USER_PREFIX,
++      list:   ext3_xattr_user_list,
++      get:    ext3_xattr_user_get,
++      set:    ext3_xattr_user_set,
++};
++
++int __init
++init_ext3_xattr_user(void)
++{
++      return ext3_xattr_register(EXT3_XATTR_INDEX_USER,
++                                 &ext3_xattr_user_handler);
++}
++
++void
++exit_ext3_xattr_user(void)
++{
++      ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
++                            &ext3_xattr_user_handler);
++}
+--- linux-rh-2.4.20-8/fs/jfs/jfs_xattr.h~linux-2.4.20-xattr-0.8.54-chaos       2002-11-29 07:53:15.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/jfs/jfs_xattr.h  2003-05-07 17:34:25.000000000 +0800
+@@ -52,8 +52,10 @@ struct jfs_ea_list {
+ #define       END_EALIST(ealist) \
+       ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
+-extern int __jfs_setxattr(struct inode *, const char *, void *, size_t, int);
+-extern int jfs_setxattr(struct dentry *, const char *, void *, size_t, int);
++extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t,
++                        int);
++extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
++                      int);
+ extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
+ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
+ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
+--- linux-rh-2.4.20-8/fs/jfs/xattr.c~linux-2.4.20-xattr-0.8.54-chaos   2002-11-29 07:53:15.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/jfs/xattr.c      2003-05-07 17:34:25.000000000 +0800
+@@ -641,7 +641,7 @@ static int ea_put(struct inode *inode, s
+ }
+ static int can_set_xattr(struct inode *inode, const char *name,
+-                       void *value, size_t value_len)
++                       const void *value, size_t value_len)
+ {
+       if (IS_RDONLY(inode))
+               return -EROFS;
+@@ -660,7 +660,7 @@ static int can_set_xattr(struct inode *i
+       return permission(inode, MAY_WRITE);
+ }
+-int __jfs_setxattr(struct inode *inode, const char *name, void *value,
++int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
+                  size_t value_len, int flags)
+ {
+       struct jfs_ea_list *ealist;
+@@ -799,7 +799,7 @@ int __jfs_setxattr(struct inode *inode, 
+       return rc;
+ }
+-int jfs_setxattr(struct dentry *dentry, const char *name, void *value,
++int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                size_t value_len, int flags)
+ {
+       if (value == NULL) {    /* empty EA, do not remove */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/fs/mbcache.c        2003-05-07 17:34:25.000000000 +0800
+@@ -0,0 +1,648 @@
++/*
++ * linux/fs/mbcache.c
++ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++/*
++ * Filesystem Meta Information Block Cache (mbcache)
++ *
++ * The mbcache caches blocks of block devices that need to be located
++ * by their device/block number, as well as by other criteria (such
++ * as the block's contents).
++ *
++ * There can only be one cache entry in a cache per device and block number.
++ * Additional indexes need not be unique in this sense. The number of
++ * additional indexes (=other criteria) can be hardwired at compile time
++ * or specified at cache create time.
++ *
++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
++ * in the cache. A valid entry is in the main hash tables of the cache,
++ * and may also be in the lru list. An invalid entry is not in any hashes
++ * or lists.
++ *
++ * A valid cache entry is only in the lru list if no handles refer to it.
++ * Invalid cache entries will be freed when the last handle to the cache
++ * entry is released. Entries that cannot be freed immediately are put
++ * back on the lru list.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/cache_def.h>
++#include <linux/version.h>
++#include <linux/init.h>
++#include <linux/mbcache.h>
++
++
++#ifdef MB_CACHE_DEBUG
++# define mb_debug(f...) do { \
++              printk(KERN_DEBUG f); \
++              printk("\n"); \
++      } while (0)
++#define mb_assert(c) do { if (!(c)) \
++              printk(KERN_ERR "assertion " #c " failed\n"); \
++      } while(0)
++#else
++# define mb_debug(f...) do { } while(0)
++# define mb_assert(c) do { } while(0)
++#endif
++#define mb_error(f...) do { \
++              printk(KERN_ERR f); \
++              printk("\n"); \
++      } while(0)
++              
++MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
++MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
++MODULE_LICENSE("GPL");
++#endif
++
++EXPORT_SYMBOL(mb_cache_create);
++EXPORT_SYMBOL(mb_cache_shrink);
++EXPORT_SYMBOL(mb_cache_destroy);
++EXPORT_SYMBOL(mb_cache_entry_alloc);
++EXPORT_SYMBOL(mb_cache_entry_insert);
++EXPORT_SYMBOL(mb_cache_entry_release);
++EXPORT_SYMBOL(mb_cache_entry_takeout);
++EXPORT_SYMBOL(mb_cache_entry_free);
++EXPORT_SYMBOL(mb_cache_entry_dup);
++EXPORT_SYMBOL(mb_cache_entry_get);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++EXPORT_SYMBOL(mb_cache_entry_find_first);
++EXPORT_SYMBOL(mb_cache_entry_find_next);
++#endif
++
++
++/*
++ * Global data: list of all mbcache's, lru list, and a spinlock for
++ * accessing cache data structures on SMP machines. The lru list is
++ * global across all mbcaches.
++ */
++
++static LIST_HEAD(mb_cache_list);
++static LIST_HEAD(mb_cache_lru_list);
++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED;
++
++static inline int
++mb_cache_indexes(struct mb_cache *cache)
++{
++#ifdef MB_CACHE_INDEXES_COUNT
++      return MB_CACHE_INDEXES_COUNT;
++#else
++      return cache->c_indexes_count;
++#endif
++}
++
++/*
++ * What the mbcache registers as to get shrunk dynamically.
++ */
++
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask);
++
++static struct cache_definition mb_cache_definition = {
++      "mb_cache",
++      mb_cache_memory_pressure
++};
++
++
++static inline int
++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
++{
++      return !list_empty(&ce->e_block_list);
++}
++
++
++static inline void
++__mb_cache_entry_unhash(struct mb_cache_entry *ce)
++{
++      int n;
++
++      if (__mb_cache_entry_is_hashed(ce)) {
++              list_del_init(&ce->e_block_list);
++              for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
++                      list_del(&ce->e_indexes[n].o_list);
++      }
++}
++
++
++static inline void
++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
++{
++      struct mb_cache *cache = ce->e_cache;
++
++      mb_assert(atomic_read(&ce->e_used) == 0);
++      if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
++              /* free failed -- put back on the lru list
++                 for freeing later. */
++              spin_lock(&mb_cache_spinlock);
++              list_add(&ce->e_lru_list, &mb_cache_lru_list);
++              spin_unlock(&mb_cache_spinlock);
++      } else {
++              kmem_cache_free(cache->c_entry_cache, ce);
++              atomic_dec(&cache->c_entry_count);
++      }
++}
++
++
++static inline void
++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
++{
++      if (atomic_dec_and_test(&ce->e_used)) {
++              if (__mb_cache_entry_is_hashed(ce))
++                      list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
++              else {
++                      spin_unlock(&mb_cache_spinlock);
++                      __mb_cache_entry_forget(ce, GFP_KERNEL);
++                      return;
++              }
++      }
++      spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_memory_pressure()  memory pressure callback
++ *
++ * This function is called by the kernel memory management when memory
++ * gets low.
++ *
++ * @priority: Amount by which to shrink the cache (0 = highes priority)
++ * @gfp_mask: (ignored)
++ */
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++      int count = 0;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &mb_cache_list) {
++              struct mb_cache *cache =
++                      list_entry(l, struct mb_cache, c_cache_list);
++              mb_debug("cache %s (%d)", cache->c_name,
++                        atomic_read(&cache->c_entry_count));
++              count += atomic_read(&cache->c_entry_count);
++      }
++      mb_debug("trying to free %d of %d entries",
++                count / (priority ? priority : 1), count);
++      if (priority)
++              count /= priority;
++      while (count-- && !list_empty(&mb_cache_lru_list)) {
++              struct mb_cache_entry *ce =
++                      list_entry(mb_cache_lru_list.next,
++                                 struct mb_cache_entry, e_lru_list);
++              list_del(&ce->e_lru_list);
++              __mb_cache_entry_unhash(ce);
++              list_add_tail(&ce->e_lru_list, &free_list);
++      }
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), gfp_mask);
++      }
++}
++
++
++/*
++ * mb_cache_create()  create a new cache
++ *
++ * All entries in one cache are equal size. Cache entries may be from
++ * multiple devices. If this is the first mbcache created, registers
++ * the cache with kernel memory management. Returns NULL if no more
++ * memory was available.
++ *
++ * @name: name of the cache (informal)
++ * @cache_op: contains the callback called when freeing a cache entry
++ * @entry_size: The size of a cache entry, including
++ *              struct mb_cache_entry
++ * @indexes_count: number of additional indexes in the cache. Must equal
++ *                 MB_CACHE_INDEXES_COUNT if the number of indexes is
++ *                 hardwired.
++ * @bucket_count: number of hash buckets
++ */
++struct mb_cache *
++mb_cache_create(const char *name, struct mb_cache_op *cache_op,
++              size_t entry_size, int indexes_count, int bucket_count)
++{
++      int m=0, n;
++      struct mb_cache *cache = NULL;
++
++      if(entry_size < sizeof(struct mb_cache_entry) +
++         indexes_count * sizeof(struct mb_cache_entry_index))
++              return NULL;
++
++      MOD_INC_USE_COUNT;
++      cache = kmalloc(sizeof(struct mb_cache) +
++                      indexes_count * sizeof(struct list_head), GFP_KERNEL);
++      if (!cache)
++              goto fail;
++      cache->c_name = name;
++      cache->c_op.free = NULL;
++      if (cache_op)
++              cache->c_op.free = cache_op->free;
++      atomic_set(&cache->c_entry_count, 0);
++      cache->c_bucket_count = bucket_count;
++#ifdef MB_CACHE_INDEXES_COUNT
++      mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
++#else
++      cache->c_indexes_count = indexes_count;
++#endif
++      cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
++                                    GFP_KERNEL);
++      if (!cache->c_block_hash)
++              goto fail;
++      for (n=0; n<bucket_count; n++)
++              INIT_LIST_HEAD(&cache->c_block_hash[n]);
++      for (m=0; m<indexes_count; m++) {
++              cache->c_indexes_hash[m] = kmalloc(bucket_count *
++                                               sizeof(struct list_head),
++                                               GFP_KERNEL);
++              if (!cache->c_indexes_hash[m])
++                      goto fail;
++              for (n=0; n<bucket_count; n++)
++                      INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
++      }
++      cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
++              0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL);
++      if (!cache->c_entry_cache)
++              goto fail;
++
++      spin_lock(&mb_cache_spinlock);
++      list_add(&cache->c_cache_list, &mb_cache_list);
++      spin_unlock(&mb_cache_spinlock);
++      return cache;
++
++fail:
++      if (cache) {
++              while (--m >= 0)
++                      kfree(cache->c_indexes_hash[m]);
++              if (cache->c_block_hash)
++                      kfree(cache->c_block_hash);
++              kfree(cache);
++      }
++      MOD_DEC_USE_COUNT;
++      return NULL;
++}
++
++
++/*
++ * mb_cache_shrink()
++ *
++ * Removes all cache entires of a device from the cache. All cache entries
++ * currently in use cannot be freed, and thus remain in the cache.
++ *
++ * @cache: which cache to shrink
++ * @dev: which device's cache entries to shrink
++ */
++void
++mb_cache_shrink(struct mb_cache *cache, kdev_t dev)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_lru_list);
++              if (ce->e_dev == dev) {
++                      list_del(&ce->e_lru_list);
++                      list_add_tail(&ce->e_lru_list, &free_list);
++                      __mb_cache_entry_unhash(ce);
++              }
++      }
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), GFP_KERNEL);
++      }
++}
++
++
++/*
++ * mb_cache_destroy()
++ *
++ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
++ * and then destroys it. If this was the last mbcache, un-registers the
++ * mbcache from kernel memory management.
++ */
++void
++mb_cache_destroy(struct mb_cache *cache)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++      int n;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_lru_list);
++              if (ce->e_cache == cache) {
++                      list_del(&ce->e_lru_list);
++                      list_add_tail(&ce->e_lru_list, &free_list);
++                      __mb_cache_entry_unhash(ce);
++              }
++      }
++      list_del(&cache->c_cache_list);
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), GFP_KERNEL);
++      }
++
++      if (atomic_read(&cache->c_entry_count) > 0) {
++              mb_error("cache %s: %d orphaned entries",
++                        cache->c_name,
++                        atomic_read(&cache->c_entry_count));
++      }
++
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0))
++      /* We don't have kmem_cache_destroy() in 2.2.x */
++      kmem_cache_shrink(cache->c_entry_cache);
++#else
++      kmem_cache_destroy(cache->c_entry_cache);
++#endif
++      for (n=0; n < mb_cache_indexes(cache); n++)
++              kfree(cache->c_indexes_hash[n]);
++      kfree(cache->c_block_hash);
++      kfree(cache);
++
++      MOD_DEC_USE_COUNT;
++}
++
++
++/*
++ * mb_cache_entry_alloc()
++ *
++ * Allocates a new cache entry. The new entry will not be valid initially,
++ * and thus cannot be looked up yet. It should be filled with data, and
++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
++ * if no more memory was available.
++ */
++struct mb_cache_entry *
++mb_cache_entry_alloc(struct mb_cache *cache)
++{
++      struct mb_cache_entry *ce;
++
++      atomic_inc(&cache->c_entry_count);
++      ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
++      if (ce) {
++              INIT_LIST_HEAD(&ce->e_lru_list);
++              INIT_LIST_HEAD(&ce->e_block_list);
++              ce->e_cache = cache;
++              atomic_set(&ce->e_used, 1);
++      }
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_insert()
++ *
++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
++ * the cache. After this, the cache entry can be looked up, but is not yet
++ * in the lru list as the caller still holds a handle to it. Returns 0 on
++ * success, or -EBUSY if a cache entry for that device + inode exists
++ * already (this may happen after a failed lookup, if another process has
++ * inserted the same cache entry in the meantime).
++ *
++ * @dev: device the cache entry belongs to
++ * @block: block number
++ * @keys: array of additional keys. There must be indexes_count entries
++ *        in the array (as specified when creating the cache).
++ */
++int
++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev,
++                    unsigned long block, unsigned int keys[])
++{
++      struct mb_cache *cache = ce->e_cache;
++      unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++      struct list_head *l;
++      int error = -EBUSY, n;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &cache->c_block_hash[bucket]) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_block_list);
++              if (ce->e_dev == dev && ce->e_block == block)
++                      goto out;
++      }
++      __mb_cache_entry_unhash(ce);
++      ce->e_dev = dev;
++      ce->e_block = block;
++      list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
++      for (n=0; n<mb_cache_indexes(cache); n++) {
++              ce->e_indexes[n].o_key = keys[n];
++              bucket = keys[n] % cache->c_bucket_count;
++              list_add(&ce->e_indexes[n].o_list,
++                       &cache->c_indexes_hash[n][bucket]);
++      }
++out:
++      spin_unlock(&mb_cache_spinlock);
++      return error;
++}
++
++
++/*
++ * mb_cache_entry_release()
++ *
++ * Release a handle to a cache entry. When the last handle to a cache entry
++ * is released it is either freed (if it is invalid) or otherwise inserted
++ * in to the lru list.
++ */
++void
++mb_cache_entry_release(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_takeout()
++ *
++ * Take a cache entry out of the cache, making it invalid. The entry can later
++ * be re-inserted using mb_cache_entry_insert(), or released using
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_takeout(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      mb_assert(list_empty(&ce->e_lru_list));
++      __mb_cache_entry_unhash(ce);
++      spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_entry_free()
++ *
++ * This is equivalent to the sequence mb_cache_entry_takeout() --
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_free(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      mb_assert(list_empty(&ce->e_lru_list));
++      __mb_cache_entry_unhash(ce);
++      __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_dup()
++ *
++ * Duplicate a handle to a cache entry (does not duplicate the cache entry
++ * itself). After the call, both the old and the new handle must be released.
++ */
++struct mb_cache_entry *
++mb_cache_entry_dup(struct mb_cache_entry *ce)
++{
++      atomic_inc(&ce->e_used);
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_get()
++ *
++ * Get a cache entry  by device / block number. (There can only be one entry
++ * in the cache per device and block.) Returns NULL if no such cache entry
++ * exists.
++ */
++struct mb_cache_entry *
++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block)
++{
++      unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &cache->c_block_hash[bucket]) {
++              ce = list_entry(l, struct mb_cache_entry, e_block_list);
++              if (ce->e_dev == dev && ce->e_block == block) {
++                      if (!list_empty(&ce->e_lru_list))
++                              list_del_init(&ce->e_lru_list);
++                      atomic_inc(&ce->e_used);
++                      goto cleanup;
++              }
++      }
++      ce = NULL;
++
++cleanup:
++      spin_unlock(&mb_cache_spinlock);
++      return ce;
++}
++
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++
++static struct mb_cache_entry *
++__mb_cache_entry_find(struct list_head *l, struct list_head *head,
++                    int index, kdev_t dev, unsigned int key)
++{
++      while (l != head) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry,
++                                 e_indexes[index].o_list);
++              if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) {
++                      if (!list_empty(&ce->e_lru_list))
++                              list_del_init(&ce->e_lru_list);
++                      atomic_inc(&ce->e_used);
++                      return ce;
++              }
++              l = l->next;
++      }
++      return NULL;
++}
++
++
++/*
++ * mb_cache_entry_find_first()
++ *
++ * Find the first cache entry on a given device with a certain key in
++ * an additional index. Additonal matches can be found with
++ * mb_cache_entry_find_next(). Returns NULL if no match was found.
++ *
++ * @cache: the cache to search
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_first(struct mb_cache *cache, int index, kdev_t dev,
++                        unsigned int key)
++{
++      unsigned int bucket = key % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      mb_assert(index < mb_cache_indexes(cache));
++      spin_lock(&mb_cache_spinlock);
++      l = cache->c_indexes_hash[index][bucket].next;
++      ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++                                 index, dev, key);
++      spin_unlock(&mb_cache_spinlock);
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_find_next()
++ *
++ * Find the next cache entry on a given device with a certain key in an
++ * additional index. Returns NULL if no match could be found. The previous
++ * entry is atomatically released, so that mb_cache_entry_find_next() can
++ * be called like this:
++ *
++ * entry = mb_cache_entry_find_first();
++ * while (entry) {
++ *    ...
++ *    entry = mb_cache_entry_find_next(entry, ...);
++ * }
++ *
++ * @prev: The previous match
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, kdev_t dev,
++                       unsigned int key)
++{
++      struct mb_cache *cache = prev->e_cache;
++      unsigned int bucket = key % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      mb_assert(index < mb_cache_indexes(cache));
++      spin_lock(&mb_cache_spinlock);
++      l = prev->e_indexes[index].o_list.next;
++      ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++                                 index, dev, key);
++      __mb_cache_entry_release_unlock(prev);
++      return ce;
++}
++
++#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
++
++static int __init init_mbcache(void)
++{
++      register_cache(&mb_cache_definition);
++      return 0;
++}
++
++static void __exit exit_mbcache(void)
++{
++      unregister_cache(&mb_cache_definition);
++}
++
++module_init(init_mbcache)
++module_exit(exit_mbcache)
++
+--- linux-rh-2.4.20-8/include/asm-arm/unistd.h~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:53.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/asm-arm/unistd.h    2003-05-07 17:34:25.000000000 +0800
+@@ -244,7 +244,6 @@
+ #define __NR_security                 (__NR_SYSCALL_BASE+223)
+ #define __NR_gettid                   (__NR_SYSCALL_BASE+224)
+ #define __NR_readahead                        (__NR_SYSCALL_BASE+225)
+-#if 0 /* allocated in 2.5 */
+ #define __NR_setxattr                 (__NR_SYSCALL_BASE+226)
+ #define __NR_lsetxattr                        (__NR_SYSCALL_BASE+227)
+ #define __NR_fsetxattr                        (__NR_SYSCALL_BASE+228)
+@@ -257,7 +256,6 @@
+ #define __NR_removexattr              (__NR_SYSCALL_BASE+235)
+ #define __NR_lremovexattr             (__NR_SYSCALL_BASE+236)
+ #define __NR_fremovexattr             (__NR_SYSCALL_BASE+237)
+-#endif
+ #define __NR_tkill                    (__NR_SYSCALL_BASE+238)
+ /*
+  * Please check 2.5 _before_ adding calls here,
+--- linux-rh-2.4.20-8/include/asm-ppc64/unistd.h~linux-2.4.20-xattr-0.8.54-chaos       2002-08-03 08:39:45.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/asm-ppc64/unistd.h  2003-05-07 17:34:25.000000000 +0800
+@@ -218,6 +218,7 @@
+ #define __NR_gettid           207
+ #if 0 /* Reserved syscalls */
+ #define __NR_tkill            208
++#endif
+ #define __NR_setxattr         209
+ #define __NR_lsetxattr                210
+ #define __NR_fsetxattr                211
+@@ -230,6 +231,7 @@
+ #define __NR_removexattr      218
+ #define __NR_lremovexattr     219
+ #define __NR_fremovexattr     220
++#if 0 /* Reserved syscalls */
+ #define __NR_futex            221
+ #endif
+--- linux-rh-2.4.20-8/include/asm-s390/unistd.h~linux-2.4.20-xattr-0.8.54-chaos        2002-08-03 08:39:45.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/asm-s390/unistd.h   2003-05-07 17:34:25.000000000 +0800
+@@ -212,9 +212,18 @@
+ #define __NR_madvise            219
+ #define __NR_getdents64               220
+ #define __NR_fcntl64          221
+-/*
+- * Numbers 224-235 are reserved for posix acl
+- */
++#define __NR_setxattr         224
++#define __NR_lsetxattr                225
++#define __NR_fsetxattr                226
++#define __NR_getxattr         227
++#define __NR_lgetxattr                228
++#define __NR_fgetxattr                229
++#define __NR_listxattr                230
++#define __NR_llistxattr               231
++#define __NR_flistxattr               232
++#define __NR_removexattr      233
++#define __NR_lremovexattr     234
++#define __NR_fremovexattr     235
+ #define __NR_gettid           236
+ #define __NR_tkill            237
+--- linux-rh-2.4.20-8/include/asm-s390x/unistd.h~linux-2.4.20-xattr-0.8.54-chaos       2002-08-03 08:39:45.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/asm-s390x/unistd.h  2003-05-07 17:34:25.000000000 +0800
+@@ -180,9 +180,18 @@
+ #define __NR_pivot_root         217
+ #define __NR_mincore            218
+ #define __NR_madvise            219
+-/*
+- * Numbers 224-235 are reserved for posix acl
+- */
++#define __NR_setxattr         224
++#define __NR_lsetxattr                225
++#define __NR_fsetxattr                226
++#define __NR_getxattr         227
++#define __NR_lgetxattr                228
++#define __NR_fgetxattr                229
++#define __NR_listxattr                230
++#define __NR_llistxattr               231
++#define __NR_flistxattr               232
++#define __NR_removexattr      233
++#define __NR_lremovexattr     234
++#define __NR_fremovexattr     235
+ #define __NR_gettid           236
+ #define __NR_tkill            237
+--- linux-rh-2.4.20-8/include/asm-sparc/unistd.h~linux-2.4.20-xattr-0.8.54-chaos       2002-08-03 08:39:45.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/asm-sparc/unistd.h  2003-05-07 17:34:25.000000000 +0800
+@@ -184,24 +184,24 @@
+ /* #define __NR_exportfs        166    SunOS Specific                              */
+ #define __NR_mount              167 /* Common                                      */
+ #define __NR_ustat              168 /* Common                                      */
+-/* #define __NR_semsys          169    SunOS Specific                              */
+-/* #define __NR_msgsys          170    SunOS Specific                              */
+-/* #define __NR_shmsys          171    SunOS Specific                              */
+-/* #define __NR_auditsys        172    SunOS Specific                              */
+-/* #define __NR_rfssys          173    SunOS Specific                              */
++#define __NR_setxattr           169 /* SunOS: semsys                               */
++#define __NR_lsetxattr          170 /* SunOS: msgsys                               */
++#define __NR_fsetxattr          171 /* SunOS: shmsys                               */
++#define __NR_getxattr           172 /* SunOS: auditsys                             */
++#define __NR_lgetxattr          173 /* SunOS: rfssys                               */
+ #define __NR_getdents           174 /* Common                                      */
+ #define __NR_setsid             175 /* Common                                      */
+ #define __NR_fchdir             176 /* Common                                      */
+-/* #define __NR_fchroot         177    SunOS Specific                              */
+-/* #define __NR_vpixsys         178    SunOS Specific                              */
+-/* #define __NR_aioread         179    SunOS Specific                              */
+-/* #define __NR_aiowrite        180    SunOS Specific                              */
+-/* #define __NR_aiowait         181    SunOS Specific                              */
+-/* #define __NR_aiocancel       182    SunOS Specific                              */
++#define __NR_fgetxattr          177 /* SunOS: fchroot                              */
++#define __NR_listxattr          178 /* SunOS: vpixsys                              */
++#define __NR_llistxattr         179 /* SunOS: aioread                              */
++#define __NR_flistxattr         180 /* SunOS: aiowrite                             */
++#define __NR_removexattr        181 /* SunOS: aiowait                              */
++#define __NR_lremovexattr       182 /* SunOS: aiocancel                            */
+ #define __NR_sigpending         183 /* Common                                      */
+ #define __NR_query_module     184 /* Linux Specific                              */
+ #define __NR_setpgid            185 /* Common                                      */
+-/* #define __NR_pathconf        186    SunOS Specific                              */
++#define __NR_fremovexattr       186 /* SunOS: pathconf                             */
+ #define __NR_tkill              187 /* SunOS: fpathconf                            */
+ /* #define __NR_sysconf         188    SunOS Specific                              */
+ #define __NR_uname              189 /* Linux Specific                              */
+--- linux-rh-2.4.20-8/include/asm-sparc64/unistd.h~linux-2.4.20-xattr-0.8.54-chaos     2002-08-03 08:39:45.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/asm-sparc64/unistd.h        2003-05-07 17:34:25.000000000 +0800
+@@ -184,24 +184,24 @@
+ /* #define __NR_exportfs        166    SunOS Specific                              */
+ #define __NR_mount              167 /* Common                                      */
+ #define __NR_ustat              168 /* Common                                      */
+-/* #define __NR_semsys          169    SunOS Specific                              */
+-/* #define __NR_msgsys          170    SunOS Specific                              */
+-/* #define __NR_shmsys          171    SunOS Specific                              */
+-/* #define __NR_auditsys        172    SunOS Specific                              */
+-/* #define __NR_rfssys          173    SunOS Specific                              */
++#define __NR_setxattr           169 /* SunOS: semsys                               */
++#define __NR_lsetxattr          170 /* SunOS: msgsys                               */
++#define __NR_fsetxattr          171 /* SunOS: shmsys                               */
++#define __NR_getxattr           172 /* SunOS: auditsys                             */
++#define __NR_lgetxattr          173 /* SunOS: rfssys                               */
+ #define __NR_getdents           174 /* Common                                      */
+ #define __NR_setsid             175 /* Common                                      */
+ #define __NR_fchdir             176 /* Common                                      */
+-/* #define __NR_fchroot         177    SunOS Specific                              */
+-/* #define __NR_vpixsys         178    SunOS Specific                              */
+-/* #define __NR_aioread         179    SunOS Specific                              */
+-/* #define __NR_aiowrite        180    SunOS Specific                              */
+-/* #define __NR_aiowait         181    SunOS Specific                              */
+-/* #define __NR_aiocancel       182    SunOS Specific                              */
++#define __NR_fgetxattr          177 /* SunOS: fchroot                              */
++#define __NR_listxattr          178 /* SunOS: vpixsys                              */
++#define __NR_llistxattr         179 /* SunOS: aioread                              */
++#define __NR_flistxattr         180 /* SunOS: aiowrite                             */
++#define __NR_removexattr        181 /* SunOS: aiowait                              */
++#define __NR_lremovexattr       182 /* SunOS: aiocancel                            */
+ #define __NR_sigpending         183 /* Common                                      */
+ #define __NR_query_module     184 /* Linux Specific                              */
+ #define __NR_setpgid            185 /* Common                                      */
+-/* #define __NR_pathconf        186    SunOS Specific                              */
++#define __NR_fremovexattr       186 /* SunOS: pathconf                             */
+ #define __NR_tkill              187 /* SunOS: fpathconf                            */
+ /* #define __NR_sysconf         188    SunOS Specific                              */
+ #define __NR_uname              189 /* Linux Specific                              */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/cache_def.h   2003-05-07 17:34:25.000000000 +0800
+@@ -0,0 +1,15 @@
++/*
++ * linux/cache_def.h
++ * Handling of caches defined in drivers, filesystems, ...
++ *
++ * Copyright (C) 2002 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++struct cache_definition {
++      const char *name;
++      void (*shrink)(int, unsigned int);
++      struct list_head link;
++};
++
++extern void register_cache(struct cache_definition *);
++extern void unregister_cache(struct cache_definition *);
+--- linux-rh-2.4.20-8/include/linux/errno.h~linux-2.4.20-xattr-0.8.54-chaos    2003-04-11 14:04:53.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/errno.h       2003-05-07 17:34:25.000000000 +0800
+@@ -26,4 +26,8 @@
+ #endif
++/* Defined for extended attributes */
++#define ENOATTR ENODATA               /* No such attribute */
++#define ENOTSUP EOPNOTSUPP    /* Operation not supported */
++
+ #endif
+--- linux-rh-2.4.20-8/include/linux/ext2_fs.h~linux-2.4.20-xattr-0.8.54-chaos  2003-04-12 15:46:42.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/ext2_fs.h     2003-05-07 17:34:25.000000000 +0800
+@@ -57,8 +57,6 @@
+  */
+ #define       EXT2_BAD_INO             1      /* Bad blocks inode */
+ #define EXT2_ROOT_INO          2      /* Root inode */
+-#define EXT2_ACL_IDX_INO       3      /* ACL inode */
+-#define EXT2_ACL_DATA_INO      4      /* ACL inode */
+ #define EXT2_BOOT_LOADER_INO   5      /* Boot loader inode */
+ #define EXT2_UNDEL_DIR_INO     6      /* Undelete directory inode */
+@@ -86,7 +84,6 @@
+ #else
+ # define EXT2_BLOCK_SIZE(s)           (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT2_ACLE_PER_BLOCK(s)                (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry))
+ #define       EXT2_ADDR_PER_BLOCK(s)          (EXT2_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT2_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+@@ -121,28 +118,6 @@
+ #endif
+ /*
+- * ACL structures
+- */
+-struct ext2_acl_header        /* Header of Access Control Lists */
+-{
+-      __u32   aclh_size;
+-      __u32   aclh_file_count;
+-      __u32   aclh_acle_count;
+-      __u32   aclh_first_acle;
+-};
+-
+-struct ext2_acl_entry /* Access Control List Entry */
+-{
+-      __u32   acle_size;
+-      __u16   acle_perms;     /* Access permissions */
+-      __u16   acle_type;      /* Type of entry */
+-      __u16   acle_tag;       /* User or group identity */
+-      __u16   acle_pad1;
+-      __u32   acle_next;      /* Pointer on next entry for the */
+-                                      /* same inode or on next free entry */
+-};
+-
+-/*
+  * Structure of a blocks group descriptor
+  */
+ struct ext2_group_desc
+@@ -314,6 +289,7 @@ struct ext2_inode {
+ #define EXT2_MOUNT_ERRORS_PANIC               0x0040  /* Panic on errors */
+ #define EXT2_MOUNT_MINIX_DF           0x0080  /* Mimics the Minix statfs */
+ #define EXT2_MOUNT_NO_UID32           0x0200  /* Disable 32-bit UIDs */
++#define EXT2_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
+ #define clear_opt(o, opt)             o &= ~EXT2_MOUNT_##opt
+ #define set_opt(o, opt)                       o |= EXT2_MOUNT_##opt
+@@ -397,6 +373,7 @@ struct ext2_super_block {
+ #ifdef __KERNEL__
+ #define EXT2_SB(sb)   (&((sb)->u.ext2_sb))
++#define EXT2_I(inode) (&((inode)->u.ext2_i))
+ #else
+ /* Assume that user mode programs are passing in an ext2fs superblock, not
+  * a kernel struct super_block.  This will allow us to call the feature-test
+@@ -466,7 +443,7 @@ struct ext2_super_block {
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008
+ #define EXT2_FEATURE_INCOMPAT_ANY             0xffffffff
+-#define EXT2_FEATURE_COMPAT_SUPP      0
++#define EXT2_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT2_FEATURE_INCOMPAT_SUPP    EXT2_FEATURE_INCOMPAT_FILETYPE
+ #define EXT2_FEATURE_RO_COMPAT_SUPP   (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -623,8 +600,10 @@ extern struct address_space_operations e
+ /* namei.c */
+ extern struct inode_operations ext2_dir_inode_operations;
++extern struct inode_operations ext2_special_inode_operations;
+ /* symlink.c */
++extern struct inode_operations ext2_symlink_inode_operations;
+ extern struct inode_operations ext2_fast_symlink_inode_operations;
+ #endif        /* __KERNEL__ */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/ext2_xattr.h  2003-05-07 17:34:25.000000000 +0800
+@@ -0,0 +1,157 @@
++/*
++  File: linux/ext2_xattr.h
++
++  On-disk format of extended attributes for the ext2 filesystem.
++
++  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT2_XATTR_MAGIC              0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT2_XATTR_REFCOUNT_MAX               1024
++
++/* Name indexes */
++#define EXT2_XATTR_INDEX_MAX                  10
++#define EXT2_XATTR_INDEX_USER                 1
++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS     2
++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT    3
++
++struct ext2_xattr_header {
++      __u32   h_magic;        /* magic number for identification */
++      __u32   h_refcount;     /* reference count */
++      __u32   h_blocks;       /* number of disk blocks used */
++      __u32   h_hash;         /* hash value of all attributes */
++      __u32   h_reserved[4];  /* zero right now */
++};
++
++struct ext2_xattr_entry {
++      __u8    e_name_len;     /* length of name */
++      __u8    e_name_index;   /* attribute name index */
++      __u16   e_value_offs;   /* offset in disk block of value */
++      __u32   e_value_block;  /* disk block attribute is stored on (n/i) */
++      __u32   e_value_size;   /* size of attribute value */
++      __u32   e_hash;         /* hash value of name and value */
++      char    e_name[0];      /* attribute name */
++};
++
++#define EXT2_XATTR_PAD_BITS           2
++#define EXT2_XATTR_PAD                (1<<EXT2_XATTR_PAD_BITS)
++#define EXT2_XATTR_ROUND              (EXT2_XATTR_PAD-1)
++#define EXT2_XATTR_LEN(name_len) \
++      (((name_len) + EXT2_XATTR_ROUND + \
++      sizeof(struct ext2_xattr_entry)) & ~EXT2_XATTR_ROUND)
++#define EXT2_XATTR_NEXT(entry) \
++      ( (struct ext2_xattr_entry *)( \
++        (char *)(entry) + EXT2_XATTR_LEN((entry)->e_name_len)) )
++#define EXT2_XATTR_SIZE(size) \
++      (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT2_FS_XATTR
++
++struct ext2_xattr_handler {
++      char *prefix;
++      size_t (*list)(char *list, struct inode *inode, const char *name,
++                     int name_len);
++      int (*get)(struct inode *inode, const char *name, void *buffer,
++                 size_t size);
++      int (*set)(struct inode *inode, const char *name, const void *buffer,
++                 size_t size, int flags);
++};
++
++extern int ext2_xattr_register(int, struct ext2_xattr_handler *);
++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *);
++
++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
++extern int ext2_removexattr(struct dentry *, const char *);
++
++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext2_xattr_list(struct inode *, char *, size_t);
++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext2_xattr_delete_inode(struct inode *);
++extern void ext2_xattr_put_super(struct super_block *);
++
++extern int init_ext2_xattr(void) __init;
++extern void exit_ext2_xattr(void);
++
++# else  /* CONFIG_EXT2_FS_XATTR */
++#  define ext2_setxattr               NULL
++#  define ext2_getxattr               NULL
++#  define ext2_listxattr      NULL
++#  define ext2_removexattr    NULL
++
++static inline int
++ext2_xattr_get(struct inode *inode, int name_index,
++             const char *name, void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++             const void *value, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++}
++
++static inline void
++ext2_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext2_xattr(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext2_xattr(void)
++{
++}
++
++# endif  /* CONFIG_EXT2_FS_XATTR */
++
++# ifdef CONFIG_EXT2_FS_XATTR_USER
++
++extern int init_ext2_xattr_user(void) __init;
++extern void exit_ext2_xattr_user(void);
++
++# else  /* CONFIG_EXT2_FS_XATTR_USER */
++
++static inline int
++init_ext2_xattr_user(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext2_xattr_user(void)
++{
++}
++
++# endif  /* CONFIG_EXT2_FS_XATTR_USER */
++
++#endif  /* __KERNEL__ */
++
+--- linux-rh-2.4.20-8/include/linux/ext3_fs.h~linux-2.4.20-xattr-0.8.54-chaos  2003-05-07 17:33:59.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/ext3_fs.h     2003-05-07 17:34:25.000000000 +0800
+@@ -63,8 +63,6 @@
+  */
+ #define       EXT3_BAD_INO             1      /* Bad blocks inode */
+ #define EXT3_ROOT_INO          2      /* Root inode */
+-#define EXT3_ACL_IDX_INO       3      /* ACL inode */
+-#define EXT3_ACL_DATA_INO      4      /* ACL inode */
+ #define EXT3_BOOT_LOADER_INO   5      /* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO     6      /* Undelete directory inode */
+ #define EXT3_RESIZE_INO                7      /* Reserved group descriptors inode */
+@@ -94,7 +92,6 @@
+ #else
+ # define EXT3_BLOCK_SIZE(s)           (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT3_ACLE_PER_BLOCK(s)                (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define       EXT3_ADDR_PER_BLOCK(s)          (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+@@ -129,28 +126,6 @@
+ #endif
+ /*
+- * ACL structures
+- */
+-struct ext3_acl_header        /* Header of Access Control Lists */
+-{
+-      __u32   aclh_size;
+-      __u32   aclh_file_count;
+-      __u32   aclh_acle_count;
+-      __u32   aclh_first_acle;
+-};
+-
+-struct ext3_acl_entry /* Access Control List Entry */
+-{
+-      __u32   acle_size;
+-      __u16   acle_perms;     /* Access permissions */
+-      __u16   acle_type;      /* Type of entry */
+-      __u16   acle_tag;       /* User or group identity */
+-      __u16   acle_pad1;
+-      __u32   acle_next;      /* Pointer on next entry for the */
+-                                      /* same inode or on next free entry */
+-};
+-
+-/*
+  * Structure of a blocks group descriptor
+  */
+ struct ext3_group_desc
+@@ -344,6 +319,7 @@ struct ext3_inode {
+   #define EXT3_MOUNT_WRITEBACK_DATA   0x0C00  /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
++#define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -520,7 +496,7 @@ struct ext3_super_block {
+ #define EXT3_FEATURE_INCOMPAT_RECOVER         0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008 /* Journal device */
+-#define EXT3_FEATURE_COMPAT_SUPP      0
++#define EXT3_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP    (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+                                        EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP   (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+@@ -703,6 +679,7 @@ extern void ext3_check_inodes_bitmap (st
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+ /* inode.c */
++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -771,8 +748,10 @@ extern struct address_space_operations e
+ /* namei.c */
+ extern struct inode_operations ext3_dir_inode_operations;
++extern struct inode_operations ext3_special_inode_operations;
+ /* symlink.c */
++extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+--- linux-rh-2.4.20-8/include/linux/ext3_jbd.h~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:59.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/ext3_jbd.h    2003-05-07 17:34:25.000000000 +0800
+@@ -30,13 +30,19 @@
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS  8U
++/* Extended attributes may touch two data buffers, two bitmap buffers,
++ * and two group and summaries. */
++
++#define EXT3_XATTR_TRANS_BLOCKS               8
++
+ /* Define the minimum size for a transaction which modifies data.  This
+  * needs to take into account the fact that we may end up modifying two
+  * quota files too (one for the group, one for the user quota).  The
+  * superblock only gets updated once, of course, so don't bother
+  * counting that again for the quota updates. */
+-#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
++#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \
++                                       EXT3_XATTR_TRANS_BLOCKS - 2)
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/ext3_xattr.h  2003-05-07 17:34:25.000000000 +0800
+@@ -0,0 +1,157 @@
++/*
++  File: linux/ext3_xattr.h
++
++  On-disk format of extended attributes for the ext3 filesystem.
++
++  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT3_XATTR_MAGIC              0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT3_XATTR_REFCOUNT_MAX               1024
++
++/* Name indexes */
++#define EXT3_XATTR_INDEX_MAX                  10
++#define EXT3_XATTR_INDEX_USER                 1
++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS     2
++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT    3
++
++struct ext3_xattr_header {
++      __u32   h_magic;        /* magic number for identification */
++      __u32   h_refcount;     /* reference count */
++      __u32   h_blocks;       /* number of disk blocks used */
++      __u32   h_hash;         /* hash value of all attributes */
++      __u32   h_reserved[4];  /* zero right now */
++};
++
++struct ext3_xattr_entry {
++      __u8    e_name_len;     /* length of name */
++      __u8    e_name_index;   /* attribute name index */
++      __u16   e_value_offs;   /* offset in disk block of value */
++      __u32   e_value_block;  /* disk block attribute is stored on (n/i) */
++      __u32   e_value_size;   /* size of attribute value */
++      __u32   e_hash;         /* hash value of name and value */
++      char    e_name[0];      /* attribute name */
++};
++
++#define EXT3_XATTR_PAD_BITS           2
++#define EXT3_XATTR_PAD                (1<<EXT3_XATTR_PAD_BITS)
++#define EXT3_XATTR_ROUND              (EXT3_XATTR_PAD-1)
++#define EXT3_XATTR_LEN(name_len) \
++      (((name_len) + EXT3_XATTR_ROUND + \
++      sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
++#define EXT3_XATTR_NEXT(entry) \
++      ( (struct ext3_xattr_entry *)( \
++        (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
++#define EXT3_XATTR_SIZE(size) \
++      (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT3_FS_XATTR
++
++struct ext3_xattr_handler {
++      char *prefix;
++      size_t (*list)(char *list, struct inode *inode, const char *name,
++                     int name_len);
++      int (*get)(struct inode *inode, const char *name, void *buffer,
++                 size_t size);
++      int (*set)(struct inode *inode, const char *name, const void *buffer,
++                 size_t size, int flags);
++};
++
++extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
++
++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
++extern int ext3_removexattr(struct dentry *, const char *);
++
++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext3_xattr_list(struct inode *, char *, size_t);
++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
++extern void ext3_xattr_put_super(struct super_block *);
++
++extern int init_ext3_xattr(void) __init;
++extern void exit_ext3_xattr(void);
++
++# else  /* CONFIG_EXT3_FS_XATTR */
++#  define ext3_setxattr               NULL
++#  define ext3_getxattr               NULL
++#  define ext3_listxattr      NULL
++#  define ext3_removexattr    NULL
++
++static inline int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++             const char *name, const void *value, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++}
++
++static inline void
++ext3_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext3_xattr(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext3_xattr(void)
++{
++}
++
++# endif  /* CONFIG_EXT3_FS_XATTR */
++
++# ifdef CONFIG_EXT3_FS_XATTR_USER
++
++extern int init_ext3_xattr_user(void) __init;
++extern void exit_ext3_xattr_user(void);
++
++# else  /* CONFIG_EXT3_FS_XATTR_USER */
++
++static inline int
++init_ext3_xattr_user(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext3_xattr_user(void)
++{
++}
++
++#endif  /* CONFIG_EXT3_FS_XATTR_USER */
++
++#endif  /* __KERNEL__ */
++
+--- linux-rh-2.4.20-8/include/linux/fs.h~linux-2.4.20-xattr-0.8.54-chaos       2003-05-07 17:33:58.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/fs.h  2003-05-07 17:34:25.000000000 +0800
+@@ -915,7 +915,7 @@ struct inode_operations {
+       int (*setattr) (struct dentry *, struct iattr *);
+       int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct dentry *, struct iattr *);
+-      int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
++      int (*setxattr) (struct dentry *, const char *, const void *, size_t, int);
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+       ssize_t (*listxattr) (struct dentry *, char *, size_t);
+       int (*removexattr) (struct dentry *, const char *);
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-rh-2.4.20-8-root/include/linux/mbcache.h     2003-05-07 17:34:25.000000000 +0800
+@@ -0,0 +1,69 @@
++/*
++  File: linux/mbcache.h
++
++  (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++/* Hardwire the number of additional indexes */
++#define MB_CACHE_INDEXES_COUNT 1
++
++struct mb_cache_entry;
++
++struct mb_cache_op {
++      int (*free)(struct mb_cache_entry *, int);
++};
++
++struct mb_cache {
++      struct list_head                c_cache_list;
++      const char                      *c_name;
++      struct mb_cache_op              c_op;
++      atomic_t                        c_entry_count;
++      int                             c_bucket_count;
++#ifndef MB_CACHE_INDEXES_COUNT
++      int                             c_indexes_count;
++#endif
++      kmem_cache_t                    *c_entry_cache;
++      struct list_head                *c_block_hash;
++      struct list_head                *c_indexes_hash[0];
++};
++
++struct mb_cache_entry_index {
++      struct list_head                o_list;
++      unsigned int                    o_key;
++};
++
++struct mb_cache_entry {
++      struct list_head                e_lru_list;
++      struct mb_cache                 *e_cache;
++      atomic_t                        e_used;
++      kdev_t                          e_dev;
++      unsigned long                   e_block;
++      struct list_head                e_block_list;
++      struct mb_cache_entry_index     e_indexes[0];
++};
++
++/* Functions on caches */
++
++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
++                                int, int);
++void mb_cache_shrink(struct mb_cache *, kdev_t);
++void mb_cache_destroy(struct mb_cache *);
++
++/* Functions on cache entries */
++
++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *);
++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long,
++                        unsigned int[]);
++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]);
++void mb_cache_entry_release(struct mb_cache_entry *);
++void mb_cache_entry_takeout(struct mb_cache_entry *);
++void mb_cache_entry_free(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t,
++                                        unsigned long);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
++                                               kdev_t, unsigned int);
++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
++                                              kdev_t, unsigned int);
++#endif
+--- linux-rh-2.4.20-8/kernel/ksyms.c~linux-2.4.20-xattr-0.8.54-chaos   2003-05-07 17:33:58.000000000 +0800
++++ linux-rh-2.4.20-8-root/kernel/ksyms.c      2003-05-07 17:34:25.000000000 +0800
+@@ -12,6 +12,7 @@
+ #define __KERNEL_SYSCALLS__
+ #include <linux/config.h>
+ #include <linux/slab.h>
++#include <linux/cache_def.h>
+ #include <linux/smp.h>
+ #include <linux/module.h>
+ #include <linux/blkdev.h>
+@@ -107,6 +108,7 @@ EXPORT_SYMBOL(exit_mm);
+ EXPORT_SYMBOL(exit_files);
+ EXPORT_SYMBOL(exit_fs);
+ EXPORT_SYMBOL(exit_sighand);
++EXPORT_SYMBOL(copy_fs_struct);
+ /* internal kernel memory management */
+ EXPORT_SYMBOL(_alloc_pages);
+@@ -125,6 +127,8 @@ EXPORT_SYMBOL(kmem_cache_alloc);
+ EXPORT_SYMBOL(kmem_cache_free);
+ EXPORT_SYMBOL(kmem_cache_validate);
+ EXPORT_SYMBOL(kmem_cache_size);
++EXPORT_SYMBOL(register_cache);
++EXPORT_SYMBOL(unregister_cache);
+ EXPORT_SYMBOL(kmalloc);
+ EXPORT_SYMBOL(kfree);
+ EXPORT_SYMBOL(vfree);
+--- linux-rh-2.4.20-8/mm/vmscan.c~linux-2.4.20-xattr-0.8.54-chaos      2003-05-07 17:33:58.000000000 +0800
++++ linux-rh-2.4.20-8-root/mm/vmscan.c 2003-05-07 17:34:25.000000000 +0800
+@@ -21,6 +21,7 @@
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+ #include <linux/swapctl.h>
++#include <linux/cache_def.h>
+ #include <linux/smp_lock.h>
+ #include <linux/pagemap.h>
+ #include <linux/init.h>
+@@ -444,6 +445,39 @@ static inline void kachunk_cache(struct 
+ #define BATCH_WORK_AMOUNT     64
++static DECLARE_MUTEX(other_caches_sem);
++static LIST_HEAD(cache_definitions);
++
++void register_cache(struct cache_definition *cache)
++{
++      down(&other_caches_sem);
++      list_add(&cache->link, &cache_definitions);
++      up(&other_caches_sem);
++}
++
++void unregister_cache(struct cache_definition *cache)
++{
++      down(&other_caches_sem);
++      list_del(&cache->link);
++      up(&other_caches_sem);
++}
++
++static void shrink_other_caches(unsigned int priority, int gfp_mask)
++{
++      struct list_head *p;
++
++      if (down_trylock(&other_caches_sem))
++              return;
++
++      list_for_each_prev(p, &cache_definitions) {
++              struct cache_definition *cache =
++                      list_entry(p, struct cache_definition, link);
++
++              cache->shrink(priority, gfp_mask);
++      }
++      up(&other_caches_sem);
++}
++
+ /*
+  * returns the active cache ratio relative to the total active list
+  * times 10 (eg. 30% cache returns 3)
+@@ -887,7 +921,7 @@ static int do_try_to_free_pages_kswapd(u
+       ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask);
+       ret += shrink_icache_memory(DEF_PRIORITY, gfp_mask);
+-      // ret += shrink_other_caches(DEF_PRIORITY, gfp_mask); 
++      shrink_other_caches(DEF_PRIORITY, gfp_mask); 
+ #ifdef CONFIG_QUOTA
+       ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+ #endif
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-root/fs/ext3/ext3-exports.c  2003-05-05 18:19:11.000000000 +0800
+@@ -0,0 +1,13 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
++
++EXPORT_SYMBOL(ext3_force_commit);
++EXPORT_SYMBOL(ext3_bread);
++EXPORT_SYMBOL(ext3_xattr_register);
++EXPORT_SYMBOL(ext3_xattr_unregister);
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_list);
++EXPORT_SYMBOL(ext3_xattr_set);
+
+_
diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch
new file mode 100644 (file)
index 0000000..f0f92e5
--- /dev/null
@@ -0,0 +1,5536 @@
+ Documentation/Configure.help  |   66 ++
+ arch/alpha/defconfig          |    7 
+ arch/alpha/kernel/entry.S     |   12 
+ arch/arm/defconfig            |    7 
+ arch/arm/kernel/calls.S       |   24 
+ arch/i386/defconfig           |    7 
+ arch/ia64/defconfig           |    7 
+ arch/m68k/defconfig           |    7 
+ arch/mips/defconfig           |    7 
+ arch/mips64/defconfig         |    7 
+ arch/ppc/defconfig            |   14 
+ arch/ppc64/kernel/misc.S      |    2 
+ arch/s390/defconfig           |    7 
+ arch/s390/kernel/entry.S      |   24 
+ arch/s390x/defconfig          |    7 
+ arch/s390x/kernel/entry.S     |   24 
+ arch/s390x/kernel/wrapper32.S |   92 +++
+ arch/sparc/defconfig          |    7 
+ arch/sparc/kernel/systbls.S   |   10 
+ arch/sparc64/defconfig        |    7 
+ arch/sparc64/kernel/systbls.S |   20 
+ fs/Config.in                  |   14 
+ fs/Makefile                   |    3 
+ fs/ext2/Makefile              |    4 
+ fs/ext2/file.c                |    5 
+ fs/ext2/ialloc.c              |    2 
+ fs/ext2/inode.c               |   34 -
+ fs/ext2/namei.c               |   14 
+ fs/ext2/super.c               |   29 
+ fs/ext2/symlink.c             |   14 
+ fs/ext2/xattr.c               | 1212 +++++++++++++++++++++++++++++++++++++++++
+ fs/ext2/xattr_user.c          |  103 +++
+ fs/ext3/Makefile              |    9 
+ fs/ext3/ext3-exports.c        |   13 
+ fs/ext3/file.c                |    5 
+ fs/ext3/ialloc.c              |    2 
+ fs/ext3/inode.c               |   35 -
+ fs/ext3/namei.c               |   21 
+ fs/ext3/super.c               |   36 +
+ fs/ext3/symlink.c             |   14 
+ fs/ext3/xattr.c               | 1225 ++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/xattr_user.c          |  111 +++
+ fs/jfs/jfs_xattr.h            |    6 
+ fs/jfs/xattr.c                |    6 
+ fs/mbcache.c                  |  648 ++++++++++++++++++++++
+ include/asm-arm/unistd.h      |    2 
+ include/asm-ppc64/unistd.h    |    2 
+ include/asm-s390/unistd.h     |   15 
+ include/asm-s390x/unistd.h    |   15 
+ include/asm-sparc/unistd.h    |   24 
+ include/asm-sparc64/unistd.h  |   24 
+ include/linux/cache_def.h     |   15 
+ include/linux/errno.h         |    4 
+ include/linux/ext2_fs.h       |   31 -
+ include/linux/ext2_xattr.h    |  157 +++++
+ include/linux/ext3_fs.h       |   31 -
+ include/linux/ext3_jbd.h      |    8 
+ include/linux/ext3_xattr.h    |  157 +++++
+ include/linux/fs.h            |    2 
+ include/linux/mbcache.h       |   69 ++
+ kernel/ksyms.c                |    4 
+ mm/vmscan.c                   |   35 +
+ 62 files changed, 4343 insertions(+), 182 deletions(-)
+
+--- linux/Documentation/Configure.help~linux-2.4.20-xattr-0.8.54-hp    Fri May 16 08:39:23 2003
++++ linux-mmonroe/Documentation/Configure.help Fri May 16 08:43:00 2003
+@@ -15309,6 +15309,39 @@ CONFIG_EXT2_FS
+   be compiled as a module, and so this could be dangerous.  Most
+   everyone wants to say Y here.
++Ext2 extended attributes
++CONFIG_EXT2_FS_XATTR
++  Extended attributes are name:value pairs associated with inodes by
++  the kernel or by users (see the attr(5) manual page, or visit
++  <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext2 extended attribute block sharing
++CONFIG_EXT2_FS_XATTR_SHARING
++  This options enables code for sharing identical extended attribute
++  blocks among multiple inodes.
++
++  Usually, say Y.
++
++Ext2 extended user attributes
++CONFIG_EXT2_FS_XATTR_USER
++  This option enables extended user attributes on ext2. Processes can
++  associate extended user attributes with inodes to store additional
++  information such as the character encoding of files, etc. (see the
++  attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext2 trusted extended attributes
++CONFIG_EXT2_FS_XATTR_TRUSTED
++  This option enables extended attributes on ext2 that are accessible
++  (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++  is only the super user. Trusted extended attributes are meant for
++  implementing system/security services.
++
++  If unsure, say N.
++
+ Ext3 journalling file system support (EXPERIMENTAL)
+ CONFIG_EXT3_FS
+   This is the journalling version of the Second extended file system
+@@ -15341,6 +15374,39 @@ CONFIG_EXT3_FS
+   of your root partition (the one containing the directory /) cannot
+   be compiled as a module, and so this may be dangerous.
++Ext3 extended attributes
++CONFIG_EXT3_FS_XATTR
++  Extended attributes are name:value pairs associated with inodes by
++  the kernel or by users (see the attr(5) manual page, or visit
++  <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext3 extended attribute block sharing
++CONFIG_EXT3_FS_XATTR_SHARING
++  This options enables code for sharing identical extended attribute
++  blocks among multiple inodes.
++
++  Usually, say Y.
++
++Ext3 extended user attributes
++CONFIG_EXT3_FS_XATTR_USER
++  This option enables extended user attributes on ext3. Processes can
++  associate extended user attributes with inodes to store additional
++  information such as the character encoding of files, etc. (see the
++  attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext3 trusted extended attributes
++CONFIG_EXT3_FS_XATTR_TRUSTED
++  This option enables extended attributes on ext3 that are accessible
++  (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++  is only the super user. Trusted extended attributes are meant for
++  implementing system/security services.
++
++  If unsure, say N.
++
+ Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
+ CONFIG_JBD
+   This is a generic journalling layer for block devices.  It is
+--- linux/arch/alpha/defconfig~linux-2.4.20-xattr-0.8.54-hp    Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/alpha/defconfig Fri May 16 08:43:00 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_ALPHA=y
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+--- linux/arch/alpha/kernel/entry.S~linux-2.4.20-xattr-0.8.54-hp       Fri Aug  2 17:39:42 2002
++++ linux-mmonroe/arch/alpha/kernel/entry.S    Fri May 16 08:43:00 2003
+@@ -1154,6 +1154,18 @@ sys_call_table:
+       .quad sys_readahead
+       .quad sys_ni_syscall                    /* 380, sys_security */
+       .quad sys_tkill
++      .quad sys_setxattr
++      .quad sys_lsetxattr
++      .quad sys_fsetxattr
++      .quad sys_getxattr                      /* 385 */
++      .quad sys_lgetxattr
++      .quad sys_fgetxattr
++      .quad sys_listxattr
++      .quad sys_llistxattr
++      .quad sys_flistxattr                    /* 390 */
++      .quad sys_removexattr
++      .quad sys_lremovexattr
++      .quad sys_fremovexattr
+ /* Remember to update everything, kids.  */
+ .ifne (. - sys_call_table) - (NR_SYSCALLS * 8)
+--- linux/arch/arm/defconfig~linux-2.4.20-xattr-0.8.54-hp      Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/arm/defconfig   Fri May 16 08:43:00 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_ARM=y
+ # CONFIG_EISA is not set
+ # CONFIG_SBUS is not set
+--- linux/arch/arm/kernel/calls.S~linux-2.4.20-xattr-0.8.54-hp Fri Aug  2 17:39:42 2002
++++ linux-mmonroe/arch/arm/kernel/calls.S      Fri May 16 08:43:00 2003
+@@ -240,18 +240,18 @@ __syscall_start:
+               .long   SYMBOL_NAME(sys_ni_syscall) /* Security */
+               .long   SYMBOL_NAME(sys_gettid)
+ /* 225 */     .long   SYMBOL_NAME(sys_readahead)
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_setxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lsetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fsetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_getxattr */
+-/* 230 */     .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lgetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fgetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_listxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_llistxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_flistxattr */
+-/* 235 */     .long   SYMBOL_NAME(sys_ni_syscall) /* sys_removexattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lremovexattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fremovexattr */
++              .long   SYMBOL_NAME(sys_setxattr)
++              .long   SYMBOL_NAME(sys_lsetxattr)
++              .long   SYMBOL_NAME(sys_fsetxattr)
++              .long   SYMBOL_NAME(sys_getxattr)
++/* 230 */     .long   SYMBOL_NAME(sys_lgetxattr)
++              .long   SYMBOL_NAME(sys_fgetxattr)
++              .long   SYMBOL_NAME(sys_listxattr)
++              .long   SYMBOL_NAME(sys_llistxattr)
++              .long   SYMBOL_NAME(sys_flistxattr)
++/* 235 */     .long   SYMBOL_NAME(sys_removexattr)
++              .long   SYMBOL_NAME(sys_lremovexattr)
++              .long   SYMBOL_NAME(sys_fremovexattr)
+               .long   SYMBOL_NAME(sys_tkill)
+               /*
+                * Please check 2.5 _before_ adding calls here,
+--- linux/arch/i386/defconfig~linux-2.4.20-xattr-0.8.54-hp     Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/i386/defconfig  Fri May 16 08:43:00 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_X86=y
+ CONFIG_ISA=y
+ # CONFIG_SBUS is not set
+--- linux/arch/ia64/defconfig~linux-2.4.20-xattr-0.8.54-hp     Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/ia64/defconfig  Fri May 16 08:43:00 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ #
+ # Code maturity level options
+--- linux/arch/m68k/defconfig~linux-2.4.20-xattr-0.8.54-hp     Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/m68k/defconfig  Fri May 16 08:43:00 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_UID16=y
+ #
+--- linux/arch/mips/defconfig~linux-2.4.20-xattr-0.8.54-hp     Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/mips/defconfig  Fri May 16 08:43:01 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_MIPS=y
+ CONFIG_MIPS32=y
+ # CONFIG_MIPS64 is not set
+--- linux/arch/mips64/defconfig~linux-2.4.20-xattr-0.8.54-hp   Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/mips64/defconfig        Fri May 16 08:43:01 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_MIPS=y
+ # CONFIG_MIPS32 is not set
+ CONFIG_MIPS64=y
+--- linux/arch/ppc/defconfig~linux-2.4.20-xattr-0.8.54-hp      Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/ppc/defconfig   Fri May 16 08:43:01 2003
+@@ -1,6 +1,20 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+--- linux/arch/ppc64/kernel/misc.S~linux-2.4.20-xattr-0.8.54-hp        Thu Nov 28 15:53:11 2002
++++ linux-mmonroe/arch/ppc64/kernel/misc.S     Fri May 16 08:43:01 2003
+@@ -731,6 +731,7 @@ _GLOBAL(sys_call_table32)
+       .llong .sys_gettid              /* 207 */
+ #if 0 /* Reserved syscalls */
+       .llong .sys_tkill               /* 208 */
++#endif
+       .llong .sys_setxattr
+       .llong .sys_lsetxattr   /* 210 */
+       .llong .sys_fsetxattr
+@@ -743,6 +744,7 @@ _GLOBAL(sys_call_table32)
+       .llong .sys_removexattr
+       .llong .sys_lremovexattr
+       .llong .sys_fremovexattr        /* 220 */
++#if 0 /* Reserved syscalls */
+       .llong .sys_futex
+ #endif
+       .llong .sys_perfmonctl   /* Put this here for now ... */
+--- linux/arch/s390/defconfig~linux-2.4.20-xattr-0.8.54-hp     Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/s390/defconfig  Fri May 16 08:43:01 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_ISA is not set
+ # CONFIG_EISA is not set
+ # CONFIG_MCA is not set
+--- linux/arch/s390/kernel/entry.S~linux-2.4.20-xattr-0.8.54-hp        Thu Nov 28 15:53:11 2002
++++ linux-mmonroe/arch/s390/kernel/entry.S     Fri May 16 08:43:01 2003
+@@ -558,18 +558,18 @@ sys_call_table:
+         .long  sys_fcntl64 
+       .long  sys_ni_syscall
+       .long  sys_ni_syscall
+-      .long  sys_ni_syscall            /* 224 - reserved for setxattr  */
+-      .long  sys_ni_syscall            /* 225 - reserved for lsetxattr */
+-      .long  sys_ni_syscall            /* 226 - reserved for fsetxattr */
+-      .long  sys_ni_syscall            /* 227 - reserved for getxattr  */
+-      .long  sys_ni_syscall            /* 228 - reserved for lgetxattr */
+-      .long  sys_ni_syscall            /* 229 - reserved for fgetxattr */
+-      .long  sys_ni_syscall            /* 230 - reserved for listxattr */
+-      .long  sys_ni_syscall            /* 231 - reserved for llistxattr */
+-      .long  sys_ni_syscall            /* 232 - reserved for flistxattr */
+-      .long  sys_ni_syscall            /* 233 - reserved for removexattr */
+-      .long  sys_ni_syscall            /* 234 - reserved for lremovexattr */
+-      .long  sys_ni_syscall            /* 235 - reserved for fremovexattr */
++      .long  sys_setxattr
++      .long  sys_lsetxattr            /* 225 */
++      .long  sys_fsetxattr
++      .long  sys_getxattr
++      .long  sys_lgetxattr
++      .long  sys_fgetxattr
++      .long  sys_listxattr            /* 230 */
++      .long  sys_llistxattr
++      .long  sys_flistxattr
++      .long  sys_removexattr
++      .long  sys_lremovexattr
++      .long  sys_fremovexattr         /* 235 */
+       .long  sys_gettid
+       .long  sys_tkill
+       .rept  255-237
+--- linux/arch/s390x/defconfig~linux-2.4.20-xattr-0.8.54-hp    Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/s390x/defconfig Fri May 16 08:43:01 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_ISA is not set
+ # CONFIG_EISA is not set
+ # CONFIG_MCA is not set
+--- linux/arch/s390x/kernel/entry.S~linux-2.4.20-xattr-0.8.54-hp       Thu Nov 28 15:53:11 2002
++++ linux-mmonroe/arch/s390x/kernel/entry.S    Fri May 16 08:43:01 2003
+@@ -591,18 +591,18 @@ sys_call_table:
+       .long  SYSCALL(sys_ni_syscall,sys32_fcntl64_wrapper)
+       .long  SYSCALL(sys_ni_syscall,sys_ni_syscall)
+       .long  SYSCALL(sys_ni_syscall,sys_ni_syscall)
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 224 - reserved for setxattr  */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 225 - reserved for lsetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 226 - reserved for fsetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 227 - reserved for getxattr  */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 228 - reserved for lgetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 229 - reserved for fgetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 230 - reserved for listxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 231 - reserved for llistxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 232 - reserved for flistxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 233 - reserved for removexattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 234 - reserved for lremovexattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 235 - reserved for fremovexattr */
++      .long  SYSCALL(sys_setxattr,sys32_setxattr_wrapper)
++      .long  SYSCALL(sys_lsetxattr,sys32_lsetxattr_wrapper)   /* 225 */
++      .long  SYSCALL(sys_fsetxattr,sys32_fsetxattr_wrapper)
++      .long  SYSCALL(sys_getxattr,sys32_getxattr_wrapper)
++      .long  SYSCALL(sys_lgetxattr,sys32_lgetxattr_wrapper)
++      .long  SYSCALL(sys_fgetxattr,sys32_fgetxattr_wrapper)
++      .long  SYSCALL(sys_listxattr,sys32_listxattr_wrapper)   /* 230 */
++      .long  SYSCALL(sys_llistxattr,sys32_llistxattr_wrapper)
++      .long  SYSCALL(sys_flistxattr,sys32_flistxattr_wrapper)
++      .long  SYSCALL(sys_removexattr,sys32_removexattr_wrapper)
++      .long  SYSCALL(sys_lremovexattr,sys32_lremovexattr_wrapper)
++      .long  SYSCALL(sys_fremovexattr,sys32_fremovexattr_wrapper)/* 235 */
+       .long  SYSCALL(sys_gettid,sys_gettid)
+       .long  SYSCALL(sys_tkill,sys_tkill)
+       .rept  255-237
+--- linux/arch/s390x/kernel/wrapper32.S~linux-2.4.20-xattr-0.8.54-hp   Mon Feb 25 11:37:56 2002
++++ linux-mmonroe/arch/s390x/kernel/wrapper32.S        Fri May 16 08:43:01 2003
+@@ -1091,3 +1091,95 @@ sys32_fstat64_wrapper:
+       llgtr   %r3,%r3                 # struct stat64 *
+       llgfr   %r4,%r4                 # long
+       jg      sys32_fstat64           # branch to system call
++
++      .globl  sys32_setxattr_wrapper
++sys32_setxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_setxattr
++
++      .globl  sys32_lsetxattr_wrapper
++sys32_lsetxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_lsetxattr
++
++      .globl  sys32_fsetxattr_wrapper
++sys32_fsetxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_fsetxattr
++
++      .globl  sys32_getxattr_wrapper
++sys32_getxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_getxattr
++
++      .globl  sys32_lgetxattr_wrapper
++sys32_lgetxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_lgetxattr
++
++      .globl  sys32_fgetxattr_wrapper
++sys32_fgetxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_fgetxattr
++
++      .globl  sys32_listxattr_wrapper
++sys32_listxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_listxattr
++
++      .globl  sys32_llistxattr_wrapper
++sys32_llistxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_llistxattr
++
++      .globl  sys32_flistxattr_wrapper
++sys32_flistxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_flistxattr
++
++      .globl  sys32_removexattr_wrapper
++sys32_removexattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      jg      sys_removexattr
++
++      .globl  sys32_lremovexattr_wrapper
++sys32_lremovexattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      jg      sys_lremovexattr
++
++      .globl  sys32_fremovexattr_wrapper
++sys32_fremovexattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      jg      sys_fremovexattr
++
++
+--- linux/arch/sparc/defconfig~linux-2.4.20-xattr-0.8.54-hp    Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/sparc/defconfig Fri May 16 08:43:01 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_UID16=y
+ CONFIG_HIGHMEM=y
+--- linux/arch/sparc/kernel/systbls.S~linux-2.4.20-xattr-0.8.54-hp     Fri Aug  2 17:39:43 2002
++++ linux-mmonroe/arch/sparc/kernel/systbls.S  Fri May 16 08:43:01 2003
+@@ -51,11 +51,11 @@ sys_call_table:
+ /*150*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+ /*155*/       .long sys_fcntl64, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
+ /*160*/       .long sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall
+-/*165*/       .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents
+-/*175*/       .long sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_sigpending, sys_query_module
+-/*185*/       .long sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sys_newuname
++/*165*/       .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr
++/*170*/       .long sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents
++/*175*/       .long sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .long sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_sigpending, sys_query_module
++/*185*/       .long sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sys_newuname
+ /*190*/       .long sys_init_module, sys_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+ /*195*/       .long sys_nis_syscall, sys_nis_syscall, sys_getppid, sparc_sigaction, sys_sgetmask
+ /*200*/       .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, old_readdir
+--- linux/arch/sparc64/defconfig~linux-2.4.20-xattr-0.8.54-hp  Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/sparc64/defconfig       Fri May 16 08:43:01 2003
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ #
+ # Code maturity level options
+--- linux/arch/sparc64/kernel/systbls.S~linux-2.4.20-xattr-0.8.54-hp   Fri May 16 08:39:14 2003
++++ linux-mmonroe/arch/sparc64/kernel/systbls.S        Fri May 16 08:43:01 2003
+@@ -52,11 +52,11 @@ sys_call_table32:
+ /*150*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+       .word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount
+ /*160*/       .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall
+-      .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_getdents
+-      .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_sigpending, sys32_query_module
+-      .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname
++      .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr
++/*170*/       .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys32_getdents
++      .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys32_sigpending, sys32_query_module
++      .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname
+ /*190*/       .word sys32_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+       .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys32_sigaction, sys_sgetmask
+ /*200*/       .word sys_ssetmask, sys_sigsuspend, sys32_newlstat, sys_uselib, old32_readdir
+@@ -111,11 +111,11 @@ sys_call_table:
+ /*150*/       .word sys_getsockname, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+       .word sys_nis_syscall, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
+ /*160*/       .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_utrap_install
+-      .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents
+-      .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_query_module
+-      .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname
++      .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr
++/*170*/       .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents
++      .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_nis_syscall, sys_query_module
++      .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname
+ /*190*/       .word sys_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+       .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys_nis_syscall, sys_sgetmask
+ /*200*/       .word sys_ssetmask, sys_nis_syscall, sys_newlstat, sys_uselib, sys_nis_syscall
+--- linux/fs/Config.in~linux-2.4.20-xattr-0.8.54-hp    Fri May 16 08:39:14 2003
++++ linux-mmonroe/fs/Config.in Fri May 16 08:43:01 2003
+@@ -35,6 +35,11 @@ dep_mbool '  Debug Befs' CONFIG_BEFS_DEB
+ dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL
+ tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS
++dep_mbool '  Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS
++dep_bool '    Ext3 extended attribute block sharing' \
++    CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR
++dep_bool '    Ext3 extended user attributes' \
++    CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR
+ # CONFIG_JBD could be its own option (even modular), but until there are
+ # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
+ # dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
+@@ -98,6 +103,11 @@ dep_mbool '  QNX4FS write support (DANGE
+ tristate 'ROM file system support' CONFIG_ROMFS_FS
+ tristate 'Second extended fs support' CONFIG_EXT2_FS
++dep_mbool '  Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS
++dep_bool '    Ext2 extended attribute block sharing' \
++    CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR
++dep_bool '    Ext2 extended user attributes' \
++    CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR
+ tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS
+@@ -176,6 +186,10 @@ else
+    define_tristate CONFIG_ZISOFS_FS n
+ fi
++# Meta block cache for Extended Attributes (ext2/ext3)
++#tristate 'Meta block cache' CONFIG_FS_MBCACHE
++define_tristate CONFIG_FS_MBCACHE y
++
+ mainmenu_option next_comment
+ comment 'Partition Types'
+ source fs/partitions/Config.in
+--- linux/fs/Makefile~linux-2.4.20-xattr-0.8.54-hp     Fri May 16 08:42:46 2003
++++ linux-mmonroe/fs/Makefile  Fri May 16 08:43:01 2003
+@@ -80,6 +80,9 @@ obj-y                                += binfmt_script.o
+ obj-$(CONFIG_BINFMT_ELF)      += binfmt_elf.o
++export-objs += mbcache.o
++obj-$(CONFIG_FS_MBCACHE)      += mbcache.o
++
+ # persistent filesystems
+ obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
+--- linux/fs/ext2/Makefile~linux-2.4.20-xattr-0.8.54-hp        Thu Oct 11 08:05:18 2001
++++ linux-mmonroe/fs/ext2/Makefile     Fri May 16 08:43:01 2003
+@@ -13,4 +13,8 @@ obj-y    := balloc.o bitmap.o dir.o file
+               ioctl.o namei.o super.o symlink.o
+ obj-m    := $(O_TARGET)
++export-objs += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+--- linux/fs/ext2/file.c~linux-2.4.20-xattr-0.8.54-hp  Thu Oct 11 08:05:18 2001
++++ linux-mmonroe/fs/ext2/file.c       Fri May 16 08:43:01 2003
+@@ -20,6 +20,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/sched.h>
+ /*
+@@ -51,4 +52,8 @@ struct file_operations ext2_file_operati
+ struct inode_operations ext2_file_inode_operations = {
+       truncate:       ext2_truncate,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- linux/fs/ext2/ialloc.c~linux-2.4.20-xattr-0.8.54-hp        Thu Nov 28 15:53:15 2002
++++ linux-mmonroe/fs/ext2/ialloc.c     Fri May 16 08:43:01 2003
+@@ -15,6 +15,7 @@
+ #include <linux/config.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+@@ -167,6 +168,7 @@ void ext2_free_inode (struct inode * ino
+        */
+       if (!is_bad_inode(inode)) {
+               /* Quota is already initialized in iput() */
++              ext2_xattr_delete_inode(inode);
+               DQUOT_FREE_INODE(inode);
+               DQUOT_DROP(inode);
+       }
+--- linux/fs/ext2/inode.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002
++++ linux-mmonroe/fs/ext2/inode.c      Fri May 16 08:43:01 2003
+@@ -39,6 +39,18 @@ MODULE_LICENSE("GPL");
+ static int ext2_update_inode(struct inode * inode, int do_sync);
+ /*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext2_inode_is_fast_symlink(struct inode *inode)
++{
++      int ea_blocks = inode->u.ext2_i.i_file_acl ?
++              (inode->i_sb->s_blocksize >> 9) : 0;
++
++      return (S_ISLNK(inode->i_mode) &&
++              inode->i_blocks - ea_blocks == 0);
++}
++
++/*
+  * Called at each iput()
+  */
+ void ext2_put_inode (struct inode * inode)
+@@ -53,9 +65,7 @@ void ext2_delete_inode (struct inode * i
+ {
+       lock_kernel();
+-      if (is_bad_inode(inode) ||
+-          inode->i_ino == EXT2_ACL_IDX_INO ||
+-          inode->i_ino == EXT2_ACL_DATA_INO)
++      if (is_bad_inode(inode))
+               goto no_delete;
+       inode->u.ext2_i.i_dtime = CURRENT_TIME;
+       mark_inode_dirty(inode);
+@@ -801,6 +811,8 @@ void ext2_truncate (struct inode * inode
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode)))
+               return;
++      if (ext2_inode_is_fast_symlink(inode))
++              return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+               return;
+@@ -888,8 +900,7 @@ void ext2_read_inode (struct inode * ino
+       unsigned long offset;
+       struct ext2_group_desc * gdp;
+-      if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO &&
+-           inode->i_ino != EXT2_ACL_DATA_INO &&
++      if ((inode->i_ino != EXT2_ROOT_INO &&
+            inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
+           inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
+               ext2_error (inode->i_sb, "ext2_read_inode",
+@@ -974,10 +985,7 @@ void ext2_read_inode (struct inode * ino
+       for (block = 0; block < EXT2_N_BLOCKS; block++)
+               inode->u.ext2_i.i_data[block] = raw_inode->i_block[block];
+-      if (inode->i_ino == EXT2_ACL_IDX_INO ||
+-          inode->i_ino == EXT2_ACL_DATA_INO)
+-              /* Nothing to do */ ;
+-      else if (S_ISREG(inode->i_mode)) {
++      if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &ext2_file_inode_operations;
+               inode->i_fop = &ext2_file_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+@@ -986,15 +994,17 @@ void ext2_read_inode (struct inode * ino
+               inode->i_fop = &ext2_dir_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+       } else if (S_ISLNK(inode->i_mode)) {
+-              if (!inode->i_blocks)
++              if (ext2_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext2_fast_symlink_inode_operations;
+               else {
+-                      inode->i_op = &page_symlink_inode_operations;
++                      inode->i_op = &ext2_symlink_inode_operations;
+                       inode->i_mapping->a_ops = &ext2_aops;
+               }
+-      } else 
++      } else {
++              inode->i_op = &ext2_special_inode_operations;
+               init_special_inode(inode, inode->i_mode,
+                                  le32_to_cpu(raw_inode->i_block[0]));
++      }
+       brelse (bh);
+       inode->i_attr_flags = 0;
+       if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) {
+--- linux/fs/ext2/namei.c~linux-2.4.20-xattr-0.8.54-hp Wed Oct  3 22:57:36 2001
++++ linux-mmonroe/fs/ext2/namei.c      Fri May 16 08:43:01 2003
+@@ -31,6 +31,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/pagemap.h>
+ /*
+@@ -136,7 +137,7 @@ static int ext2_symlink (struct inode * 
+       if (l > sizeof (inode->u.ext2_i.i_data)) {
+               /* slow symlink */
+-              inode->i_op = &page_symlink_inode_operations;
++              inode->i_op = &ext2_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+               err = block_symlink(inode, symname, l);
+               if (err)
+@@ -345,4 +346,15 @@ struct inode_operations ext2_dir_inode_o
+       rmdir:          ext2_rmdir,
+       mknod:          ext2_mknod,
+       rename:         ext2_rename,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
++};
++
++struct inode_operations ext2_special_inode_operations = {
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- linux/fs/ext2/super.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002
++++ linux-mmonroe/fs/ext2/super.c      Fri May 16 08:43:01 2003
+@@ -21,6 +21,7 @@
+ #include <linux/string.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -125,6 +126,7 @@ void ext2_put_super (struct super_block 
+       int db_count;
+       int i;
++      ext2_xattr_put_super(sb);
+       if (!(sb->s_flags & MS_RDONLY)) {
+               struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+@@ -175,6 +177,13 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++              if (!strcmp (this_char, "user_xattr"))
++                      set_opt (*mount_options, XATTR_USER);
++              else if (!strcmp (this_char, "nouser_xattr"))
++                      clear_opt (*mount_options, XATTR_USER);
++              else
++#endif
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -424,6 +433,9 @@ struct super_block * ext2_read_super (st
+           blocksize = BLOCK_SIZE;
+       sb->u.ext2_sb.s_mount_opt = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++      /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */
++#endif
+       if (!parse_options ((char *) data, &sb_block, &resuid, &resgid,
+           &sb->u.ext2_sb.s_mount_opt)) {
+               return NULL;
+@@ -813,12 +825,27 @@ static DECLARE_FSTYPE_DEV(ext2_fs_type, 
+ static int __init init_ext2_fs(void)
+ {
+-        return register_filesystem(&ext2_fs_type);
++      int error = init_ext2_xattr();
++      if (error)
++              return error;
++      error = init_ext2_xattr_user();
++      if (error)
++              goto fail;
++      error = register_filesystem(&ext2_fs_type);
++      if (!error)
++              return 0;
++
++      exit_ext2_xattr_user();
++fail:
++      exit_ext2_xattr();
++      return error;
+ }
+ static void __exit exit_ext2_fs(void)
+ {
+       unregister_filesystem(&ext2_fs_type);
++      exit_ext2_xattr_user();
++      exit_ext2_xattr();
+ }
+ EXPORT_NO_SYMBOLS;
+--- linux/fs/ext2/symlink.c~linux-2.4.20-xattr-0.8.54-hp       Wed Sep 27 13:41:33 2000
++++ linux-mmonroe/fs/ext2/symlink.c    Fri May 16 08:43:01 2003
+@@ -19,6 +19,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -32,7 +33,20 @@ static int ext2_follow_link(struct dentr
+       return vfs_follow_link(nd, s);
+ }
++struct inode_operations ext2_symlink_inode_operations = {
++      readlink:       page_readlink,
++      follow_link:    page_follow_link,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
++};
++
+ struct inode_operations ext2_fast_symlink_inode_operations = {
+       readlink:       ext2_readlink,
+       follow_link:    ext2_follow_link,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/fs/ext2/xattr.c      Fri May 16 08:43:01 2003
+@@ -0,0 +1,1212 @@
++/*
++ * linux/fs/ext2/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Extended attributes for symlinks and special files added per
++ *  suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ *   +------------------+
++ *   | header           |
++ *   | entry 1          | |
++ *   | entry 2          | | growing downwards
++ *   | entry 3          | v
++ *   | four null bytes  |
++ *   | . . .            |
++ *   | value 1          | ^
++ *   | value 3          | | growing upwards
++ *   | value 2          | |
++ *   +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT2_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++/* These symbols may be needed by a module. */
++EXPORT_SYMBOL(ext2_xattr_register);
++EXPORT_SYMBOL(ext2_xattr_unregister);
++EXPORT_SYMBOL(ext2_xattr_get);
++EXPORT_SYMBOL(ext2_xattr_list);
++EXPORT_SYMBOL(ext2_xattr_set);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT2_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++              printk(KERN_DEBUG "inode %s:%ld: ", \
++                      kdevname(inode->i_dev), inode->i_ino); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++# define ea_bdebug(bh, f...) do { \
++              printk(KERN_DEBUG "block %s:%ld: ", \
++                      kdevname(bh->b_dev), bh->b_blocknr); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext2_xattr_set2(struct inode *, struct buffer_head *,
++                         struct ext2_xattr_header *);
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++static int ext2_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext2_xattr_cache_find(struct inode *,
++                                               struct ext2_xattr_header *);
++static void ext2_xattr_cache_remove(struct buffer_head *);
++static void ext2_xattr_rehash(struct ext2_xattr_header *,
++                            struct ext2_xattr_entry *);
++
++static struct mb_cache *ext2_xattr_cache;
++
++#else
++# define ext2_xattr_cache_insert(bh) 0
++# define ext2_xattr_cache_find(inode, header) NULL
++# define ext2_xattr_cache_remove(bh) while(0) {}
++# define ext2_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext2_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext2_xattr_sem);
++
++static inline int
++ext2_xattr_new_block(struct inode *inode, int * errp, int force)
++{
++      struct super_block *sb = inode->i_sb;
++      int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) +
++              EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb);
++
++      /* How can we enforce the allocation? */
++      int block = ext2_new_block(inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++      if (!*errp)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++      return block;
++}
++
++static inline int
++ext2_xattr_quota_alloc(struct inode *inode, int force)
++{
++      /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++      int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++      if (!error)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++      int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++      return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext2_xattr_quota_free(struct inode *inode)
++{
++      DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext2_xattr_free_block(struct inode * inode, unsigned long block)
++{
++      ext2_free_blocks(inode, block, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext2_xattr_quota_free(inode) \
++      DQUOT_FREE_BLOCK(inode, 1)
++# define ext2_xattr_free_block(inode, block) \
++      ext2_free_blocks(inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++      return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++      return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler)
++{
++      int error = -EINVAL;
++
++      if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++              write_lock(&ext2_handler_lock);
++              if (!ext2_xattr_handlers[name_index-1]) {
++                      ext2_xattr_handlers[name_index-1] = handler;
++                      error = 0;
++              }
++              write_unlock(&ext2_handler_lock);
++      }
++      return error;
++}
++
++void
++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler)
++{
++      if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) {
++              write_lock(&ext2_handler_lock);
++              ext2_xattr_handlers[name_index-1] = NULL;
++              write_unlock(&ext2_handler_lock);
++      }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++      while (*a_prefix && *a == *a_prefix) {
++              a++;
++              a_prefix++;
++      }
++      return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static struct ext2_xattr_handler *
++ext2_xattr_resolve_name(const char **name)
++{
++      struct ext2_xattr_handler *handler = NULL;
++      int i;
++
++      if (!*name)
++              return NULL;
++      read_lock(&ext2_handler_lock);
++      for (i=0; i<EXT2_XATTR_INDEX_MAX; i++) {
++              if (ext2_xattr_handlers[i]) {
++                      const char *n = strcmp_prefix(*name,
++                              ext2_xattr_handlers[i]->prefix);
++                      if (n) {
++                              handler = ext2_xattr_handlers[i];
++                              *name = n;
++                              break;
++                      }
++              }
++      }
++      read_unlock(&ext2_handler_lock);
++      return handler;
++}
++
++static inline struct ext2_xattr_handler *
++ext2_xattr_handler(int name_index)
++{
++      struct ext2_xattr_handler *handler = NULL;
++      if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++              read_lock(&ext2_handler_lock);
++              handler = ext2_xattr_handlers[name_index-1];
++              read_unlock(&ext2_handler_lock);
++      }
++      return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_getxattr(struct dentry *dentry, const char *name,
++            void *buffer, size_t size)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++      return ext2_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_setxattr(struct dentry *dentry, const char *name,
++            const void *value, size_t size, int flags)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      if (size == 0)
++              value = "";  /* empty EA, do not remove */
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_removexattr(struct dentry *dentry, const char *name)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext2_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_entry *entry;
++      unsigned int block, size;
++      char *end;
++      int name_len, error;
++
++      ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++                name_index, name, buffer, (long)buffer_size);
++
++      if (name == NULL)
++              return -EINVAL;
++      if (!EXT2_I(inode)->i_file_acl)
++              return -ENOATTR;
++      block = EXT2_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext2_error(inode->i_sb, "ext2_xattr_get",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* find named attribute */
++      name_len = strlen(name);
++
++      error = -ERANGE;
++      if (name_len > 255)
++              goto cleanup;
++      entry = FIRST_ENTRY(bh);
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              if (name_index == entry->e_name_index &&
++                  name_len == entry->e_name_len &&
++                  memcmp(name, entry->e_name, name_len) == 0)
++                      goto found;
++              entry = next;
++      }
++      /* Check the remaining name entries */
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              entry = next;
++      }
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      error = -ENOATTR;
++      goto cleanup;
++found:
++      /* check the buffer size */
++      if (entry->e_value_block != 0)
++              goto bad_block;
++      size = le32_to_cpu(entry->e_value_size);
++      if (size > inode->i_sb->s_blocksize ||
++          le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++              goto bad_block;
++
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (buffer) {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++              /* return value of attribute */
++              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++                      size);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * ext2_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_entry *entry;
++      unsigned int block, size = 0;
++      char *buf, *end;
++      int error;
++
++      ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++                buffer, (long)buffer_size);
++
++      if (!EXT2_I(inode)->i_file_acl)
++              return 0;
++      block = EXT2_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext2_error(inode->i_sb, "ext2_xattr_list",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* compute the size required for the list of attribute names */
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT2_XATTR_NEXT(entry)) {
++              struct ext2_xattr_handler *handler;
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++
++              handler = ext2_xattr_handler(entry->e_name_index);
++              if (handler)
++                      size += handler->list(NULL, inode, entry->e_name,
++                                            entry->e_name_len);
++      }
++
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (!buffer) {
++              error = size;
++              goto cleanup;
++      } else {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++      }
++
++      /* list the attribute names */
++      buf = buffer;
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT2_XATTR_NEXT(entry)) {
++              struct ext2_xattr_handler *handler;
++              
++              handler = ext2_xattr_handler(entry->e_name_index);
++              if (handler)
++                      buf += handler->list(buf, inode, entry->e_name,
++                                           entry->e_name_len);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext2_xattr_update_super_block(struct super_block *sb)
++{
++      if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
++              return;
++
++      lock_super(sb);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++      EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR;
++#endif
++      EXT2_SB(sb)->s_es->s_feature_compat |=
++              cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR);
++      sb->s_dirt = 1;
++      mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
++      unlock_super(sb);
++}
++
++/*
++ * ext2_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++             const void *value, size_t value_len, int flags)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_header *header = NULL;
++      struct ext2_xattr_entry *here, *last;
++      unsigned int name_len;
++      int block = EXT2_I(inode)->i_file_acl;
++      int min_offs = sb->s_blocksize, not_found = 1, free, error;
++      char *end;
++      
++      /*
++       * header -- Points either into bh, or to a temporarily
++       *           allocated buffer.
++       * here -- The named entry found, or the place for inserting, within
++       *         the block pointed to by header.
++       * last -- Points right after the last named entry within the block
++       *         pointed to by header.
++       * min_offs -- The offset of the first value (values are aligned
++       *             towards the end of the block).
++       * end -- Points right after the block pointed to by header.
++       */
++      
++      ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++                name_index, name, value, (long)value_len);
++
++      if (IS_RDONLY(inode))
++              return -EROFS;
++      if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++              return -EPERM;
++      if (value == NULL)
++              value_len = 0;
++      if (name == NULL)
++              return -EINVAL;
++      name_len = strlen(name);
++      if (name_len > 255 || value_len > sb->s_blocksize)
++              return -ERANGE;
++      down(&ext2_xattr_sem);
++
++      if (block) {
++              /* The inode already has an extended attribute block. */
++
++              bh = sb_bread(sb, block);
++              error = -EIO;
++              if (!bh)
++                      goto cleanup;
++              ea_bdebug(bh, "b_count=%d, refcount=%d",
++                      atomic_read(&(bh->b_count)),
++                      le32_to_cpu(HDR(bh)->h_refcount));
++              header = HDR(bh);
++              end = bh->b_data + bh->b_size;
++              if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++                  header->h_blocks != cpu_to_le32(1)) {
++bad_block:            ext2_error(sb, "ext2_xattr_set",
++                              "inode %ld: bad block %d", inode->i_ino, block);
++                      error = -EIO;
++                      goto cleanup;
++              }
++              /* Find the named attribute. */
++              here = FIRST_ENTRY(bh);
++              while (!IS_LAST_ENTRY(here)) {
++                      struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!here->e_value_block && here->e_value_size) {
++                              int offs = le16_to_cpu(here->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      not_found = name_index - here->e_name_index;
++                      if (!not_found)
++                              not_found = name_len - here->e_name_len;
++                      if (!not_found)
++                              not_found = memcmp(name, here->e_name,name_len);
++                      if (not_found <= 0)
++                              break;
++                      here = next;
++              }
++              last = here;
++              /* We still need to compute min_offs and last. */
++              while (!IS_LAST_ENTRY(last)) {
++                      struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!last->e_value_block && last->e_value_size) {
++                              int offs = le16_to_cpu(last->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      last = next;
++              }
++
++              /* Check whether we have enough space left. */
++              free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++      } else {
++              /* We will use a new extended attribute block. */
++              free = sb->s_blocksize -
++                      sizeof(struct ext2_xattr_header) - sizeof(__u32);
++              here = last = NULL;  /* avoid gcc uninitialized warning. */
++      }
++
++      if (not_found) {
++              /* Request to remove a nonexistent attribute? */
++              error = -ENOATTR;
++              if (flags & XATTR_REPLACE)
++                      goto cleanup;
++              error = 0;
++              if (value == NULL)
++                      goto cleanup;
++              else
++                      free -= EXT2_XATTR_LEN(name_len);
++      } else {
++              /* Request to create an existing attribute? */
++              error = -EEXIST;
++              if (flags & XATTR_CREATE)
++                      goto cleanup;
++              if (!here->e_value_block && here->e_value_size) {
++                      unsigned int size = le32_to_cpu(here->e_value_size);
++
++                      if (le16_to_cpu(here->e_value_offs) + size > 
++                          sb->s_blocksize || size > sb->s_blocksize)
++                              goto bad_block;
++                      free += EXT2_XATTR_SIZE(size);
++              }
++      }
++      free -= EXT2_XATTR_SIZE(value_len);
++      error = -ENOSPC;
++      if (free < 0)
++              goto cleanup;
++
++      /* Here we know that we can set the new attribute. */
++
++      if (header) {
++              if (header->h_refcount == cpu_to_le32(1)) {
++                      ea_bdebug(bh, "modifying in-place");
++                      ext2_xattr_cache_remove(bh);
++              } else {
++                      int offset;
++
++                      ea_bdebug(bh, "cloning");
++                      header = kmalloc(bh->b_size, GFP_KERNEL);
++                      error = -ENOMEM;
++                      if (header == NULL)
++                              goto cleanup;
++                      memcpy(header, HDR(bh), bh->b_size);
++                      header->h_refcount = cpu_to_le32(1);
++                      offset = (char *)header - bh->b_data;
++                      here = ENTRY((char *)here + offset);
++                      last = ENTRY((char *)last + offset);
++              }
++      } else {
++              /* Allocate a buffer where we construct the new block. */
++              header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++              error = -ENOMEM;
++              if (header == NULL)
++                      goto cleanup;
++              memset(header, 0, sb->s_blocksize);
++              end = (char *)header + sb->s_blocksize;
++              header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
++              header->h_blocks = header->h_refcount = cpu_to_le32(1);
++              last = here = ENTRY(header+1);
++      }
++
++      if (not_found) {
++              /* Insert the new name. */
++              int size = EXT2_XATTR_LEN(name_len);
++              int rest = (char *)last - (char *)here;
++              memmove((char *)here + size, here, rest);
++              memset(here, 0, size);
++              here->e_name_index = name_index;
++              here->e_name_len = name_len;
++              memcpy(here->e_name, name, name_len);
++      } else {
++              /* Remove the old value. */
++              if (!here->e_value_block && here->e_value_size) {
++                      char *first_val = (char *)header + min_offs;
++                      int offs = le16_to_cpu(here->e_value_offs);
++                      char *val = (char *)header + offs;
++                      size_t size = EXT2_XATTR_SIZE(
++                              le32_to_cpu(here->e_value_size));
++                      memmove(first_val + size, first_val, val - first_val);
++                      memset(first_val, 0, size);
++                      here->e_value_offs = 0;
++                      min_offs += size;
++
++                      /* Adjust all value offsets. */
++                      last = ENTRY(header+1);
++                      while (!IS_LAST_ENTRY(last)) {
++                              int o = le16_to_cpu(last->e_value_offs);
++                              if (!last->e_value_block && o < offs)
++                                      last->e_value_offs =
++                                              cpu_to_le16(o + size);
++                              last = EXT2_XATTR_NEXT(last);
++                      }
++              }
++              if (value == NULL) {
++                      /* Remove this attribute. */
++                      if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) {
++                              /* This block is now empty. */
++                              error = ext2_xattr_set2(inode, bh, NULL);
++                              goto cleanup;
++                      } else {
++                              /* Remove the old name. */
++                              int size = EXT2_XATTR_LEN(name_len);
++                              last = ENTRY((char *)last - size);
++                              memmove(here, (char*)here + size,
++                                      (char*)last - (char*)here);
++                              memset(last, 0, size);
++                      }
++              }
++      }
++
++      if (value != NULL) {
++              /* Insert the new value. */
++              here->e_value_size = cpu_to_le32(value_len);
++              if (value_len) {
++                      size_t size = EXT2_XATTR_SIZE(value_len);
++                      char *val = (char *)header + min_offs - size;
++                      here->e_value_offs =
++                              cpu_to_le16((char *)val - (char *)header);
++                      memset(val + size - EXT2_XATTR_PAD, 0,
++                             EXT2_XATTR_PAD); /* Clear the pad bytes. */
++                      memcpy(val, value, value_len);
++              }
++      }
++      ext2_xattr_rehash(header, here);
++
++      error = ext2_xattr_set2(inode, bh, header);
++
++cleanup:
++      brelse(bh);
++      if (!(bh && header == HDR(bh)))
++              kfree(header);
++      up(&ext2_xattr_sem);
++
++      return error;
++}
++
++/*
++ * Second half of ext2_xattr_set(): Update the file system.
++ */
++static int
++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
++              struct ext2_xattr_header *header)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *new_bh = NULL;
++      int error;
++
++      if (header) {
++              new_bh = ext2_xattr_cache_find(inode, header);
++              if (new_bh) {
++                      /*
++                       * We found an identical block in the cache.
++                       * The old block will be released after updating
++                       * the inode.
++                       */
++                      ea_bdebug(old_bh, "reusing block %ld",
++                              new_bh->b_blocknr);
++                      
++                      error = -EDQUOT;
++                      if (ext2_xattr_quota_alloc(inode, 1))
++                              goto cleanup;
++                      
++                      HDR(new_bh)->h_refcount = cpu_to_le32(
++                              le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++                      ea_bdebug(new_bh, "refcount now=%d",
++                              le32_to_cpu(HDR(new_bh)->h_refcount));
++              } else if (old_bh && header == HDR(old_bh)) {
++                      /* Keep this block. */
++                      new_bh = old_bh;
++                      ext2_xattr_cache_insert(new_bh);
++              } else {
++                      /* We need to allocate a new block */
++                      int force = EXT2_I(inode)->i_file_acl != 0;
++                      int block = ext2_xattr_new_block(inode, &error, force);
++                      if (error)
++                              goto cleanup;
++                      ea_idebug(inode, "creating block %d", block);
++
++                      new_bh = sb_getblk(sb, block);
++                      if (!new_bh) {
++                              ext2_xattr_free_block(inode, block);
++                              error = -EIO;
++                              goto cleanup;
++                      }
++                      lock_buffer(new_bh);
++                      memcpy(new_bh->b_data, header, new_bh->b_size);
++                      mark_buffer_uptodate(new_bh, 1);
++                      unlock_buffer(new_bh);
++                      ext2_xattr_cache_insert(new_bh);
++                      
++                      ext2_xattr_update_super_block(sb);
++              }
++              mark_buffer_dirty(new_bh);
++              if (IS_SYNC(inode)) {
++                      ll_rw_block(WRITE, 1, &new_bh);
++                      wait_on_buffer(new_bh); 
++                      error = -EIO;
++                      if (buffer_req(new_bh) && !buffer_uptodate(new_bh))
++                              goto cleanup;
++              }
++      }
++
++      /* Update the inode. */
++      EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++      inode->i_ctime = CURRENT_TIME;
++      if (IS_SYNC(inode)) {
++              error = ext2_sync_inode (inode);
++              if (error)
++                      goto cleanup;
++      } else
++              mark_inode_dirty(inode);
++
++      error = 0;
++      if (old_bh && old_bh != new_bh) {
++              /*
++               * If there was an old block, and we are not still using it,
++               * we now release the old block.
++              */
++              unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++              if (refcount == 1) {
++                      /* Free the old block. */
++                      ea_bdebug(old_bh, "freeing");
++                      ext2_xattr_free_block(inode, old_bh->b_blocknr);
++                      mark_buffer_clean(old_bh);
++              } else {
++                      /* Decrement the refcount only. */
++                      refcount--;
++                      HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++                      ext2_xattr_quota_free(inode);
++                      mark_buffer_dirty(old_bh);
++                      ea_bdebug(old_bh, "refcount now=%d", refcount);
++              }
++      }
++
++cleanup:
++      if (old_bh != new_bh)
++              brelse(new_bh);
++
++      return error;
++}
++
++/*
++ * ext2_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++      struct buffer_head *bh;
++      unsigned int block = EXT2_I(inode)->i_file_acl;
++
++      if (!block)
++              return;
++      down(&ext2_xattr_sem);
++
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh) {
++              ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++                      "inode %ld: block %d read error", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++              ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++      if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++              ext2_xattr_cache_remove(bh);
++              ext2_xattr_free_block(inode, block);
++              bforget(bh);
++              bh = NULL;
++      } else {
++              HDR(bh)->h_refcount = cpu_to_le32(
++                      le32_to_cpu(HDR(bh)->h_refcount) - 1);
++              mark_buffer_dirty(bh);
++              if (IS_SYNC(inode)) {
++                      ll_rw_block(WRITE, 1, &bh);
++                      wait_on_buffer(bh);
++              }
++              ext2_xattr_quota_free(inode);
++      }
++      EXT2_I(inode)->i_file_acl = 0;
++
++cleanup:
++      brelse(bh);
++      up(&ext2_xattr_sem);
++}
++
++/*
++ * ext2_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext2_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++      mb_cache_shrink(ext2_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++/*
++ * ext2_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext2_xattr_cache_insert(struct buffer_head *bh)
++{
++      __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++      struct mb_cache_entry *ce;
++      int error;
++
++      ce = mb_cache_entry_alloc(ext2_xattr_cache);
++      if (!ce)
++              return -ENOMEM;
++      error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++      if (error) {
++              mb_cache_entry_free(ce);
++              if (error == -EBUSY) {
++                      ea_bdebug(bh, "already in cache (%d cache entries)",
++                              atomic_read(&ext2_xattr_cache->c_entry_count));
++                      error = 0;
++              }
++      } else {
++              ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++                        atomic_read(&ext2_xattr_cache->c_entry_count));
++              mb_cache_entry_release(ce);
++      }
++      return error;
++}
++
++/*
++ * ext2_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext2_xattr_cmp(struct ext2_xattr_header *header1,
++             struct ext2_xattr_header *header2)
++{
++      struct ext2_xattr_entry *entry1, *entry2;
++
++      entry1 = ENTRY(header1+1);
++      entry2 = ENTRY(header2+1);
++      while (!IS_LAST_ENTRY(entry1)) {
++              if (IS_LAST_ENTRY(entry2))
++                      return 1;
++              if (entry1->e_hash != entry2->e_hash ||
++                  entry1->e_name_len != entry2->e_name_len ||
++                  entry1->e_value_size != entry2->e_value_size ||
++                  memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++                      return 1;
++              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++                      return -EIO;
++              if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++                         (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++                         le32_to_cpu(entry1->e_value_size)))
++                      return 1;
++
++              entry1 = EXT2_XATTR_NEXT(entry1);
++              entry2 = EXT2_XATTR_NEXT(entry2);
++      }
++      if (!IS_LAST_ENTRY(entry2))
++              return 1;
++      return 0;
++}
++
++/*
++ * ext2_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
++{
++      __u32 hash = le32_to_cpu(header->h_hash);
++      struct mb_cache_entry *ce;
++
++      if (!header->h_hash)
++              return NULL;  /* never share */
++      ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++      ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash);
++      while (ce) {
++              struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++              if (!bh) {
++                      ext2_error(inode->i_sb, "ext2_xattr_cache_find",
++                              "inode %ld: block %ld read error",
++                              inode->i_ino, ce->e_block);
++              } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++                         EXT2_XATTR_REFCOUNT_MAX) {
++                      ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++                              le32_to_cpu(HDR(bh)->h_refcount),
++                              EXT2_XATTR_REFCOUNT_MAX);
++              } else if (!ext2_xattr_cmp(header, HDR(bh))) {
++                      ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++                      mb_cache_entry_release(ce);
++                      return bh;
++              }
++              brelse(bh);
++              ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++      }
++      return NULL;
++}
++
++/*
++ * ext2_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext2_xattr_cache_remove(struct buffer_head *bh)
++{
++      struct mb_cache_entry *ce;
++
++      ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr);
++      if (ce) {
++              ea_bdebug(bh, "removing (%d cache entries remaining)",
++                        atomic_read(&ext2_xattr_cache->c_entry_count)-1);
++              mb_cache_entry_free(ce);
++      } else 
++              ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header,
++                                       struct ext2_xattr_entry *entry)
++{
++      __u32 hash = 0;
++      char *name = entry->e_name;
++      int n;
++
++      for (n=0; n < entry->e_name_len; n++) {
++              hash = (hash << NAME_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++                     *name++;
++      }
++
++      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++              __u32 *value = (__u32 *)((char *)header +
++                      le16_to_cpu(entry->e_value_offs));
++              for (n = (le32_to_cpu(entry->e_value_size) +
++                   EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) {
++                      hash = (hash << VALUE_HASH_SHIFT) ^
++                             (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++                             le32_to_cpu(*value++);
++              }
++      }
++      entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext2_xattr_rehash(struct ext2_xattr_header *header,
++                            struct ext2_xattr_entry *entry)
++{
++      struct ext2_xattr_entry *here;
++      __u32 hash = 0;
++      
++      ext2_xattr_hash_entry(header, entry);
++      here = ENTRY(header+1);
++      while (!IS_LAST_ENTRY(here)) {
++              if (!here->e_hash) {
++                      /* Block is not shared if an entry's hash value == 0 */
++                      hash = 0;
++                      break;
++              }
++              hash = (hash << BLOCK_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++                     le32_to_cpu(here->e_hash);
++              here = EXT2_XATTR_NEXT(here);
++      }
++      header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext2_xattr(void)
++{
++      ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
++              sizeof(struct mb_cache_entry) +
++              sizeof(struct mb_cache_entry_index), 1, 61);
++      if (!ext2_xattr_cache)
++              return -ENOMEM;
++
++      return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++      mb_cache_destroy(ext2_xattr_cache);
++}
++
++#else  /* CONFIG_EXT2_FS_XATTR_SHARING */
++
++int __init
++init_ext2_xattr(void)
++{
++      return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++}
++
++#endif  /* CONFIG_EXT2_FS_XATTR_SHARING */
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/fs/ext2/xattr_user.c Fri May 16 08:43:01 2003
+@@ -0,0 +1,103 @@
++/*
++ * linux/fs/ext2/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++# include <linux/ext2_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext2_xattr_user_list(char *list, struct inode *inode,
++                   const char *name, int name_len)
++{
++      const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return 0;
++
++      if (list) {
++              memcpy(list, XATTR_USER_PREFIX, prefix_len);
++              memcpy(list+prefix_len, name, name_len);
++              list[prefix_len + name_len] = '\0';
++      }
++      return prefix_len + name_len + 1;
++}
++
++static int
++ext2_xattr_user_get(struct inode *inode, const char *name,
++                  void *buffer, size_t size)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++      error = ext2_permission_locked(inode, MAY_READ);
++#else
++      error = permission(inode, MAY_READ);
++#endif
++      if (error)
++              return error;
++
++      return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name,
++                            buffer, size);
++}
++
++static int
++ext2_xattr_user_set(struct inode *inode, const char *name,
++                  const void *value, size_t size, int flags)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++      if ( !S_ISREG(inode->i_mode) &&
++          (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++              return -EPERM;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++      error = ext2_permission_locked(inode, MAY_WRITE);
++#else
++      error = permission(inode, MAY_WRITE);
++#endif
++      if (error)
++              return error;
++  
++      return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
++                            value, size, flags);
++}
++
++struct ext2_xattr_handler ext2_xattr_user_handler = {
++      prefix: XATTR_USER_PREFIX,
++      list:   ext2_xattr_user_list,
++      get:    ext2_xattr_user_get,
++      set:    ext2_xattr_user_set,
++};
++
++int __init
++init_ext2_xattr_user(void)
++{
++      return ext2_xattr_register(EXT2_XATTR_INDEX_USER,
++                                 &ext2_xattr_user_handler);
++}
++
++void
++exit_ext2_xattr_user(void)
++{
++      ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
++                            &ext2_xattr_user_handler);
++}
+--- linux/fs/ext3/Makefile~linux-2.4.20-xattr-0.8.54-hp        Fri May 16 08:42:46 2003
++++ linux-mmonroe/fs/ext3/Makefile     Fri May 16 08:43:01 2003
+@@ -1,5 +1,5 @@
+ #
+-# Makefile for the linux ext2-filesystem routines.
++# Makefile for the linux ext3-filesystem routines.
+ #
+ # Note! Dependencies are done automagically by 'make dep', which also
+ # removes any old dependencies. DON'T put your own dependencies here
+@@ -9,10 +9,13 @@
+ O_TARGET := ext3.o
+-export-objs :=        super.o inode.o
++export-objs := ext3-exports.o
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-              ioctl.o namei.o super.o symlink.o hash.o
++              ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
+ obj-m    := $(O_TARGET)
++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+--- linux/fs/ext3/file.c~linux-2.4.20-xattr-0.8.54-hp  Fri May 16 08:42:46 2003
++++ linux-mmonroe/fs/ext3/file.c       Fri May 16 08:43:01 2003
+@@ -23,6 +23,7 @@
+ #include <linux/locks.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/smp_lock.h>
+@@ -126,5 +127,9 @@ struct file_operations ext3_file_operati
+ struct inode_operations ext3_file_inode_operations = {
+       truncate:       ext3_truncate,          /* BKL held */
+       setattr:        ext3_setattr,           /* BKL held */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
+--- linux/fs/ext3/ialloc.c~linux-2.4.20-xattr-0.8.54-hp        Thu Nov 28 15:53:15 2002
++++ linux-mmonroe/fs/ext3/ialloc.c     Fri May 16 08:43:01 2003
+@@ -17,6 +17,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, 
+        * as writing the quota to disk may need the lock as well.
+        */
+       DQUOT_INIT(inode);
++      ext3_xattr_delete_inode(handle, inode);
+       DQUOT_FREE_INODE(inode);
+       DQUOT_DROP(inode);
+--- linux/fs/ext3/inode.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002
++++ linux-mmonroe/fs/ext3/inode.c      Fri May 16 08:43:01 2003
+@@ -39,6 +39,18 @@
+  */
+ #undef SEARCH_FROM_ZERO
++/*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext3_inode_is_fast_symlink(struct inode *inode)
++{
++      int ea_blocks = inode->u.ext3_i.i_file_acl ?
++              (inode->i_sb->s_blocksize >> 9) : 0;
++
++      return (S_ISLNK(inode->i_mode) &&
++              inode->i_blocks - ea_blocks == 0);
++}
++
+ /* The ext3 forget function must perform a revoke if we are freeing data
+  * which has been journaled.  Metadata (eg. indirect blocks) must be
+  * revoked in all cases. 
+@@ -48,7 +60,7 @@
+  * still needs to be revoked.
+  */
+-static int ext3_forget(handle_t *handle, int is_metadata,
++int ext3_forget(handle_t *handle, int is_metadata,
+                      struct inode *inode, struct buffer_head *bh,
+                      int blocknr)
+ {
+@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i
+ {
+       handle_t *handle;
+       
+-      if (is_bad_inode(inode) ||
+-          inode->i_ino == EXT3_ACL_IDX_INO ||
+-          inode->i_ino == EXT3_ACL_DATA_INO)
++      if (is_bad_inode(inode))
+               goto no_delete;
+       lock_kernel();
+@@ -1855,6 +1865,8 @@ void ext3_truncate(struct inode * inode)
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode)))
+               return;
++      if (ext3_inode_is_fast_symlink(inode))
++              return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+               return;
+@@ -2002,8 +2014,6 @@ int ext3_get_inode_loc (struct inode *in
+       struct ext3_group_desc * gdp;
+               
+       if ((inode->i_ino != EXT3_ROOT_INO &&
+-              inode->i_ino != EXT3_ACL_IDX_INO &&
+-              inode->i_ino != EXT3_ACL_DATA_INO &&
+               inode->i_ino != EXT3_JOURNAL_INO &&
+               inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+               inode->i_ino > le32_to_cpu(
+@@ -2130,10 +2140,7 @@ void ext3_read_inode(struct inode * inod
+       brelse (iloc.bh);
+-      if (inode->i_ino == EXT3_ACL_IDX_INO ||
+-          inode->i_ino == EXT3_ACL_DATA_INO)
+-              /* Nothing to do */ ;
+-      else if (S_ISREG(inode->i_mode)) {
++      if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &ext3_file_inode_operations;
+               inode->i_fop = &ext3_file_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+@@ -2141,15 +2148,17 @@ void ext3_read_inode(struct inode * inod
+               inode->i_op = &ext3_dir_inode_operations;
+               inode->i_fop = &ext3_dir_operations;
+       } else if (S_ISLNK(inode->i_mode)) {
+-              if (!inode->i_blocks)
++              if (ext3_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext3_fast_symlink_inode_operations;
+               else {
+-                      inode->i_op = &page_symlink_inode_operations;
++                      inode->i_op = &ext3_symlink_inode_operations;
+                       inode->i_mapping->a_ops = &ext3_aops;
+               }
+-      } else 
++      } else {
++              inode->i_op = &ext3_special_inode_operations;
+               init_special_inode(inode, inode->i_mode,
+                                  le32_to_cpu(iloc.raw_inode->i_block[0]));
++      }
+       /* inode->i_attr_flags = 0;                             unused */
+       if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
+--- linux/fs/ext3/namei.c~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:47 2003
++++ linux-mmonroe/fs/ext3/namei.c      Fri May 16 08:43:01 2003
+@@ -29,6 +29,7 @@
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+@@ -1611,7 +1612,7 @@ static int ext3_mkdir(struct inode * dir
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext3_new_inode (handle, dir, S_IFDIR);
++      inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -1619,7 +1620,6 @@ static int ext3_mkdir(struct inode * dir
+       inode->i_op = &ext3_dir_inode_operations;
+       inode->i_fop = &ext3_dir_operations;
+       inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+-      inode->i_blocks = 0;    
+       dir_block = ext3_bread (handle, inode, 0, 1, &err);
+       if (!dir_block) {
+               inode->i_nlink--; /* is this nlink == 0? */
+@@ -1646,9 +1646,6 @@ static int ext3_mkdir(struct inode * dir
+       BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+       ext3_journal_dirty_metadata(handle, dir_block);
+       brelse (dir_block);
+-      inode->i_mode = S_IFDIR | mode;
+-      if (dir->i_mode & S_ISGID)
+-              inode->i_mode |= S_ISGID;
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_entry (handle, dentry, inode);
+       if (err) {
+@@ -2017,7 +2014,7 @@ static int ext3_symlink (struct inode * 
+               goto out_stop;
+       if (l > sizeof (EXT3_I(inode)->i_data)) {
+-              inode->i_op = &page_symlink_inode_operations;
++              inode->i_op = &ext3_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+               /*
+                * block_symlink() calls back into ext3_prepare/commit_write.
+@@ -2244,4 +2241,16 @@ struct inode_operations ext3_dir_inode_o
+       rmdir:          ext3_rmdir,             /* BKL held */
+       mknod:          ext3_mknod,             /* BKL held */
+       rename:         ext3_rename,            /* BKL held */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
++
++struct inode_operations ext3_special_inode_operations = {
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
++};
++
+--- linux/fs/ext3/super.c~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:46 2003
++++ linux-mmonroe/fs/ext3/super.c      Fri May 16 08:43:01 2003
+@@ -24,6 +24,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -406,6 +407,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+               EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+@@ -502,6 +504,7 @@ static int parse_options (char * options
+                         int is_remount)
+ {
+       unsigned long *mount_options = &sbi->s_mount_opt;
++      
+       uid_t *resuid = &sbi->s_resuid;
+       gid_t *resgid = &sbi->s_resgid;
+       char * this_char;
+@@ -514,6 +517,13 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++              if (!strcmp (this_char, "user_xattr"))
++                      set_opt (*mount_options, XATTR_USER);
++              else if (!strcmp (this_char, "nouser_xattr"))
++                      clear_opt (*mount_options, XATTR_USER);
++              else
++#endif
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -931,6 +941,12 @@ struct super_block * ext3_read_super (st
+       sbi->s_mount_opt = 0;
+       sbi->s_resuid = EXT3_DEF_RESUID;
+       sbi->s_resgid = EXT3_DEF_RESGID;
++
++      /* Default extended attribute flags */
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++      /* set_opt(sbi->s_mount_opt, XATTR_USER); */
++#endif
++
+       if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
+               sb->s_dev = 0;
+               goto out_fail;
+@@ -1768,17 +1784,29 @@ static DECLARE_FSTYPE_DEV(ext3_fs_type, 
+ static int __init init_ext3_fs(void)
+ {
+-        return register_filesystem(&ext3_fs_type);
++      int error = init_ext3_xattr();
++      if (error)
++              return error;
++      error = init_ext3_xattr_user();
++      if (error)
++              goto fail;
++      error = register_filesystem(&ext3_fs_type);
++      if (!error)
++              return 0;
++      
++      exit_ext3_xattr_user();
++fail:
++      exit_ext3_xattr();
++      return error;
+ }
+ static void __exit exit_ext3_fs(void)
+ {
+       unregister_filesystem(&ext3_fs_type);
++      exit_ext3_xattr_user();
++      exit_ext3_xattr();
+ }
+-EXPORT_SYMBOL(ext3_force_commit);
+-EXPORT_SYMBOL(ext3_bread);
+-
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+ MODULE_LICENSE("GPL");
+--- linux/fs/ext3/symlink.c~linux-2.4.20-xattr-0.8.54-hp       Fri Nov  9 14:25:04 2001
++++ linux-mmonroe/fs/ext3/symlink.c    Fri May 16 08:43:01 2003
+@@ -20,6 +20,7 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -33,7 +34,20 @@ static int ext3_follow_link(struct dentr
+       return vfs_follow_link(nd, s);
+ }
++struct inode_operations ext3_symlink_inode_operations = {
++      readlink:       page_readlink,          /* BKL not held.  Don't need */
++      follow_link:    page_follow_link,       /* BKL not held.  Don't need */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
++};
++
+ struct inode_operations ext3_fast_symlink_inode_operations = {
+       readlink:       ext3_readlink,          /* BKL not held.  Don't need */
+       follow_link:    ext3_follow_link,       /* BKL not held.  Don't need */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/fs/ext3/xattr.c      Fri May 16 08:43:01 2003
+@@ -0,0 +1,1225 @@
++/*
++ * linux/fs/ext3/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
++ * Extended attributes for symlinks and special files added per
++ *  suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ *   +------------------+
++ *   | header           |
++ *   | entry 1          | |
++ *   | entry 2          | | growing downwards
++ *   | entry 3          | v
++ *   | four null bytes  |
++ *   | . . .            |
++ *   | value 1          | ^
++ *   | value 3          | | growing upwards
++ *   | value 2          | |
++ *   +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT3_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++#define EXT3_EA_USER "user."
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT3_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++              printk(KERN_DEBUG "inode %s:%ld: ", \
++                      kdevname(inode->i_dev), inode->i_ino); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++# define ea_bdebug(bh, f...) do { \
++              printk(KERN_DEBUG "block %s:%ld: ", \
++                      kdevname(bh->b_dev), bh->b_blocknr); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *,
++                         struct ext3_xattr_header *);
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++static int ext3_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext3_xattr_cache_find(struct inode *,
++                                               struct ext3_xattr_header *);
++static void ext3_xattr_cache_remove(struct buffer_head *);
++static void ext3_xattr_rehash(struct ext3_xattr_header *,
++                            struct ext3_xattr_entry *);
++
++static struct mb_cache *ext3_xattr_cache;
++
++#else
++# define ext3_xattr_cache_insert(bh) 0
++# define ext3_xattr_cache_find(inode, header) NULL
++# define ext3_xattr_cache_remove(bh) while(0) {}
++# define ext3_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext3_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext3_xattr_sem);
++
++static inline int
++ext3_xattr_new_block(handle_t *handle, struct inode *inode,
++                   int * errp, int force)
++{
++      struct super_block *sb = inode->i_sb;
++      int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
++              EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb);
++
++      /* How can we enforce the allocation? */
++      int block = ext3_new_block(handle, inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++      if (!*errp)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++      return block;
++}
++
++static inline int
++ext3_xattr_quota_alloc(struct inode *inode, int force)
++{
++      /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++      int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++      if (!error)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++      int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++      return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext3_xattr_quota_free(struct inode *inode)
++{
++      DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext3_xattr_free_block(handle_t *handle, struct inode * inode,
++                    unsigned long block)
++{
++      ext3_free_blocks(handle, inode, block, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext3_xattr_quota_free(inode) \
++      DQUOT_FREE_BLOCK(inode, 1)
++# define ext3_xattr_free_block(handle, inode, block) \
++      ext3_free_blocks(handle, inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++      return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++      return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
++{
++      int error = -EINVAL;
++
++      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++              write_lock(&ext3_handler_lock);
++              if (!ext3_xattr_handlers[name_index-1]) {
++                      ext3_xattr_handlers[name_index-1] = handler;
++                      error = 0;
++              }
++              write_unlock(&ext3_handler_lock);
++      }
++      return error;
++}
++
++void
++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
++{
++      if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
++              write_lock(&ext3_handler_lock);
++              ext3_xattr_handlers[name_index-1] = NULL;
++              write_unlock(&ext3_handler_lock);
++      }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++      while (*a_prefix && *a == *a_prefix) {
++              a++;
++              a_prefix++;
++      }
++      return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static inline struct ext3_xattr_handler *
++ext3_xattr_resolve_name(const char **name)
++{
++      struct ext3_xattr_handler *handler = NULL;
++      int i;
++
++      if (!*name)
++              return NULL;
++      read_lock(&ext3_handler_lock);
++      for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
++              if (ext3_xattr_handlers[i]) {
++                      const char *n = strcmp_prefix(*name,
++                              ext3_xattr_handlers[i]->prefix);
++                      if (n) {
++                              handler = ext3_xattr_handlers[i];
++                              *name = n;
++                              break;
++                      }
++              }
++      }
++      read_unlock(&ext3_handler_lock);
++      return handler;
++}
++
++static inline struct ext3_xattr_handler *
++ext3_xattr_handler(int name_index)
++{
++      struct ext3_xattr_handler *handler = NULL;
++      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++              read_lock(&ext3_handler_lock);
++              handler = ext3_xattr_handlers[name_index-1];
++              read_unlock(&ext3_handler_lock);
++      }
++      return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_getxattr(struct dentry *dentry, const char *name,
++            void *buffer, size_t size)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++      return ext3_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_setxattr(struct dentry *dentry, const char *name,
++            const void *value, size_t size, int flags)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      if (size == 0)
++              value = "";  /* empty EA, do not remove */
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_removexattr(struct dentry *dentry, const char *name)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext3_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_entry *entry;
++      unsigned int block, size;
++      char *end;
++      int name_len, error;
++
++      ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++                name_index, name, buffer, (long)buffer_size);
++
++      if (name == NULL)
++              return -EINVAL;
++      if (!EXT3_I(inode)->i_file_acl)
++              return -ENOATTR;
++      block = EXT3_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext3_error(inode->i_sb, "ext3_xattr_get",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* find named attribute */
++      name_len = strlen(name);
++
++      error = -ERANGE;
++      if (name_len > 255)
++              goto cleanup;
++      entry = FIRST_ENTRY(bh);
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              if (name_index == entry->e_name_index &&
++                  name_len == entry->e_name_len &&
++                  memcmp(name, entry->e_name, name_len) == 0)
++                      goto found;
++              entry = next;
++      }
++      /* Check the remaining name entries */
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              entry = next;
++      }
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      error = -ENOATTR;
++      goto cleanup;
++found:
++      /* check the buffer size */
++      if (entry->e_value_block != 0)
++              goto bad_block;
++      size = le32_to_cpu(entry->e_value_size);
++      if (size > inode->i_sb->s_blocksize ||
++          le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++              goto bad_block;
++
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (buffer) {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++              /* return value of attribute */
++              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++                      size);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * ext3_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_entry *entry;
++      unsigned int block, size = 0;
++      char *buf, *end;
++      int error;
++
++      ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++                buffer, (long)buffer_size);
++
++      if (!EXT3_I(inode)->i_file_acl)
++              return 0;
++      block = EXT3_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext3_error(inode->i_sb, "ext3_xattr_list",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* compute the size required for the list of attribute names */
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT3_XATTR_NEXT(entry)) {
++              struct ext3_xattr_handler *handler;
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++
++              handler = ext3_xattr_handler(entry->e_name_index);
++              if (handler)
++                      size += handler->list(NULL, inode, entry->e_name,
++                                            entry->e_name_len);
++      }
++
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (!buffer) {
++              error = size;
++              goto cleanup;
++      } else {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++      }
++
++      /* list the attribute names */
++      buf = buffer;
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT3_XATTR_NEXT(entry)) {
++              struct ext3_xattr_handler *handler;
++
++              handler = ext3_xattr_handler(entry->e_name_index);
++              if (handler)
++                      buf += handler->list(buf, inode, entry->e_name,
++                                           entry->e_name_len);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext3_xattr_update_super_block(handle_t *handle,
++                                        struct super_block *sb)
++{
++      if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
++              return;
++
++      lock_super(sb);
++      ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++      EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR;
++#endif
++      EXT3_SB(sb)->s_es->s_feature_compat |=
++              cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
++      sb->s_dirt = 1;
++      ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++      unlock_super(sb);
++}
++
++/*
++ * ext3_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++             const char *name, const void *value, size_t value_len, int flags)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_header *header = NULL;
++      struct ext3_xattr_entry *here, *last;
++      unsigned int name_len;
++      int block = EXT3_I(inode)->i_file_acl;
++      int min_offs = sb->s_blocksize, not_found = 1, free, error;
++      char *end;
++      
++      /*
++       * header -- Points either into bh, or to a temporarily
++       *           allocated buffer.
++       * here -- The named entry found, or the place for inserting, within
++       *         the block pointed to by header.
++       * last -- Points right after the last named entry within the block
++       *         pointed to by header.
++       * min_offs -- The offset of the first value (values are aligned
++       *             towards the end of the block).
++       * end -- Points right after the block pointed to by header.
++       */
++      
++      ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++                name_index, name, value, (long)value_len);
++
++      if (IS_RDONLY(inode))
++              return -EROFS;
++      if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++              return -EPERM;
++      if (value == NULL)
++              value_len = 0;
++      if (name == NULL)
++              return -EINVAL;
++      name_len = strlen(name);
++      if (name_len > 255 || value_len > sb->s_blocksize)
++              return -ERANGE;
++      down(&ext3_xattr_sem);
++
++      if (block) {
++              /* The inode already has an extended attribute block. */
++              bh = sb_bread(sb, block);
++              error = -EIO;
++              if (!bh)
++                      goto cleanup;
++              ea_bdebug(bh, "b_count=%d, refcount=%d",
++                      atomic_read(&(bh->b_count)),
++                      le32_to_cpu(HDR(bh)->h_refcount));
++              header = HDR(bh);
++              end = bh->b_data + bh->b_size;
++              if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++                  header->h_blocks != cpu_to_le32(1)) {
++bad_block:            ext3_error(sb, "ext3_xattr_set",
++                              "inode %ld: bad block %d", inode->i_ino, block);
++                      error = -EIO;
++                      goto cleanup;
++              }
++              /* Find the named attribute. */
++              here = FIRST_ENTRY(bh);
++              while (!IS_LAST_ENTRY(here)) {
++                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!here->e_value_block && here->e_value_size) {
++                              int offs = le16_to_cpu(here->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      not_found = name_index - here->e_name_index;
++                      if (!not_found)
++                              not_found = name_len - here->e_name_len;
++                      if (!not_found)
++                              not_found = memcmp(name, here->e_name,name_len);
++                      if (not_found <= 0)
++                              break;
++                      here = next;
++              }
++              last = here;
++              /* We still need to compute min_offs and last. */
++              while (!IS_LAST_ENTRY(last)) {
++                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!last->e_value_block && last->e_value_size) {
++                              int offs = le16_to_cpu(last->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      last = next;
++              }
++
++              /* Check whether we have enough space left. */
++              free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++      } else {
++              /* We will use a new extended attribute block. */
++              free = sb->s_blocksize -
++                      sizeof(struct ext3_xattr_header) - sizeof(__u32);
++              here = last = NULL;  /* avoid gcc uninitialized warning. */
++      }
++
++      if (not_found) {
++              /* Request to remove a nonexistent attribute? */
++              error = -ENOATTR;
++              if (flags & XATTR_REPLACE)
++                      goto cleanup;
++              error = 0;
++              if (value == NULL)
++                      goto cleanup;
++              else
++                      free -= EXT3_XATTR_LEN(name_len);
++      } else {
++              /* Request to create an existing attribute? */
++              error = -EEXIST;
++              if (flags & XATTR_CREATE)
++                      goto cleanup;
++              if (!here->e_value_block && here->e_value_size) {
++                      unsigned int size = le32_to_cpu(here->e_value_size);
++
++                      if (le16_to_cpu(here->e_value_offs) + size > 
++                          sb->s_blocksize || size > sb->s_blocksize)
++                              goto bad_block;
++                      free += EXT3_XATTR_SIZE(size);
++              }
++      }
++      free -= EXT3_XATTR_SIZE(value_len);
++      error = -ENOSPC;
++      if (free < 0)
++              goto cleanup;
++
++      /* Here we know that we can set the new attribute. */
++
++      if (header) {
++              if (header->h_refcount == cpu_to_le32(1)) {
++                      ea_bdebug(bh, "modifying in-place");
++                      ext3_xattr_cache_remove(bh);
++                      error = ext3_journal_get_write_access(handle, bh);
++                      if (error)
++                              goto cleanup;
++              } else {
++                      int offset;
++
++                      ea_bdebug(bh, "cloning");
++                      header = kmalloc(bh->b_size, GFP_KERNEL);
++                      error = -ENOMEM;
++                      if (header == NULL)
++                              goto cleanup;
++                      memcpy(header, HDR(bh), bh->b_size);
++                      header->h_refcount = cpu_to_le32(1);
++                      offset = (char *)header - bh->b_data;
++                      here = ENTRY((char *)here + offset);
++                      last = ENTRY((char *)last + offset);
++              }
++      } else {
++              /* Allocate a buffer where we construct the new block. */
++              header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++              error = -ENOMEM;
++              if (header == NULL)
++                      goto cleanup;
++              memset(header, 0, sb->s_blocksize);
++              end = (char *)header + sb->s_blocksize;
++              header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
++              header->h_blocks = header->h_refcount = cpu_to_le32(1);
++              last = here = ENTRY(header+1);
++      }
++
++      if (not_found) {
++              /* Insert the new name. */
++              int size = EXT3_XATTR_LEN(name_len);
++              int rest = (char *)last - (char *)here;
++              memmove((char *)here + size, here, rest);
++              memset(here, 0, size);
++              here->e_name_index = name_index;
++              here->e_name_len = name_len;
++              memcpy(here->e_name, name, name_len);
++      } else {
++              /* Remove the old value. */
++              if (!here->e_value_block && here->e_value_size) {
++                      char *first_val = (char *)header + min_offs;
++                      int offs = le16_to_cpu(here->e_value_offs);
++                      char *val = (char *)header + offs;
++                      size_t size = EXT3_XATTR_SIZE(
++                              le32_to_cpu(here->e_value_size));
++                      memmove(first_val + size, first_val, val - first_val);
++                      memset(first_val, 0, size);
++                      here->e_value_offs = 0;
++                      min_offs += size;
++
++                      /* Adjust all value offsets. */
++                      last = ENTRY(header+1);
++                      while (!IS_LAST_ENTRY(last)) {
++                              int o = le16_to_cpu(last->e_value_offs);
++                              if (!last->e_value_block && o < offs)
++                                      last->e_value_offs =
++                                              cpu_to_le16(o + size);
++                              last = EXT3_XATTR_NEXT(last);
++                      }
++              }
++              if (value == NULL) {
++                      /* Remove this attribute. */
++                      if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) {
++                              /* This block is now empty. */
++                              error = ext3_xattr_set2(handle, inode, bh,NULL);
++                              goto cleanup;
++                      } else {
++                              /* Remove the old name. */
++                              int size = EXT3_XATTR_LEN(name_len);
++                              last = ENTRY((char *)last - size);
++                              memmove(here, (char*)here + size,
++                                      (char*)last - (char*)here);
++                              memset(last, 0, size);
++                      }
++              }
++      }
++
++      if (value != NULL) {
++              /* Insert the new value. */
++              here->e_value_size = cpu_to_le32(value_len);
++              if (value_len) {
++                      size_t size = EXT3_XATTR_SIZE(value_len);
++                      char *val = (char *)header + min_offs - size;
++                      here->e_value_offs =
++                              cpu_to_le16((char *)val - (char *)header);
++                      memset(val + size - EXT3_XATTR_PAD, 0,
++                             EXT3_XATTR_PAD); /* Clear the pad bytes. */
++                      memcpy(val, value, value_len);
++              }
++      }
++      ext3_xattr_rehash(header, here);
++
++      error = ext3_xattr_set2(handle, inode, bh, header);
++
++cleanup:
++      brelse(bh);
++      if (!(bh && header == HDR(bh)))
++              kfree(header);
++      up(&ext3_xattr_sem);
++
++      return error;
++}
++
++/*
++ * Second half of ext3_xattr_set(): Update the file system.
++ */
++static int
++ext3_xattr_set2(handle_t *handle, struct inode *inode,
++              struct buffer_head *old_bh, struct ext3_xattr_header *header)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *new_bh = NULL;
++      int error;
++
++      if (header) {
++              new_bh = ext3_xattr_cache_find(inode, header);
++              if (new_bh) {
++                      /*
++                       * We found an identical block in the cache.
++                       * The old block will be released after updating
++                       * the inode.
++                       */
++                      ea_bdebug(old_bh, "reusing block %ld",
++                              new_bh->b_blocknr);
++                      
++                      error = -EDQUOT;
++                      if (ext3_xattr_quota_alloc(inode, 1))
++                              goto cleanup;
++                      
++                      error = ext3_journal_get_write_access(handle, new_bh);
++                      if (error)
++                              goto cleanup;
++                      HDR(new_bh)->h_refcount = cpu_to_le32(
++                              le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++                      ea_bdebug(new_bh, "refcount now=%d",
++                              le32_to_cpu(HDR(new_bh)->h_refcount));
++              } else if (old_bh && header == HDR(old_bh)) {
++                      /* Keep this block. */
++                      new_bh = old_bh;
++                      ext3_xattr_cache_insert(new_bh);
++              } else {
++                      /* We need to allocate a new block */
++                      int force = EXT3_I(inode)->i_file_acl != 0;
++                      int block = ext3_xattr_new_block(handle, inode,
++                                                       &error, force);
++                      if (error)
++                              goto cleanup;
++                      ea_idebug(inode, "creating block %d", block);
++
++                      new_bh = sb_getblk(sb, block);
++                      if (!new_bh) {
++getblk_failed:                        ext3_xattr_free_block(handle, inode, block);
++                              error = -EIO;
++                              goto cleanup;
++                      }
++                      lock_buffer(new_bh);
++                      error = ext3_journal_get_create_access(handle, new_bh);
++                      if (error) {
++                              unlock_buffer(new_bh);
++                              goto getblk_failed;
++                      }
++                      memcpy(new_bh->b_data, header, new_bh->b_size);
++                      mark_buffer_uptodate(new_bh, 1);
++                      unlock_buffer(new_bh);
++                      ext3_xattr_cache_insert(new_bh);
++                      
++                      ext3_xattr_update_super_block(handle, sb);
++              }
++              error = ext3_journal_dirty_metadata(handle, new_bh);
++              if (error)
++                      goto cleanup;
++      }
++
++      /* Update the inode. */
++      EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++      inode->i_ctime = CURRENT_TIME;
++      ext3_mark_inode_dirty(handle, inode);
++      if (IS_SYNC(inode))
++              handle->h_sync = 1;
++
++      error = 0;
++      if (old_bh && old_bh != new_bh) {
++              /*
++               * If there was an old block, and we are not still using it,
++               * we now release the old block.
++              */
++              unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++              error = ext3_journal_get_write_access(handle, old_bh);
++              if (error)
++                      goto cleanup;
++              if (refcount == 1) {
++                      /* Free the old block. */
++                      ea_bdebug(old_bh, "freeing");
++                      ext3_xattr_free_block(handle, inode, old_bh->b_blocknr);
++
++                      /* ext3_forget() calls bforget() for us, but we
++                         let our caller release old_bh, so we need to
++                         duplicate the handle before. */
++                      get_bh(old_bh);
++                      ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
++              } else {
++                      /* Decrement the refcount only. */
++                      refcount--;
++                      HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++                      ext3_xattr_quota_free(inode);
++                      ext3_journal_dirty_metadata(handle, old_bh);
++                      ea_bdebug(old_bh, "refcount now=%d", refcount);
++              }
++      }
++
++cleanup:
++      if (old_bh != new_bh)
++              brelse(new_bh);
++
++      return error;
++}
++
++/*
++ * ext3_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++      struct buffer_head *bh;
++      unsigned int block = EXT3_I(inode)->i_file_acl;
++
++      if (!block)
++              return;
++      down(&ext3_xattr_sem);
++
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh) {
++              ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++                      "inode %ld: block %d read error", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++              ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              goto cleanup;
++      }
++      ext3_journal_get_write_access(handle, bh);
++      ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++      if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++              ext3_xattr_cache_remove(bh);
++              ext3_xattr_free_block(handle, inode, block);
++              ext3_forget(handle, 1, inode, bh, block);
++              bh = NULL;
++      } else {
++              HDR(bh)->h_refcount = cpu_to_le32(
++                      le32_to_cpu(HDR(bh)->h_refcount) - 1);
++              ext3_journal_dirty_metadata(handle, bh);
++              if (IS_SYNC(inode))
++                      handle->h_sync = 1;
++              ext3_xattr_quota_free(inode);
++      }
++      EXT3_I(inode)->i_file_acl = 0;
++
++cleanup:
++      brelse(bh);
++      up(&ext3_xattr_sem);
++}
++
++/*
++ * ext3_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext3_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++      mb_cache_shrink(ext3_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++/*
++ * ext3_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext3_xattr_cache_insert(struct buffer_head *bh)
++{
++      __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++      struct mb_cache_entry *ce;
++      int error;
++
++      ce = mb_cache_entry_alloc(ext3_xattr_cache);
++      if (!ce)
++              return -ENOMEM;
++      error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++      if (error) {
++              mb_cache_entry_free(ce);
++              if (error == -EBUSY) {
++                      ea_bdebug(bh, "already in cache (%d cache entries)",
++                              atomic_read(&ext3_xattr_cache->c_entry_count));
++                      error = 0;
++              }
++      } else {
++              ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++                        atomic_read(&ext3_xattr_cache->c_entry_count));
++              mb_cache_entry_release(ce);
++      }
++      return error;
++}
++
++/*
++ * ext3_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext3_xattr_cmp(struct ext3_xattr_header *header1,
++             struct ext3_xattr_header *header2)
++{
++      struct ext3_xattr_entry *entry1, *entry2;
++
++      entry1 = ENTRY(header1+1);
++      entry2 = ENTRY(header2+1);
++      while (!IS_LAST_ENTRY(entry1)) {
++              if (IS_LAST_ENTRY(entry2))
++                      return 1;
++              if (entry1->e_hash != entry2->e_hash ||
++                  entry1->e_name_len != entry2->e_name_len ||
++                  entry1->e_value_size != entry2->e_value_size ||
++                  memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++                      return 1;
++              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++                      return -EIO;
++              if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++                         (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++                         le32_to_cpu(entry1->e_value_size)))
++                      return 1;
++
++              entry1 = EXT3_XATTR_NEXT(entry1);
++              entry2 = EXT3_XATTR_NEXT(entry2);
++      }
++      if (!IS_LAST_ENTRY(entry2))
++              return 1;
++      return 0;
++}
++
++/*
++ * ext3_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
++{
++      __u32 hash = le32_to_cpu(header->h_hash);
++      struct mb_cache_entry *ce;
++
++      if (!header->h_hash)
++              return NULL;  /* never share */
++      ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++      ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash);
++      while (ce) {
++              struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++              if (!bh) {
++                      ext3_error(inode->i_sb, "ext3_xattr_cache_find",
++                              "inode %ld: block %ld read error",
++                              inode->i_ino, ce->e_block);
++              } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++                         EXT3_XATTR_REFCOUNT_MAX) {
++                      ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++                              le32_to_cpu(HDR(bh)->h_refcount),
++                              EXT3_XATTR_REFCOUNT_MAX);
++              } else if (!ext3_xattr_cmp(header, HDR(bh))) {
++                      ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++                      mb_cache_entry_release(ce);
++                      return bh;
++              }
++              brelse(bh);
++              ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++      }
++      return NULL;
++}
++
++/*
++ * ext3_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext3_xattr_cache_remove(struct buffer_head *bh)
++{
++      struct mb_cache_entry *ce;
++
++      ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr);
++      if (ce) {
++              ea_bdebug(bh, "removing (%d cache entries remaining)",
++                        atomic_read(&ext3_xattr_cache->c_entry_count)-1);
++              mb_cache_entry_free(ce);
++      } else 
++              ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
++                                       struct ext3_xattr_entry *entry)
++{
++      __u32 hash = 0;
++      char *name = entry->e_name;
++      int n;
++
++      for (n=0; n < entry->e_name_len; n++) {
++              hash = (hash << NAME_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++                     *name++;
++      }
++
++      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++              __u32 *value = (__u32 *)((char *)header +
++                      le16_to_cpu(entry->e_value_offs));
++              for (n = (le32_to_cpu(entry->e_value_size) +
++                   EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
++                      hash = (hash << VALUE_HASH_SHIFT) ^
++                             (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++                             le32_to_cpu(*value++);
++              }
++      }
++      entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext3_xattr_rehash(struct ext3_xattr_header *header,
++                            struct ext3_xattr_entry *entry)
++{
++      struct ext3_xattr_entry *here;
++      __u32 hash = 0;
++      
++      ext3_xattr_hash_entry(header, entry);
++      here = ENTRY(header+1);
++      while (!IS_LAST_ENTRY(here)) {
++              if (!here->e_hash) {
++                      /* Block is not shared if an entry's hash value == 0 */
++                      hash = 0;
++                      break;
++              }
++              hash = (hash << BLOCK_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++                     le32_to_cpu(here->e_hash);
++              here = EXT3_XATTR_NEXT(here);
++      }
++      header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext3_xattr(void)
++{
++      ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
++              sizeof(struct mb_cache_entry) +
++              sizeof(struct mb_cache_entry_index), 1, 61);
++      if (!ext3_xattr_cache)
++              return -ENOMEM;
++
++      return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++      if (ext3_xattr_cache)
++              mb_cache_destroy(ext3_xattr_cache);
++      ext3_xattr_cache = NULL;
++}
++
++#else  /* CONFIG_EXT3_FS_XATTR_SHARING */
++
++int __init
++init_ext3_xattr(void)
++{
++      return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++}
++
++#endif  /* CONFIG_EXT3_FS_XATTR_SHARING */
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/fs/ext3/xattr_user.c Fri May 16 08:43:01 2003
+@@ -0,0 +1,111 @@
++/*
++ * linux/fs/ext3/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++# include <linux/ext3_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext3_xattr_user_list(char *list, struct inode *inode,
++                   const char *name, int name_len)
++{
++      const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return 0;
++
++      if (list) {
++              memcpy(list, XATTR_USER_PREFIX, prefix_len);
++              memcpy(list+prefix_len, name, name_len);
++              list[prefix_len + name_len] = '\0';
++      }
++      return prefix_len + name_len + 1;
++}
++
++static int
++ext3_xattr_user_get(struct inode *inode, const char *name,
++                  void *buffer, size_t size)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++      error = ext3_permission_locked(inode, MAY_READ);
++#else
++      error = permission(inode, MAY_READ);
++#endif
++      if (error)
++              return error;
++
++      return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name,
++                            buffer, size);
++}
++
++static int
++ext3_xattr_user_set(struct inode *inode, const char *name,
++                  const void *value, size_t size, int flags)
++{
++      handle_t *handle;
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++      if ( !S_ISREG(inode->i_mode) &&
++          (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++              return -EPERM;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++      error = ext3_permission_locked(inode, MAY_WRITE);
++#else
++      error = permission(inode, MAY_WRITE);
++#endif
++      if (error)
++              return error;
++  
++      handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++      error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name,
++                             value, size, flags);
++      ext3_journal_stop(handle, inode);
++
++      return error;
++}
++
++struct ext3_xattr_handler ext3_xattr_user_handler = {
++      prefix: XATTR_USER_PREFIX,
++      list:   ext3_xattr_user_list,
++      get:    ext3_xattr_user_get,
++      set:    ext3_xattr_user_set,
++};
++
++int __init
++init_ext3_xattr_user(void)
++{
++      return ext3_xattr_register(EXT3_XATTR_INDEX_USER,
++                                 &ext3_xattr_user_handler);
++}
++
++void
++exit_ext3_xattr_user(void)
++{
++      ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
++                            &ext3_xattr_user_handler);
++}
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/fs/ext3/ext3-exports.c       Fri May 16 08:43:01 2003
+@@ -0,0 +1,13 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
++
++EXPORT_SYMBOL(ext3_force_commit);
++EXPORT_SYMBOL(ext3_bread);
++EXPORT_SYMBOL(ext3_xattr_register);
++EXPORT_SYMBOL(ext3_xattr_unregister);
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_list);
++EXPORT_SYMBOL(ext3_xattr_set);
+--- linux/fs/jfs/jfs_xattr.h~linux-2.4.20-xattr-0.8.54-hp      Thu Nov 28 15:53:15 2002
++++ linux-mmonroe/fs/jfs/jfs_xattr.h   Fri May 16 08:43:01 2003
+@@ -52,8 +52,10 @@ struct jfs_ea_list {
+ #define       END_EALIST(ealist) \
+       ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
+-extern int __jfs_setxattr(struct inode *, const char *, void *, size_t, int);
+-extern int jfs_setxattr(struct dentry *, const char *, void *, size_t, int);
++extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t,
++                        int);
++extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
++                      int);
+ extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
+ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
+ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
+--- linux/fs/jfs/xattr.c~linux-2.4.20-xattr-0.8.54-hp  Thu Nov 28 15:53:15 2002
++++ linux-mmonroe/fs/jfs/xattr.c       Fri May 16 08:43:01 2003
+@@ -641,7 +641,7 @@ static int ea_put(struct inode *inode, s
+ }
+ static int can_set_xattr(struct inode *inode, const char *name,
+-                       void *value, size_t value_len)
++                       const void *value, size_t value_len)
+ {
+       if (IS_RDONLY(inode))
+               return -EROFS;
+@@ -660,7 +660,7 @@ static int can_set_xattr(struct inode *i
+       return permission(inode, MAY_WRITE);
+ }
+-int __jfs_setxattr(struct inode *inode, const char *name, void *value,
++int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
+                  size_t value_len, int flags)
+ {
+       struct jfs_ea_list *ealist;
+@@ -799,7 +799,7 @@ int __jfs_setxattr(struct inode *inode, 
+       return rc;
+ }
+-int jfs_setxattr(struct dentry *dentry, const char *name, void *value,
++int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                size_t value_len, int flags)
+ {
+       if (value == NULL) {    /* empty EA, do not remove */
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/fs/mbcache.c Fri May 16 08:43:01 2003
+@@ -0,0 +1,648 @@
++/*
++ * linux/fs/mbcache.c
++ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++/*
++ * Filesystem Meta Information Block Cache (mbcache)
++ *
++ * The mbcache caches blocks of block devices that need to be located
++ * by their device/block number, as well as by other criteria (such
++ * as the block's contents).
++ *
++ * There can only be one cache entry in a cache per device and block number.
++ * Additional indexes need not be unique in this sense. The number of
++ * additional indexes (=other criteria) can be hardwired at compile time
++ * or specified at cache create time.
++ *
++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
++ * in the cache. A valid entry is in the main hash tables of the cache,
++ * and may also be in the lru list. An invalid entry is not in any hashes
++ * or lists.
++ *
++ * A valid cache entry is only in the lru list if no handles refer to it.
++ * Invalid cache entries will be freed when the last handle to the cache
++ * entry is released. Entries that cannot be freed immediately are put
++ * back on the lru list.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/cache_def.h>
++#include <linux/version.h>
++#include <linux/init.h>
++#include <linux/mbcache.h>
++
++
++#ifdef MB_CACHE_DEBUG
++# define mb_debug(f...) do { \
++              printk(KERN_DEBUG f); \
++              printk("\n"); \
++      } while (0)
++#define mb_assert(c) do { if (!(c)) \
++              printk(KERN_ERR "assertion " #c " failed\n"); \
++      } while(0)
++#else
++# define mb_debug(f...) do { } while(0)
++# define mb_assert(c) do { } while(0)
++#endif
++#define mb_error(f...) do { \
++              printk(KERN_ERR f); \
++              printk("\n"); \
++      } while(0)
++              
++MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
++MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
++MODULE_LICENSE("GPL");
++#endif
++
++EXPORT_SYMBOL(mb_cache_create);
++EXPORT_SYMBOL(mb_cache_shrink);
++EXPORT_SYMBOL(mb_cache_destroy);
++EXPORT_SYMBOL(mb_cache_entry_alloc);
++EXPORT_SYMBOL(mb_cache_entry_insert);
++EXPORT_SYMBOL(mb_cache_entry_release);
++EXPORT_SYMBOL(mb_cache_entry_takeout);
++EXPORT_SYMBOL(mb_cache_entry_free);
++EXPORT_SYMBOL(mb_cache_entry_dup);
++EXPORT_SYMBOL(mb_cache_entry_get);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++EXPORT_SYMBOL(mb_cache_entry_find_first);
++EXPORT_SYMBOL(mb_cache_entry_find_next);
++#endif
++
++
++/*
++ * Global data: list of all mbcache's, lru list, and a spinlock for
++ * accessing cache data structures on SMP machines. The lru list is
++ * global across all mbcaches.
++ */
++
++static LIST_HEAD(mb_cache_list);
++static LIST_HEAD(mb_cache_lru_list);
++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED;
++
++static inline int
++mb_cache_indexes(struct mb_cache *cache)
++{
++#ifdef MB_CACHE_INDEXES_COUNT
++      return MB_CACHE_INDEXES_COUNT;
++#else
++      return cache->c_indexes_count;
++#endif
++}
++
++/*
++ * What the mbcache registers as to get shrunk dynamically.
++ */
++
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask);
++
++static struct cache_definition mb_cache_definition = {
++      "mb_cache",
++      mb_cache_memory_pressure
++};
++
++
++static inline int
++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
++{
++      return !list_empty(&ce->e_block_list);
++}
++
++
++static inline void
++__mb_cache_entry_unhash(struct mb_cache_entry *ce)
++{
++      int n;
++
++      if (__mb_cache_entry_is_hashed(ce)) {
++              list_del_init(&ce->e_block_list);
++              for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
++                      list_del(&ce->e_indexes[n].o_list);
++      }
++}
++
++
++static inline void
++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
++{
++      struct mb_cache *cache = ce->e_cache;
++
++      mb_assert(atomic_read(&ce->e_used) == 0);
++      if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
++              /* free failed -- put back on the lru list
++                 for freeing later. */
++              spin_lock(&mb_cache_spinlock);
++              list_add(&ce->e_lru_list, &mb_cache_lru_list);
++              spin_unlock(&mb_cache_spinlock);
++      } else {
++              kmem_cache_free(cache->c_entry_cache, ce);
++              atomic_dec(&cache->c_entry_count);
++      }
++}
++
++
++static inline void
++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
++{
++      if (atomic_dec_and_test(&ce->e_used)) {
++              if (__mb_cache_entry_is_hashed(ce))
++                      list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
++              else {
++                      spin_unlock(&mb_cache_spinlock);
++                      __mb_cache_entry_forget(ce, GFP_KERNEL);
++                      return;
++              }
++      }
++      spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_memory_pressure()  memory pressure callback
++ *
++ * This function is called by the kernel memory management when memory
++ * gets low.
++ *
++ * @priority: Amount by which to shrink the cache (0 = highes priority)
++ * @gfp_mask: (ignored)
++ */
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++      int count = 0;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &mb_cache_list) {
++              struct mb_cache *cache =
++                      list_entry(l, struct mb_cache, c_cache_list);
++              mb_debug("cache %s (%d)", cache->c_name,
++                        atomic_read(&cache->c_entry_count));
++              count += atomic_read(&cache->c_entry_count);
++      }
++      mb_debug("trying to free %d of %d entries",
++                count / (priority ? priority : 1), count);
++      if (priority)
++              count /= priority;
++      while (count-- && !list_empty(&mb_cache_lru_list)) {
++              struct mb_cache_entry *ce =
++                      list_entry(mb_cache_lru_list.next,
++                                 struct mb_cache_entry, e_lru_list);
++              list_del(&ce->e_lru_list);
++              __mb_cache_entry_unhash(ce);
++              list_add_tail(&ce->e_lru_list, &free_list);
++      }
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), gfp_mask);
++      }
++}
++
++
++/*
++ * mb_cache_create()  create a new cache
++ *
++ * All entries in one cache are equal size. Cache entries may be from
++ * multiple devices. If this is the first mbcache created, registers
++ * the cache with kernel memory management. Returns NULL if no more
++ * memory was available.
++ *
++ * @name: name of the cache (informal)
++ * @cache_op: contains the callback called when freeing a cache entry
++ * @entry_size: The size of a cache entry, including
++ *              struct mb_cache_entry
++ * @indexes_count: number of additional indexes in the cache. Must equal
++ *                 MB_CACHE_INDEXES_COUNT if the number of indexes is
++ *                 hardwired.
++ * @bucket_count: number of hash buckets
++ */
++struct mb_cache *
++mb_cache_create(const char *name, struct mb_cache_op *cache_op,
++              size_t entry_size, int indexes_count, int bucket_count)
++{
++      int m=0, n;
++      struct mb_cache *cache = NULL;
++
++      if(entry_size < sizeof(struct mb_cache_entry) +
++         indexes_count * sizeof(struct mb_cache_entry_index))
++              return NULL;
++
++      MOD_INC_USE_COUNT;
++      cache = kmalloc(sizeof(struct mb_cache) +
++                      indexes_count * sizeof(struct list_head), GFP_KERNEL);
++      if (!cache)
++              goto fail;
++      cache->c_name = name;
++      cache->c_op.free = NULL;
++      if (cache_op)
++              cache->c_op.free = cache_op->free;
++      atomic_set(&cache->c_entry_count, 0);
++      cache->c_bucket_count = bucket_count;
++#ifdef MB_CACHE_INDEXES_COUNT
++      mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
++#else
++      cache->c_indexes_count = indexes_count;
++#endif
++      cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
++                                    GFP_KERNEL);
++      if (!cache->c_block_hash)
++              goto fail;
++      for (n=0; n<bucket_count; n++)
++              INIT_LIST_HEAD(&cache->c_block_hash[n]);
++      for (m=0; m<indexes_count; m++) {
++              cache->c_indexes_hash[m] = kmalloc(bucket_count *
++                                               sizeof(struct list_head),
++                                               GFP_KERNEL);
++              if (!cache->c_indexes_hash[m])
++                      goto fail;
++              for (n=0; n<bucket_count; n++)
++                      INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
++      }
++      cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
++              0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL);
++      if (!cache->c_entry_cache)
++              goto fail;
++
++      spin_lock(&mb_cache_spinlock);
++      list_add(&cache->c_cache_list, &mb_cache_list);
++      spin_unlock(&mb_cache_spinlock);
++      return cache;
++
++fail:
++      if (cache) {
++              while (--m >= 0)
++                      kfree(cache->c_indexes_hash[m]);
++              if (cache->c_block_hash)
++                      kfree(cache->c_block_hash);
++              kfree(cache);
++      }
++      MOD_DEC_USE_COUNT;
++      return NULL;
++}
++
++
++/*
++ * mb_cache_shrink()
++ *
++ * Removes all cache entires of a device from the cache. All cache entries
++ * currently in use cannot be freed, and thus remain in the cache.
++ *
++ * @cache: which cache to shrink
++ * @dev: which device's cache entries to shrink
++ */
++void
++mb_cache_shrink(struct mb_cache *cache, kdev_t dev)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_lru_list);
++              if (ce->e_dev == dev) {
++                      list_del(&ce->e_lru_list);
++                      list_add_tail(&ce->e_lru_list, &free_list);
++                      __mb_cache_entry_unhash(ce);
++              }
++      }
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), GFP_KERNEL);
++      }
++}
++
++
++/*
++ * mb_cache_destroy()
++ *
++ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
++ * and then destroys it. If this was the last mbcache, un-registers the
++ * mbcache from kernel memory management.
++ */
++void
++mb_cache_destroy(struct mb_cache *cache)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++      int n;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_lru_list);
++              if (ce->e_cache == cache) {
++                      list_del(&ce->e_lru_list);
++                      list_add_tail(&ce->e_lru_list, &free_list);
++                      __mb_cache_entry_unhash(ce);
++              }
++      }
++      list_del(&cache->c_cache_list);
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), GFP_KERNEL);
++      }
++
++      if (atomic_read(&cache->c_entry_count) > 0) {
++              mb_error("cache %s: %d orphaned entries",
++                        cache->c_name,
++                        atomic_read(&cache->c_entry_count));
++      }
++
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0))
++      /* We don't have kmem_cache_destroy() in 2.2.x */
++      kmem_cache_shrink(cache->c_entry_cache);
++#else
++      kmem_cache_destroy(cache->c_entry_cache);
++#endif
++      for (n=0; n < mb_cache_indexes(cache); n++)
++              kfree(cache->c_indexes_hash[n]);
++      kfree(cache->c_block_hash);
++      kfree(cache);
++
++      MOD_DEC_USE_COUNT;
++}
++
++
++/*
++ * mb_cache_entry_alloc()
++ *
++ * Allocates a new cache entry. The new entry will not be valid initially,
++ * and thus cannot be looked up yet. It should be filled with data, and
++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
++ * if no more memory was available.
++ */
++struct mb_cache_entry *
++mb_cache_entry_alloc(struct mb_cache *cache)
++{
++      struct mb_cache_entry *ce;
++
++      atomic_inc(&cache->c_entry_count);
++      ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
++      if (ce) {
++              INIT_LIST_HEAD(&ce->e_lru_list);
++              INIT_LIST_HEAD(&ce->e_block_list);
++              ce->e_cache = cache;
++              atomic_set(&ce->e_used, 1);
++      }
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_insert()
++ *
++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
++ * the cache. After this, the cache entry can be looked up, but is not yet
++ * in the lru list as the caller still holds a handle to it. Returns 0 on
++ * success, or -EBUSY if a cache entry for that device + inode exists
++ * already (this may happen after a failed lookup, if another process has
++ * inserted the same cache entry in the meantime).
++ *
++ * @dev: device the cache entry belongs to
++ * @block: block number
++ * @keys: array of additional keys. There must be indexes_count entries
++ *        in the array (as specified when creating the cache).
++ */
++int
++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev,
++                    unsigned long block, unsigned int keys[])
++{
++      struct mb_cache *cache = ce->e_cache;
++      unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++      struct list_head *l;
++      int error = -EBUSY, n;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &cache->c_block_hash[bucket]) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_block_list);
++              if (ce->e_dev == dev && ce->e_block == block)
++                      goto out;
++      }
++      __mb_cache_entry_unhash(ce);
++      ce->e_dev = dev;
++      ce->e_block = block;
++      list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
++      for (n=0; n<mb_cache_indexes(cache); n++) {
++              ce->e_indexes[n].o_key = keys[n];
++              bucket = keys[n] % cache->c_bucket_count;
++              list_add(&ce->e_indexes[n].o_list,
++                       &cache->c_indexes_hash[n][bucket]);
++      }
++out:
++      spin_unlock(&mb_cache_spinlock);
++      return error;
++}
++
++
++/*
++ * mb_cache_entry_release()
++ *
++ * Release a handle to a cache entry. When the last handle to a cache entry
++ * is released it is either freed (if it is invalid) or otherwise inserted
++ * in to the lru list.
++ */
++void
++mb_cache_entry_release(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_takeout()
++ *
++ * Take a cache entry out of the cache, making it invalid. The entry can later
++ * be re-inserted using mb_cache_entry_insert(), or released using
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_takeout(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      mb_assert(list_empty(&ce->e_lru_list));
++      __mb_cache_entry_unhash(ce);
++      spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_entry_free()
++ *
++ * This is equivalent to the sequence mb_cache_entry_takeout() --
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_free(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      mb_assert(list_empty(&ce->e_lru_list));
++      __mb_cache_entry_unhash(ce);
++      __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_dup()
++ *
++ * Duplicate a handle to a cache entry (does not duplicate the cache entry
++ * itself). After the call, both the old and the new handle must be released.
++ */
++struct mb_cache_entry *
++mb_cache_entry_dup(struct mb_cache_entry *ce)
++{
++      atomic_inc(&ce->e_used);
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_get()
++ *
++ * Get a cache entry  by device / block number. (There can only be one entry
++ * in the cache per device and block.) Returns NULL if no such cache entry
++ * exists.
++ */
++struct mb_cache_entry *
++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block)
++{
++      unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &cache->c_block_hash[bucket]) {
++              ce = list_entry(l, struct mb_cache_entry, e_block_list);
++              if (ce->e_dev == dev && ce->e_block == block) {
++                      if (!list_empty(&ce->e_lru_list))
++                              list_del_init(&ce->e_lru_list);
++                      atomic_inc(&ce->e_used);
++                      goto cleanup;
++              }
++      }
++      ce = NULL;
++
++cleanup:
++      spin_unlock(&mb_cache_spinlock);
++      return ce;
++}
++
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++
++static struct mb_cache_entry *
++__mb_cache_entry_find(struct list_head *l, struct list_head *head,
++                    int index, kdev_t dev, unsigned int key)
++{
++      while (l != head) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry,
++                                 e_indexes[index].o_list);
++              if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) {
++                      if (!list_empty(&ce->e_lru_list))
++                              list_del_init(&ce->e_lru_list);
++                      atomic_inc(&ce->e_used);
++                      return ce;
++              }
++              l = l->next;
++      }
++      return NULL;
++}
++
++
++/*
++ * mb_cache_entry_find_first()
++ *
++ * Find the first cache entry on a given device with a certain key in
++ * an additional index. Additonal matches can be found with
++ * mb_cache_entry_find_next(). Returns NULL if no match was found.
++ *
++ * @cache: the cache to search
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_first(struct mb_cache *cache, int index, kdev_t dev,
++                        unsigned int key)
++{
++      unsigned int bucket = key % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      mb_assert(index < mb_cache_indexes(cache));
++      spin_lock(&mb_cache_spinlock);
++      l = cache->c_indexes_hash[index][bucket].next;
++      ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++                                 index, dev, key);
++      spin_unlock(&mb_cache_spinlock);
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_find_next()
++ *
++ * Find the next cache entry on a given device with a certain key in an
++ * additional index. Returns NULL if no match could be found. The previous
++ * entry is atomatically released, so that mb_cache_entry_find_next() can
++ * be called like this:
++ *
++ * entry = mb_cache_entry_find_first();
++ * while (entry) {
++ *    ...
++ *    entry = mb_cache_entry_find_next(entry, ...);
++ * }
++ *
++ * @prev: The previous match
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, kdev_t dev,
++                       unsigned int key)
++{
++      struct mb_cache *cache = prev->e_cache;
++      unsigned int bucket = key % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      mb_assert(index < mb_cache_indexes(cache));
++      spin_lock(&mb_cache_spinlock);
++      l = prev->e_indexes[index].o_list.next;
++      ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++                                 index, dev, key);
++      __mb_cache_entry_release_unlock(prev);
++      return ce;
++}
++
++#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
++
++static int __init init_mbcache(void)
++{
++      register_cache(&mb_cache_definition);
++      return 0;
++}
++
++static void __exit exit_mbcache(void)
++{
++      unregister_cache(&mb_cache_definition);
++}
++
++module_init(init_mbcache)
++module_exit(exit_mbcache)
++
+--- linux/include/asm-arm/unistd.h~linux-2.4.20-xattr-0.8.54-hp        Fri Aug  2 17:39:45 2002
++++ linux-mmonroe/include/asm-arm/unistd.h     Fri May 16 08:43:01 2003
+@@ -244,7 +244,6 @@
+ #define __NR_security                 (__NR_SYSCALL_BASE+223)
+ #define __NR_gettid                   (__NR_SYSCALL_BASE+224)
+ #define __NR_readahead                        (__NR_SYSCALL_BASE+225)
+-#if 0 /* allocated in 2.5 */
+ #define __NR_setxattr                 (__NR_SYSCALL_BASE+226)
+ #define __NR_lsetxattr                        (__NR_SYSCALL_BASE+227)
+ #define __NR_fsetxattr                        (__NR_SYSCALL_BASE+228)
+@@ -257,7 +256,6 @@
+ #define __NR_removexattr              (__NR_SYSCALL_BASE+235)
+ #define __NR_lremovexattr             (__NR_SYSCALL_BASE+236)
+ #define __NR_fremovexattr             (__NR_SYSCALL_BASE+237)
+-#endif
+ #define __NR_tkill                    (__NR_SYSCALL_BASE+238)
+ /*
+  * Please check 2.5 _before_ adding calls here,
+--- linux/include/asm-ppc64/unistd.h~linux-2.4.20-xattr-0.8.54-hp      Fri Aug  2 17:39:45 2002
++++ linux-mmonroe/include/asm-ppc64/unistd.h   Fri May 16 08:43:01 2003
+@@ -218,6 +218,7 @@
+ #define __NR_gettid           207
+ #if 0 /* Reserved syscalls */
+ #define __NR_tkill            208
++#endif
+ #define __NR_setxattr         209
+ #define __NR_lsetxattr                210
+ #define __NR_fsetxattr                211
+@@ -230,6 +231,7 @@
+ #define __NR_removexattr      218
+ #define __NR_lremovexattr     219
+ #define __NR_fremovexattr     220
++#if 0 /* Reserved syscalls */
+ #define __NR_futex            221
+ #endif
+--- linux/include/asm-s390/unistd.h~linux-2.4.20-xattr-0.8.54-hp       Fri Aug  2 17:39:45 2002
++++ linux-mmonroe/include/asm-s390/unistd.h    Fri May 16 08:43:01 2003
+@@ -212,9 +212,18 @@
+ #define __NR_madvise            219
+ #define __NR_getdents64               220
+ #define __NR_fcntl64          221
+-/*
+- * Numbers 224-235 are reserved for posix acl
+- */
++#define __NR_setxattr         224
++#define __NR_lsetxattr                225
++#define __NR_fsetxattr                226
++#define __NR_getxattr         227
++#define __NR_lgetxattr                228
++#define __NR_fgetxattr                229
++#define __NR_listxattr                230
++#define __NR_llistxattr               231
++#define __NR_flistxattr               232
++#define __NR_removexattr      233
++#define __NR_lremovexattr     234
++#define __NR_fremovexattr     235
+ #define __NR_gettid           236
+ #define __NR_tkill            237
+--- linux/include/asm-s390x/unistd.h~linux-2.4.20-xattr-0.8.54-hp      Fri Aug  2 17:39:45 2002
++++ linux-mmonroe/include/asm-s390x/unistd.h   Fri May 16 08:43:01 2003
+@@ -180,9 +180,18 @@
+ #define __NR_pivot_root         217
+ #define __NR_mincore            218
+ #define __NR_madvise            219
+-/*
+- * Numbers 224-235 are reserved for posix acl
+- */
++#define __NR_setxattr         224
++#define __NR_lsetxattr                225
++#define __NR_fsetxattr                226
++#define __NR_getxattr         227
++#define __NR_lgetxattr                228
++#define __NR_fgetxattr                229
++#define __NR_listxattr                230
++#define __NR_llistxattr               231
++#define __NR_flistxattr               232
++#define __NR_removexattr      233
++#define __NR_lremovexattr     234
++#define __NR_fremovexattr     235
+ #define __NR_gettid           236
+ #define __NR_tkill            237
+--- linux/include/asm-sparc/unistd.h~linux-2.4.20-xattr-0.8.54-hp      Fri Aug  2 17:39:45 2002
++++ linux-mmonroe/include/asm-sparc/unistd.h   Fri May 16 08:43:01 2003
+@@ -184,24 +184,24 @@
+ /* #define __NR_exportfs        166    SunOS Specific                              */
+ #define __NR_mount              167 /* Common                                      */
+ #define __NR_ustat              168 /* Common                                      */
+-/* #define __NR_semsys          169    SunOS Specific                              */
+-/* #define __NR_msgsys          170    SunOS Specific                              */
+-/* #define __NR_shmsys          171    SunOS Specific                              */
+-/* #define __NR_auditsys        172    SunOS Specific                              */
+-/* #define __NR_rfssys          173    SunOS Specific                              */
++#define __NR_setxattr           169 /* SunOS: semsys                               */
++#define __NR_lsetxattr          170 /* SunOS: msgsys                               */
++#define __NR_fsetxattr          171 /* SunOS: shmsys                               */
++#define __NR_getxattr           172 /* SunOS: auditsys                             */
++#define __NR_lgetxattr          173 /* SunOS: rfssys                               */
+ #define __NR_getdents           174 /* Common                                      */
+ #define __NR_setsid             175 /* Common                                      */
+ #define __NR_fchdir             176 /* Common                                      */
+-/* #define __NR_fchroot         177    SunOS Specific                              */
+-/* #define __NR_vpixsys         178    SunOS Specific                              */
+-/* #define __NR_aioread         179    SunOS Specific                              */
+-/* #define __NR_aiowrite        180    SunOS Specific                              */
+-/* #define __NR_aiowait         181    SunOS Specific                              */
+-/* #define __NR_aiocancel       182    SunOS Specific                              */
++#define __NR_fgetxattr          177 /* SunOS: fchroot                              */
++#define __NR_listxattr          178 /* SunOS: vpixsys                              */
++#define __NR_llistxattr         179 /* SunOS: aioread                              */
++#define __NR_flistxattr         180 /* SunOS: aiowrite                             */
++#define __NR_removexattr        181 /* SunOS: aiowait                              */
++#define __NR_lremovexattr       182 /* SunOS: aiocancel                            */
+ #define __NR_sigpending         183 /* Common                                      */
+ #define __NR_query_module     184 /* Linux Specific                              */
+ #define __NR_setpgid            185 /* Common                                      */
+-/* #define __NR_pathconf        186    SunOS Specific                              */
++#define __NR_fremovexattr       186 /* SunOS: pathconf                             */
+ #define __NR_tkill              187 /* SunOS: fpathconf                            */
+ /* #define __NR_sysconf         188    SunOS Specific                              */
+ #define __NR_uname              189 /* Linux Specific                              */
+--- linux/include/asm-sparc64/unistd.h~linux-2.4.20-xattr-0.8.54-hp    Fri Aug  2 17:39:45 2002
++++ linux-mmonroe/include/asm-sparc64/unistd.h Fri May 16 08:43:01 2003
+@@ -184,24 +184,24 @@
+ /* #define __NR_exportfs        166    SunOS Specific                              */
+ #define __NR_mount              167 /* Common                                      */
+ #define __NR_ustat              168 /* Common                                      */
+-/* #define __NR_semsys          169    SunOS Specific                              */
+-/* #define __NR_msgsys          170    SunOS Specific                              */
+-/* #define __NR_shmsys          171    SunOS Specific                              */
+-/* #define __NR_auditsys        172    SunOS Specific                              */
+-/* #define __NR_rfssys          173    SunOS Specific                              */
++#define __NR_setxattr           169 /* SunOS: semsys                               */
++#define __NR_lsetxattr          170 /* SunOS: msgsys                               */
++#define __NR_fsetxattr          171 /* SunOS: shmsys                               */
++#define __NR_getxattr           172 /* SunOS: auditsys                             */
++#define __NR_lgetxattr          173 /* SunOS: rfssys                               */
+ #define __NR_getdents           174 /* Common                                      */
+ #define __NR_setsid             175 /* Common                                      */
+ #define __NR_fchdir             176 /* Common                                      */
+-/* #define __NR_fchroot         177    SunOS Specific                              */
+-/* #define __NR_vpixsys         178    SunOS Specific                              */
+-/* #define __NR_aioread         179    SunOS Specific                              */
+-/* #define __NR_aiowrite        180    SunOS Specific                              */
+-/* #define __NR_aiowait         181    SunOS Specific                              */
+-/* #define __NR_aiocancel       182    SunOS Specific                              */
++#define __NR_fgetxattr          177 /* SunOS: fchroot                              */
++#define __NR_listxattr          178 /* SunOS: vpixsys                              */
++#define __NR_llistxattr         179 /* SunOS: aioread                              */
++#define __NR_flistxattr         180 /* SunOS: aiowrite                             */
++#define __NR_removexattr        181 /* SunOS: aiowait                              */
++#define __NR_lremovexattr       182 /* SunOS: aiocancel                            */
+ #define __NR_sigpending         183 /* Common                                      */
+ #define __NR_query_module     184 /* Linux Specific                              */
+ #define __NR_setpgid            185 /* Common                                      */
+-/* #define __NR_pathconf        186    SunOS Specific                              */
++#define __NR_fremovexattr       186 /* SunOS: pathconf                             */
+ #define __NR_tkill              187 /* SunOS: fpathconf                            */
+ /* #define __NR_sysconf         188    SunOS Specific                              */
+ #define __NR_uname              189 /* Linux Specific                              */
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/include/linux/cache_def.h    Fri May 16 08:43:01 2003
+@@ -0,0 +1,15 @@
++/*
++ * linux/cache_def.h
++ * Handling of caches defined in drivers, filesystems, ...
++ *
++ * Copyright (C) 2002 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++struct cache_definition {
++      const char *name;
++      void (*shrink)(int, unsigned int);
++      struct list_head link;
++};
++
++extern void register_cache(struct cache_definition *);
++extern void unregister_cache(struct cache_definition *);
+--- linux/include/linux/errno.h~linux-2.4.20-xattr-0.8.54-hp   Fri Feb  9 14:46:13 2001
++++ linux-mmonroe/include/linux/errno.h        Fri May 16 08:43:01 2003
+@@ -23,4 +23,8 @@
+ #endif
++/* Defined for extended attributes */
++#define ENOATTR ENODATA               /* No such attribute */
++#define ENOTSUP EOPNOTSUPP    /* Operation not supported */
++
+ #endif
+--- linux/include/linux/ext2_fs.h~linux-2.4.20-xattr-0.8.54-hp Thu Nov 22 11:46:52 2001
++++ linux-mmonroe/include/linux/ext2_fs.h      Fri May 16 08:43:01 2003
+@@ -57,8 +57,6 @@
+  */
+ #define       EXT2_BAD_INO             1      /* Bad blocks inode */
+ #define EXT2_ROOT_INO          2      /* Root inode */
+-#define EXT2_ACL_IDX_INO       3      /* ACL inode */
+-#define EXT2_ACL_DATA_INO      4      /* ACL inode */
+ #define EXT2_BOOT_LOADER_INO   5      /* Boot loader inode */
+ #define EXT2_UNDEL_DIR_INO     6      /* Undelete directory inode */
+@@ -86,7 +84,6 @@
+ #else
+ # define EXT2_BLOCK_SIZE(s)           (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT2_ACLE_PER_BLOCK(s)                (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry))
+ #define       EXT2_ADDR_PER_BLOCK(s)          (EXT2_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT2_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+@@ -121,28 +118,6 @@
+ #endif
+ /*
+- * ACL structures
+- */
+-struct ext2_acl_header        /* Header of Access Control Lists */
+-{
+-      __u32   aclh_size;
+-      __u32   aclh_file_count;
+-      __u32   aclh_acle_count;
+-      __u32   aclh_first_acle;
+-};
+-
+-struct ext2_acl_entry /* Access Control List Entry */
+-{
+-      __u32   acle_size;
+-      __u16   acle_perms;     /* Access permissions */
+-      __u16   acle_type;      /* Type of entry */
+-      __u16   acle_tag;       /* User or group identity */
+-      __u16   acle_pad1;
+-      __u32   acle_next;      /* Pointer on next entry for the */
+-                                      /* same inode or on next free entry */
+-};
+-
+-/*
+  * Structure of a blocks group descriptor
+  */
+ struct ext2_group_desc
+@@ -314,6 +289,7 @@ struct ext2_inode {
+ #define EXT2_MOUNT_ERRORS_PANIC               0x0040  /* Panic on errors */
+ #define EXT2_MOUNT_MINIX_DF           0x0080  /* Mimics the Minix statfs */
+ #define EXT2_MOUNT_NO_UID32           0x0200  /* Disable 32-bit UIDs */
++#define EXT2_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
+ #define clear_opt(o, opt)             o &= ~EXT2_MOUNT_##opt
+ #define set_opt(o, opt)                       o |= EXT2_MOUNT_##opt
+@@ -397,6 +373,7 @@ struct ext2_super_block {
+ #ifdef __KERNEL__
+ #define EXT2_SB(sb)   (&((sb)->u.ext2_sb))
++#define EXT2_I(inode) (&((inode)->u.ext2_i))
+ #else
+ /* Assume that user mode programs are passing in an ext2fs superblock, not
+  * a kernel struct super_block.  This will allow us to call the feature-test
+@@ -466,7 +443,7 @@ struct ext2_super_block {
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008
+ #define EXT2_FEATURE_INCOMPAT_ANY             0xffffffff
+-#define EXT2_FEATURE_COMPAT_SUPP      0
++#define EXT2_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT2_FEATURE_INCOMPAT_SUPP    EXT2_FEATURE_INCOMPAT_FILETYPE
+ #define EXT2_FEATURE_RO_COMPAT_SUPP   (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -623,8 +600,10 @@ extern struct address_space_operations e
+ /* namei.c */
+ extern struct inode_operations ext2_dir_inode_operations;
++extern struct inode_operations ext2_special_inode_operations;
+ /* symlink.c */
++extern struct inode_operations ext2_symlink_inode_operations;
+ extern struct inode_operations ext2_fast_symlink_inode_operations;
+ #endif        /* __KERNEL__ */
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/include/linux/ext2_xattr.h   Fri May 16 08:43:01 2003
+@@ -0,0 +1,157 @@
++/*
++  File: linux/ext2_xattr.h
++
++  On-disk format of extended attributes for the ext2 filesystem.
++
++  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT2_XATTR_MAGIC              0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT2_XATTR_REFCOUNT_MAX               1024
++
++/* Name indexes */
++#define EXT2_XATTR_INDEX_MAX                  10
++#define EXT2_XATTR_INDEX_USER                 1
++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS     2
++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT    3
++
++struct ext2_xattr_header {
++      __u32   h_magic;        /* magic number for identification */
++      __u32   h_refcount;     /* reference count */
++      __u32   h_blocks;       /* number of disk blocks used */
++      __u32   h_hash;         /* hash value of all attributes */
++      __u32   h_reserved[4];  /* zero right now */
++};
++
++struct ext2_xattr_entry {
++      __u8    e_name_len;     /* length of name */
++      __u8    e_name_index;   /* attribute name index */
++      __u16   e_value_offs;   /* offset in disk block of value */
++      __u32   e_value_block;  /* disk block attribute is stored on (n/i) */
++      __u32   e_value_size;   /* size of attribute value */
++      __u32   e_hash;         /* hash value of name and value */
++      char    e_name[0];      /* attribute name */
++};
++
++#define EXT2_XATTR_PAD_BITS           2
++#define EXT2_XATTR_PAD                (1<<EXT2_XATTR_PAD_BITS)
++#define EXT2_XATTR_ROUND              (EXT2_XATTR_PAD-1)
++#define EXT2_XATTR_LEN(name_len) \
++      (((name_len) + EXT2_XATTR_ROUND + \
++      sizeof(struct ext2_xattr_entry)) & ~EXT2_XATTR_ROUND)
++#define EXT2_XATTR_NEXT(entry) \
++      ( (struct ext2_xattr_entry *)( \
++        (char *)(entry) + EXT2_XATTR_LEN((entry)->e_name_len)) )
++#define EXT2_XATTR_SIZE(size) \
++      (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT2_FS_XATTR
++
++struct ext2_xattr_handler {
++      char *prefix;
++      size_t (*list)(char *list, struct inode *inode, const char *name,
++                     int name_len);
++      int (*get)(struct inode *inode, const char *name, void *buffer,
++                 size_t size);
++      int (*set)(struct inode *inode, const char *name, const void *buffer,
++                 size_t size, int flags);
++};
++
++extern int ext2_xattr_register(int, struct ext2_xattr_handler *);
++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *);
++
++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
++extern int ext2_removexattr(struct dentry *, const char *);
++
++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext2_xattr_list(struct inode *, char *, size_t);
++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext2_xattr_delete_inode(struct inode *);
++extern void ext2_xattr_put_super(struct super_block *);
++
++extern int init_ext2_xattr(void) __init;
++extern void exit_ext2_xattr(void);
++
++# else  /* CONFIG_EXT2_FS_XATTR */
++#  define ext2_setxattr               NULL
++#  define ext2_getxattr               NULL
++#  define ext2_listxattr      NULL
++#  define ext2_removexattr    NULL
++
++static inline int
++ext2_xattr_get(struct inode *inode, int name_index,
++             const char *name, void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++             const void *value, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++}
++
++static inline void
++ext2_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext2_xattr(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext2_xattr(void)
++{
++}
++
++# endif  /* CONFIG_EXT2_FS_XATTR */
++
++# ifdef CONFIG_EXT2_FS_XATTR_USER
++
++extern int init_ext2_xattr_user(void) __init;
++extern void exit_ext2_xattr_user(void);
++
++# else  /* CONFIG_EXT2_FS_XATTR_USER */
++
++static inline int
++init_ext2_xattr_user(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext2_xattr_user(void)
++{
++}
++
++# endif  /* CONFIG_EXT2_FS_XATTR_USER */
++
++#endif  /* __KERNEL__ */
++
+--- linux/include/linux/ext3_fs.h~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:47 2003
++++ linux-mmonroe/include/linux/ext3_fs.h      Fri May 16 08:43:01 2003
+@@ -63,8 +63,6 @@
+  */
+ #define       EXT3_BAD_INO             1      /* Bad blocks inode */
+ #define EXT3_ROOT_INO          2      /* Root inode */
+-#define EXT3_ACL_IDX_INO       3      /* ACL inode */
+-#define EXT3_ACL_DATA_INO      4      /* ACL inode */
+ #define EXT3_BOOT_LOADER_INO   5      /* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO     6      /* Undelete directory inode */
+ #define EXT3_RESIZE_INO                7      /* Reserved group descriptors inode */
+@@ -94,7 +92,6 @@
+ #else
+ # define EXT3_BLOCK_SIZE(s)           (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT3_ACLE_PER_BLOCK(s)                (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define       EXT3_ADDR_PER_BLOCK(s)          (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+@@ -129,28 +126,6 @@
+ #endif
+ /*
+- * ACL structures
+- */
+-struct ext3_acl_header        /* Header of Access Control Lists */
+-{
+-      __u32   aclh_size;
+-      __u32   aclh_file_count;
+-      __u32   aclh_acle_count;
+-      __u32   aclh_first_acle;
+-};
+-
+-struct ext3_acl_entry /* Access Control List Entry */
+-{
+-      __u32   acle_size;
+-      __u16   acle_perms;     /* Access permissions */
+-      __u16   acle_type;      /* Type of entry */
+-      __u16   acle_tag;       /* User or group identity */
+-      __u16   acle_pad1;
+-      __u32   acle_next;      /* Pointer on next entry for the */
+-                                      /* same inode or on next free entry */
+-};
+-
+-/*
+  * Structure of a blocks group descriptor
+  */
+ struct ext3_group_desc
+@@ -344,6 +319,7 @@ struct ext3_inode {
+   #define EXT3_MOUNT_WRITEBACK_DATA   0x0C00  /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
++#define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -520,7 +496,7 @@ struct ext3_super_block {
+ #define EXT3_FEATURE_INCOMPAT_RECOVER         0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008 /* Journal device */
+-#define EXT3_FEATURE_COMPAT_SUPP      0
++#define EXT3_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP    (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+                                        EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP   (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+@@ -703,6 +679,7 @@ extern void ext3_check_inodes_bitmap (st
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+ /* inode.c */
++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -771,8 +748,10 @@ extern struct address_space_operations e
+ /* namei.c */
+ extern struct inode_operations ext3_dir_inode_operations;
++extern struct inode_operations ext3_special_inode_operations;
+ /* symlink.c */
++extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+--- linux/include/linux/ext3_jbd.h~linux-2.4.20-xattr-0.8.54-hp        Fri May 16 08:42:46 2003
++++ linux-mmonroe/include/linux/ext3_jbd.h     Fri May 16 08:43:01 2003
+@@ -30,13 +30,19 @@
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS  8
++/* Extended attributes may touch two data buffers, two bitmap buffers,
++ * and two group and summaries. */
++
++#define EXT3_XATTR_TRANS_BLOCKS               8
++
+ /* Define the minimum size for a transaction which modifies data.  This
+  * needs to take into account the fact that we may end up modifying two
+  * quota files too (one for the group, one for the user quota).  The
+  * superblock only gets updated once, of course, so don't bother
+  * counting that again for the quota updates. */
+-#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
++#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \
++                                       EXT3_XATTR_TRANS_BLOCKS - 2)
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/include/linux/ext3_xattr.h   Fri May 16 08:43:01 2003
+@@ -0,0 +1,157 @@
++/*
++  File: linux/ext3_xattr.h
++
++  On-disk format of extended attributes for the ext3 filesystem.
++
++  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT3_XATTR_MAGIC              0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT3_XATTR_REFCOUNT_MAX               1024
++
++/* Name indexes */
++#define EXT3_XATTR_INDEX_MAX                  10
++#define EXT3_XATTR_INDEX_USER                 1
++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS     2
++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT    3
++
++struct ext3_xattr_header {
++      __u32   h_magic;        /* magic number for identification */
++      __u32   h_refcount;     /* reference count */
++      __u32   h_blocks;       /* number of disk blocks used */
++      __u32   h_hash;         /* hash value of all attributes */
++      __u32   h_reserved[4];  /* zero right now */
++};
++
++struct ext3_xattr_entry {
++      __u8    e_name_len;     /* length of name */
++      __u8    e_name_index;   /* attribute name index */
++      __u16   e_value_offs;   /* offset in disk block of value */
++      __u32   e_value_block;  /* disk block attribute is stored on (n/i) */
++      __u32   e_value_size;   /* size of attribute value */
++      __u32   e_hash;         /* hash value of name and value */
++      char    e_name[0];      /* attribute name */
++};
++
++#define EXT3_XATTR_PAD_BITS           2
++#define EXT3_XATTR_PAD                (1<<EXT3_XATTR_PAD_BITS)
++#define EXT3_XATTR_ROUND              (EXT3_XATTR_PAD-1)
++#define EXT3_XATTR_LEN(name_len) \
++      (((name_len) + EXT3_XATTR_ROUND + \
++      sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
++#define EXT3_XATTR_NEXT(entry) \
++      ( (struct ext3_xattr_entry *)( \
++        (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
++#define EXT3_XATTR_SIZE(size) \
++      (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT3_FS_XATTR
++
++struct ext3_xattr_handler {
++      char *prefix;
++      size_t (*list)(char *list, struct inode *inode, const char *name,
++                     int name_len);
++      int (*get)(struct inode *inode, const char *name, void *buffer,
++                 size_t size);
++      int (*set)(struct inode *inode, const char *name, const void *buffer,
++                 size_t size, int flags);
++};
++
++extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
++
++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
++extern int ext3_removexattr(struct dentry *, const char *);
++
++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext3_xattr_list(struct inode *, char *, size_t);
++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
++extern void ext3_xattr_put_super(struct super_block *);
++
++extern int init_ext3_xattr(void) __init;
++extern void exit_ext3_xattr(void);
++
++# else  /* CONFIG_EXT3_FS_XATTR */
++#  define ext3_setxattr               NULL
++#  define ext3_getxattr               NULL
++#  define ext3_listxattr      NULL
++#  define ext3_removexattr    NULL
++
++static inline int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++             const char *name, const void *value, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++}
++
++static inline void
++ext3_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext3_xattr(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext3_xattr(void)
++{
++}
++
++# endif  /* CONFIG_EXT3_FS_XATTR */
++
++# ifdef CONFIG_EXT3_FS_XATTR_USER
++
++extern int init_ext3_xattr_user(void) __init;
++extern void exit_ext3_xattr_user(void);
++
++# else  /* CONFIG_EXT3_FS_XATTR_USER */
++
++static inline int
++init_ext3_xattr_user(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext3_xattr_user(void)
++{
++}
++
++#endif  /* CONFIG_EXT3_FS_XATTR_USER */
++
++#endif  /* __KERNEL__ */
++
+--- linux/include/linux/fs.h~linux-2.4.20-xattr-0.8.54-hp      Fri May 16 08:42:46 2003
++++ linux-mmonroe/include/linux/fs.h   Fri May 16 08:43:01 2003
+@@ -909,7 +909,7 @@ struct inode_operations {
+       int (*setattr) (struct dentry *, struct iattr *);
+       int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct dentry *, struct iattr *);
+-      int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
++      int (*setxattr) (struct dentry *, const char *, const void *, size_t, int);
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+       ssize_t (*listxattr) (struct dentry *, char *, size_t);
+       int (*removexattr) (struct dentry *, const char *);
+--- /dev/null  Mon May 20 21:11:23 2002
++++ linux-mmonroe/include/linux/mbcache.h      Fri May 16 08:43:01 2003
+@@ -0,0 +1,69 @@
++/*
++  File: linux/mbcache.h
++
++  (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++/* Hardwire the number of additional indexes */
++#define MB_CACHE_INDEXES_COUNT 1
++
++struct mb_cache_entry;
++
++struct mb_cache_op {
++      int (*free)(struct mb_cache_entry *, int);
++};
++
++struct mb_cache {
++      struct list_head                c_cache_list;
++      const char                      *c_name;
++      struct mb_cache_op              c_op;
++      atomic_t                        c_entry_count;
++      int                             c_bucket_count;
++#ifndef MB_CACHE_INDEXES_COUNT
++      int                             c_indexes_count;
++#endif
++      kmem_cache_t                    *c_entry_cache;
++      struct list_head                *c_block_hash;
++      struct list_head                *c_indexes_hash[0];
++};
++
++struct mb_cache_entry_index {
++      struct list_head                o_list;
++      unsigned int                    o_key;
++};
++
++struct mb_cache_entry {
++      struct list_head                e_lru_list;
++      struct mb_cache                 *e_cache;
++      atomic_t                        e_used;
++      kdev_t                          e_dev;
++      unsigned long                   e_block;
++      struct list_head                e_block_list;
++      struct mb_cache_entry_index     e_indexes[0];
++};
++
++/* Functions on caches */
++
++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
++                                int, int);
++void mb_cache_shrink(struct mb_cache *, kdev_t);
++void mb_cache_destroy(struct mb_cache *);
++
++/* Functions on cache entries */
++
++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *);
++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long,
++                        unsigned int[]);
++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]);
++void mb_cache_entry_release(struct mb_cache_entry *);
++void mb_cache_entry_takeout(struct mb_cache_entry *);
++void mb_cache_entry_free(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t,
++                                        unsigned long);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
++                                               kdev_t, unsigned int);
++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
++                                              kdev_t, unsigned int);
++#endif
+--- linux/kernel/ksyms.c~linux-2.4.20-xattr-0.8.54-hp  Fri May 16 08:42:45 2003
++++ linux-mmonroe/kernel/ksyms.c       Fri May 16 08:43:52 2003
+@@ -11,6 +11,7 @@
+ #include <linux/config.h>
+ #include <linux/slab.h>
++#include <linux/cache_def.h>
+ #include <linux/module.h>
+ #include <linux/blkdev.h>
+ #include <linux/cdrom.h>
+@@ -106,6 +107,7 @@ EXPORT_SYMBOL(exit_mm);
+ EXPORT_SYMBOL(exit_files);
+ EXPORT_SYMBOL(exit_fs);
+ EXPORT_SYMBOL(exit_sighand);
++EXPORT_SYMBOL(copy_fs_struct);
+ EXPORT_SYMBOL_GPL(make_pages_present);
+ /* internal kernel memory management */
+@@ -126,6 +128,8 @@ EXPORT_SYMBOL(kmem_cache_validate);
+ EXPORT_SYMBOL(kmem_cache_alloc);
+ EXPORT_SYMBOL(kmem_cache_free);
+ EXPORT_SYMBOL(kmem_cache_size);
++EXPORT_SYMBOL(register_cache);
++EXPORT_SYMBOL(unregister_cache);
+ EXPORT_SYMBOL(kmalloc);
+ EXPORT_SYMBOL(kfree);
+ EXPORT_SYMBOL(vfree);
+--- linux/mm/vmscan.c~linux-2.4.20-xattr-0.8.54-hp     Fri May 16 08:39:23 2003
++++ linux-mmonroe/mm/vmscan.c  Fri May 16 08:43:01 2003
+@@ -18,6 +18,7 @@
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+ #include <linux/swapctl.h>
++#include <linux/cache_def.h>
+ #include <linux/smp_lock.h>
+ #include <linux/pagemap.h>
+ #include <linux/init.h>
+@@ -35,6 +36,39 @@
+  */
+ #define DEF_PRIORITY (6)
++static DECLARE_MUTEX(other_caches_sem);
++static LIST_HEAD(cache_definitions);
++
++void register_cache(struct cache_definition *cache)
++{
++      down(&other_caches_sem);
++      list_add(&cache->link, &cache_definitions);
++      up(&other_caches_sem);
++}
++
++void unregister_cache(struct cache_definition *cache)
++{
++      down(&other_caches_sem);
++      list_del(&cache->link);
++      up(&other_caches_sem);
++}
++
++static void shrink_other_caches(unsigned int priority, int gfp_mask)
++{
++      struct list_head *p;
++
++      if (down_trylock(&other_caches_sem))
++              return;
++
++      list_for_each_prev(p, &cache_definitions) {
++              struct cache_definition *cache =
++                      list_entry(p, struct cache_definition, link);
++
++              cache->shrink(priority, gfp_mask);
++      }
++      up(&other_caches_sem);
++}
++
+ /*
+  * The swap-out function returns 1 if it successfully
+  * scanned all the pages it was asked to (`count').
+@@ -579,6 +613,7 @@ static int shrink_caches(zone_t * classz
+       shrink_dcache_memory(priority, gfp_mask);
+       shrink_icache_memory(priority, gfp_mask);
++      shrink_other_caches(priority, gfp_mask);
+ #ifdef CONFIG_QUOTA
+       shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+ #endif
+
+_
diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch
new file mode 100644 (file)
index 0000000..1489989
--- /dev/null
@@ -0,0 +1,5595 @@
+ Documentation/Configure.help  |   66 ++
+ arch/alpha/defconfig          |    7 
+ arch/alpha/kernel/entry.S     |   12 
+ arch/arm/defconfig            |    7 
+ arch/arm/kernel/calls.S       |   24 
+ arch/i386/defconfig           |    7 
+ arch/ia64/defconfig           |    7 
+ arch/ia64/kernel/entry.S      |   24 
+ arch/m68k/defconfig           |    7 
+ arch/mips/defconfig           |    7 
+ arch/mips64/defconfig         |    7 
+ arch/ppc/defconfig            |   14 
+ arch/ppc64/kernel/misc.S      |    2 
+ arch/s390/defconfig           |    7 
+ arch/s390/kernel/entry.S      |   24 
+ arch/s390x/defconfig          |    7 
+ arch/s390x/kernel/entry.S     |   24 
+ arch/s390x/kernel/wrapper32.S |   92 +++
+ arch/sparc/defconfig          |    7 
+ arch/sparc/kernel/systbls.S   |   10 
+ arch/sparc64/defconfig        |    7 
+ arch/sparc64/kernel/systbls.S |   20 
+ fs/Config.in                  |   14 
+ fs/Makefile                   |    3 
+ fs/ext2/Makefile              |    4 
+ fs/ext2/file.c                |    5 
+ fs/ext2/ialloc.c              |    2 
+ fs/ext2/inode.c               |   34 -
+ fs/ext2/namei.c               |   14 
+ fs/ext2/super.c               |   29 
+ fs/ext2/symlink.c             |   14 
+ fs/ext2/xattr.c               | 1212 +++++++++++++++++++++++++++++++++++++++++
+ fs/ext2/xattr_user.c          |  103 +++
+ fs/ext3/Makefile              |   10 
+ fs/ext3/file.c                |    5 
+ fs/ext3/ialloc.c              |    2 
+ fs/ext3/inode.c               |   35 -
+ fs/ext3/namei.c               |   21 
+ fs/ext3/super.c               |   36 +
+ fs/ext3/symlink.c             |   14 
+ fs/ext3/xattr.c               | 1225 ++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/xattr_user.c          |  111 +++
+ fs/jfs/jfs_xattr.h            |    6 
+ fs/jfs/xattr.c                |    6 
+ fs/mbcache.c                  |  648 ++++++++++++++++++++++
+ include/asm-arm/unistd.h      |    2 
+ include/asm-ia64/unistd.h     |   13 
+ include/asm-ppc64/unistd.h    |    2 
+ include/asm-s390/unistd.h     |   15 
+ include/asm-s390x/unistd.h    |   15 
+ include/asm-sparc/unistd.h    |   24 
+ include/asm-sparc64/unistd.h  |   24 
+ include/linux/cache_def.h     |   15 
+ include/linux/errno.h         |    4 
+ include/linux/ext2_fs.h       |   31 -
+ include/linux/ext2_xattr.h    |  157 +++++
+ include/linux/ext3_fs.h       |   31 -
+ include/linux/ext3_jbd.h      |    8 
+ include/linux/ext3_xattr.h    |  157 +++++
+ include/linux/fs.h            |    2 
+ include/linux/mbcache.h       |   69 ++
+ kernel/ksyms.c                |    4 
+ mm/vmscan.c                   |   35 +
+ fs/ext3/ext3-exports.c        |   14 +  
+ 64 files changed, 4355 insertions(+), 195 deletions(-)
+
+--- linux-2.4.20/Documentation/Configure.help~linux-2.4.20-xattr-0.8.54        2003-05-05 17:43:06.000000000 +0800
++++ linux-2.4.20-root/Documentation/Configure.help     2003-05-07 18:08:03.000000000 +0800
+@@ -15242,6 +15242,39 @@ CONFIG_EXT2_FS
+   be compiled as a module, and so this could be dangerous.  Most
+   everyone wants to say Y here.
++Ext2 extended attributes
++CONFIG_EXT2_FS_XATTR
++  Extended attributes are name:value pairs associated with inodes by
++  the kernel or by users (see the attr(5) manual page, or visit
++  <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext2 extended attribute block sharing
++CONFIG_EXT2_FS_XATTR_SHARING
++  This options enables code for sharing identical extended attribute
++  blocks among multiple inodes.
++
++  Usually, say Y.
++
++Ext2 extended user attributes
++CONFIG_EXT2_FS_XATTR_USER
++  This option enables extended user attributes on ext2. Processes can
++  associate extended user attributes with inodes to store additional
++  information such as the character encoding of files, etc. (see the
++  attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext2 trusted extended attributes
++CONFIG_EXT2_FS_XATTR_TRUSTED
++  This option enables extended attributes on ext2 that are accessible
++  (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++  is only the super user. Trusted extended attributes are meant for
++  implementing system/security services.
++
++  If unsure, say N.
++
+ Ext3 journalling file system support (EXPERIMENTAL)
+ CONFIG_EXT3_FS
+   This is the journalling version of the Second extended file system
+@@ -15274,6 +15307,39 @@ CONFIG_EXT3_FS
+   of your root partition (the one containing the directory /) cannot
+   be compiled as a module, and so this may be dangerous.
++Ext3 extended attributes
++CONFIG_EXT3_FS_XATTR
++  Extended attributes are name:value pairs associated with inodes by
++  the kernel or by users (see the attr(5) manual page, or visit
++  <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext3 extended attribute block sharing
++CONFIG_EXT3_FS_XATTR_SHARING
++  This options enables code for sharing identical extended attribute
++  blocks among multiple inodes.
++
++  Usually, say Y.
++
++Ext3 extended user attributes
++CONFIG_EXT3_FS_XATTR_USER
++  This option enables extended user attributes on ext3. Processes can
++  associate extended user attributes with inodes to store additional
++  information such as the character encoding of files, etc. (see the
++  attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++  If unsure, say N.
++
++Ext3 trusted extended attributes
++CONFIG_EXT3_FS_XATTR_TRUSTED
++  This option enables extended attributes on ext3 that are accessible
++  (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++  is only the super user. Trusted extended attributes are meant for
++  implementing system/security services.
++
++  If unsure, say N.
++
+ Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
+ CONFIG_JBD
+   This is a generic journalling layer for block devices.  It is
+--- linux-2.4.20/arch/alpha/defconfig~linux-2.4.20-xattr-0.8.54        2001-11-20 07:19:42.000000000 +0800
++++ linux-2.4.20-root/arch/alpha/defconfig     2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_ALPHA=y
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+--- linux-2.4.20/arch/alpha/kernel/entry.S~linux-2.4.20-xattr-0.8.54   2002-08-03 08:39:42.000000000 +0800
++++ linux-2.4.20-root/arch/alpha/kernel/entry.S        2003-05-07 18:08:03.000000000 +0800
+@@ -1154,6 +1154,18 @@ sys_call_table:
+       .quad sys_readahead
+       .quad sys_ni_syscall                    /* 380, sys_security */
+       .quad sys_tkill
++      .quad sys_setxattr
++      .quad sys_lsetxattr
++      .quad sys_fsetxattr
++      .quad sys_getxattr                      /* 385 */
++      .quad sys_lgetxattr
++      .quad sys_fgetxattr
++      .quad sys_listxattr
++      .quad sys_llistxattr
++      .quad sys_flistxattr                    /* 390 */
++      .quad sys_removexattr
++      .quad sys_lremovexattr
++      .quad sys_fremovexattr
+ /* Remember to update everything, kids.  */
+ .ifne (. - sys_call_table) - (NR_SYSCALLS * 8)
+--- linux-2.4.20/arch/arm/defconfig~linux-2.4.20-xattr-0.8.54  2001-05-20 08:43:05.000000000 +0800
++++ linux-2.4.20-root/arch/arm/defconfig       2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_ARM=y
+ # CONFIG_EISA is not set
+ # CONFIG_SBUS is not set
+--- linux-2.4.20/arch/arm/kernel/calls.S~linux-2.4.20-xattr-0.8.54     2002-08-03 08:39:42.000000000 +0800
++++ linux-2.4.20-root/arch/arm/kernel/calls.S  2003-05-07 18:08:03.000000000 +0800
+@@ -240,18 +240,18 @@ __syscall_start:
+               .long   SYMBOL_NAME(sys_ni_syscall) /* Security */
+               .long   SYMBOL_NAME(sys_gettid)
+ /* 225 */     .long   SYMBOL_NAME(sys_readahead)
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_setxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lsetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fsetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_getxattr */
+-/* 230 */     .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lgetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fgetxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_listxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_llistxattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_flistxattr */
+-/* 235 */     .long   SYMBOL_NAME(sys_ni_syscall) /* sys_removexattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_lremovexattr */
+-              .long   SYMBOL_NAME(sys_ni_syscall) /* sys_fremovexattr */
++              .long   SYMBOL_NAME(sys_setxattr)
++              .long   SYMBOL_NAME(sys_lsetxattr)
++              .long   SYMBOL_NAME(sys_fsetxattr)
++              .long   SYMBOL_NAME(sys_getxattr)
++/* 230 */     .long   SYMBOL_NAME(sys_lgetxattr)
++              .long   SYMBOL_NAME(sys_fgetxattr)
++              .long   SYMBOL_NAME(sys_listxattr)
++              .long   SYMBOL_NAME(sys_llistxattr)
++              .long   SYMBOL_NAME(sys_flistxattr)
++/* 235 */     .long   SYMBOL_NAME(sys_removexattr)
++              .long   SYMBOL_NAME(sys_lremovexattr)
++              .long   SYMBOL_NAME(sys_fremovexattr)
+               .long   SYMBOL_NAME(sys_tkill)
+               /*
+                * Please check 2.5 _before_ adding calls here,
+--- linux-2.4.20/arch/i386/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:09.000000000 +0800
++++ linux-2.4.20-root/arch/i386/defconfig      2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_X86=y
+ CONFIG_ISA=y
+ # CONFIG_SBUS is not set
+--- linux-2.4.20/arch/ia64/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:09.000000000 +0800
++++ linux-2.4.20-root/arch/ia64/defconfig      2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ #
+ # Code maturity level options
+--- linux-2.4.20/arch/ia64/kernel/entry.S~linux-2.4.20-xattr-0.8.54    2002-11-29 07:53:09.000000000 +0800
++++ linux-2.4.20-root/arch/ia64/kernel/entry.S 2003-05-07 18:08:03.000000000 +0800
+@@ -1170,18 +1170,18 @@ sys_call_table:
+       data8 sys_getdents64
+       data8 sys_getunwind                     // 1215
+       data8 sys_readahead
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall                   // 1220
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall                   // 1225
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall
+-      data8 ia64_ni_syscall
++      data8 sys_setxattr
++      data8 sys_lsetxattr
++      data8 sys_fsetxattr
++      data8 sys_getxattr                      // 1220
++      data8 sys_lgetxattr
++      data8 sys_fgetxattr
++      data8 sys_listxattr
++      data8 sys_llistxattr
++      data8 sys_flistxattr                    // 1225
++      data8 sys_removexattr
++      data8 sys_lremovexattr
++      data8 sys_fremovexattr
+       data8 sys_tkill
+       data8 ia64_ni_syscall                   // 1230
+       data8 ia64_ni_syscall
+--- linux-2.4.20/arch/m68k/defconfig~linux-2.4.20-xattr-0.8.54 2000-06-20 03:56:08.000000000 +0800
++++ linux-2.4.20-root/arch/m68k/defconfig      2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_UID16=y
+ #
+--- linux-2.4.20/arch/mips/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:10.000000000 +0800
++++ linux-2.4.20-root/arch/mips/defconfig      2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_MIPS=y
+ CONFIG_MIPS32=y
+ # CONFIG_MIPS64 is not set
+--- linux-2.4.20/arch/mips64/defconfig~linux-2.4.20-xattr-0.8.54       2002-11-29 07:53:10.000000000 +0800
++++ linux-2.4.20-root/arch/mips64/defconfig    2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_MIPS=y
+ # CONFIG_MIPS32 is not set
+ CONFIG_MIPS64=y
+--- linux-2.4.20/arch/ppc/defconfig~linux-2.4.20-xattr-0.8.54  2002-11-29 07:53:11.000000000 +0800
++++ linux-2.4.20-root/arch/ppc/defconfig       2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,20 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+--- linux-2.4.20/arch/ppc64/kernel/misc.S~linux-2.4.20-xattr-0.8.54    2002-11-29 07:53:11.000000000 +0800
++++ linux-2.4.20-root/arch/ppc64/kernel/misc.S 2003-05-07 18:08:03.000000000 +0800
+@@ -731,6 +731,7 @@ _GLOBAL(sys_call_table32)
+       .llong .sys_gettid              /* 207 */
+ #if 0 /* Reserved syscalls */
+       .llong .sys_tkill               /* 208 */
++#endif
+       .llong .sys_setxattr
+       .llong .sys_lsetxattr   /* 210 */
+       .llong .sys_fsetxattr
+@@ -743,6 +744,7 @@ _GLOBAL(sys_call_table32)
+       .llong .sys_removexattr
+       .llong .sys_lremovexattr
+       .llong .sys_fremovexattr        /* 220 */
++#if 0 /* Reserved syscalls */
+       .llong .sys_futex
+ #endif
+       .llong .sys_perfmonctl   /* Put this here for now ... */
+--- linux-2.4.20/arch/s390/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:11.000000000 +0800
++++ linux-2.4.20-root/arch/s390/defconfig      2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_ISA is not set
+ # CONFIG_EISA is not set
+ # CONFIG_MCA is not set
+--- linux-2.4.20/arch/s390/kernel/entry.S~linux-2.4.20-xattr-0.8.54    2002-11-29 07:53:11.000000000 +0800
++++ linux-2.4.20-root/arch/s390/kernel/entry.S 2003-05-07 18:08:03.000000000 +0800
+@@ -558,18 +558,18 @@ sys_call_table:
+         .long  sys_fcntl64 
+       .long  sys_ni_syscall
+       .long  sys_ni_syscall
+-      .long  sys_ni_syscall            /* 224 - reserved for setxattr  */
+-      .long  sys_ni_syscall            /* 225 - reserved for lsetxattr */
+-      .long  sys_ni_syscall            /* 226 - reserved for fsetxattr */
+-      .long  sys_ni_syscall            /* 227 - reserved for getxattr  */
+-      .long  sys_ni_syscall            /* 228 - reserved for lgetxattr */
+-      .long  sys_ni_syscall            /* 229 - reserved for fgetxattr */
+-      .long  sys_ni_syscall            /* 230 - reserved for listxattr */
+-      .long  sys_ni_syscall            /* 231 - reserved for llistxattr */
+-      .long  sys_ni_syscall            /* 232 - reserved for flistxattr */
+-      .long  sys_ni_syscall            /* 233 - reserved for removexattr */
+-      .long  sys_ni_syscall            /* 234 - reserved for lremovexattr */
+-      .long  sys_ni_syscall            /* 235 - reserved for fremovexattr */
++      .long  sys_setxattr
++      .long  sys_lsetxattr            /* 225 */
++      .long  sys_fsetxattr
++      .long  sys_getxattr
++      .long  sys_lgetxattr
++      .long  sys_fgetxattr
++      .long  sys_listxattr            /* 230 */
++      .long  sys_llistxattr
++      .long  sys_flistxattr
++      .long  sys_removexattr
++      .long  sys_lremovexattr
++      .long  sys_fremovexattr         /* 235 */
+       .long  sys_gettid
+       .long  sys_tkill
+       .rept  255-237
+--- linux-2.4.20/arch/s390x/defconfig~linux-2.4.20-xattr-0.8.54        2002-11-29 07:53:11.000000000 +0800
++++ linux-2.4.20-root/arch/s390x/defconfig     2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_ISA is not set
+ # CONFIG_EISA is not set
+ # CONFIG_MCA is not set
+--- linux-2.4.20/arch/s390x/kernel/entry.S~linux-2.4.20-xattr-0.8.54   2002-11-29 07:53:11.000000000 +0800
++++ linux-2.4.20-root/arch/s390x/kernel/entry.S        2003-05-07 18:08:03.000000000 +0800
+@@ -591,18 +591,18 @@ sys_call_table:
+       .long  SYSCALL(sys_ni_syscall,sys32_fcntl64_wrapper)
+       .long  SYSCALL(sys_ni_syscall,sys_ni_syscall)
+       .long  SYSCALL(sys_ni_syscall,sys_ni_syscall)
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 224 - reserved for setxattr  */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 225 - reserved for lsetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 226 - reserved for fsetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 227 - reserved for getxattr  */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 228 - reserved for lgetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 229 - reserved for fgetxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 230 - reserved for listxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 231 - reserved for llistxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 232 - reserved for flistxattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 233 - reserved for removexattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 234 - reserved for lremovexattr */
+-      .long  SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 235 - reserved for fremovexattr */
++      .long  SYSCALL(sys_setxattr,sys32_setxattr_wrapper)
++      .long  SYSCALL(sys_lsetxattr,sys32_lsetxattr_wrapper)   /* 225 */
++      .long  SYSCALL(sys_fsetxattr,sys32_fsetxattr_wrapper)
++      .long  SYSCALL(sys_getxattr,sys32_getxattr_wrapper)
++      .long  SYSCALL(sys_lgetxattr,sys32_lgetxattr_wrapper)
++      .long  SYSCALL(sys_fgetxattr,sys32_fgetxattr_wrapper)
++      .long  SYSCALL(sys_listxattr,sys32_listxattr_wrapper)   /* 230 */
++      .long  SYSCALL(sys_llistxattr,sys32_llistxattr_wrapper)
++      .long  SYSCALL(sys_flistxattr,sys32_flistxattr_wrapper)
++      .long  SYSCALL(sys_removexattr,sys32_removexattr_wrapper)
++      .long  SYSCALL(sys_lremovexattr,sys32_lremovexattr_wrapper)
++      .long  SYSCALL(sys_fremovexattr,sys32_fremovexattr_wrapper)/* 235 */
+       .long  SYSCALL(sys_gettid,sys_gettid)
+       .long  SYSCALL(sys_tkill,sys_tkill)
+       .rept  255-237
+--- linux-2.4.20/arch/s390x/kernel/wrapper32.S~linux-2.4.20-xattr-0.8.54       2002-02-26 03:37:56.000000000 +0800
++++ linux-2.4.20-root/arch/s390x/kernel/wrapper32.S    2003-05-07 18:08:03.000000000 +0800
+@@ -1091,3 +1091,95 @@ sys32_fstat64_wrapper:
+       llgtr   %r3,%r3                 # struct stat64 *
+       llgfr   %r4,%r4                 # long
+       jg      sys32_fstat64           # branch to system call
++
++      .globl  sys32_setxattr_wrapper
++sys32_setxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_setxattr
++
++      .globl  sys32_lsetxattr_wrapper
++sys32_lsetxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_lsetxattr
++
++      .globl  sys32_fsetxattr_wrapper
++sys32_fsetxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      lgfr    %r6,%r6                 # int
++      jg      sys_fsetxattr
++
++      .globl  sys32_getxattr_wrapper
++sys32_getxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_getxattr
++
++      .globl  sys32_lgetxattr_wrapper
++sys32_lgetxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_lgetxattr
++
++      .globl  sys32_fgetxattr_wrapper
++sys32_fgetxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgtr   %r4,%r4                 # void *
++      llgfr   %r5,%r5                 # size_t
++      jg      sys_fgetxattr
++
++      .globl  sys32_listxattr_wrapper
++sys32_listxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_listxattr
++
++      .globl  sys32_llistxattr_wrapper
++sys32_llistxattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_llistxattr
++
++      .globl  sys32_flistxattr_wrapper
++sys32_flistxattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      llgfr   %r4,%r4                 # size_t
++      jg      sys_flistxattr
++
++      .globl  sys32_removexattr_wrapper
++sys32_removexattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      jg      sys_removexattr
++
++      .globl  sys32_lremovexattr_wrapper
++sys32_lremovexattr_wrapper:
++      llgtr   %r2,%r2                 # char *
++      llgtr   %r3,%r3                 # char *
++      jg      sys_lremovexattr
++
++      .globl  sys32_fremovexattr_wrapper
++sys32_fremovexattr_wrapper:
++      lgfr    %r2,%r2                 # int
++      llgtr   %r3,%r3                 # char *
++      jg      sys_fremovexattr
++
++
+--- linux-2.4.20/arch/sparc/defconfig~linux-2.4.20-xattr-0.8.54        2002-08-03 08:39:43.000000000 +0800
++++ linux-2.4.20-root/arch/sparc/defconfig     2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_UID16=y
+ CONFIG_HIGHMEM=y
+--- linux-2.4.20/arch/sparc/kernel/systbls.S~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:43.000000000 +0800
++++ linux-2.4.20-root/arch/sparc/kernel/systbls.S      2003-05-07 18:08:03.000000000 +0800
+@@ -51,11 +51,11 @@ sys_call_table:
+ /*150*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+ /*155*/       .long sys_fcntl64, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
+ /*160*/       .long sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall
+-/*165*/       .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents
+-/*175*/       .long sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_sigpending, sys_query_module
+-/*185*/       .long sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sys_newuname
++/*165*/       .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr
++/*170*/       .long sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents
++/*175*/       .long sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .long sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_sigpending, sys_query_module
++/*185*/       .long sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sys_newuname
+ /*190*/       .long sys_init_module, sys_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+ /*195*/       .long sys_nis_syscall, sys_nis_syscall, sys_getppid, sparc_sigaction, sys_sgetmask
+ /*200*/       .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, old_readdir
+--- linux-2.4.20/arch/sparc64/defconfig~linux-2.4.20-xattr-0.8.54      2002-11-29 07:53:12.000000000 +0800
++++ linux-2.4.20-root/arch/sparc64/defconfig   2003-05-07 18:08:03.000000000 +0800
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ #
+ # Code maturity level options
+--- linux-2.4.20/arch/sparc64/kernel/systbls.S~linux-2.4.20-xattr-0.8.54       2002-08-03 08:39:43.000000000 +0800
++++ linux-2.4.20-root/arch/sparc64/kernel/systbls.S    2003-05-07 18:08:03.000000000 +0800
+@@ -52,11 +52,11 @@ sys_call_table32:
+ /*150*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+       .word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount
+ /*160*/       .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall
+-      .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_getdents
+-      .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_sigpending, sys32_query_module
+-      .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname
++      .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr
++/*170*/       .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys32_getdents
++      .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys32_sigpending, sys32_query_module
++      .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname
+ /*190*/       .word sys32_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+       .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys32_sigaction, sys_sgetmask
+ /*200*/       .word sys_ssetmask, sys_sigsuspend, sys32_newlstat, sys_uselib, old32_readdir
+@@ -111,11 +111,11 @@ sys_call_table:
+ /*150*/       .word sys_getsockname, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64
+       .word sys_nis_syscall, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount
+ /*160*/       .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_utrap_install
+-      .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall
+-/*170*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents
+-      .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+-/*180*/       .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_query_module
+-      .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname
++      .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr
++/*170*/       .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents
++      .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr
++/*180*/       .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_nis_syscall, sys_query_module
++      .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname
+ /*190*/       .word sys_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall
+       .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys_nis_syscall, sys_sgetmask
+ /*200*/       .word sys_ssetmask, sys_nis_syscall, sys_newlstat, sys_uselib, sys_nis_syscall
+--- linux-2.4.20/fs/Config.in~linux-2.4.20-xattr-0.8.54        2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/fs/Config.in     2003-05-07 18:08:03.000000000 +0800
+@@ -25,6 +25,11 @@ dep_mbool '  Debug Befs' CONFIG_BEFS_DEB
+ dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL
+ tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS
++dep_mbool '  Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS
++dep_bool '    Ext3 extended attribute block sharing' \
++    CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR
++dep_bool '    Ext3 extended user attributes' \
++    CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR
+ # CONFIG_JBD could be its own option (even modular), but until there are
+ # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
+ # dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
+@@ -84,6 +89,11 @@ dep_mbool '  QNX4FS write support (DANGE
+ tristate 'ROM file system support' CONFIG_ROMFS_FS
+ tristate 'Second extended fs support' CONFIG_EXT2_FS
++dep_mbool '  Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS
++dep_bool '    Ext2 extended attribute block sharing' \
++    CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR
++dep_bool '    Ext2 extended user attributes' \
++    CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR
+ tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS
+@@ -155,6 +165,10 @@ else
+    define_tristate CONFIG_ZISOFS_FS n
+ fi
++# Meta block cache for Extended Attributes (ext2/ext3)
++#tristate 'Meta block cache' CONFIG_FS_MBCACHE
++define_tristate CONFIG_FS_MBCACHE y 
++
+ mainmenu_option next_comment
+ comment 'Partition Types'
+ source fs/partitions/Config.in
+--- linux-2.4.20/fs/Makefile~linux-2.4.20-xattr-0.8.54 2003-05-05 19:00:58.000000000 +0800
++++ linux-2.4.20-root/fs/Makefile      2003-05-07 18:08:03.000000000 +0800
+@@ -79,6 +79,9 @@ obj-y                                += binfmt_script.o
+ obj-$(CONFIG_BINFMT_ELF)      += binfmt_elf.o
++export-objs += mbcache.o
++obj-$(CONFIG_FS_MBCACHE)      += mbcache.o
++
+ # persistent filesystems
+ obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
+--- linux-2.4.20/fs/ext2/Makefile~linux-2.4.20-xattr-0.8.54    2001-10-11 23:05:18.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/Makefile 2003-05-07 18:08:03.000000000 +0800
+@@ -13,4 +13,8 @@ obj-y    := balloc.o bitmap.o dir.o file
+               ioctl.o namei.o super.o symlink.o
+ obj-m    := $(O_TARGET)
++export-objs += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+--- linux-2.4.20/fs/ext2/file.c~linux-2.4.20-xattr-0.8.54      2001-10-11 23:05:18.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/file.c   2003-05-07 18:08:03.000000000 +0800
+@@ -20,6 +20,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/sched.h>
+ /*
+@@ -51,4 +52,8 @@ struct file_operations ext2_file_operati
+ struct inode_operations ext2_file_inode_operations = {
+       truncate:       ext2_truncate,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- linux-2.4.20/fs/ext2/ialloc.c~linux-2.4.20-xattr-0.8.54    2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/ialloc.c 2003-05-07 18:08:03.000000000 +0800
+@@ -15,6 +15,7 @@
+ #include <linux/config.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+@@ -167,6 +168,7 @@ void ext2_free_inode (struct inode * ino
+        */
+       if (!is_bad_inode(inode)) {
+               /* Quota is already initialized in iput() */
++              ext2_xattr_delete_inode(inode);
+               DQUOT_FREE_INODE(inode);
+               DQUOT_DROP(inode);
+       }
+--- linux-2.4.20/fs/ext2/inode.c~linux-2.4.20-xattr-0.8.54     2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/inode.c  2003-05-07 18:08:03.000000000 +0800
+@@ -39,6 +39,18 @@ MODULE_LICENSE("GPL");
+ static int ext2_update_inode(struct inode * inode, int do_sync);
+ /*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext2_inode_is_fast_symlink(struct inode *inode)
++{
++      int ea_blocks = inode->u.ext2_i.i_file_acl ?
++              (inode->i_sb->s_blocksize >> 9) : 0;
++
++      return (S_ISLNK(inode->i_mode) &&
++              inode->i_blocks - ea_blocks == 0);
++}
++
++/*
+  * Called at each iput()
+  */
+ void ext2_put_inode (struct inode * inode)
+@@ -53,9 +65,7 @@ void ext2_delete_inode (struct inode * i
+ {
+       lock_kernel();
+-      if (is_bad_inode(inode) ||
+-          inode->i_ino == EXT2_ACL_IDX_INO ||
+-          inode->i_ino == EXT2_ACL_DATA_INO)
++      if (is_bad_inode(inode))
+               goto no_delete;
+       inode->u.ext2_i.i_dtime = CURRENT_TIME;
+       mark_inode_dirty(inode);
+@@ -801,6 +811,8 @@ void ext2_truncate (struct inode * inode
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode)))
+               return;
++      if (ext2_inode_is_fast_symlink(inode))
++              return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+               return;
+@@ -888,8 +900,7 @@ void ext2_read_inode (struct inode * ino
+       unsigned long offset;
+       struct ext2_group_desc * gdp;
+-      if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO &&
+-           inode->i_ino != EXT2_ACL_DATA_INO &&
++      if ((inode->i_ino != EXT2_ROOT_INO &&
+            inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
+           inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
+               ext2_error (inode->i_sb, "ext2_read_inode",
+@@ -974,10 +985,7 @@ void ext2_read_inode (struct inode * ino
+       for (block = 0; block < EXT2_N_BLOCKS; block++)
+               inode->u.ext2_i.i_data[block] = raw_inode->i_block[block];
+-      if (inode->i_ino == EXT2_ACL_IDX_INO ||
+-          inode->i_ino == EXT2_ACL_DATA_INO)
+-              /* Nothing to do */ ;
+-      else if (S_ISREG(inode->i_mode)) {
++      if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &ext2_file_inode_operations;
+               inode->i_fop = &ext2_file_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+@@ -986,15 +994,17 @@ void ext2_read_inode (struct inode * ino
+               inode->i_fop = &ext2_dir_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+       } else if (S_ISLNK(inode->i_mode)) {
+-              if (!inode->i_blocks)
++              if (ext2_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext2_fast_symlink_inode_operations;
+               else {
+-                      inode->i_op = &page_symlink_inode_operations;
++                      inode->i_op = &ext2_symlink_inode_operations;
+                       inode->i_mapping->a_ops = &ext2_aops;
+               }
+-      } else 
++      } else {
++              inode->i_op = &ext2_special_inode_operations;
+               init_special_inode(inode, inode->i_mode,
+                                  le32_to_cpu(raw_inode->i_block[0]));
++      }
+       brelse (bh);
+       inode->i_attr_flags = 0;
+       if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) {
+--- linux-2.4.20/fs/ext2/namei.c~linux-2.4.20-xattr-0.8.54     2001-10-04 13:57:36.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/namei.c  2003-05-07 18:08:03.000000000 +0800
+@@ -31,6 +31,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/pagemap.h>
+ /*
+@@ -136,7 +137,7 @@ static int ext2_symlink (struct inode * 
+       if (l > sizeof (inode->u.ext2_i.i_data)) {
+               /* slow symlink */
+-              inode->i_op = &page_symlink_inode_operations;
++              inode->i_op = &ext2_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext2_aops;
+               err = block_symlink(inode, symname, l);
+               if (err)
+@@ -345,4 +346,15 @@ struct inode_operations ext2_dir_inode_o
+       rmdir:          ext2_rmdir,
+       mknod:          ext2_mknod,
+       rename:         ext2_rename,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
++};
++
++struct inode_operations ext2_special_inode_operations = {
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- linux-2.4.20/fs/ext2/super.c~linux-2.4.20-xattr-0.8.54     2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/super.c  2003-05-07 18:08:03.000000000 +0800
+@@ -21,6 +21,7 @@
+ #include <linux/string.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -125,6 +126,7 @@ void ext2_put_super (struct super_block 
+       int db_count;
+       int i;
++      ext2_xattr_put_super(sb);
+       if (!(sb->s_flags & MS_RDONLY)) {
+               struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+@@ -175,6 +177,13 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++              if (!strcmp (this_char, "user_xattr"))
++                      set_opt (*mount_options, XATTR_USER);
++              else if (!strcmp (this_char, "nouser_xattr"))
++                      clear_opt (*mount_options, XATTR_USER);
++              else
++#endif
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -424,6 +433,9 @@ struct super_block * ext2_read_super (st
+           blocksize = BLOCK_SIZE;
+       sb->u.ext2_sb.s_mount_opt = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++      /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */
++#endif
+       if (!parse_options ((char *) data, &sb_block, &resuid, &resgid,
+           &sb->u.ext2_sb.s_mount_opt)) {
+               return NULL;
+@@ -813,12 +825,27 @@ static DECLARE_FSTYPE_DEV(ext2_fs_type, 
+ static int __init init_ext2_fs(void)
+ {
+-        return register_filesystem(&ext2_fs_type);
++      int error = init_ext2_xattr();
++      if (error)
++              return error;
++      error = init_ext2_xattr_user();
++      if (error)
++              goto fail;
++      error = register_filesystem(&ext2_fs_type);
++      if (!error)
++              return 0;
++
++      exit_ext2_xattr_user();
++fail:
++      exit_ext2_xattr();
++      return error;
+ }
+ static void __exit exit_ext2_fs(void)
+ {
+       unregister_filesystem(&ext2_fs_type);
++      exit_ext2_xattr_user();
++      exit_ext2_xattr();
+ }
+ EXPORT_NO_SYMBOLS;
+--- linux-2.4.20/fs/ext2/symlink.c~linux-2.4.20-xattr-0.8.54   2000-09-28 04:41:33.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/symlink.c        2003-05-07 18:08:03.000000000 +0800
+@@ -19,6 +19,7 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -32,7 +33,20 @@ static int ext2_follow_link(struct dentr
+       return vfs_follow_link(nd, s);
+ }
++struct inode_operations ext2_symlink_inode_operations = {
++      readlink:       page_readlink,
++      follow_link:    page_follow_link,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
++};
++
+ struct inode_operations ext2_fast_symlink_inode_operations = {
+       readlink:       ext2_readlink,
+       follow_link:    ext2_follow_link,
++      setxattr:       ext2_setxattr,
++      getxattr:       ext2_getxattr,
++      listxattr:      ext2_listxattr,
++      removexattr:    ext2_removexattr,
+ };
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/xattr.c  2003-05-07 18:08:03.000000000 +0800
+@@ -0,0 +1,1212 @@
++/*
++ * linux/fs/ext2/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Extended attributes for symlinks and special files added per
++ *  suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ *   +------------------+
++ *   | header           |
++ *   | entry 1          | |
++ *   | entry 2          | | growing downwards
++ *   | entry 3          | v
++ *   | four null bytes  |
++ *   | . . .            |
++ *   | value 1          | ^
++ *   | value 3          | | growing upwards
++ *   | value 2          | |
++ *   +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT2_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++/* These symbols may be needed by a module. */
++EXPORT_SYMBOL(ext2_xattr_register);
++EXPORT_SYMBOL(ext2_xattr_unregister);
++EXPORT_SYMBOL(ext2_xattr_get);
++EXPORT_SYMBOL(ext2_xattr_list);
++EXPORT_SYMBOL(ext2_xattr_set);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT2_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++              printk(KERN_DEBUG "inode %s:%ld: ", \
++                      kdevname(inode->i_dev), inode->i_ino); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++# define ea_bdebug(bh, f...) do { \
++              printk(KERN_DEBUG "block %s:%ld: ", \
++                      kdevname(bh->b_dev), bh->b_blocknr); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext2_xattr_set2(struct inode *, struct buffer_head *,
++                         struct ext2_xattr_header *);
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++static int ext2_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext2_xattr_cache_find(struct inode *,
++                                               struct ext2_xattr_header *);
++static void ext2_xattr_cache_remove(struct buffer_head *);
++static void ext2_xattr_rehash(struct ext2_xattr_header *,
++                            struct ext2_xattr_entry *);
++
++static struct mb_cache *ext2_xattr_cache;
++
++#else
++# define ext2_xattr_cache_insert(bh) 0
++# define ext2_xattr_cache_find(inode, header) NULL
++# define ext2_xattr_cache_remove(bh) while(0) {}
++# define ext2_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext2_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext2_xattr_sem);
++
++static inline int
++ext2_xattr_new_block(struct inode *inode, int * errp, int force)
++{
++      struct super_block *sb = inode->i_sb;
++      int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) +
++              EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb);
++
++      /* How can we enforce the allocation? */
++      int block = ext2_new_block(inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++      if (!*errp)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++      return block;
++}
++
++static inline int
++ext2_xattr_quota_alloc(struct inode *inode, int force)
++{
++      /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++      int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++      if (!error)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++      int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++      return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext2_xattr_quota_free(struct inode *inode)
++{
++      DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext2_xattr_free_block(struct inode * inode, unsigned long block)
++{
++      ext2_free_blocks(inode, block, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext2_xattr_quota_free(inode) \
++      DQUOT_FREE_BLOCK(inode, 1)
++# define ext2_xattr_free_block(inode, block) \
++      ext2_free_blocks(inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++      return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++      return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler)
++{
++      int error = -EINVAL;
++
++      if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++              write_lock(&ext2_handler_lock);
++              if (!ext2_xattr_handlers[name_index-1]) {
++                      ext2_xattr_handlers[name_index-1] = handler;
++                      error = 0;
++              }
++              write_unlock(&ext2_handler_lock);
++      }
++      return error;
++}
++
++void
++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler)
++{
++      if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) {
++              write_lock(&ext2_handler_lock);
++              ext2_xattr_handlers[name_index-1] = NULL;
++              write_unlock(&ext2_handler_lock);
++      }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++      while (*a_prefix && *a == *a_prefix) {
++              a++;
++              a_prefix++;
++      }
++      return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static struct ext2_xattr_handler *
++ext2_xattr_resolve_name(const char **name)
++{
++      struct ext2_xattr_handler *handler = NULL;
++      int i;
++
++      if (!*name)
++              return NULL;
++      read_lock(&ext2_handler_lock);
++      for (i=0; i<EXT2_XATTR_INDEX_MAX; i++) {
++              if (ext2_xattr_handlers[i]) {
++                      const char *n = strcmp_prefix(*name,
++                              ext2_xattr_handlers[i]->prefix);
++                      if (n) {
++                              handler = ext2_xattr_handlers[i];
++                              *name = n;
++                              break;
++                      }
++              }
++      }
++      read_unlock(&ext2_handler_lock);
++      return handler;
++}
++
++static inline struct ext2_xattr_handler *
++ext2_xattr_handler(int name_index)
++{
++      struct ext2_xattr_handler *handler = NULL;
++      if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++              read_lock(&ext2_handler_lock);
++              handler = ext2_xattr_handlers[name_index-1];
++              read_unlock(&ext2_handler_lock);
++      }
++      return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_getxattr(struct dentry *dentry, const char *name,
++            void *buffer, size_t size)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++      return ext2_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_setxattr(struct dentry *dentry, const char *name,
++            const void *value, size_t size, int flags)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      if (size == 0)
++              value = "";  /* empty EA, do not remove */
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_removexattr(struct dentry *dentry, const char *name)
++{
++      struct ext2_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext2_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext2_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_entry *entry;
++      unsigned int block, size;
++      char *end;
++      int name_len, error;
++
++      ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++                name_index, name, buffer, (long)buffer_size);
++
++      if (name == NULL)
++              return -EINVAL;
++      if (!EXT2_I(inode)->i_file_acl)
++              return -ENOATTR;
++      block = EXT2_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext2_error(inode->i_sb, "ext2_xattr_get",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* find named attribute */
++      name_len = strlen(name);
++
++      error = -ERANGE;
++      if (name_len > 255)
++              goto cleanup;
++      entry = FIRST_ENTRY(bh);
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              if (name_index == entry->e_name_index &&
++                  name_len == entry->e_name_len &&
++                  memcmp(name, entry->e_name, name_len) == 0)
++                      goto found;
++              entry = next;
++      }
++      /* Check the remaining name entries */
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              entry = next;
++      }
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      error = -ENOATTR;
++      goto cleanup;
++found:
++      /* check the buffer size */
++      if (entry->e_value_block != 0)
++              goto bad_block;
++      size = le32_to_cpu(entry->e_value_size);
++      if (size > inode->i_sb->s_blocksize ||
++          le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++              goto bad_block;
++
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (buffer) {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++              /* return value of attribute */
++              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++                      size);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * ext2_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_entry *entry;
++      unsigned int block, size = 0;
++      char *buf, *end;
++      int error;
++
++      ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++                buffer, (long)buffer_size);
++
++      if (!EXT2_I(inode)->i_file_acl)
++              return 0;
++      block = EXT2_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext2_error(inode->i_sb, "ext2_xattr_list",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* compute the size required for the list of attribute names */
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT2_XATTR_NEXT(entry)) {
++              struct ext2_xattr_handler *handler;
++              struct ext2_xattr_entry *next =
++                      EXT2_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++
++              handler = ext2_xattr_handler(entry->e_name_index);
++              if (handler)
++                      size += handler->list(NULL, inode, entry->e_name,
++                                            entry->e_name_len);
++      }
++
++      if (ext2_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (!buffer) {
++              error = size;
++              goto cleanup;
++      } else {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++      }
++
++      /* list the attribute names */
++      buf = buffer;
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT2_XATTR_NEXT(entry)) {
++              struct ext2_xattr_handler *handler;
++              
++              handler = ext2_xattr_handler(entry->e_name_index);
++              if (handler)
++                      buf += handler->list(buf, inode, entry->e_name,
++                                           entry->e_name_len);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext2_xattr_update_super_block(struct super_block *sb)
++{
++      if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
++              return;
++
++      lock_super(sb);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++      EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR;
++#endif
++      EXT2_SB(sb)->s_es->s_feature_compat |=
++              cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR);
++      sb->s_dirt = 1;
++      mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
++      unlock_super(sb);
++}
++
++/*
++ * ext2_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++             const void *value, size_t value_len, int flags)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *bh = NULL;
++      struct ext2_xattr_header *header = NULL;
++      struct ext2_xattr_entry *here, *last;
++      unsigned int name_len;
++      int block = EXT2_I(inode)->i_file_acl;
++      int min_offs = sb->s_blocksize, not_found = 1, free, error;
++      char *end;
++      
++      /*
++       * header -- Points either into bh, or to a temporarily
++       *           allocated buffer.
++       * here -- The named entry found, or the place for inserting, within
++       *         the block pointed to by header.
++       * last -- Points right after the last named entry within the block
++       *         pointed to by header.
++       * min_offs -- The offset of the first value (values are aligned
++       *             towards the end of the block).
++       * end -- Points right after the block pointed to by header.
++       */
++      
++      ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++                name_index, name, value, (long)value_len);
++
++      if (IS_RDONLY(inode))
++              return -EROFS;
++      if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++              return -EPERM;
++      if (value == NULL)
++              value_len = 0;
++      if (name == NULL)
++              return -EINVAL;
++      name_len = strlen(name);
++      if (name_len > 255 || value_len > sb->s_blocksize)
++              return -ERANGE;
++      down(&ext2_xattr_sem);
++
++      if (block) {
++              /* The inode already has an extended attribute block. */
++
++              bh = sb_bread(sb, block);
++              error = -EIO;
++              if (!bh)
++                      goto cleanup;
++              ea_bdebug(bh, "b_count=%d, refcount=%d",
++                      atomic_read(&(bh->b_count)),
++                      le32_to_cpu(HDR(bh)->h_refcount));
++              header = HDR(bh);
++              end = bh->b_data + bh->b_size;
++              if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++                  header->h_blocks != cpu_to_le32(1)) {
++bad_block:            ext2_error(sb, "ext2_xattr_set",
++                              "inode %ld: bad block %d", inode->i_ino, block);
++                      error = -EIO;
++                      goto cleanup;
++              }
++              /* Find the named attribute. */
++              here = FIRST_ENTRY(bh);
++              while (!IS_LAST_ENTRY(here)) {
++                      struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!here->e_value_block && here->e_value_size) {
++                              int offs = le16_to_cpu(here->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      not_found = name_index - here->e_name_index;
++                      if (!not_found)
++                              not_found = name_len - here->e_name_len;
++                      if (!not_found)
++                              not_found = memcmp(name, here->e_name,name_len);
++                      if (not_found <= 0)
++                              break;
++                      here = next;
++              }
++              last = here;
++              /* We still need to compute min_offs and last. */
++              while (!IS_LAST_ENTRY(last)) {
++                      struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!last->e_value_block && last->e_value_size) {
++                              int offs = le16_to_cpu(last->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      last = next;
++              }
++
++              /* Check whether we have enough space left. */
++              free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++      } else {
++              /* We will use a new extended attribute block. */
++              free = sb->s_blocksize -
++                      sizeof(struct ext2_xattr_header) - sizeof(__u32);
++              here = last = NULL;  /* avoid gcc uninitialized warning. */
++      }
++
++      if (not_found) {
++              /* Request to remove a nonexistent attribute? */
++              error = -ENOATTR;
++              if (flags & XATTR_REPLACE)
++                      goto cleanup;
++              error = 0;
++              if (value == NULL)
++                      goto cleanup;
++              else
++                      free -= EXT2_XATTR_LEN(name_len);
++      } else {
++              /* Request to create an existing attribute? */
++              error = -EEXIST;
++              if (flags & XATTR_CREATE)
++                      goto cleanup;
++              if (!here->e_value_block && here->e_value_size) {
++                      unsigned int size = le32_to_cpu(here->e_value_size);
++
++                      if (le16_to_cpu(here->e_value_offs) + size > 
++                          sb->s_blocksize || size > sb->s_blocksize)
++                              goto bad_block;
++                      free += EXT2_XATTR_SIZE(size);
++              }
++      }
++      free -= EXT2_XATTR_SIZE(value_len);
++      error = -ENOSPC;
++      if (free < 0)
++              goto cleanup;
++
++      /* Here we know that we can set the new attribute. */
++
++      if (header) {
++              if (header->h_refcount == cpu_to_le32(1)) {
++                      ea_bdebug(bh, "modifying in-place");
++                      ext2_xattr_cache_remove(bh);
++              } else {
++                      int offset;
++
++                      ea_bdebug(bh, "cloning");
++                      header = kmalloc(bh->b_size, GFP_KERNEL);
++                      error = -ENOMEM;
++                      if (header == NULL)
++                              goto cleanup;
++                      memcpy(header, HDR(bh), bh->b_size);
++                      header->h_refcount = cpu_to_le32(1);
++                      offset = (char *)header - bh->b_data;
++                      here = ENTRY((char *)here + offset);
++                      last = ENTRY((char *)last + offset);
++              }
++      } else {
++              /* Allocate a buffer where we construct the new block. */
++              header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++              error = -ENOMEM;
++              if (header == NULL)
++                      goto cleanup;
++              memset(header, 0, sb->s_blocksize);
++              end = (char *)header + sb->s_blocksize;
++              header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
++              header->h_blocks = header->h_refcount = cpu_to_le32(1);
++              last = here = ENTRY(header+1);
++      }
++
++      if (not_found) {
++              /* Insert the new name. */
++              int size = EXT2_XATTR_LEN(name_len);
++              int rest = (char *)last - (char *)here;
++              memmove((char *)here + size, here, rest);
++              memset(here, 0, size);
++              here->e_name_index = name_index;
++              here->e_name_len = name_len;
++              memcpy(here->e_name, name, name_len);
++      } else {
++              /* Remove the old value. */
++              if (!here->e_value_block && here->e_value_size) {
++                      char *first_val = (char *)header + min_offs;
++                      int offs = le16_to_cpu(here->e_value_offs);
++                      char *val = (char *)header + offs;
++                      size_t size = EXT2_XATTR_SIZE(
++                              le32_to_cpu(here->e_value_size));
++                      memmove(first_val + size, first_val, val - first_val);
++                      memset(first_val, 0, size);
++                      here->e_value_offs = 0;
++                      min_offs += size;
++
++                      /* Adjust all value offsets. */
++                      last = ENTRY(header+1);
++                      while (!IS_LAST_ENTRY(last)) {
++                              int o = le16_to_cpu(last->e_value_offs);
++                              if (!last->e_value_block && o < offs)
++                                      last->e_value_offs =
++                                              cpu_to_le16(o + size);
++                              last = EXT2_XATTR_NEXT(last);
++                      }
++              }
++              if (value == NULL) {
++                      /* Remove this attribute. */
++                      if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) {
++                              /* This block is now empty. */
++                              error = ext2_xattr_set2(inode, bh, NULL);
++                              goto cleanup;
++                      } else {
++                              /* Remove the old name. */
++                              int size = EXT2_XATTR_LEN(name_len);
++                              last = ENTRY((char *)last - size);
++                              memmove(here, (char*)here + size,
++                                      (char*)last - (char*)here);
++                              memset(last, 0, size);
++                      }
++              }
++      }
++
++      if (value != NULL) {
++              /* Insert the new value. */
++              here->e_value_size = cpu_to_le32(value_len);
++              if (value_len) {
++                      size_t size = EXT2_XATTR_SIZE(value_len);
++                      char *val = (char *)header + min_offs - size;
++                      here->e_value_offs =
++                              cpu_to_le16((char *)val - (char *)header);
++                      memset(val + size - EXT2_XATTR_PAD, 0,
++                             EXT2_XATTR_PAD); /* Clear the pad bytes. */
++                      memcpy(val, value, value_len);
++              }
++      }
++      ext2_xattr_rehash(header, here);
++
++      error = ext2_xattr_set2(inode, bh, header);
++
++cleanup:
++      brelse(bh);
++      if (!(bh && header == HDR(bh)))
++              kfree(header);
++      up(&ext2_xattr_sem);
++
++      return error;
++}
++
++/*
++ * Second half of ext2_xattr_set(): Update the file system.
++ */
++static int
++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
++              struct ext2_xattr_header *header)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *new_bh = NULL;
++      int error;
++
++      if (header) {
++              new_bh = ext2_xattr_cache_find(inode, header);
++              if (new_bh) {
++                      /*
++                       * We found an identical block in the cache.
++                       * The old block will be released after updating
++                       * the inode.
++                       */
++                      ea_bdebug(old_bh, "reusing block %ld",
++                              new_bh->b_blocknr);
++                      
++                      error = -EDQUOT;
++                      if (ext2_xattr_quota_alloc(inode, 1))
++                              goto cleanup;
++                      
++                      HDR(new_bh)->h_refcount = cpu_to_le32(
++                              le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++                      ea_bdebug(new_bh, "refcount now=%d",
++                              le32_to_cpu(HDR(new_bh)->h_refcount));
++              } else if (old_bh && header == HDR(old_bh)) {
++                      /* Keep this block. */
++                      new_bh = old_bh;
++                      ext2_xattr_cache_insert(new_bh);
++              } else {
++                      /* We need to allocate a new block */
++                      int force = EXT2_I(inode)->i_file_acl != 0;
++                      int block = ext2_xattr_new_block(inode, &error, force);
++                      if (error)
++                              goto cleanup;
++                      ea_idebug(inode, "creating block %d", block);
++
++                      new_bh = sb_getblk(sb, block);
++                      if (!new_bh) {
++                              ext2_xattr_free_block(inode, block);
++                              error = -EIO;
++                              goto cleanup;
++                      }
++                      lock_buffer(new_bh);
++                      memcpy(new_bh->b_data, header, new_bh->b_size);
++                      mark_buffer_uptodate(new_bh, 1);
++                      unlock_buffer(new_bh);
++                      ext2_xattr_cache_insert(new_bh);
++                      
++                      ext2_xattr_update_super_block(sb);
++              }
++              mark_buffer_dirty(new_bh);
++              if (IS_SYNC(inode)) {
++                      ll_rw_block(WRITE, 1, &new_bh);
++                      wait_on_buffer(new_bh); 
++                      error = -EIO;
++                      if (buffer_req(new_bh) && !buffer_uptodate(new_bh))
++                              goto cleanup;
++              }
++      }
++
++      /* Update the inode. */
++      EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++      inode->i_ctime = CURRENT_TIME;
++      if (IS_SYNC(inode)) {
++              error = ext2_sync_inode (inode);
++              if (error)
++                      goto cleanup;
++      } else
++              mark_inode_dirty(inode);
++
++      error = 0;
++      if (old_bh && old_bh != new_bh) {
++              /*
++               * If there was an old block, and we are not still using it,
++               * we now release the old block.
++              */
++              unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++              if (refcount == 1) {
++                      /* Free the old block. */
++                      ea_bdebug(old_bh, "freeing");
++                      ext2_xattr_free_block(inode, old_bh->b_blocknr);
++                      mark_buffer_clean(old_bh);
++              } else {
++                      /* Decrement the refcount only. */
++                      refcount--;
++                      HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++                      ext2_xattr_quota_free(inode);
++                      mark_buffer_dirty(old_bh);
++                      ea_bdebug(old_bh, "refcount now=%d", refcount);
++              }
++      }
++
++cleanup:
++      if (old_bh != new_bh)
++              brelse(new_bh);
++
++      return error;
++}
++
++/*
++ * ext2_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++      struct buffer_head *bh;
++      unsigned int block = EXT2_I(inode)->i_file_acl;
++
++      if (!block)
++              return;
++      down(&ext2_xattr_sem);
++
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh) {
++              ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++                      "inode %ld: block %d read error", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++              ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++      if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++              ext2_xattr_cache_remove(bh);
++              ext2_xattr_free_block(inode, block);
++              bforget(bh);
++              bh = NULL;
++      } else {
++              HDR(bh)->h_refcount = cpu_to_le32(
++                      le32_to_cpu(HDR(bh)->h_refcount) - 1);
++              mark_buffer_dirty(bh);
++              if (IS_SYNC(inode)) {
++                      ll_rw_block(WRITE, 1, &bh);
++                      wait_on_buffer(bh);
++              }
++              ext2_xattr_quota_free(inode);
++      }
++      EXT2_I(inode)->i_file_acl = 0;
++
++cleanup:
++      brelse(bh);
++      up(&ext2_xattr_sem);
++}
++
++/*
++ * ext2_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext2_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++      mb_cache_shrink(ext2_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++/*
++ * ext2_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext2_xattr_cache_insert(struct buffer_head *bh)
++{
++      __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++      struct mb_cache_entry *ce;
++      int error;
++
++      ce = mb_cache_entry_alloc(ext2_xattr_cache);
++      if (!ce)
++              return -ENOMEM;
++      error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++      if (error) {
++              mb_cache_entry_free(ce);
++              if (error == -EBUSY) {
++                      ea_bdebug(bh, "already in cache (%d cache entries)",
++                              atomic_read(&ext2_xattr_cache->c_entry_count));
++                      error = 0;
++              }
++      } else {
++              ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++                        atomic_read(&ext2_xattr_cache->c_entry_count));
++              mb_cache_entry_release(ce);
++      }
++      return error;
++}
++
++/*
++ * ext2_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext2_xattr_cmp(struct ext2_xattr_header *header1,
++             struct ext2_xattr_header *header2)
++{
++      struct ext2_xattr_entry *entry1, *entry2;
++
++      entry1 = ENTRY(header1+1);
++      entry2 = ENTRY(header2+1);
++      while (!IS_LAST_ENTRY(entry1)) {
++              if (IS_LAST_ENTRY(entry2))
++                      return 1;
++              if (entry1->e_hash != entry2->e_hash ||
++                  entry1->e_name_len != entry2->e_name_len ||
++                  entry1->e_value_size != entry2->e_value_size ||
++                  memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++                      return 1;
++              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++                      return -EIO;
++              if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++                         (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++                         le32_to_cpu(entry1->e_value_size)))
++                      return 1;
++
++              entry1 = EXT2_XATTR_NEXT(entry1);
++              entry2 = EXT2_XATTR_NEXT(entry2);
++      }
++      if (!IS_LAST_ENTRY(entry2))
++              return 1;
++      return 0;
++}
++
++/*
++ * ext2_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
++{
++      __u32 hash = le32_to_cpu(header->h_hash);
++      struct mb_cache_entry *ce;
++
++      if (!header->h_hash)
++              return NULL;  /* never share */
++      ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++      ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash);
++      while (ce) {
++              struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++              if (!bh) {
++                      ext2_error(inode->i_sb, "ext2_xattr_cache_find",
++                              "inode %ld: block %ld read error",
++                              inode->i_ino, ce->e_block);
++              } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++                         EXT2_XATTR_REFCOUNT_MAX) {
++                      ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++                              le32_to_cpu(HDR(bh)->h_refcount),
++                              EXT2_XATTR_REFCOUNT_MAX);
++              } else if (!ext2_xattr_cmp(header, HDR(bh))) {
++                      ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++                      mb_cache_entry_release(ce);
++                      return bh;
++              }
++              brelse(bh);
++              ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++      }
++      return NULL;
++}
++
++/*
++ * ext2_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext2_xattr_cache_remove(struct buffer_head *bh)
++{
++      struct mb_cache_entry *ce;
++
++      ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr);
++      if (ce) {
++              ea_bdebug(bh, "removing (%d cache entries remaining)",
++                        atomic_read(&ext2_xattr_cache->c_entry_count)-1);
++              mb_cache_entry_free(ce);
++      } else 
++              ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header,
++                                       struct ext2_xattr_entry *entry)
++{
++      __u32 hash = 0;
++      char *name = entry->e_name;
++      int n;
++
++      for (n=0; n < entry->e_name_len; n++) {
++              hash = (hash << NAME_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++                     *name++;
++      }
++
++      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++              __u32 *value = (__u32 *)((char *)header +
++                      le16_to_cpu(entry->e_value_offs));
++              for (n = (le32_to_cpu(entry->e_value_size) +
++                   EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) {
++                      hash = (hash << VALUE_HASH_SHIFT) ^
++                             (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++                             le32_to_cpu(*value++);
++              }
++      }
++      entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext2_xattr_rehash(struct ext2_xattr_header *header,
++                            struct ext2_xattr_entry *entry)
++{
++      struct ext2_xattr_entry *here;
++      __u32 hash = 0;
++      
++      ext2_xattr_hash_entry(header, entry);
++      here = ENTRY(header+1);
++      while (!IS_LAST_ENTRY(here)) {
++              if (!here->e_hash) {
++                      /* Block is not shared if an entry's hash value == 0 */
++                      hash = 0;
++                      break;
++              }
++              hash = (hash << BLOCK_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++                     le32_to_cpu(here->e_hash);
++              here = EXT2_XATTR_NEXT(here);
++      }
++      header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext2_xattr(void)
++{
++      ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
++              sizeof(struct mb_cache_entry) +
++              sizeof(struct mb_cache_entry_index), 1, 61);
++      if (!ext2_xattr_cache)
++              return -ENOMEM;
++
++      return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++      mb_cache_destroy(ext2_xattr_cache);
++}
++
++#else  /* CONFIG_EXT2_FS_XATTR_SHARING */
++
++int __init
++init_ext2_xattr(void)
++{
++      return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++}
++
++#endif  /* CONFIG_EXT2_FS_XATTR_SHARING */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/fs/ext2/xattr_user.c     2003-05-07 18:08:03.000000000 +0800
+@@ -0,0 +1,103 @@
++/*
++ * linux/fs/ext2/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++# include <linux/ext2_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext2_xattr_user_list(char *list, struct inode *inode,
++                   const char *name, int name_len)
++{
++      const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return 0;
++
++      if (list) {
++              memcpy(list, XATTR_USER_PREFIX, prefix_len);
++              memcpy(list+prefix_len, name, name_len);
++              list[prefix_len + name_len] = '\0';
++      }
++      return prefix_len + name_len + 1;
++}
++
++static int
++ext2_xattr_user_get(struct inode *inode, const char *name,
++                  void *buffer, size_t size)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++      error = ext2_permission_locked(inode, MAY_READ);
++#else
++      error = permission(inode, MAY_READ);
++#endif
++      if (error)
++              return error;
++
++      return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name,
++                            buffer, size);
++}
++
++static int
++ext2_xattr_user_set(struct inode *inode, const char *name,
++                  const void *value, size_t size, int flags)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++      if ( !S_ISREG(inode->i_mode) &&
++          (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++              return -EPERM;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++      error = ext2_permission_locked(inode, MAY_WRITE);
++#else
++      error = permission(inode, MAY_WRITE);
++#endif
++      if (error)
++              return error;
++  
++      return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
++                            value, size, flags);
++}
++
++struct ext2_xattr_handler ext2_xattr_user_handler = {
++      prefix: XATTR_USER_PREFIX,
++      list:   ext2_xattr_user_list,
++      get:    ext2_xattr_user_get,
++      set:    ext2_xattr_user_set,
++};
++
++int __init
++init_ext2_xattr_user(void)
++{
++      return ext2_xattr_register(EXT2_XATTR_INDEX_USER,
++                                 &ext2_xattr_user_handler);
++}
++
++void
++exit_ext2_xattr_user(void)
++{
++      ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
++                            &ext2_xattr_user_handler);
++}
+--- linux-2.4.20/fs/ext3/Makefile~linux-2.4.20-xattr-0.8.54    2003-05-05 19:01:02.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/Makefile 2003-05-07 18:10:33.000000000 +0800
+@@ -1,5 +1,5 @@
+ #
+-# Makefile for the linux ext2-filesystem routines.
++# Makefile for the linux ext3-filesystem routines.
+ #
+ # Note! Dependencies are done automagically by 'make dep', which also
+ # removes any old dependencies. DON'T put your own dependencies here
+@@ -9,10 +9,14 @@
+ O_TARGET := ext3.o
+-export-objs :=        super.o inode.o
++export-objs := ext3-exports.o
+ obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+-              ioctl.o namei.o super.o symlink.o hash.o
++              ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
+ obj-m    := $(O_TARGET)
++export-objs += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+--- linux-2.4.20/fs/ext3/file.c~linux-2.4.20-xattr-0.8.54      2003-05-05 19:01:02.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/file.c   2003-05-07 18:08:03.000000000 +0800
+@@ -23,6 +23,7 @@
+ #include <linux/locks.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/smp_lock.h>
+@@ -126,5 +127,9 @@ struct file_operations ext3_file_operati
+ struct inode_operations ext3_file_inode_operations = {
+       truncate:       ext3_truncate,          /* BKL held */
+       setattr:        ext3_setattr,           /* BKL held */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
+--- linux-2.4.20/fs/ext3/ialloc.c~linux-2.4.20-xattr-0.8.54    2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/ialloc.c 2003-05-07 18:08:03.000000000 +0800
+@@ -17,6 +17,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, 
+        * as writing the quota to disk may need the lock as well.
+        */
+       DQUOT_INIT(inode);
++      ext3_xattr_delete_inode(handle, inode);
+       DQUOT_FREE_INODE(inode);
+       DQUOT_DROP(inode);
+--- linux-2.4.20/fs/ext3/inode.c~linux-2.4.20-xattr-0.8.54     2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/inode.c  2003-05-07 18:08:03.000000000 +0800
+@@ -39,6 +39,18 @@
+  */
+ #undef SEARCH_FROM_ZERO
++/*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext3_inode_is_fast_symlink(struct inode *inode)
++{
++      int ea_blocks = inode->u.ext3_i.i_file_acl ?
++              (inode->i_sb->s_blocksize >> 9) : 0;
++
++      return (S_ISLNK(inode->i_mode) &&
++              inode->i_blocks - ea_blocks == 0);
++}
++
+ /* The ext3 forget function must perform a revoke if we are freeing data
+  * which has been journaled.  Metadata (eg. indirect blocks) must be
+  * revoked in all cases. 
+@@ -48,7 +60,7 @@
+  * still needs to be revoked.
+  */
+-static int ext3_forget(handle_t *handle, int is_metadata,
++int ext3_forget(handle_t *handle, int is_metadata,
+                      struct inode *inode, struct buffer_head *bh,
+                      int blocknr)
+ {
+@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i
+ {
+       handle_t *handle;
+       
+-      if (is_bad_inode(inode) ||
+-          inode->i_ino == EXT3_ACL_IDX_INO ||
+-          inode->i_ino == EXT3_ACL_DATA_INO)
++      if (is_bad_inode(inode))
+               goto no_delete;
+       lock_kernel();
+@@ -1855,6 +1865,8 @@ void ext3_truncate(struct inode * inode)
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+           S_ISLNK(inode->i_mode)))
+               return;
++      if (ext3_inode_is_fast_symlink(inode))
++              return;
+       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+               return;
+@@ -2002,8 +2014,6 @@ int ext3_get_inode_loc (struct inode *in
+       struct ext3_group_desc * gdp;
+               
+       if ((inode->i_ino != EXT3_ROOT_INO &&
+-              inode->i_ino != EXT3_ACL_IDX_INO &&
+-              inode->i_ino != EXT3_ACL_DATA_INO &&
+               inode->i_ino != EXT3_JOURNAL_INO &&
+               inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+               inode->i_ino > le32_to_cpu(
+@@ -2130,10 +2140,7 @@ void ext3_read_inode(struct inode * inod
+       brelse (iloc.bh);
+-      if (inode->i_ino == EXT3_ACL_IDX_INO ||
+-          inode->i_ino == EXT3_ACL_DATA_INO)
+-              /* Nothing to do */ ;
+-      else if (S_ISREG(inode->i_mode)) {
++      if (S_ISREG(inode->i_mode)) {
+               inode->i_op = &ext3_file_inode_operations;
+               inode->i_fop = &ext3_file_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+@@ -2141,15 +2148,17 @@ void ext3_read_inode(struct inode * inod
+               inode->i_op = &ext3_dir_inode_operations;
+               inode->i_fop = &ext3_dir_operations;
+       } else if (S_ISLNK(inode->i_mode)) {
+-              if (!inode->i_blocks)
++              if (ext3_inode_is_fast_symlink(inode))
+                       inode->i_op = &ext3_fast_symlink_inode_operations;
+               else {
+-                      inode->i_op = &page_symlink_inode_operations;
++                      inode->i_op = &ext3_symlink_inode_operations;
+                       inode->i_mapping->a_ops = &ext3_aops;
+               }
+-      } else 
++      } else {
++              inode->i_op = &ext3_special_inode_operations;
+               init_special_inode(inode, inode->i_mode,
+                                  le32_to_cpu(iloc.raw_inode->i_block[0]));
++      }
+       /* inode->i_attr_flags = 0;                             unused */
+       if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
+               /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
+--- linux-2.4.20/fs/ext3/namei.c~linux-2.4.20-xattr-0.8.54     2003-05-05 19:01:05.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/namei.c  2003-05-07 18:08:03.000000000 +0800
+@@ -29,6 +29,7 @@
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+@@ -1611,7 +1612,7 @@ static int ext3_mkdir(struct inode * dir
+       if (IS_SYNC(dir))
+               handle->h_sync = 1;
+-      inode = ext3_new_inode (handle, dir, S_IFDIR);
++      inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_stop;
+@@ -1619,7 +1620,6 @@ static int ext3_mkdir(struct inode * dir
+       inode->i_op = &ext3_dir_inode_operations;
+       inode->i_fop = &ext3_dir_operations;
+       inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+-      inode->i_blocks = 0;    
+       dir_block = ext3_bread (handle, inode, 0, 1, &err);
+       if (!dir_block) {
+               inode->i_nlink--; /* is this nlink == 0? */
+@@ -1646,9 +1646,6 @@ static int ext3_mkdir(struct inode * dir
+       BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+       ext3_journal_dirty_metadata(handle, dir_block);
+       brelse (dir_block);
+-      inode->i_mode = S_IFDIR | mode;
+-      if (dir->i_mode & S_ISGID)
+-              inode->i_mode |= S_ISGID;
+       ext3_mark_inode_dirty(handle, inode);
+       err = ext3_add_entry (handle, dentry, inode);
+       if (err) {
+@@ -2017,7 +2014,7 @@ static int ext3_symlink (struct inode * 
+               goto out_stop;
+       if (l > sizeof (EXT3_I(inode)->i_data)) {
+-              inode->i_op = &page_symlink_inode_operations;
++              inode->i_op = &ext3_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ext3_aops;
+               /*
+                * block_symlink() calls back into ext3_prepare/commit_write.
+@@ -2244,4 +2241,16 @@ struct inode_operations ext3_dir_inode_o
+       rmdir:          ext3_rmdir,             /* BKL held */
+       mknod:          ext3_mknod,             /* BKL held */
+       rename:         ext3_rename,            /* BKL held */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
++
++struct inode_operations ext3_special_inode_operations = {
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
++};
++
+--- linux-2.4.20/fs/ext3/super.c~linux-2.4.20-xattr-0.8.54     2003-05-05 19:01:02.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/super.c  2003-05-07 18:08:39.000000000 +0800
+@@ -24,6 +24,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -404,6 +405,7 @@ void ext3_put_super (struct super_block 
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++      ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+               EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+@@ -499,6 +501,7 @@ static int parse_options (char * options
+                         int is_remount)
+ {
+       unsigned long *mount_options = &sbi->s_mount_opt;
++      
+       uid_t *resuid = &sbi->s_resuid;
+       gid_t *resgid = &sbi->s_resgid;
+       char * this_char;
+@@ -511,6 +514,13 @@ static int parse_options (char * options
+            this_char = strtok (NULL, ",")) {
+               if ((value = strchr (this_char, '=')) != NULL)
+                       *value++ = 0;
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++              if (!strcmp (this_char, "user_xattr"))
++                      set_opt (*mount_options, XATTR_USER);
++              else if (!strcmp (this_char, "nouser_xattr"))
++                      clear_opt (*mount_options, XATTR_USER);
++              else
++#endif
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -928,6 +938,12 @@ struct super_block * ext3_read_super (st
+       sbi->s_mount_opt = 0;
+       sbi->s_resuid = EXT3_DEF_RESUID;
+       sbi->s_resgid = EXT3_DEF_RESGID;
++
++      /* Default extended attribute flags */
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++      /* set_opt(sbi->s_mount_opt, XATTR_USER); */
++#endif
++
+       if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
+               sb->s_dev = 0;
+               goto out_fail;
+@@ -1767,17 +1783,29 @@ static DECLARE_FSTYPE_DEV(ext3_fs_type, 
+ static int __init init_ext3_fs(void)
+ {
+-        return register_filesystem(&ext3_fs_type);
++      int error = init_ext3_xattr();
++      if (error)
++              return error;
++      error = init_ext3_xattr_user();
++      if (error)
++              goto fail;
++      error = register_filesystem(&ext3_fs_type);
++      if (!error)
++              return 0;
++      
++      exit_ext3_xattr_user();
++fail:
++      exit_ext3_xattr();
++      return error;
+ }
+ static void __exit exit_ext3_fs(void)
+ {
+       unregister_filesystem(&ext3_fs_type);
++      exit_ext3_xattr_user();
++      exit_ext3_xattr();
+ }
+-EXPORT_SYMBOL(ext3_force_commit);
+-EXPORT_SYMBOL(ext3_bread);
+-
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+ MODULE_LICENSE("GPL");
+--- linux-2.4.20/fs/ext3/symlink.c~linux-2.4.20-xattr-0.8.54   2001-11-10 06:25:04.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/symlink.c        2003-05-07 18:08:03.000000000 +0800
+@@ -20,6 +20,7 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -33,7 +34,20 @@ static int ext3_follow_link(struct dentr
+       return vfs_follow_link(nd, s);
+ }
++struct inode_operations ext3_symlink_inode_operations = {
++      readlink:       page_readlink,          /* BKL not held.  Don't need */
++      follow_link:    page_follow_link,       /* BKL not held.  Don't need */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
++};
++
+ struct inode_operations ext3_fast_symlink_inode_operations = {
+       readlink:       ext3_readlink,          /* BKL not held.  Don't need */
+       follow_link:    ext3_follow_link,       /* BKL not held.  Don't need */
++      setxattr:       ext3_setxattr,          /* BKL held */
++      getxattr:       ext3_getxattr,          /* BKL held */
++      listxattr:      ext3_listxattr,         /* BKL held */
++      removexattr:    ext3_removexattr,       /* BKL held */
+ };
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/xattr.c  2003-05-07 18:09:23.000000000 +0800
+@@ -0,0 +1,1225 @@
++/*
++ * linux/fs/ext3/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
++ * Extended attributes for symlinks and special files added per
++ *  suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ *   +------------------+
++ *   | header           |
++ *   | entry 1          | |
++ *   | entry 2          | | growing downwards
++ *   | entry 3          | v
++ *   | four null bytes  |
++ *   | . . .            |
++ *   | value 1          | ^
++ *   | value 3          | | growing upwards
++ *   | value 2          | |
++ *   +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT3_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++#define EXT3_EA_USER "user."
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT3_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++              printk(KERN_DEBUG "inode %s:%ld: ", \
++                      kdevname(inode->i_dev), inode->i_ino); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++# define ea_bdebug(bh, f...) do { \
++              printk(KERN_DEBUG "block %s:%ld: ", \
++                      kdevname(bh->b_dev), bh->b_blocknr); \
++              printk(f); \
++              printk("\n"); \
++      } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *,
++                         struct ext3_xattr_header *);
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++static int ext3_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext3_xattr_cache_find(struct inode *,
++                                               struct ext3_xattr_header *);
++static void ext3_xattr_cache_remove(struct buffer_head *);
++static void ext3_xattr_rehash(struct ext3_xattr_header *,
++                            struct ext3_xattr_entry *);
++
++static struct mb_cache *ext3_xattr_cache;
++
++#else
++# define ext3_xattr_cache_insert(bh) 0
++# define ext3_xattr_cache_find(inode, header) NULL
++# define ext3_xattr_cache_remove(bh) while(0) {}
++# define ext3_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext3_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext3_xattr_sem);
++
++static inline int
++ext3_xattr_new_block(handle_t *handle, struct inode *inode,
++                   int * errp, int force)
++{
++      struct super_block *sb = inode->i_sb;
++      int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
++              EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb);
++
++      /* How can we enforce the allocation? */
++      int block = ext3_new_block(handle, inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++      if (!*errp)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++      return block;
++}
++
++static inline int
++ext3_xattr_quota_alloc(struct inode *inode, int force)
++{
++      /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++      int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++      if (!error)
++              inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++      int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++      return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext3_xattr_quota_free(struct inode *inode)
++{
++      DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext3_xattr_free_block(handle_t *handle, struct inode * inode,
++                    unsigned long block)
++{
++      ext3_free_blocks(handle, inode, block, 1);
++      inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext3_xattr_quota_free(inode) \
++      DQUOT_FREE_BLOCK(inode, 1)
++# define ext3_xattr_free_block(handle, inode, block) \
++      ext3_free_blocks(handle, inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++      return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++      return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
++{
++      int error = -EINVAL;
++
++      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++              write_lock(&ext3_handler_lock);
++              if (!ext3_xattr_handlers[name_index-1]) {
++                      ext3_xattr_handlers[name_index-1] = handler;
++                      error = 0;
++              }
++              write_unlock(&ext3_handler_lock);
++      }
++      return error;
++}
++
++void
++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
++{
++      if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
++              write_lock(&ext3_handler_lock);
++              ext3_xattr_handlers[name_index-1] = NULL;
++              write_unlock(&ext3_handler_lock);
++      }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++      while (*a_prefix && *a == *a_prefix) {
++              a++;
++              a_prefix++;
++      }
++      return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static inline struct ext3_xattr_handler *
++ext3_xattr_resolve_name(const char **name)
++{
++      struct ext3_xattr_handler *handler = NULL;
++      int i;
++
++      if (!*name)
++              return NULL;
++      read_lock(&ext3_handler_lock);
++      for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
++              if (ext3_xattr_handlers[i]) {
++                      const char *n = strcmp_prefix(*name,
++                              ext3_xattr_handlers[i]->prefix);
++                      if (n) {
++                              handler = ext3_xattr_handlers[i];
++                              *name = n;
++                              break;
++                      }
++              }
++      }
++      read_unlock(&ext3_handler_lock);
++      return handler;
++}
++
++static inline struct ext3_xattr_handler *
++ext3_xattr_handler(int name_index)
++{
++      struct ext3_xattr_handler *handler = NULL;
++      if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++              read_lock(&ext3_handler_lock);
++              handler = ext3_xattr_handlers[name_index-1];
++              read_unlock(&ext3_handler_lock);
++      }
++      return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_getxattr(struct dentry *dentry, const char *name,
++            void *buffer, size_t size)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++      return ext3_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_setxattr(struct dentry *dentry, const char *name,
++            const void *value, size_t size, int flags)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      if (size == 0)
++              value = "";  /* empty EA, do not remove */
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_removexattr(struct dentry *dentry, const char *name)
++{
++      struct ext3_xattr_handler *handler;
++      struct inode *inode = dentry->d_inode;
++
++      handler = ext3_xattr_resolve_name(&name);
++      if (!handler)
++              return -ENOTSUP;
++      return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext3_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_entry *entry;
++      unsigned int block, size;
++      char *end;
++      int name_len, error;
++
++      ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++                name_index, name, buffer, (long)buffer_size);
++
++      if (name == NULL)
++              return -EINVAL;
++      if (!EXT3_I(inode)->i_file_acl)
++              return -ENOATTR;
++      block = EXT3_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext3_error(inode->i_sb, "ext3_xattr_get",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* find named attribute */
++      name_len = strlen(name);
++
++      error = -ERANGE;
++      if (name_len > 255)
++              goto cleanup;
++      entry = FIRST_ENTRY(bh);
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              if (name_index == entry->e_name_index &&
++                  name_len == entry->e_name_len &&
++                  memcmp(name, entry->e_name, name_len) == 0)
++                      goto found;
++              entry = next;
++      }
++      /* Check the remaining name entries */
++      while (!IS_LAST_ENTRY(entry)) {
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++              entry = next;
++      }
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      error = -ENOATTR;
++      goto cleanup;
++found:
++      /* check the buffer size */
++      if (entry->e_value_block != 0)
++              goto bad_block;
++      size = le32_to_cpu(entry->e_value_size);
++      if (size > inode->i_sb->s_blocksize ||
++          le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++              goto bad_block;
++
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (buffer) {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++              /* return value of attribute */
++              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++                      size);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * ext3_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_entry *entry;
++      unsigned int block, size = 0;
++      char *buf, *end;
++      int error;
++
++      ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++                buffer, (long)buffer_size);
++
++      if (!EXT3_I(inode)->i_file_acl)
++              return 0;
++      block = EXT3_I(inode)->i_file_acl;
++      ea_idebug(inode, "reading block %d", block);
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh)
++              return -EIO;
++      ea_bdebug(bh, "b_count=%d, refcount=%d",
++              atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++      end = bh->b_data + bh->b_size;
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block:    ext3_error(inode->i_sb, "ext3_xattr_list",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              error = -EIO;
++              goto cleanup;
++      }
++      /* compute the size required for the list of attribute names */
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT3_XATTR_NEXT(entry)) {
++              struct ext3_xattr_handler *handler;
++              struct ext3_xattr_entry *next =
++                      EXT3_XATTR_NEXT(entry);
++              if ((char *)next >= end)
++                      goto bad_block;
++
++              handler = ext3_xattr_handler(entry->e_name_index);
++              if (handler)
++                      size += handler->list(NULL, inode, entry->e_name,
++                                            entry->e_name_len);
++      }
++
++      if (ext3_xattr_cache_insert(bh))
++              ea_idebug(inode, "cache insert failed");
++      if (!buffer) {
++              error = size;
++              goto cleanup;
++      } else {
++              error = -ERANGE;
++              if (size > buffer_size)
++                      goto cleanup;
++      }
++
++      /* list the attribute names */
++      buf = buffer;
++      for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT3_XATTR_NEXT(entry)) {
++              struct ext3_xattr_handler *handler;
++
++              handler = ext3_xattr_handler(entry->e_name_index);
++              if (handler)
++                      buf += handler->list(buf, inode, entry->e_name,
++                                           entry->e_name_len);
++      }
++      error = size;
++
++cleanup:
++      brelse(bh);
++
++      return error;
++}
++
++/*
++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext3_xattr_update_super_block(handle_t *handle,
++                                        struct super_block *sb)
++{
++      if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
++              return;
++
++      lock_super(sb);
++      ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++      EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR;
++#endif
++      EXT3_SB(sb)->s_es->s_feature_compat |=
++              cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
++      sb->s_dirt = 1;
++      ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++      unlock_super(sb);
++}
++
++/*
++ * ext3_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++             const char *name, const void *value, size_t value_len, int flags)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *bh = NULL;
++      struct ext3_xattr_header *header = NULL;
++      struct ext3_xattr_entry *here, *last;
++      unsigned int name_len;
++      int block = EXT3_I(inode)->i_file_acl;
++      int min_offs = sb->s_blocksize, not_found = 1, free, error;
++      char *end;
++      
++      /*
++       * header -- Points either into bh, or to a temporarily
++       *           allocated buffer.
++       * here -- The named entry found, or the place for inserting, within
++       *         the block pointed to by header.
++       * last -- Points right after the last named entry within the block
++       *         pointed to by header.
++       * min_offs -- The offset of the first value (values are aligned
++       *             towards the end of the block).
++       * end -- Points right after the block pointed to by header.
++       */
++      
++      ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++                name_index, name, value, (long)value_len);
++
++      if (IS_RDONLY(inode))
++              return -EROFS;
++      if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++              return -EPERM;
++      if (value == NULL)
++              value_len = 0;
++      if (name == NULL)
++              return -EINVAL;
++      name_len = strlen(name);
++      if (name_len > 255 || value_len > sb->s_blocksize)
++              return -ERANGE;
++      down(&ext3_xattr_sem);
++
++      if (block) {
++              /* The inode already has an extended attribute block. */
++              bh = sb_bread(sb, block);
++              error = -EIO;
++              if (!bh)
++                      goto cleanup;
++              ea_bdebug(bh, "b_count=%d, refcount=%d",
++                      atomic_read(&(bh->b_count)),
++                      le32_to_cpu(HDR(bh)->h_refcount));
++              header = HDR(bh);
++              end = bh->b_data + bh->b_size;
++              if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++                  header->h_blocks != cpu_to_le32(1)) {
++bad_block:            ext3_error(sb, "ext3_xattr_set",
++                              "inode %ld: bad block %d", inode->i_ino, block);
++                      error = -EIO;
++                      goto cleanup;
++              }
++              /* Find the named attribute. */
++              here = FIRST_ENTRY(bh);
++              while (!IS_LAST_ENTRY(here)) {
++                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!here->e_value_block && here->e_value_size) {
++                              int offs = le16_to_cpu(here->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      not_found = name_index - here->e_name_index;
++                      if (!not_found)
++                              not_found = name_len - here->e_name_len;
++                      if (!not_found)
++                              not_found = memcmp(name, here->e_name,name_len);
++                      if (not_found <= 0)
++                              break;
++                      here = next;
++              }
++              last = here;
++              /* We still need to compute min_offs and last. */
++              while (!IS_LAST_ENTRY(last)) {
++                      struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last);
++                      if ((char *)next >= end)
++                              goto bad_block;
++                      if (!last->e_value_block && last->e_value_size) {
++                              int offs = le16_to_cpu(last->e_value_offs);
++                              if (offs < min_offs)
++                                      min_offs = offs;
++                      }
++                      last = next;
++              }
++
++              /* Check whether we have enough space left. */
++              free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++      } else {
++              /* We will use a new extended attribute block. */
++              free = sb->s_blocksize -
++                      sizeof(struct ext3_xattr_header) - sizeof(__u32);
++              here = last = NULL;  /* avoid gcc uninitialized warning. */
++      }
++
++      if (not_found) {
++              /* Request to remove a nonexistent attribute? */
++              error = -ENOATTR;
++              if (flags & XATTR_REPLACE)
++                      goto cleanup;
++              error = 0;
++              if (value == NULL)
++                      goto cleanup;
++              else
++                      free -= EXT3_XATTR_LEN(name_len);
++      } else {
++              /* Request to create an existing attribute? */
++              error = -EEXIST;
++              if (flags & XATTR_CREATE)
++                      goto cleanup;
++              if (!here->e_value_block && here->e_value_size) {
++                      unsigned int size = le32_to_cpu(here->e_value_size);
++
++                      if (le16_to_cpu(here->e_value_offs) + size > 
++                          sb->s_blocksize || size > sb->s_blocksize)
++                              goto bad_block;
++                      free += EXT3_XATTR_SIZE(size);
++              }
++      }
++      free -= EXT3_XATTR_SIZE(value_len);
++      error = -ENOSPC;
++      if (free < 0)
++              goto cleanup;
++
++      /* Here we know that we can set the new attribute. */
++
++      if (header) {
++              if (header->h_refcount == cpu_to_le32(1)) {
++                      ea_bdebug(bh, "modifying in-place");
++                      ext3_xattr_cache_remove(bh);
++                      error = ext3_journal_get_write_access(handle, bh);
++                      if (error)
++                              goto cleanup;
++              } else {
++                      int offset;
++
++                      ea_bdebug(bh, "cloning");
++                      header = kmalloc(bh->b_size, GFP_KERNEL);
++                      error = -ENOMEM;
++                      if (header == NULL)
++                              goto cleanup;
++                      memcpy(header, HDR(bh), bh->b_size);
++                      header->h_refcount = cpu_to_le32(1);
++                      offset = (char *)header - bh->b_data;
++                      here = ENTRY((char *)here + offset);
++                      last = ENTRY((char *)last + offset);
++              }
++      } else {
++              /* Allocate a buffer where we construct the new block. */
++              header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++              error = -ENOMEM;
++              if (header == NULL)
++                      goto cleanup;
++              memset(header, 0, sb->s_blocksize);
++              end = (char *)header + sb->s_blocksize;
++              header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
++              header->h_blocks = header->h_refcount = cpu_to_le32(1);
++              last = here = ENTRY(header+1);
++      }
++
++      if (not_found) {
++              /* Insert the new name. */
++              int size = EXT3_XATTR_LEN(name_len);
++              int rest = (char *)last - (char *)here;
++              memmove((char *)here + size, here, rest);
++              memset(here, 0, size);
++              here->e_name_index = name_index;
++              here->e_name_len = name_len;
++              memcpy(here->e_name, name, name_len);
++      } else {
++              /* Remove the old value. */
++              if (!here->e_value_block && here->e_value_size) {
++                      char *first_val = (char *)header + min_offs;
++                      int offs = le16_to_cpu(here->e_value_offs);
++                      char *val = (char *)header + offs;
++                      size_t size = EXT3_XATTR_SIZE(
++                              le32_to_cpu(here->e_value_size));
++                      memmove(first_val + size, first_val, val - first_val);
++                      memset(first_val, 0, size);
++                      here->e_value_offs = 0;
++                      min_offs += size;
++
++                      /* Adjust all value offsets. */
++                      last = ENTRY(header+1);
++                      while (!IS_LAST_ENTRY(last)) {
++                              int o = le16_to_cpu(last->e_value_offs);
++                              if (!last->e_value_block && o < offs)
++                                      last->e_value_offs =
++                                              cpu_to_le16(o + size);
++                              last = EXT3_XATTR_NEXT(last);
++                      }
++              }
++              if (value == NULL) {
++                      /* Remove this attribute. */
++                      if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) {
++                              /* This block is now empty. */
++                              error = ext3_xattr_set2(handle, inode, bh,NULL);
++                              goto cleanup;
++                      } else {
++                              /* Remove the old name. */
++                              int size = EXT3_XATTR_LEN(name_len);
++                              last = ENTRY((char *)last - size);
++                              memmove(here, (char*)here + size,
++                                      (char*)last - (char*)here);
++                              memset(last, 0, size);
++                      }
++              }
++      }
++
++      if (value != NULL) {
++              /* Insert the new value. */
++              here->e_value_size = cpu_to_le32(value_len);
++              if (value_len) {
++                      size_t size = EXT3_XATTR_SIZE(value_len);
++                      char *val = (char *)header + min_offs - size;
++                      here->e_value_offs =
++                              cpu_to_le16((char *)val - (char *)header);
++                      memset(val + size - EXT3_XATTR_PAD, 0,
++                             EXT3_XATTR_PAD); /* Clear the pad bytes. */
++                      memcpy(val, value, value_len);
++              }
++      }
++      ext3_xattr_rehash(header, here);
++
++      error = ext3_xattr_set2(handle, inode, bh, header);
++
++cleanup:
++      brelse(bh);
++      if (!(bh && header == HDR(bh)))
++              kfree(header);
++      up(&ext3_xattr_sem);
++
++      return error;
++}
++
++/*
++ * Second half of ext3_xattr_set(): Update the file system.
++ */
++static int
++ext3_xattr_set2(handle_t *handle, struct inode *inode,
++              struct buffer_head *old_bh, struct ext3_xattr_header *header)
++{
++      struct super_block *sb = inode->i_sb;
++      struct buffer_head *new_bh = NULL;
++      int error;
++
++      if (header) {
++              new_bh = ext3_xattr_cache_find(inode, header);
++              if (new_bh) {
++                      /*
++                       * We found an identical block in the cache.
++                       * The old block will be released after updating
++                       * the inode.
++                       */
++                      ea_bdebug(old_bh, "reusing block %ld",
++                              new_bh->b_blocknr);
++                      
++                      error = -EDQUOT;
++                      if (ext3_xattr_quota_alloc(inode, 1))
++                              goto cleanup;
++                      
++                      error = ext3_journal_get_write_access(handle, new_bh);
++                      if (error)
++                              goto cleanup;
++                      HDR(new_bh)->h_refcount = cpu_to_le32(
++                              le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++                      ea_bdebug(new_bh, "refcount now=%d",
++                              le32_to_cpu(HDR(new_bh)->h_refcount));
++              } else if (old_bh && header == HDR(old_bh)) {
++                      /* Keep this block. */
++                      new_bh = old_bh;
++                      ext3_xattr_cache_insert(new_bh);
++              } else {
++                      /* We need to allocate a new block */
++                      int force = EXT3_I(inode)->i_file_acl != 0;
++                      int block = ext3_xattr_new_block(handle, inode,
++                                                       &error, force);
++                      if (error)
++                              goto cleanup;
++                      ea_idebug(inode, "creating block %d", block);
++
++                      new_bh = sb_getblk(sb, block);
++                      if (!new_bh) {
++getblk_failed:                        ext3_xattr_free_block(handle, inode, block);
++                              error = -EIO;
++                              goto cleanup;
++                      }
++                      lock_buffer(new_bh);
++                      error = ext3_journal_get_create_access(handle, new_bh);
++                      if (error) {
++                              unlock_buffer(new_bh);
++                              goto getblk_failed;
++                      }
++                      memcpy(new_bh->b_data, header, new_bh->b_size);
++                      mark_buffer_uptodate(new_bh, 1);
++                      unlock_buffer(new_bh);
++                      ext3_xattr_cache_insert(new_bh);
++                      
++                      ext3_xattr_update_super_block(handle, sb);
++              }
++              error = ext3_journal_dirty_metadata(handle, new_bh);
++              if (error)
++                      goto cleanup;
++      }
++
++      /* Update the inode. */
++      EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++      inode->i_ctime = CURRENT_TIME;
++      ext3_mark_inode_dirty(handle, inode);
++      if (IS_SYNC(inode))
++              handle->h_sync = 1;
++
++      error = 0;
++      if (old_bh && old_bh != new_bh) {
++              /*
++               * If there was an old block, and we are not still using it,
++               * we now release the old block.
++              */
++              unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++              error = ext3_journal_get_write_access(handle, old_bh);
++              if (error)
++                      goto cleanup;
++              if (refcount == 1) {
++                      /* Free the old block. */
++                      ea_bdebug(old_bh, "freeing");
++                      ext3_xattr_free_block(handle, inode, old_bh->b_blocknr);
++
++                      /* ext3_forget() calls bforget() for us, but we
++                         let our caller release old_bh, so we need to
++                         duplicate the handle before. */
++                      get_bh(old_bh);
++                      ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
++              } else {
++                      /* Decrement the refcount only. */
++                      refcount--;
++                      HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++                      ext3_xattr_quota_free(inode);
++                      ext3_journal_dirty_metadata(handle, old_bh);
++                      ea_bdebug(old_bh, "refcount now=%d", refcount);
++              }
++      }
++
++cleanup:
++      if (old_bh != new_bh)
++              brelse(new_bh);
++
++      return error;
++}
++
++/*
++ * ext3_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++      struct buffer_head *bh;
++      unsigned int block = EXT3_I(inode)->i_file_acl;
++
++      if (!block)
++              return;
++      down(&ext3_xattr_sem);
++
++      bh = sb_bread(inode->i_sb, block);
++      if (!bh) {
++              ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++                      "inode %ld: block %d read error", inode->i_ino, block);
++              goto cleanup;
++      }
++      ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++      if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++          HDR(bh)->h_blocks != cpu_to_le32(1)) {
++              ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++                      "inode %ld: bad block %d", inode->i_ino, block);
++              goto cleanup;
++      }
++      ext3_journal_get_write_access(handle, bh);
++      ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++      if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++              ext3_xattr_cache_remove(bh);
++              ext3_xattr_free_block(handle, inode, block);
++              ext3_forget(handle, 1, inode, bh, block);
++              bh = NULL;
++      } else {
++              HDR(bh)->h_refcount = cpu_to_le32(
++                      le32_to_cpu(HDR(bh)->h_refcount) - 1);
++              ext3_journal_dirty_metadata(handle, bh);
++              if (IS_SYNC(inode))
++                      handle->h_sync = 1;
++              ext3_xattr_quota_free(inode);
++      }
++      EXT3_I(inode)->i_file_acl = 0;
++
++cleanup:
++      brelse(bh);
++      up(&ext3_xattr_sem);
++}
++
++/*
++ * ext3_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext3_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++      mb_cache_shrink(ext3_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++/*
++ * ext3_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext3_xattr_cache_insert(struct buffer_head *bh)
++{
++      __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++      struct mb_cache_entry *ce;
++      int error;
++
++      ce = mb_cache_entry_alloc(ext3_xattr_cache);
++      if (!ce)
++              return -ENOMEM;
++      error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++      if (error) {
++              mb_cache_entry_free(ce);
++              if (error == -EBUSY) {
++                      ea_bdebug(bh, "already in cache (%d cache entries)",
++                              atomic_read(&ext3_xattr_cache->c_entry_count));
++                      error = 0;
++              }
++      } else {
++              ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++                        atomic_read(&ext3_xattr_cache->c_entry_count));
++              mb_cache_entry_release(ce);
++      }
++      return error;
++}
++
++/*
++ * ext3_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext3_xattr_cmp(struct ext3_xattr_header *header1,
++             struct ext3_xattr_header *header2)
++{
++      struct ext3_xattr_entry *entry1, *entry2;
++
++      entry1 = ENTRY(header1+1);
++      entry2 = ENTRY(header2+1);
++      while (!IS_LAST_ENTRY(entry1)) {
++              if (IS_LAST_ENTRY(entry2))
++                      return 1;
++              if (entry1->e_hash != entry2->e_hash ||
++                  entry1->e_name_len != entry2->e_name_len ||
++                  entry1->e_value_size != entry2->e_value_size ||
++                  memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++                      return 1;
++              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++                      return -EIO;
++              if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++                         (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++                         le32_to_cpu(entry1->e_value_size)))
++                      return 1;
++
++              entry1 = EXT3_XATTR_NEXT(entry1);
++              entry2 = EXT3_XATTR_NEXT(entry2);
++      }
++      if (!IS_LAST_ENTRY(entry2))
++              return 1;
++      return 0;
++}
++
++/*
++ * ext3_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
++{
++      __u32 hash = le32_to_cpu(header->h_hash);
++      struct mb_cache_entry *ce;
++
++      if (!header->h_hash)
++              return NULL;  /* never share */
++      ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++      ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash);
++      while (ce) {
++              struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++              if (!bh) {
++                      ext3_error(inode->i_sb, "ext3_xattr_cache_find",
++                              "inode %ld: block %ld read error",
++                              inode->i_ino, ce->e_block);
++              } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++                         EXT3_XATTR_REFCOUNT_MAX) {
++                      ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++                              le32_to_cpu(HDR(bh)->h_refcount),
++                              EXT3_XATTR_REFCOUNT_MAX);
++              } else if (!ext3_xattr_cmp(header, HDR(bh))) {
++                      ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++                      mb_cache_entry_release(ce);
++                      return bh;
++              }
++              brelse(bh);
++              ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++      }
++      return NULL;
++}
++
++/*
++ * ext3_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext3_xattr_cache_remove(struct buffer_head *bh)
++{
++      struct mb_cache_entry *ce;
++
++      ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr);
++      if (ce) {
++              ea_bdebug(bh, "removing (%d cache entries remaining)",
++                        atomic_read(&ext3_xattr_cache->c_entry_count)-1);
++              mb_cache_entry_free(ce);
++      } else 
++              ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
++                                       struct ext3_xattr_entry *entry)
++{
++      __u32 hash = 0;
++      char *name = entry->e_name;
++      int n;
++
++      for (n=0; n < entry->e_name_len; n++) {
++              hash = (hash << NAME_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++                     *name++;
++      }
++
++      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++              __u32 *value = (__u32 *)((char *)header +
++                      le16_to_cpu(entry->e_value_offs));
++              for (n = (le32_to_cpu(entry->e_value_size) +
++                   EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
++                      hash = (hash << VALUE_HASH_SHIFT) ^
++                             (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++                             le32_to_cpu(*value++);
++              }
++      }
++      entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext3_xattr_rehash(struct ext3_xattr_header *header,
++                            struct ext3_xattr_entry *entry)
++{
++      struct ext3_xattr_entry *here;
++      __u32 hash = 0;
++      
++      ext3_xattr_hash_entry(header, entry);
++      here = ENTRY(header+1);
++      while (!IS_LAST_ENTRY(here)) {
++              if (!here->e_hash) {
++                      /* Block is not shared if an entry's hash value == 0 */
++                      hash = 0;
++                      break;
++              }
++              hash = (hash << BLOCK_HASH_SHIFT) ^
++                     (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++                     le32_to_cpu(here->e_hash);
++              here = EXT3_XATTR_NEXT(here);
++      }
++      header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext3_xattr(void)
++{
++      ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
++              sizeof(struct mb_cache_entry) +
++              sizeof(struct mb_cache_entry_index), 1, 61);
++      if (!ext3_xattr_cache)
++              return -ENOMEM;
++
++      return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++      if (ext3_xattr_cache)
++              mb_cache_destroy(ext3_xattr_cache);
++      ext3_xattr_cache = NULL;
++}
++
++#else  /* CONFIG_EXT3_FS_XATTR_SHARING */
++
++int __init
++init_ext3_xattr(void)
++{
++      return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++}
++
++#endif  /* CONFIG_EXT3_FS_XATTR_SHARING */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/fs/ext3/xattr_user.c     2003-05-07 18:08:03.000000000 +0800
+@@ -0,0 +1,111 @@
++/*
++ * linux/fs/ext3/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++# include <linux/ext3_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext3_xattr_user_list(char *list, struct inode *inode,
++                   const char *name, int name_len)
++{
++      const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return 0;
++
++      if (list) {
++              memcpy(list, XATTR_USER_PREFIX, prefix_len);
++              memcpy(list+prefix_len, name, name_len);
++              list[prefix_len + name_len] = '\0';
++      }
++      return prefix_len + name_len + 1;
++}
++
++static int
++ext3_xattr_user_get(struct inode *inode, const char *name,
++                  void *buffer, size_t size)
++{
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++      error = ext3_permission_locked(inode, MAY_READ);
++#else
++      error = permission(inode, MAY_READ);
++#endif
++      if (error)
++              return error;
++
++      return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name,
++                            buffer, size);
++}
++
++static int
++ext3_xattr_user_set(struct inode *inode, const char *name,
++                  const void *value, size_t size, int flags)
++{
++      handle_t *handle;
++      int error;
++
++      if (strcmp(name, "") == 0)
++              return -EINVAL;
++      if (!test_opt(inode->i_sb, XATTR_USER))
++              return -ENOTSUP;
++      if ( !S_ISREG(inode->i_mode) &&
++          (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++              return -EPERM;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++      error = ext3_permission_locked(inode, MAY_WRITE);
++#else
++      error = permission(inode, MAY_WRITE);
++#endif
++      if (error)
++              return error;
++  
++      handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++      error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name,
++                             value, size, flags);
++      ext3_journal_stop(handle, inode);
++
++      return error;
++}
++
++struct ext3_xattr_handler ext3_xattr_user_handler = {
++      prefix: XATTR_USER_PREFIX,
++      list:   ext3_xattr_user_list,
++      get:    ext3_xattr_user_get,
++      set:    ext3_xattr_user_set,
++};
++
++int __init
++init_ext3_xattr_user(void)
++{
++      return ext3_xattr_register(EXT3_XATTR_INDEX_USER,
++                                 &ext3_xattr_user_handler);
++}
++
++void
++exit_ext3_xattr_user(void)
++{
++      ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
++                            &ext3_xattr_user_handler);
++}
+--- linux-2.4.20/fs/jfs/jfs_xattr.h~linux-2.4.20-xattr-0.8.54  2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/fs/jfs/jfs_xattr.h       2003-05-07 18:08:03.000000000 +0800
+@@ -52,8 +52,10 @@ struct jfs_ea_list {
+ #define       END_EALIST(ealist) \
+       ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
+-extern int __jfs_setxattr(struct inode *, const char *, void *, size_t, int);
+-extern int jfs_setxattr(struct dentry *, const char *, void *, size_t, int);
++extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t,
++                        int);
++extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
++                      int);
+ extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
+ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
+ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
+--- linux-2.4.20/fs/jfs/xattr.c~linux-2.4.20-xattr-0.8.54      2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/fs/jfs/xattr.c   2003-05-07 18:08:03.000000000 +0800
+@@ -641,7 +641,7 @@ static int ea_put(struct inode *inode, s
+ }
+ static int can_set_xattr(struct inode *inode, const char *name,
+-                       void *value, size_t value_len)
++                       const void *value, size_t value_len)
+ {
+       if (IS_RDONLY(inode))
+               return -EROFS;
+@@ -660,7 +660,7 @@ static int can_set_xattr(struct inode *i
+       return permission(inode, MAY_WRITE);
+ }
+-int __jfs_setxattr(struct inode *inode, const char *name, void *value,
++int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
+                  size_t value_len, int flags)
+ {
+       struct jfs_ea_list *ealist;
+@@ -799,7 +799,7 @@ int __jfs_setxattr(struct inode *inode, 
+       return rc;
+ }
+-int jfs_setxattr(struct dentry *dentry, const char *name, void *value,
++int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                size_t value_len, int flags)
+ {
+       if (value == NULL) {    /* empty EA, do not remove */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/fs/mbcache.c     2003-05-07 18:08:03.000000000 +0800
+@@ -0,0 +1,648 @@
++/*
++ * linux/fs/mbcache.c
++ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++/*
++ * Filesystem Meta Information Block Cache (mbcache)
++ *
++ * The mbcache caches blocks of block devices that need to be located
++ * by their device/block number, as well as by other criteria (such
++ * as the block's contents).
++ *
++ * There can only be one cache entry in a cache per device and block number.
++ * Additional indexes need not be unique in this sense. The number of
++ * additional indexes (=other criteria) can be hardwired at compile time
++ * or specified at cache create time.
++ *
++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
++ * in the cache. A valid entry is in the main hash tables of the cache,
++ * and may also be in the lru list. An invalid entry is not in any hashes
++ * or lists.
++ *
++ * A valid cache entry is only in the lru list if no handles refer to it.
++ * Invalid cache entries will be freed when the last handle to the cache
++ * entry is released. Entries that cannot be freed immediately are put
++ * back on the lru list.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/cache_def.h>
++#include <linux/version.h>
++#include <linux/init.h>
++#include <linux/mbcache.h>
++
++
++#ifdef MB_CACHE_DEBUG
++# define mb_debug(f...) do { \
++              printk(KERN_DEBUG f); \
++              printk("\n"); \
++      } while (0)
++#define mb_assert(c) do { if (!(c)) \
++              printk(KERN_ERR "assertion " #c " failed\n"); \
++      } while(0)
++#else
++# define mb_debug(f...) do { } while(0)
++# define mb_assert(c) do { } while(0)
++#endif
++#define mb_error(f...) do { \
++              printk(KERN_ERR f); \
++              printk("\n"); \
++      } while(0)
++              
++MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
++MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
++MODULE_LICENSE("GPL");
++#endif
++
++EXPORT_SYMBOL(mb_cache_create);
++EXPORT_SYMBOL(mb_cache_shrink);
++EXPORT_SYMBOL(mb_cache_destroy);
++EXPORT_SYMBOL(mb_cache_entry_alloc);
++EXPORT_SYMBOL(mb_cache_entry_insert);
++EXPORT_SYMBOL(mb_cache_entry_release);
++EXPORT_SYMBOL(mb_cache_entry_takeout);
++EXPORT_SYMBOL(mb_cache_entry_free);
++EXPORT_SYMBOL(mb_cache_entry_dup);
++EXPORT_SYMBOL(mb_cache_entry_get);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++EXPORT_SYMBOL(mb_cache_entry_find_first);
++EXPORT_SYMBOL(mb_cache_entry_find_next);
++#endif
++
++
++/*
++ * Global data: list of all mbcache's, lru list, and a spinlock for
++ * accessing cache data structures on SMP machines. The lru list is
++ * global across all mbcaches.
++ */
++
++static LIST_HEAD(mb_cache_list);
++static LIST_HEAD(mb_cache_lru_list);
++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED;
++
++static inline int
++mb_cache_indexes(struct mb_cache *cache)
++{
++#ifdef MB_CACHE_INDEXES_COUNT
++      return MB_CACHE_INDEXES_COUNT;
++#else
++      return cache->c_indexes_count;
++#endif
++}
++
++/*
++ * What the mbcache registers as to get shrunk dynamically.
++ */
++
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask);
++
++static struct cache_definition mb_cache_definition = {
++      "mb_cache",
++      mb_cache_memory_pressure
++};
++
++
++static inline int
++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
++{
++      return !list_empty(&ce->e_block_list);
++}
++
++
++static inline void
++__mb_cache_entry_unhash(struct mb_cache_entry *ce)
++{
++      int n;
++
++      if (__mb_cache_entry_is_hashed(ce)) {
++              list_del_init(&ce->e_block_list);
++              for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
++                      list_del(&ce->e_indexes[n].o_list);
++      }
++}
++
++
++static inline void
++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
++{
++      struct mb_cache *cache = ce->e_cache;
++
++      mb_assert(atomic_read(&ce->e_used) == 0);
++      if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
++              /* free failed -- put back on the lru list
++                 for freeing later. */
++              spin_lock(&mb_cache_spinlock);
++              list_add(&ce->e_lru_list, &mb_cache_lru_list);
++              spin_unlock(&mb_cache_spinlock);
++      } else {
++              kmem_cache_free(cache->c_entry_cache, ce);
++              atomic_dec(&cache->c_entry_count);
++      }
++}
++
++
++static inline void
++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
++{
++      if (atomic_dec_and_test(&ce->e_used)) {
++              if (__mb_cache_entry_is_hashed(ce))
++                      list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
++              else {
++                      spin_unlock(&mb_cache_spinlock);
++                      __mb_cache_entry_forget(ce, GFP_KERNEL);
++                      return;
++              }
++      }
++      spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_memory_pressure()  memory pressure callback
++ *
++ * This function is called by the kernel memory management when memory
++ * gets low.
++ *
++ * @priority: Amount by which to shrink the cache (0 = highes priority)
++ * @gfp_mask: (ignored)
++ */
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++      int count = 0;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &mb_cache_list) {
++              struct mb_cache *cache =
++                      list_entry(l, struct mb_cache, c_cache_list);
++              mb_debug("cache %s (%d)", cache->c_name,
++                        atomic_read(&cache->c_entry_count));
++              count += atomic_read(&cache->c_entry_count);
++      }
++      mb_debug("trying to free %d of %d entries",
++                count / (priority ? priority : 1), count);
++      if (priority)
++              count /= priority;
++      while (count-- && !list_empty(&mb_cache_lru_list)) {
++              struct mb_cache_entry *ce =
++                      list_entry(mb_cache_lru_list.next,
++                                 struct mb_cache_entry, e_lru_list);
++              list_del(&ce->e_lru_list);
++              __mb_cache_entry_unhash(ce);
++              list_add_tail(&ce->e_lru_list, &free_list);
++      }
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), gfp_mask);
++      }
++}
++
++
++/*
++ * mb_cache_create()  create a new cache
++ *
++ * All entries in one cache are equal size. Cache entries may be from
++ * multiple devices. If this is the first mbcache created, registers
++ * the cache with kernel memory management. Returns NULL if no more
++ * memory was available.
++ *
++ * @name: name of the cache (informal)
++ * @cache_op: contains the callback called when freeing a cache entry
++ * @entry_size: The size of a cache entry, including
++ *              struct mb_cache_entry
++ * @indexes_count: number of additional indexes in the cache. Must equal
++ *                 MB_CACHE_INDEXES_COUNT if the number of indexes is
++ *                 hardwired.
++ * @bucket_count: number of hash buckets
++ */
++struct mb_cache *
++mb_cache_create(const char *name, struct mb_cache_op *cache_op,
++              size_t entry_size, int indexes_count, int bucket_count)
++{
++      int m=0, n;
++      struct mb_cache *cache = NULL;
++
++      if(entry_size < sizeof(struct mb_cache_entry) +
++         indexes_count * sizeof(struct mb_cache_entry_index))
++              return NULL;
++
++      MOD_INC_USE_COUNT;
++      cache = kmalloc(sizeof(struct mb_cache) +
++                      indexes_count * sizeof(struct list_head), GFP_KERNEL);
++      if (!cache)
++              goto fail;
++      cache->c_name = name;
++      cache->c_op.free = NULL;
++      if (cache_op)
++              cache->c_op.free = cache_op->free;
++      atomic_set(&cache->c_entry_count, 0);
++      cache->c_bucket_count = bucket_count;
++#ifdef MB_CACHE_INDEXES_COUNT
++      mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
++#else
++      cache->c_indexes_count = indexes_count;
++#endif
++      cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
++                                    GFP_KERNEL);
++      if (!cache->c_block_hash)
++              goto fail;
++      for (n=0; n<bucket_count; n++)
++              INIT_LIST_HEAD(&cache->c_block_hash[n]);
++      for (m=0; m<indexes_count; m++) {
++              cache->c_indexes_hash[m] = kmalloc(bucket_count *
++                                               sizeof(struct list_head),
++                                               GFP_KERNEL);
++              if (!cache->c_indexes_hash[m])
++                      goto fail;
++              for (n=0; n<bucket_count; n++)
++                      INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
++      }
++      cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
++              0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL);
++      if (!cache->c_entry_cache)
++              goto fail;
++
++      spin_lock(&mb_cache_spinlock);
++      list_add(&cache->c_cache_list, &mb_cache_list);
++      spin_unlock(&mb_cache_spinlock);
++      return cache;
++
++fail:
++      if (cache) {
++              while (--m >= 0)
++                      kfree(cache->c_indexes_hash[m]);
++              if (cache->c_block_hash)
++                      kfree(cache->c_block_hash);
++              kfree(cache);
++      }
++      MOD_DEC_USE_COUNT;
++      return NULL;
++}
++
++
++/*
++ * mb_cache_shrink()
++ *
++ * Removes all cache entires of a device from the cache. All cache entries
++ * currently in use cannot be freed, and thus remain in the cache.
++ *
++ * @cache: which cache to shrink
++ * @dev: which device's cache entries to shrink
++ */
++void
++mb_cache_shrink(struct mb_cache *cache, kdev_t dev)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_lru_list);
++              if (ce->e_dev == dev) {
++                      list_del(&ce->e_lru_list);
++                      list_add_tail(&ce->e_lru_list, &free_list);
++                      __mb_cache_entry_unhash(ce);
++              }
++      }
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), GFP_KERNEL);
++      }
++}
++
++
++/*
++ * mb_cache_destroy()
++ *
++ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
++ * and then destroys it. If this was the last mbcache, un-registers the
++ * mbcache from kernel memory management.
++ */
++void
++mb_cache_destroy(struct mb_cache *cache)
++{
++      LIST_HEAD(free_list);
++      struct list_head *l, *ltmp;
++      int n;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_lru_list);
++              if (ce->e_cache == cache) {
++                      list_del(&ce->e_lru_list);
++                      list_add_tail(&ce->e_lru_list, &free_list);
++                      __mb_cache_entry_unhash(ce);
++              }
++      }
++      list_del(&cache->c_cache_list);
++      spin_unlock(&mb_cache_spinlock);
++      list_for_each_safe(l, ltmp, &free_list) {
++              __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++                                                 e_lru_list), GFP_KERNEL);
++      }
++
++      if (atomic_read(&cache->c_entry_count) > 0) {
++              mb_error("cache %s: %d orphaned entries",
++                        cache->c_name,
++                        atomic_read(&cache->c_entry_count));
++      }
++
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0))
++      /* We don't have kmem_cache_destroy() in 2.2.x */
++      kmem_cache_shrink(cache->c_entry_cache);
++#else
++      kmem_cache_destroy(cache->c_entry_cache);
++#endif
++      for (n=0; n < mb_cache_indexes(cache); n++)
++              kfree(cache->c_indexes_hash[n]);
++      kfree(cache->c_block_hash);
++      kfree(cache);
++
++      MOD_DEC_USE_COUNT;
++}
++
++
++/*
++ * mb_cache_entry_alloc()
++ *
++ * Allocates a new cache entry. The new entry will not be valid initially,
++ * and thus cannot be looked up yet. It should be filled with data, and
++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
++ * if no more memory was available.
++ */
++struct mb_cache_entry *
++mb_cache_entry_alloc(struct mb_cache *cache)
++{
++      struct mb_cache_entry *ce;
++
++      atomic_inc(&cache->c_entry_count);
++      ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
++      if (ce) {
++              INIT_LIST_HEAD(&ce->e_lru_list);
++              INIT_LIST_HEAD(&ce->e_block_list);
++              ce->e_cache = cache;
++              atomic_set(&ce->e_used, 1);
++      }
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_insert()
++ *
++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
++ * the cache. After this, the cache entry can be looked up, but is not yet
++ * in the lru list as the caller still holds a handle to it. Returns 0 on
++ * success, or -EBUSY if a cache entry for that device + inode exists
++ * already (this may happen after a failed lookup, if another process has
++ * inserted the same cache entry in the meantime).
++ *
++ * @dev: device the cache entry belongs to
++ * @block: block number
++ * @keys: array of additional keys. There must be indexes_count entries
++ *        in the array (as specified when creating the cache).
++ */
++int
++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev,
++                    unsigned long block, unsigned int keys[])
++{
++      struct mb_cache *cache = ce->e_cache;
++      unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++      struct list_head *l;
++      int error = -EBUSY, n;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &cache->c_block_hash[bucket]) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry, e_block_list);
++              if (ce->e_dev == dev && ce->e_block == block)
++                      goto out;
++      }
++      __mb_cache_entry_unhash(ce);
++      ce->e_dev = dev;
++      ce->e_block = block;
++      list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
++      for (n=0; n<mb_cache_indexes(cache); n++) {
++              ce->e_indexes[n].o_key = keys[n];
++              bucket = keys[n] % cache->c_bucket_count;
++              list_add(&ce->e_indexes[n].o_list,
++                       &cache->c_indexes_hash[n][bucket]);
++      }
++out:
++      spin_unlock(&mb_cache_spinlock);
++      return error;
++}
++
++
++/*
++ * mb_cache_entry_release()
++ *
++ * Release a handle to a cache entry. When the last handle to a cache entry
++ * is released it is either freed (if it is invalid) or otherwise inserted
++ * in to the lru list.
++ */
++void
++mb_cache_entry_release(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_takeout()
++ *
++ * Take a cache entry out of the cache, making it invalid. The entry can later
++ * be re-inserted using mb_cache_entry_insert(), or released using
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_takeout(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      mb_assert(list_empty(&ce->e_lru_list));
++      __mb_cache_entry_unhash(ce);
++      spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_entry_free()
++ *
++ * This is equivalent to the sequence mb_cache_entry_takeout() --
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_free(struct mb_cache_entry *ce)
++{
++      spin_lock(&mb_cache_spinlock);
++      mb_assert(list_empty(&ce->e_lru_list));
++      __mb_cache_entry_unhash(ce);
++      __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_dup()
++ *
++ * Duplicate a handle to a cache entry (does not duplicate the cache entry
++ * itself). After the call, both the old and the new handle must be released.
++ */
++struct mb_cache_entry *
++mb_cache_entry_dup(struct mb_cache_entry *ce)
++{
++      atomic_inc(&ce->e_used);
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_get()
++ *
++ * Get a cache entry  by device / block number. (There can only be one entry
++ * in the cache per device and block.) Returns NULL if no such cache entry
++ * exists.
++ */
++struct mb_cache_entry *
++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block)
++{
++      unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      spin_lock(&mb_cache_spinlock);
++      list_for_each(l, &cache->c_block_hash[bucket]) {
++              ce = list_entry(l, struct mb_cache_entry, e_block_list);
++              if (ce->e_dev == dev && ce->e_block == block) {
++                      if (!list_empty(&ce->e_lru_list))
++                              list_del_init(&ce->e_lru_list);
++                      atomic_inc(&ce->e_used);
++                      goto cleanup;
++              }
++      }
++      ce = NULL;
++
++cleanup:
++      spin_unlock(&mb_cache_spinlock);
++      return ce;
++}
++
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++
++static struct mb_cache_entry *
++__mb_cache_entry_find(struct list_head *l, struct list_head *head,
++                    int index, kdev_t dev, unsigned int key)
++{
++      while (l != head) {
++              struct mb_cache_entry *ce =
++                      list_entry(l, struct mb_cache_entry,
++                                 e_indexes[index].o_list);
++              if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) {
++                      if (!list_empty(&ce->e_lru_list))
++                              list_del_init(&ce->e_lru_list);
++                      atomic_inc(&ce->e_used);
++                      return ce;
++              }
++              l = l->next;
++      }
++      return NULL;
++}
++
++
++/*
++ * mb_cache_entry_find_first()
++ *
++ * Find the first cache entry on a given device with a certain key in
++ * an additional index. Additonal matches can be found with
++ * mb_cache_entry_find_next(). Returns NULL if no match was found.
++ *
++ * @cache: the cache to search
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_first(struct mb_cache *cache, int index, kdev_t dev,
++                        unsigned int key)
++{
++      unsigned int bucket = key % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      mb_assert(index < mb_cache_indexes(cache));
++      spin_lock(&mb_cache_spinlock);
++      l = cache->c_indexes_hash[index][bucket].next;
++      ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++                                 index, dev, key);
++      spin_unlock(&mb_cache_spinlock);
++      return ce;
++}
++
++
++/*
++ * mb_cache_entry_find_next()
++ *
++ * Find the next cache entry on a given device with a certain key in an
++ * additional index. Returns NULL if no match could be found. The previous
++ * entry is atomatically released, so that mb_cache_entry_find_next() can
++ * be called like this:
++ *
++ * entry = mb_cache_entry_find_first();
++ * while (entry) {
++ *    ...
++ *    entry = mb_cache_entry_find_next(entry, ...);
++ * }
++ *
++ * @prev: The previous match
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, kdev_t dev,
++                       unsigned int key)
++{
++      struct mb_cache *cache = prev->e_cache;
++      unsigned int bucket = key % cache->c_bucket_count;
++      struct list_head *l;
++      struct mb_cache_entry *ce;
++
++      mb_assert(index < mb_cache_indexes(cache));
++      spin_lock(&mb_cache_spinlock);
++      l = prev->e_indexes[index].o_list.next;
++      ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++                                 index, dev, key);
++      __mb_cache_entry_release_unlock(prev);
++      return ce;
++}
++
++#endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
++
++static int __init init_mbcache(void)
++{
++      register_cache(&mb_cache_definition);
++      return 0;
++}
++
++static void __exit exit_mbcache(void)
++{
++      unregister_cache(&mb_cache_definition);
++}
++
++module_init(init_mbcache)
++module_exit(exit_mbcache)
++
+--- linux-2.4.20/include/asm-arm/unistd.h~linux-2.4.20-xattr-0.8.54    2002-08-03 08:39:45.000000000 +0800
++++ linux-2.4.20-root/include/asm-arm/unistd.h 2003-05-07 18:08:03.000000000 +0800
+@@ -244,7 +244,6 @@
+ #define __NR_security                 (__NR_SYSCALL_BASE+223)
+ #define __NR_gettid                   (__NR_SYSCALL_BASE+224)
+ #define __NR_readahead                        (__NR_SYSCALL_BASE+225)
+-#if 0 /* allocated in 2.5 */
+ #define __NR_setxattr                 (__NR_SYSCALL_BASE+226)
+ #define __NR_lsetxattr                        (__NR_SYSCALL_BASE+227)
+ #define __NR_fsetxattr                        (__NR_SYSCALL_BASE+228)
+@@ -257,7 +256,6 @@
+ #define __NR_removexattr              (__NR_SYSCALL_BASE+235)
+ #define __NR_lremovexattr             (__NR_SYSCALL_BASE+236)
+ #define __NR_fremovexattr             (__NR_SYSCALL_BASE+237)
+-#endif
+ #define __NR_tkill                    (__NR_SYSCALL_BASE+238)
+ /*
+  * Please check 2.5 _before_ adding calls here,
+--- linux-2.4.20/include/asm-ia64/unistd.h~linux-2.4.20-xattr-0.8.54   2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/include/asm-ia64/unistd.h        2003-05-07 18:08:03.000000000 +0800
+@@ -206,8 +206,19 @@
+ #define __NR_getdents64                       1214
+ #define __NR_getunwind                        1215
+ #define __NR_readahead                        1216
++#define __NR_setxattr                 1217
++#define __NR_lsetxattr                        1218
++#define __NR_fsetxattr                        1219
++#define __NR_getxattr                 1220
++#define __NR_lgetxattr                        1221
++#define __NR_fgetxattr                        1222
++#define __NR_listxattr                        1223
++#define __NR_llistxattr                       1224
++#define __NR_flistxattr                       1225
++#define __NR_removexattr              1226
++#define __NR_lremovexattr             1227
++#define __NR_fremovexattr             1228
+ /*
+- * 1217-1228: reserved for xattr
+  * 1230-1232: reserved for futex and sched_[sg]etaffinity.
+  */
+ #define __NR_tkill                    1229
+--- linux-2.4.20/include/asm-ppc64/unistd.h~linux-2.4.20-xattr-0.8.54  2002-08-03 08:39:45.000000000 +0800
++++ linux-2.4.20-root/include/asm-ppc64/unistd.h       2003-05-07 18:08:03.000000000 +0800
+@@ -218,6 +218,7 @@
+ #define __NR_gettid           207
+ #if 0 /* Reserved syscalls */
+ #define __NR_tkill            208
++#endif
+ #define __NR_setxattr         209
+ #define __NR_lsetxattr                210
+ #define __NR_fsetxattr                211
+@@ -230,6 +231,7 @@
+ #define __NR_removexattr      218
+ #define __NR_lremovexattr     219
+ #define __NR_fremovexattr     220
++#if 0 /* Reserved syscalls */
+ #define __NR_futex            221
+ #endif
+--- linux-2.4.20/include/asm-s390/unistd.h~linux-2.4.20-xattr-0.8.54   2002-08-03 08:39:45.000000000 +0800
++++ linux-2.4.20-root/include/asm-s390/unistd.h        2003-05-07 18:08:03.000000000 +0800
+@@ -212,9 +212,18 @@
+ #define __NR_madvise            219
+ #define __NR_getdents64               220
+ #define __NR_fcntl64          221
+-/*
+- * Numbers 224-235 are reserved for posix acl
+- */
++#define __NR_setxattr         224
++#define __NR_lsetxattr                225
++#define __NR_fsetxattr                226
++#define __NR_getxattr         227
++#define __NR_lgetxattr                228
++#define __NR_fgetxattr                229
++#define __NR_listxattr                230
++#define __NR_llistxattr               231
++#define __NR_flistxattr               232
++#define __NR_removexattr      233
++#define __NR_lremovexattr     234
++#define __NR_fremovexattr     235
+ #define __NR_gettid           236
+ #define __NR_tkill            237
+--- linux-2.4.20/include/asm-s390x/unistd.h~linux-2.4.20-xattr-0.8.54  2002-08-03 08:39:45.000000000 +0800
++++ linux-2.4.20-root/include/asm-s390x/unistd.h       2003-05-07 18:08:03.000000000 +0800
+@@ -180,9 +180,18 @@
+ #define __NR_pivot_root         217
+ #define __NR_mincore            218
+ #define __NR_madvise            219
+-/*
+- * Numbers 224-235 are reserved for posix acl
+- */
++#define __NR_setxattr         224
++#define __NR_lsetxattr                225
++#define __NR_fsetxattr                226
++#define __NR_getxattr         227
++#define __NR_lgetxattr                228
++#define __NR_fgetxattr                229
++#define __NR_listxattr                230
++#define __NR_llistxattr               231
++#define __NR_flistxattr               232
++#define __NR_removexattr      233
++#define __NR_lremovexattr     234
++#define __NR_fremovexattr     235
+ #define __NR_gettid           236
+ #define __NR_tkill            237
+--- linux-2.4.20/include/asm-sparc/unistd.h~linux-2.4.20-xattr-0.8.54  2002-08-03 08:39:45.000000000 +0800
++++ linux-2.4.20-root/include/asm-sparc/unistd.h       2003-05-07 18:08:03.000000000 +0800
+@@ -184,24 +184,24 @@
+ /* #define __NR_exportfs        166    SunOS Specific                              */
+ #define __NR_mount              167 /* Common                                      */
+ #define __NR_ustat              168 /* Common                                      */
+-/* #define __NR_semsys          169    SunOS Specific                              */
+-/* #define __NR_msgsys          170    SunOS Specific                              */
+-/* #define __NR_shmsys          171    SunOS Specific                              */
+-/* #define __NR_auditsys        172    SunOS Specific                              */
+-/* #define __NR_rfssys          173    SunOS Specific                              */
++#define __NR_setxattr           169 /* SunOS: semsys                               */
++#define __NR_lsetxattr          170 /* SunOS: msgsys                               */
++#define __NR_fsetxattr          171 /* SunOS: shmsys                               */
++#define __NR_getxattr           172 /* SunOS: auditsys                             */
++#define __NR_lgetxattr          173 /* SunOS: rfssys                               */
+ #define __NR_getdents           174 /* Common                                      */
+ #define __NR_setsid             175 /* Common                                      */
+ #define __NR_fchdir             176 /* Common                                      */
+-/* #define __NR_fchroot         177    SunOS Specific                              */
+-/* #define __NR_vpixsys         178    SunOS Specific                              */
+-/* #define __NR_aioread         179    SunOS Specific                              */
+-/* #define __NR_aiowrite        180    SunOS Specific                              */
+-/* #define __NR_aiowait         181    SunOS Specific                              */
+-/* #define __NR_aiocancel       182    SunOS Specific                              */
++#define __NR_fgetxattr          177 /* SunOS: fchroot                              */
++#define __NR_listxattr          178 /* SunOS: vpixsys                              */
++#define __NR_llistxattr         179 /* SunOS: aioread                              */
++#define __NR_flistxattr         180 /* SunOS: aiowrite                             */
++#define __NR_removexattr        181 /* SunOS: aiowait                              */
++#define __NR_lremovexattr       182 /* SunOS: aiocancel                            */
+ #define __NR_sigpending         183 /* Common                                      */
+ #define __NR_query_module     184 /* Linux Specific                              */
+ #define __NR_setpgid            185 /* Common                                      */
+-/* #define __NR_pathconf        186    SunOS Specific                              */
++#define __NR_fremovexattr       186 /* SunOS: pathconf                             */
+ #define __NR_tkill              187 /* SunOS: fpathconf                            */
+ /* #define __NR_sysconf         188    SunOS Specific                              */
+ #define __NR_uname              189 /* Linux Specific                              */
+--- linux-2.4.20/include/asm-sparc64/unistd.h~linux-2.4.20-xattr-0.8.54        2002-08-03 08:39:45.000000000 +0800
++++ linux-2.4.20-root/include/asm-sparc64/unistd.h     2003-05-07 18:08:03.000000000 +0800
+@@ -184,24 +184,24 @@
+ /* #define __NR_exportfs        166    SunOS Specific                              */
+ #define __NR_mount              167 /* Common                                      */
+ #define __NR_ustat              168 /* Common                                      */
+-/* #define __NR_semsys          169    SunOS Specific                              */
+-/* #define __NR_msgsys          170    SunOS Specific                              */
+-/* #define __NR_shmsys          171    SunOS Specific                              */
+-/* #define __NR_auditsys        172    SunOS Specific                              */
+-/* #define __NR_rfssys          173    SunOS Specific                              */
++#define __NR_setxattr           169 /* SunOS: semsys                               */
++#define __NR_lsetxattr          170 /* SunOS: msgsys                               */
++#define __NR_fsetxattr          171 /* SunOS: shmsys                               */
++#define __NR_getxattr           172 /* SunOS: auditsys                             */
++#define __NR_lgetxattr          173 /* SunOS: rfssys                               */
+ #define __NR_getdents           174 /* Common                                      */
+ #define __NR_setsid             175 /* Common                                      */
+ #define __NR_fchdir             176 /* Common                                      */
+-/* #define __NR_fchroot         177    SunOS Specific                              */
+-/* #define __NR_vpixsys         178    SunOS Specific                              */
+-/* #define __NR_aioread         179    SunOS Specific                              */
+-/* #define __NR_aiowrite        180    SunOS Specific                              */
+-/* #define __NR_aiowait         181    SunOS Specific                              */
+-/* #define __NR_aiocancel       182    SunOS Specific                              */
++#define __NR_fgetxattr          177 /* SunOS: fchroot                              */
++#define __NR_listxattr          178 /* SunOS: vpixsys                              */
++#define __NR_llistxattr         179 /* SunOS: aioread                              */
++#define __NR_flistxattr         180 /* SunOS: aiowrite                             */
++#define __NR_removexattr        181 /* SunOS: aiowait                              */
++#define __NR_lremovexattr       182 /* SunOS: aiocancel                            */
+ #define __NR_sigpending         183 /* Common                                      */
+ #define __NR_query_module     184 /* Linux Specific                              */
+ #define __NR_setpgid            185 /* Common                                      */
+-/* #define __NR_pathconf        186    SunOS Specific                              */
++#define __NR_fremovexattr       186 /* SunOS: pathconf                             */
+ #define __NR_tkill              187 /* SunOS: fpathconf                            */
+ /* #define __NR_sysconf         188    SunOS Specific                              */
+ #define __NR_uname              189 /* Linux Specific                              */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/include/linux/cache_def.h        2003-05-07 18:08:03.000000000 +0800
+@@ -0,0 +1,15 @@
++/*
++ * linux/cache_def.h
++ * Handling of caches defined in drivers, filesystems, ...
++ *
++ * Copyright (C) 2002 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++struct cache_definition {
++      const char *name;
++      void (*shrink)(int, unsigned int);
++      struct list_head link;
++};
++
++extern void register_cache(struct cache_definition *);
++extern void unregister_cache(struct cache_definition *);
+--- linux-2.4.20/include/linux/errno.h~linux-2.4.20-xattr-0.8.54       2003-04-14 16:39:03.000000000 +0800
++++ linux-2.4.20-root/include/linux/errno.h    2003-05-07 18:08:03.000000000 +0800
+@@ -23,4 +23,8 @@
+ #endif
++/* Defined for extended attributes */
++#define ENOATTR ENODATA               /* No such attribute */
++#define ENOTSUP EOPNOTSUPP    /* Operation not supported */
++
+ #endif
+--- linux-2.4.20/include/linux/ext2_fs.h~linux-2.4.20-xattr-0.8.54     2003-04-14 16:39:08.000000000 +0800
++++ linux-2.4.20-root/include/linux/ext2_fs.h  2003-05-07 18:08:03.000000000 +0800
+@@ -57,8 +57,6 @@
+  */
+ #define       EXT2_BAD_INO             1      /* Bad blocks inode */
+ #define EXT2_ROOT_INO          2      /* Root inode */
+-#define EXT2_ACL_IDX_INO       3      /* ACL inode */
+-#define EXT2_ACL_DATA_INO      4      /* ACL inode */
+ #define EXT2_BOOT_LOADER_INO   5      /* Boot loader inode */
+ #define EXT2_UNDEL_DIR_INO     6      /* Undelete directory inode */
+@@ -86,7 +84,6 @@
+ #else
+ # define EXT2_BLOCK_SIZE(s)           (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT2_ACLE_PER_BLOCK(s)                (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry))
+ #define       EXT2_ADDR_PER_BLOCK(s)          (EXT2_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT2_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+@@ -121,28 +118,6 @@
+ #endif
+ /*
+- * ACL structures
+- */
+-struct ext2_acl_header        /* Header of Access Control Lists */
+-{
+-      __u32   aclh_size;
+-      __u32   aclh_file_count;
+-      __u32   aclh_acle_count;
+-      __u32   aclh_first_acle;
+-};
+-
+-struct ext2_acl_entry /* Access Control List Entry */
+-{
+-      __u32   acle_size;
+-      __u16   acle_perms;     /* Access permissions */
+-      __u16   acle_type;      /* Type of entry */
+-      __u16   acle_tag;       /* User or group identity */
+-      __u16   acle_pad1;
+-      __u32   acle_next;      /* Pointer on next entry for the */
+-                                      /* same inode or on next free entry */
+-};
+-
+-/*
+  * Structure of a blocks group descriptor
+  */
+ struct ext2_group_desc
+@@ -314,6 +289,7 @@ struct ext2_inode {
+ #define EXT2_MOUNT_ERRORS_PANIC               0x0040  /* Panic on errors */
+ #define EXT2_MOUNT_MINIX_DF           0x0080  /* Mimics the Minix statfs */
+ #define EXT2_MOUNT_NO_UID32           0x0200  /* Disable 32-bit UIDs */
++#define EXT2_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
+ #define clear_opt(o, opt)             o &= ~EXT2_MOUNT_##opt
+ #define set_opt(o, opt)                       o |= EXT2_MOUNT_##opt
+@@ -397,6 +373,7 @@ struct ext2_super_block {
+ #ifdef __KERNEL__
+ #define EXT2_SB(sb)   (&((sb)->u.ext2_sb))
++#define EXT2_I(inode) (&((inode)->u.ext2_i))
+ #else
+ /* Assume that user mode programs are passing in an ext2fs superblock, not
+  * a kernel struct super_block.  This will allow us to call the feature-test
+@@ -466,7 +443,7 @@ struct ext2_super_block {
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008
+ #define EXT2_FEATURE_INCOMPAT_ANY             0xffffffff
+-#define EXT2_FEATURE_COMPAT_SUPP      0
++#define EXT2_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT2_FEATURE_INCOMPAT_SUPP    EXT2_FEATURE_INCOMPAT_FILETYPE
+ #define EXT2_FEATURE_RO_COMPAT_SUPP   (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+                                        EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -623,8 +600,10 @@ extern struct address_space_operations e
+ /* namei.c */
+ extern struct inode_operations ext2_dir_inode_operations;
++extern struct inode_operations ext2_special_inode_operations;
+ /* symlink.c */
++extern struct inode_operations ext2_symlink_inode_operations;
+ extern struct inode_operations ext2_fast_symlink_inode_operations;
+ #endif        /* __KERNEL__ */
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/include/linux/ext2_xattr.h       2003-05-07 18:08:03.000000000 +0800
+@@ -0,0 +1,157 @@
++/*
++  File: linux/ext2_xattr.h
++
++  On-disk format of extended attributes for the ext2 filesystem.
++
++  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT2_XATTR_MAGIC              0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT2_XATTR_REFCOUNT_MAX               1024
++
++/* Name indexes */
++#define EXT2_XATTR_INDEX_MAX                  10
++#define EXT2_XATTR_INDEX_USER                 1
++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS     2
++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT    3
++
++struct ext2_xattr_header {
++      __u32   h_magic;        /* magic number for identification */
++      __u32   h_refcount;     /* reference count */
++      __u32   h_blocks;       /* number of disk blocks used */
++      __u32   h_hash;         /* hash value of all attributes */
++      __u32   h_reserved[4];  /* zero right now */
++};
++
++struct ext2_xattr_entry {
++      __u8    e_name_len;     /* length of name */
++      __u8    e_name_index;   /* attribute name index */
++      __u16   e_value_offs;   /* offset in disk block of value */
++      __u32   e_value_block;  /* disk block attribute is stored on (n/i) */
++      __u32   e_value_size;   /* size of attribute value */
++      __u32   e_hash;         /* hash value of name and value */
++      char    e_name[0];      /* attribute name */
++};
++
++#define EXT2_XATTR_PAD_BITS           2
++#define EXT2_XATTR_PAD                (1<<EXT2_XATTR_PAD_BITS)
++#define EXT2_XATTR_ROUND              (EXT2_XATTR_PAD-1)
++#define EXT2_XATTR_LEN(name_len) \
++      (((name_len) + EXT2_XATTR_ROUND + \
++      sizeof(struct ext2_xattr_entry)) & ~EXT2_XATTR_ROUND)
++#define EXT2_XATTR_NEXT(entry) \
++      ( (struct ext2_xattr_entry *)( \
++        (char *)(entry) + EXT2_XATTR_LEN((entry)->e_name_len)) )
++#define EXT2_XATTR_SIZE(size) \
++      (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT2_FS_XATTR
++
++struct ext2_xattr_handler {
++      char *prefix;
++      size_t (*list)(char *list, struct inode *inode, const char *name,
++                     int name_len);
++      int (*get)(struct inode *inode, const char *name, void *buffer,
++                 size_t size);
++      int (*set)(struct inode *inode, const char *name, const void *buffer,
++                 size_t size, int flags);
++};
++
++extern int ext2_xattr_register(int, struct ext2_xattr_handler *);
++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *);
++
++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
++extern int ext2_removexattr(struct dentry *, const char *);
++
++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext2_xattr_list(struct inode *, char *, size_t);
++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext2_xattr_delete_inode(struct inode *);
++extern void ext2_xattr_put_super(struct super_block *);
++
++extern int init_ext2_xattr(void) __init;
++extern void exit_ext2_xattr(void);
++
++# else  /* CONFIG_EXT2_FS_XATTR */
++#  define ext2_setxattr               NULL
++#  define ext2_getxattr               NULL
++#  define ext2_listxattr      NULL
++#  define ext2_removexattr    NULL
++
++static inline int
++ext2_xattr_get(struct inode *inode, int name_index,
++             const char *name, void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++             const void *value, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++}
++
++static inline void
++ext2_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext2_xattr(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext2_xattr(void)
++{
++}
++
++# endif  /* CONFIG_EXT2_FS_XATTR */
++
++# ifdef CONFIG_EXT2_FS_XATTR_USER
++
++extern int init_ext2_xattr_user(void) __init;
++extern void exit_ext2_xattr_user(void);
++
++# else  /* CONFIG_EXT2_FS_XATTR_USER */
++
++static inline int
++init_ext2_xattr_user(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext2_xattr_user(void)
++{
++}
++
++# endif  /* CONFIG_EXT2_FS_XATTR_USER */
++
++#endif  /* __KERNEL__ */
++
+--- linux-2.4.20/include/linux/ext3_fs.h~linux-2.4.20-xattr-0.8.54     2003-05-05 19:01:04.000000000 +0800
++++ linux-2.4.20-root/include/linux/ext3_fs.h  2003-05-07 18:08:03.000000000 +0800
+@@ -63,8 +63,6 @@
+  */
+ #define       EXT3_BAD_INO             1      /* Bad blocks inode */
+ #define EXT3_ROOT_INO          2      /* Root inode */
+-#define EXT3_ACL_IDX_INO       3      /* ACL inode */
+-#define EXT3_ACL_DATA_INO      4      /* ACL inode */
+ #define EXT3_BOOT_LOADER_INO   5      /* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO     6      /* Undelete directory inode */
+ #define EXT3_RESIZE_INO                7      /* Reserved group descriptors inode */
+@@ -94,7 +92,6 @@
+ #else
+ # define EXT3_BLOCK_SIZE(s)           (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT3_ACLE_PER_BLOCK(s)                (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define       EXT3_ADDR_PER_BLOCK(s)          (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s)      ((s)->s_blocksize_bits)
+@@ -129,28 +126,6 @@
+ #endif
+ /*
+- * ACL structures
+- */
+-struct ext3_acl_header        /* Header of Access Control Lists */
+-{
+-      __u32   aclh_size;
+-      __u32   aclh_file_count;
+-      __u32   aclh_acle_count;
+-      __u32   aclh_first_acle;
+-};
+-
+-struct ext3_acl_entry /* Access Control List Entry */
+-{
+-      __u32   acle_size;
+-      __u16   acle_perms;     /* Access permissions */
+-      __u16   acle_type;      /* Type of entry */
+-      __u16   acle_tag;       /* User or group identity */
+-      __u16   acle_pad1;
+-      __u32   acle_next;      /* Pointer on next entry for the */
+-                                      /* same inode or on next free entry */
+-};
+-
+-/*
+  * Structure of a blocks group descriptor
+  */
+ struct ext3_group_desc
+@@ -344,6 +319,7 @@ struct ext3_inode {
+   #define EXT3_MOUNT_WRITEBACK_DATA   0x0C00  /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
++#define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -520,7 +496,7 @@ struct ext3_super_block {
+ #define EXT3_FEATURE_INCOMPAT_RECOVER         0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV     0x0008 /* Journal device */
+-#define EXT3_FEATURE_COMPAT_SUPP      0
++#define EXT3_FEATURE_COMPAT_SUPP      EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP    (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+                                        EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP   (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+@@ -703,6 +679,7 @@ extern void ext3_check_inodes_bitmap (st
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+ /* inode.c */
++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+@@ -771,8 +748,10 @@ extern struct address_space_operations e
+ /* namei.c */
+ extern struct inode_operations ext3_dir_inode_operations;
++extern struct inode_operations ext3_special_inode_operations;
+ /* symlink.c */
++extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+--- linux-2.4.20/include/linux/ext3_jbd.h~linux-2.4.20-xattr-0.8.54    2003-05-05 19:01:02.000000000 +0800
++++ linux-2.4.20-root/include/linux/ext3_jbd.h 2003-05-07 18:08:03.000000000 +0800
+@@ -30,13 +30,19 @@
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS  8
++/* Extended attributes may touch two data buffers, two bitmap buffers,
++ * and two group and summaries. */
++
++#define EXT3_XATTR_TRANS_BLOCKS               8
++
+ /* Define the minimum size for a transaction which modifies data.  This
+  * needs to take into account the fact that we may end up modifying two
+  * quota files too (one for the group, one for the user quota).  The
+  * superblock only gets updated once, of course, so don't bother
+  * counting that again for the quota updates. */
+-#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
++#define EXT3_DATA_TRANS_BLOCKS                (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \
++                                       EXT3_XATTR_TRANS_BLOCKS - 2)
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/include/linux/ext3_xattr.h       2003-05-07 18:08:03.000000000 +0800
+@@ -0,0 +1,157 @@
++/*
++  File: linux/ext3_xattr.h
++
++  On-disk format of extended attributes for the ext3 filesystem.
++
++  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT3_XATTR_MAGIC              0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT3_XATTR_REFCOUNT_MAX               1024
++
++/* Name indexes */
++#define EXT3_XATTR_INDEX_MAX                  10
++#define EXT3_XATTR_INDEX_USER                 1
++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS     2
++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT    3
++
++struct ext3_xattr_header {
++      __u32   h_magic;        /* magic number for identification */
++      __u32   h_refcount;     /* reference count */
++      __u32   h_blocks;       /* number of disk blocks used */
++      __u32   h_hash;         /* hash value of all attributes */
++      __u32   h_reserved[4];  /* zero right now */
++};
++
++struct ext3_xattr_entry {
++      __u8    e_name_len;     /* length of name */
++      __u8    e_name_index;   /* attribute name index */
++      __u16   e_value_offs;   /* offset in disk block of value */
++      __u32   e_value_block;  /* disk block attribute is stored on (n/i) */
++      __u32   e_value_size;   /* size of attribute value */
++      __u32   e_hash;         /* hash value of name and value */
++      char    e_name[0];      /* attribute name */
++};
++
++#define EXT3_XATTR_PAD_BITS           2
++#define EXT3_XATTR_PAD                (1<<EXT3_XATTR_PAD_BITS)
++#define EXT3_XATTR_ROUND              (EXT3_XATTR_PAD-1)
++#define EXT3_XATTR_LEN(name_len) \
++      (((name_len) + EXT3_XATTR_ROUND + \
++      sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
++#define EXT3_XATTR_NEXT(entry) \
++      ( (struct ext3_xattr_entry *)( \
++        (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
++#define EXT3_XATTR_SIZE(size) \
++      (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT3_FS_XATTR
++
++struct ext3_xattr_handler {
++      char *prefix;
++      size_t (*list)(char *list, struct inode *inode, const char *name,
++                     int name_len);
++      int (*get)(struct inode *inode, const char *name, void *buffer,
++                 size_t size);
++      int (*set)(struct inode *inode, const char *name, const void *buffer,
++                 size_t size, int flags);
++};
++
++extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
++
++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
++extern int ext3_removexattr(struct dentry *, const char *);
++
++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext3_xattr_list(struct inode *, char *, size_t);
++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
++extern void ext3_xattr_put_super(struct super_block *);
++
++extern int init_ext3_xattr(void) __init;
++extern void exit_ext3_xattr(void);
++
++# else  /* CONFIG_EXT3_FS_XATTR */
++#  define ext3_setxattr               NULL
++#  define ext3_getxattr               NULL
++#  define ext3_listxattr      NULL
++#  define ext3_removexattr    NULL
++
++static inline int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++             void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
++{
++      return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++             const char *name, const void *value, size_t size, int flags)
++{
++      return -ENOTSUP;
++}
++
++static inline void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++}
++
++static inline void
++ext3_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext3_xattr(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext3_xattr(void)
++{
++}
++
++# endif  /* CONFIG_EXT3_FS_XATTR */
++
++# ifdef CONFIG_EXT3_FS_XATTR_USER
++
++extern int init_ext3_xattr_user(void) __init;
++extern void exit_ext3_xattr_user(void);
++
++# else  /* CONFIG_EXT3_FS_XATTR_USER */
++
++static inline int
++init_ext3_xattr_user(void)
++{
++      return 0;
++}
++
++static inline void
++exit_ext3_xattr_user(void)
++{
++}
++
++#endif  /* CONFIG_EXT3_FS_XATTR_USER */
++
++#endif  /* __KERNEL__ */
++
+--- linux-2.4.20/include/linux/fs.h~linux-2.4.20-xattr-0.8.54  2003-05-05 19:00:55.000000000 +0800
++++ linux-2.4.20-root/include/linux/fs.h       2003-05-07 18:08:03.000000000 +0800
+@@ -888,7 +888,7 @@ struct inode_operations {
+       int (*setattr) (struct dentry *, struct iattr *);
+       int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct dentry *, struct iattr *);
+-      int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
++      int (*setxattr) (struct dentry *, const char *, const void *, size_t, int);
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+       ssize_t (*listxattr) (struct dentry *, char *, size_t);
+       int (*removexattr) (struct dentry *, const char *);
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-2.4.20-root/include/linux/mbcache.h  2003-05-07 18:08:03.000000000 +0800
+@@ -0,0 +1,69 @@
++/*
++  File: linux/mbcache.h
++
++  (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++/* Hardwire the number of additional indexes */
++#define MB_CACHE_INDEXES_COUNT 1
++
++struct mb_cache_entry;
++
++struct mb_cache_op {
++      int (*free)(struct mb_cache_entry *, int);
++};
++
++struct mb_cache {
++      struct list_head                c_cache_list;
++      const char                      *c_name;
++      struct mb_cache_op              c_op;
++      atomic_t                        c_entry_count;
++      int                             c_bucket_count;
++#ifndef MB_CACHE_INDEXES_COUNT
++      int                             c_indexes_count;
++#endif
++      kmem_cache_t                    *c_entry_cache;
++      struct list_head                *c_block_hash;
++      struct list_head                *c_indexes_hash[0];
++};
++
++struct mb_cache_entry_index {
++      struct list_head                o_list;
++      unsigned int                    o_key;
++};
++
++struct mb_cache_entry {
++      struct list_head                e_lru_list;
++      struct mb_cache                 *e_cache;
++      atomic_t                        e_used;
++      kdev_t                          e_dev;
++      unsigned long                   e_block;
++      struct list_head                e_block_list;
++      struct mb_cache_entry_index     e_indexes[0];
++};
++
++/* Functions on caches */
++
++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
++                                int, int);
++void mb_cache_shrink(struct mb_cache *, kdev_t);
++void mb_cache_destroy(struct mb_cache *);
++
++/* Functions on cache entries */
++
++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *);
++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long,
++                        unsigned int[]);
++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]);
++void mb_cache_entry_release(struct mb_cache_entry *);
++void mb_cache_entry_takeout(struct mb_cache_entry *);
++void mb_cache_entry_free(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t,
++                                        unsigned long);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
++                                               kdev_t, unsigned int);
++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
++                                              kdev_t, unsigned int);
++#endif
+--- linux-2.4.20/kernel/ksyms.c~linux-2.4.20-xattr-0.8.54      2003-05-05 17:43:15.000000000 +0800
++++ linux-2.4.20-root/kernel/ksyms.c   2003-05-07 18:08:03.000000000 +0800
+@@ -11,6 +11,7 @@
+ #include <linux/config.h>
+ #include <linux/slab.h>
++#include <linux/cache_def.h>
+ #include <linux/module.h>
+ #include <linux/blkdev.h>
+ #include <linux/cdrom.h>
+@@ -89,6 +90,7 @@ EXPORT_SYMBOL(exit_mm);
+ EXPORT_SYMBOL(exit_files);
+ EXPORT_SYMBOL(exit_fs);
+ EXPORT_SYMBOL(exit_sighand);
++EXPORT_SYMBOL(copy_fs_struct);
+ /* internal kernel memory management */
+ EXPORT_SYMBOL(_alloc_pages);
+@@ -107,6 +109,8 @@ EXPORT_SYMBOL(kmem_cache_validate);
+ EXPORT_SYMBOL(kmem_cache_alloc);
+ EXPORT_SYMBOL(kmem_cache_free);
+ EXPORT_SYMBOL(kmem_cache_size);
++EXPORT_SYMBOL(register_cache);
++EXPORT_SYMBOL(unregister_cache);
+ EXPORT_SYMBOL(kmalloc);
+ EXPORT_SYMBOL(kfree);
+ EXPORT_SYMBOL(vfree);
+--- linux-2.4.20/mm/vmscan.c~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800
++++ linux-2.4.20-root/mm/vmscan.c      2003-05-07 18:08:03.000000000 +0800
+@@ -18,6 +18,7 @@
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+ #include <linux/swapctl.h>
++#include <linux/cache_def.h>
+ #include <linux/smp_lock.h>
+ #include <linux/pagemap.h>
+ #include <linux/init.h>
+@@ -34,6 +35,39 @@
+  */
+ #define DEF_PRIORITY (6)
++static DECLARE_MUTEX(other_caches_sem);
++static LIST_HEAD(cache_definitions);
++
++void register_cache(struct cache_definition *cache)
++{
++      down(&other_caches_sem);
++      list_add(&cache->link, &cache_definitions);
++      up(&other_caches_sem);
++}
++
++void unregister_cache(struct cache_definition *cache)
++{
++      down(&other_caches_sem);
++      list_del(&cache->link);
++      up(&other_caches_sem);
++}
++
++static void shrink_other_caches(unsigned int priority, int gfp_mask)
++{
++      struct list_head *p;
++
++      if (down_trylock(&other_caches_sem))
++              return;
++
++      list_for_each_prev(p, &cache_definitions) {
++              struct cache_definition *cache =
++                      list_entry(p, struct cache_definition, link);
++
++              cache->shrink(priority, gfp_mask);
++      }
++      up(&other_caches_sem);
++}
++
+ /*
+  * The swap-out function returns 1 if it successfully
+  * scanned all the pages it was asked to (`count').
+@@ -577,6 +611,7 @@ static int shrink_caches(zone_t * classz
+       shrink_dcache_memory(priority, gfp_mask);
+       shrink_icache_memory(priority, gfp_mask);
++      shrink_other_caches(priority, gfp_mask);
+ #ifdef CONFIG_QUOTA
+       shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+ #endif
+--- /dev/null  2003-01-30 18:24:37.000000000 +0800
++++ linux-root/fs/ext3/ext3-exports.c  2003-05-05 18:19:11.000000000 +0800
+@@ -0,0 +1,13 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
++
++EXPORT_SYMBOL(ext3_force_commit);
++EXPORT_SYMBOL(ext3_bread);
++EXPORT_SYMBOL(ext3_xattr_register);
++EXPORT_SYMBOL(ext3_xattr_unregister);
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_list);
++EXPORT_SYMBOL(ext3_xattr_set);
+
+_
diff --git a/lustre/kernel_patches/patches/lustre-2.5.63.patch b/lustre/kernel_patches/patches/lustre-2.5.63.patch
new file mode 100644 (file)
index 0000000..40e6a90
--- /dev/null
@@ -0,0 +1,862 @@
+ arch/um/kernel/mem.c   |   18 ++++++
+ fs/dcache.c            |   12 +++-
+ fs/namei.c             |  132 ++++++++++++++++++++++++++++++++++++++-----------
+ fs/namespace.c         |    1 
+ fs/nfsd/vfs.c          |    2 
+ fs/open.c              |   39 ++++++++++++--
+ fs/stat.c              |    2 
+ fs/sysfs/inode.c       |    2 
+ include/linux/dcache.h |   28 ++++++++++
+ include/linux/fs.h     |   20 +++++++
+ include/linux/namei.h  |    3 -
+ include/linux/slab.h   |    1 
+ kernel/ksyms.c         |    7 ++
+ mm/slab.c              |    5 +
+ net/unix/af_unix.c     |    2 
+ 15 files changed, 231 insertions(+), 43 deletions(-)
+
+--- linux-2.5.63-nointent/arch/um/kernel/mem.c~lustre-2.5.63   Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/arch/um/kernel/mem.c    Tue Mar 18 15:02:10 2003
+@@ -660,6 +660,22 @@ struct page *pte_mem_map(pte_t pte)
+       return(phys_mem_map(pte_val(pte)));
+ }
++struct page *check_get_page(unsigned long kaddr)
++{
++        struct page *page;
++        struct mem_region *mr;
++        unsigned long phys = __pa(kaddr);
++      unsigned int n = phys_region_index(phys);
++
++      if(regions[n] == NULL) 
++                return NULL; 
++
++        mr = regions[n];
++        page = (struct page *) mr->mem_map;
++      return page + ((phys_addr(phys)) >> PAGE_SHIFT);
++}
++
++
+ struct mem_region *page_region(struct page *page, int *index_out)
+ {
+       int i;
+@@ -747,7 +763,7 @@ extern unsigned long region_pa(void *vir
+                  (addr <= region->start + region->len))
+                       return(mk_phys(addr - region->start, i));
+       }
+-      panic("region_pa : no region for virtual address");
++      //panic("region_pa : no region for virtual address");
+       return(0);
+ }
+--- linux-2.5.63-nointent/fs/namei.c~lustre-2.5.63     Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/fs/namei.c      Mon Mar 24 17:08:18 2003
+@@ -101,6 +101,14 @@
+  * any extra contention...
+  */
++void intent_release(struct dentry *de, struct lookup_intent *it)
++{
++      if (it && de->d_op && de->d_op->d_intent_release)
++              de->d_op->d_intent_release(de, it);
++
++}
++
++
+ /* In order to reduce some races, while at the same time doing additional
+  * checking and hopefully speeding things up, we copy filenames to the
+  * kernel data space before using them..
+@@ -273,10 +281,18 @@ void path_release(struct nameidata *nd)
+  * Internal lookup() using the new generic dcache.
+  * SMP-safe
+  */
+-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
++static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags, struct lookup_intent *it)
+ {
+       struct dentry * dentry = d_lookup(parent, name);
+       
++      if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) {
++              if (!dentry->d_op->d_revalidate2(dentry, flags, it) &&
++                  !d_invalidate(dentry)) {
++                      dput(dentry);
++                      dentry = NULL;
++              }
++              return dentry;
++      } else
+       if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+               if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
+                       dput(dentry);
+@@ -330,7 +346,7 @@ ok:
+  * make sure that nobody added the entry to the dcache in the meantime..
+  * SMP-safe
+  */
+-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
++static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags, struct lookup_intent *it)
+ {
+       struct dentry * result;
+       struct inode *dir = parent->d_inode;
+@@ -348,7 +364,10 @@ static struct dentry * real_lookup(struc
+               struct dentry * dentry = d_alloc(parent, name);
+               result = ERR_PTR(-ENOMEM);
+               if (dentry) {
+-                      result = dir->i_op->lookup(dir, dentry);
++                      if (dir->i_op->lookup2)
++                              result = dir->i_op->lookup2(dir, dentry, it);
++                      else
++                                result = dir->i_op->lookup(dir, dentry);
+                       if (result)
+                               dput(dentry);
+                       else {
+@@ -370,6 +389,12 @@ static struct dentry * real_lookup(struc
+                       dput(result);
+                       result = ERR_PTR(-ENOENT);
+               }
++      } else if (result->d_op && result->d_op->d_revalidate2) {
++              if (!result->d_op->d_revalidate2(result, flags, it) &&
++                  !d_invalidate(result)) {
++                      dput(result);
++                      result = ERR_PTR(-ENOENT);
++              }
+       }
+       return result;
+ }
+@@ -402,6 +427,7 @@ static inline int do_follow_link(struct 
+       current->link_count--;
+       return err;
+ loop:
++      intent_release(dentry, &nd->it);
+       path_release(nd);
+       return err;
+ }
+@@ -447,15 +473,26 @@ static int follow_mount(struct vfsmount 
+       return res;
+ }
+-static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
++static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry,
++                              struct lookup_intent *it)
+ {
+       struct vfsmount *mounted;
+       spin_lock(&dcache_lock);
+       mounted = lookup_mnt(*mnt, *dentry);
+       if (mounted) {
++              int opc = 0, mode = 0;
+               *mnt = mntget(mounted);
+               spin_unlock(&dcache_lock);
++              if (it) {
++                      opc = it->it_op;
++                      mode = it->it_mode;
++              }
++              intent_release(*dentry, it);
++              if (it) {
++                      it->it_op = opc;
++                      it->it_mode = mode;
++              }
+               dput(*dentry);
+               mntput(mounted->mnt_parent);
+               *dentry = dget(mounted->mnt_root);
+@@ -467,7 +504,7 @@ static inline int __follow_down(struct v
+ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+ {
+-      return __follow_down(mnt,dentry);
++      return __follow_down(mnt,dentry,NULL);
+ }
+  
+ static inline void follow_dotdot(struct vfsmount **mnt, struct dentry **dentry)
+@@ -531,7 +568,7 @@ done:
+       return 0;
+ need_lookup:
+-      dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE);
++      dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE, &nd->it);
+       if (IS_ERR(dentry))
+               goto fail;
+       goto done;
+@@ -665,7 +702,7 @@ int link_path_walk(const char * name, st
+                       nd->dentry = next.dentry;
+               }
+               err = -ENOTDIR; 
+-              if (!inode->i_op->lookup)
++              if (!inode->i_op->lookup && !inode->i_op->lookup2)
+                       break;
+               continue;
+               /* here ends the main loop */
+@@ -716,7 +753,8 @@ last_component:
+                       break;
+               if (lookup_flags & LOOKUP_DIRECTORY) {
+                       err = -ENOTDIR; 
+-                      if (!inode->i_op || !inode->i_op->lookup)
++                      if (!inode->i_op || 
++                            (!inode->i_op->lookup && !inode->i_op->lookup2))
+                               break;
+               }
+               goto return_base;
+@@ -735,6 +773,7 @@ out_dput:
+               dput(next.dentry);
+               break;
+       }
++      intent_release(nd->dentry, &nd->it);
+       path_release(nd);
+ return_err:
+       return err;
+@@ -857,7 +896,8 @@ int path_lookup(const char *name, unsign
+  * needs parent already locked. Doesn't follow mounts.
+  * SMP-safe.
+  */
+-struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
++struct dentry * lookup_hash(struct qstr *name, struct dentry * base, 
++                            struct lookup_intent *it)
+ {
+       struct dentry * dentry;
+       struct inode *inode;
+@@ -880,13 +920,16 @@ struct dentry * lookup_hash(struct qstr 
+                       goto out;
+       }
+-      dentry = cached_lookup(base, name, 0);
++      dentry = cached_lookup(base, name, 0, it);
+       if (!dentry) {
+               struct dentry *new = d_alloc(base, name);
+               dentry = ERR_PTR(-ENOMEM);
+               if (!new)
+                       goto out;
+-              dentry = inode->i_op->lookup(inode, new);
++                if (inode->i_op->lookup2) 
++                        dentry = inode->i_op->lookup2(inode, new, it);
++                else 
++                        dentry = inode->i_op->lookup(inode, new);
+               if (!dentry) {
+                       dentry = new;
+                       security_inode_post_lookup(inode, dentry);
+@@ -898,7 +941,7 @@ out:
+ }
+ /* SMP-safe */
+-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct lookup_intent *it)
+ {
+       unsigned long hash;
+       struct qstr this;
+@@ -918,11 +961,16 @@ struct dentry * lookup_one_len(const cha
+       }
+       this.hash = end_name_hash(hash);
+-      return lookup_hash(&this, base);
++      return lookup_hash(&this, base, it);
+ access:
+       return ERR_PTR(-EACCES);
+ }
++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
++{
++        return lookup_one_len_it(name, base, len, NULL);
++}
++
+ /*
+  *    namei()
+  *
+@@ -1224,6 +1272,9 @@ int open_namei(const char * pathname, in
+       /*
+        * Create - we need to know the parent.
+        */
++      nd->it.it_mode = mode;
++      nd->it.it_op |= IT_CREAT;
++              
+       error = path_lookup(pathname, LOOKUP_PARENT, nd);
+       if (error)
+               return error;
+@@ -1239,7 +1290,7 @@ int open_namei(const char * pathname, in
+       dir = nd->dentry;
+       down(&dir->d_inode->i_sem);
+-      dentry = lookup_hash(&nd->last, nd->dentry);
++      dentry = lookup_hash(&nd->last, nd->dentry, &nd->it);
+ do_last:
+       error = PTR_ERR(dentry);
+@@ -1247,7 +1298,8 @@ do_last:
+               up(&dir->d_inode->i_sem);
+               goto exit;
+       }
+-
++        
++      nd->it.it_mode = mode;
+       /* Negative dentry, just create the file */
+       if (!dentry->d_inode) {
+               if (!IS_POSIXACL(dir->d_inode))
+@@ -1277,7 +1329,7 @@ do_last:
+               error = -ELOOP;
+               if (flag & O_NOFOLLOW)
+                       goto exit_dput;
+-              while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry));
++              while (__follow_down(&nd->mnt,&dentry,&nd->it) && d_mountpoint(dentry));
+       }
+       error = -ENOENT;
+       if (!dentry->d_inode)
+@@ -1297,8 +1349,10 @@ ok:
+       return 0;
+ exit_dput:
++      intent_release(dentry, &nd->it);
+       dput(dentry);
+ exit:
++      intent_release(nd->dentry, &nd->it);
+       path_release(nd);
+       return error;
+@@ -1320,7 +1374,12 @@ do_link:
+       if (error)
+               goto exit_dput;
+       UPDATE_ATIME(dentry->d_inode);
+-      error = dentry->d_inode->i_op->follow_link(dentry, nd);
++      if (dentry->d_inode->i_op->follow_link2)
++              error = dentry->d_inode->i_op->follow_link2(dentry, nd, &nd->it);
++      else
++              error = dentry->d_inode->i_op->follow_link(dentry, nd);
++        if (error)
++              intent_release(dentry, &nd->it);
+       dput(dentry);
+       if (error)
+               return error;
+@@ -1342,7 +1401,7 @@ do_link:
+       }
+       dir = nd->dentry;
+       down(&dir->d_inode->i_sem);
+-      dentry = lookup_hash(&nd->last, nd->dentry);
++      dentry = lookup_hash(&nd->last, nd->dentry, &nd->it);
+       putname(nd->last.name);
+       goto do_last;
+ }
+@@ -1356,7 +1415,7 @@ static struct dentry *lookup_create(stru
+       dentry = ERR_PTR(-EEXIST);
+       if (nd->last_type != LAST_NORM)
+               goto fail;
+-      dentry = lookup_hash(&nd->last, nd->dentry);
++      dentry = lookup_hash(&nd->last, nd->dentry, &nd->it);
+       if (IS_ERR(dentry))
+               goto fail;
+       if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
+@@ -1588,7 +1647,7 @@ asmlinkage long sys_rmdir(const char * p
+                       goto exit1;
+       }
+       down(&nd.dentry->d_inode->i_sem);
+-      dentry = lookup_hash(&nd.last, nd.dentry);
++      dentry = lookup_hash(&nd.last, nd.dentry, &nd.it);
+       error = PTR_ERR(dentry);
+       if (!IS_ERR(dentry)) {
+               error = vfs_rmdir(nd.dentry->d_inode, dentry);
+@@ -1654,8 +1713,18 @@ asmlinkage long sys_unlink(const char * 
+       error = -EISDIR;
+       if (nd.last_type != LAST_NORM)
+               goto exit1;
++      if (nd.dentry->d_inode->i_op->unlink2) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++              error = op->unlink2(nd.dentry->d_inode,
++                              nd.last.name,
++                              nd.last.len);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto exit1;
++      }
+       down(&nd.dentry->d_inode->i_sem);
+-      dentry = lookup_hash(&nd.last, nd.dentry);
++//    dentry = lookup_hash(&nd.last, nd.dentry, &nd.it);
++      dentry = lookup_hash(&nd.last, nd.dentry, NULL);
+       error = PTR_ERR(dentry);
+       if (!IS_ERR(dentry)) {
+               /* Why not before? Because we want correct error value */
+@@ -1859,7 +1928,8 @@ exit:
+  *       locking].
+  */
+ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
+-             struct inode *new_dir, struct dentry *new_dentry)
++             struct inode *new_dir, struct dentry *new_dentry,
++                                struct lookup_intent *it)
+ {
+       int error = 0;
+       struct inode *target;
+@@ -1887,6 +1957,7 @@ int vfs_rename_dir(struct inode *old_dir
+               error = -EBUSY;
+       else 
+               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
++      intent_release(new_dentry, it);
+       if (target) {
+               if (!error)
+                       target->i_flags |= S_DEAD;
+@@ -1904,7 +1975,8 @@ int vfs_rename_dir(struct inode *old_dir
+ }
+ int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
+-             struct inode *new_dir, struct dentry *new_dentry)
++             struct inode *new_dir, struct dentry *new_dentry,
++               struct lookup_intent *it)
+ {
+       struct inode *target;
+       int error;
+@@ -1921,6 +1993,7 @@ int vfs_rename_other(struct inode *old_d
+               error = -EBUSY;
+       else
+               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
++        intent_release(new_dentry, it);
+       if (!error) {
+               /* The following d_move() should become unconditional */
+               if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME))
+@@ -1934,7 +2007,8 @@ int vfs_rename_other(struct inode *old_d
+ }
+ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+-             struct inode *new_dir, struct dentry *new_dentry)
++             struct inode *new_dir, struct dentry *new_dentry, 
++               struct lookup_intent *it)
+ {
+       int error;
+       int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+@@ -1960,9 +2034,9 @@ int vfs_rename(struct inode *old_dir, st
+       DQUOT_INIT(new_dir);
+       if (is_dir)
+-              error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
++              error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry, it);
+       else
+-              error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
++              error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry, it);
+       if (!error) {
+               if (old_dir == new_dir)
+                       inode_dir_notify(old_dir, DN_RENAME);
+@@ -2005,7 +2079,7 @@ static inline int do_rename(const char *
+       trap = lock_rename(new_dir, old_dir);
+-      old_dentry = lookup_hash(&oldnd.last, old_dir);
++      old_dentry = lookup_hash(&oldnd.last, old_dir, &oldnd.it);
+       error = PTR_ERR(old_dentry);
+       if (IS_ERR(old_dentry))
+               goto exit3;
+@@ -2025,7 +2099,7 @@ static inline int do_rename(const char *
+       error = -EINVAL;
+       if (old_dentry == trap)
+               goto exit4;
+-      new_dentry = lookup_hash(&newnd.last, new_dir);
++      new_dentry = lookup_hash(&newnd.last, new_dir, &newnd.it);
+       error = PTR_ERR(new_dentry);
+       if (IS_ERR(new_dentry))
+               goto exit4;
+@@ -2035,7 +2109,7 @@ static inline int do_rename(const char *
+               goto exit5;
+       error = vfs_rename(old_dir->d_inode, old_dentry,
+-                                 new_dir->d_inode, new_dentry);
++                                 new_dir->d_inode, new_dentry, NULL);
+ exit5:
+       dput(new_dentry);
+ exit4:
+--- linux-2.5.63-nointent/fs/nfsd/vfs.c~lustre-2.5.63  Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/fs/nfsd/vfs.c   Tue Mar 18 15:02:10 2003
+@@ -1337,7 +1337,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
+                       err = nfserr_perm;
+       } else
+ #endif
+-      err = vfs_rename(fdir, odentry, tdir, ndentry);
++      err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
+       if (!err && EX_ISSYNC(tfhp->fh_export)) {
+               nfsd_sync_dir(tdentry);
+               nfsd_sync_dir(fdentry);
+--- linux-2.5.63-nointent/fs/sysfs/inode.c~lustre-2.5.63       Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/fs/sysfs/inode.c        Tue Mar 18 15:02:10 2003
+@@ -540,7 +540,7 @@ static struct dentry * get_dentry(struct
+       qstr.name = name;
+       qstr.len = strlen(name);
+       qstr.hash = full_name_hash(name,qstr.len);
+-      return lookup_hash(&qstr,parent);
++      return lookup_hash(&qstr,parent,NULL);
+ }
+--- linux-2.5.63-nointent/include/linux/dcache.h~lustre-2.5.63 Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/include/linux/dcache.h  Tue Mar 18 15:02:10 2003
+@@ -12,6 +12,27 @@
+ struct vfsmount;
++#define IT_OPEN     (1)
++#define IT_CREAT    (1<<1)
++#define IT_READDIR  (1<<2)
++#define IT_GETATTR  (1<<3)
++#define IT_LOOKUP   (1<<4)
++#define IT_UNLINK   (1<<5)
++
++
++struct lookup_intent {
++       int it_op;
++       int it_mode;
++       int it_flags;
++       int it_disposition;
++       int it_status;
++       struct iattr *it_iattr;
++       __u64 it_lock_handle[2];
++       int it_lock_mode;
++       void *it_data;
++};
++
++
+ /*
+  * linux/include/linux/dcache.h
+  *
+@@ -34,6 +55,8 @@ struct qstr {
+       char name_str[0];
+ };
++#include <linux/namei.h>
++
+ struct dentry_stat_t {
+       int nr_dentry;
+       int nr_unused;
+@@ -87,6 +110,7 @@ struct dentry {
+       struct list_head d_subdirs;     /* our children */
+       struct list_head d_alias;       /* inode alias list */
+       int d_mounted;
++        struct lookup_intent *d_it;
+       struct qstr d_name;
+       struct qstr * d_qstr;           /* quick str ptr used in lockless lookup and concurrent d_move */
+       unsigned long d_time;           /* used by d_revalidate */
+@@ -107,6 +131,8 @@ struct dentry_operations {
+       int (*d_delete)(struct dentry *);
+       void (*d_release)(struct dentry *);
+       void (*d_iput)(struct dentry *, struct inode *);
++      int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *);
++      void (*d_intent_release)(struct  dentry *, struct lookup_intent *);
+ };
+ /* the dentry parameter passed to d_hash and d_compare is the parent
+@@ -147,6 +173,8 @@ d_iput:            no              no              yes
+ #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
+ #define DCACHE_UNHASHED               0x0010  
++#define DCACHE_LUSTRE_INVALID     0x0011  /* Lustre invalidated */
++
+ extern spinlock_t dcache_lock;
+ extern rwlock_t dparent_lock;
+--- linux-2.5.63-nointent/include/linux/fs.h~lustre-2.5.63     Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/include/linux/fs.h      Tue Mar 18 15:02:10 2003
+@@ -234,6 +234,9 @@ typedef int (get_blocks_t)(struct inode 
+ #define ATTR_ATTR_FLAG        1024
+ #define ATTR_KILL_SUID        2048
+ #define ATTR_KILL_SGID        4096
++#define ATTR_RAW              8192    /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN        16384    /* called from open path, ie O_TRUNC */
++
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
+@@ -642,7 +645,7 @@ extern int vfs_symlink(struct inode *, s
+ extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
+ extern int vfs_rmdir(struct inode *, struct dentry *);
+ extern int vfs_unlink(struct inode *, struct dentry *);
+-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
++extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct lookup_intent *it);
+ /*
+  * File types
+@@ -728,19 +731,33 @@ struct file_operations {
+ struct inode_operations {
+       int (*create) (struct inode *,struct dentry *,int);
+       struct dentry * (*lookup) (struct inode *,struct dentry *);
++      struct dentry * (*lookup2) (struct inode *,struct dentry *, 
++                                    struct lookup_intent *);
+       int (*link) (struct dentry *,struct inode *,struct dentry *);
++      int (*link2) (struct inode *,struct inode *, const char *, int);
+       int (*unlink) (struct inode *,struct dentry *);
++      int (*unlink2) (struct inode *, const char *, int);
+       int (*symlink) (struct inode *,struct dentry *,const char *);
++      int (*symlink2) (struct inode *, const char *, int, const char *);
+       int (*mkdir) (struct inode *,struct dentry *,int);
++      int (*mkdir2) (struct inode *, const char *, int,int);
+       int (*rmdir) (struct inode *,struct dentry *);
++      int (*rmdir2) (struct inode *, const char *, int);
+       int (*mknod) (struct inode *,struct dentry *,int,dev_t);
++      int (*mknod2) (struct inode *, const char *, int,int,int);
+       int (*rename) (struct inode *, struct dentry *,
+                       struct inode *, struct dentry *);
++      int (*rename2) (struct inode *, struct inode *,
++                      const char *oldname, int oldlen,
++                      const char *newname, int newlen);
+       int (*readlink) (struct dentry *, char *,int);
+       int (*follow_link) (struct dentry *, struct nameidata *);
++      int (*follow_link2) (struct dentry *, struct nameidata *,
++                              struct lookup_intent *it);
+       void (*truncate) (struct inode *);
+       int (*permission) (struct inode *, int);
+       int (*setattr) (struct dentry *, struct iattr *);
++      int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
+       int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t,int);
+@@ -953,6 +970,7 @@ extern int register_filesystem(struct fi
+ extern int unregister_filesystem(struct file_system_type *);
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount(struct vfsmount *);
++struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+ extern int vfs_statfs(struct super_block *, struct statfs *);
+--- linux-2.5.63-nointent/include/linux/namei.h~lustre-2.5.63  Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/include/linux/namei.h   Tue Mar 18 15:02:10 2003
+@@ -11,6 +11,7 @@ struct nameidata {
+       struct qstr     last;
+       unsigned int    flags;
+       int             last_type;
++   struct lookup_intent it;
+ };
+ /*
+@@ -44,7 +45,7 @@ extern int FASTCALL(link_path_walk(const
+ extern void path_release(struct nameidata *);
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+-extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
++extern struct dentry * lookup_hash(struct qstr *, struct dentry *, struct lookup_intent *);
+ extern int follow_down(struct vfsmount **, struct dentry **);
+ extern int follow_up(struct vfsmount **, struct dentry **);
+--- linux-2.5.63-nointent/include/linux/slab.h~lustre-2.5.63   Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/include/linux/slab.h    Tue Mar 18 15:02:10 2003
+@@ -55,6 +55,7 @@ extern int kmem_cache_destroy(kmem_cache
+ extern int kmem_cache_shrink(kmem_cache_t *);
+ extern void *kmem_cache_alloc(kmem_cache_t *, int);
+ extern void kmem_cache_free(kmem_cache_t *, void *);
++extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp);
+ extern unsigned int kmem_cache_size(kmem_cache_t *);
+ extern void *kmalloc(size_t, int);
+--- linux-2.5.63-nointent/kernel/ksyms.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/kernel/ksyms.c  Tue Mar 18 15:02:10 2003
+@@ -377,6 +377,7 @@ EXPORT_SYMBOL(unregister_filesystem);
+ EXPORT_SYMBOL(kern_mount);
+ EXPORT_SYMBOL(__mntput);
+ EXPORT_SYMBOL(may_umount);
++EXPORT_SYMBOL(reparent_to_init);
+ /* executable format registration */
+ EXPORT_SYMBOL(register_binfmt);
+@@ -407,6 +408,12 @@ EXPORT_SYMBOL(request_irq);
+ EXPORT_SYMBOL(free_irq);
+ EXPORT_SYMBOL(irq_stat);
++/* lustre */
++EXPORT_SYMBOL(do_kern_mount);
++EXPORT_SYMBOL(exit_files);
++EXPORT_SYMBOL(kmem_cache_validate);
++
++
+ /* waitqueue handling */
+ EXPORT_SYMBOL(add_wait_queue);
+ EXPORT_SYMBOL(add_wait_queue_exclusive);
+--- linux-2.5.63-nointent/mm/slab.c~lustre-2.5.63      Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/mm/slab.c       Tue Mar 18 15:02:10 2003
+@@ -1792,6 +1792,11 @@ static inline void __cache_free (kmem_ca
+       }
+ }
++int kmem_cache_validate(kmem_cache_t *cachep, void *objp)
++{
++      return 1;
++}
++
+ /**
+  * kmem_cache_alloc - Allocate an object
+  * @cachep: The cache to allocate from.
+--- linux-2.5.63-nointent/net/unix/af_unix.c~lustre-2.5.63     Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/net/unix/af_unix.c      Tue Mar 18 15:02:10 2003
+@@ -720,7 +720,7 @@ static int unix_bind(struct socket *sock
+               /*
+                * Do the final lookup.
+                */
+-              dentry = lookup_hash(&nd.last, nd.dentry);
++              dentry = lookup_hash(&nd.last, nd.dentry, NULL);
+               err = PTR_ERR(dentry);
+               if (IS_ERR(dentry))
+                       goto out_mknod_unlock;
+--- linux-2.5.63-nointent/fs/dcache.c~lustre-2.5.63    Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/fs/dcache.c     Tue Mar 18 15:02:10 2003
+@@ -1111,15 +1111,21 @@ void d_delete(struct dentry * dentry)
+  * Adds a dentry to the hash according to its name.
+  */
+  
+-void d_rehash(struct dentry * entry)
++void __d_rehash(struct dentry * entry, int lock)
+ {
+       struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+-      spin_lock(&dcache_lock);
++      if (lock) spin_lock(&dcache_lock);
+       if (!list_empty(&entry->d_hash) && !d_unhashed(entry)) BUG();
+       entry->d_vfs_flags &= ~DCACHE_UNHASHED;
+       entry->d_bucket = list;
+       list_add_rcu(&entry->d_hash, list);
+-      spin_unlock(&dcache_lock);
++      if (lock) spin_unlock(&dcache_lock);
++}
++EXPORT_SYMBOL(__d_rehash);
++
++void d_rehash(struct dentry * entry)
++{
++      __d_rehash(entry, 1);
+ }
+ #define do_switch(x,y) do { \
+--- linux-2.5.63-nointent/fs/namespace.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003
++++ linux-2.5.63-nointent-root/fs/namespace.c  Tue Mar 18 15:02:10 2003
+@@ -925,6 +925,7 @@ void set_fs_pwd(struct fs_struct *fs, st
+               mntput(old_pwdmnt);
+       }
+ }
++EXPORT_SYMBOL(set_fs_pwd);
+ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
+ {
+--- linux-2.5.63-nointent/fs/open.c~lustre-2.5.63      Thu Mar 20 12:43:39 2003
++++ linux-2.5.63-nointent-root/fs/open.c       Mon Mar 24 16:25:47 2003
+@@ -97,7 +97,8 @@ static inline long do_sys_truncate(const
+       struct nameidata nd;
+       struct inode * inode;
+       int error;
+-
++        struct lookup_intent it = { .it_op = IT_GETATTR };
++      nd.it=it;
+       error = -EINVAL;
+       if (length < 0) /* sorry, but loff_t says... */
+               goto out;
+@@ -142,11 +143,13 @@ static inline long do_sys_truncate(const
+       error = locks_verify_truncate(inode, NULL, length);
+       if (!error) {
+               DQUOT_INIT(inode);
++              intent_release(nd.dentry, &nd.it);
+               error = do_truncate(nd.dentry, length);
+       }
+       put_write_access(inode);
+ dput_and_out:
++      intent_release(nd.dentry, &nd.it);
+       path_release(&nd);
+ out:
+       return error;
+@@ -340,6 +343,8 @@ asmlinkage long sys_access(const char * 
+       int old_fsuid, old_fsgid;
+       kernel_cap_t old_cap;
+       int res;
++      struct lookup_intent it = { .it_op = IT_GETATTR };
++      nd.it=it;       
+       if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
+               return -EINVAL;
+@@ -371,6 +376,8 @@ asmlinkage long sys_access(const char * 
+               if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
+                  && !special_file(nd.dentry->d_inode->i_mode))
+                       res = -EROFS;
++                              
++              intent_release(nd.dentry, &nd.it);
+               path_release(&nd);
+       }
+@@ -385,6 +392,8 @@ asmlinkage long sys_chdir(const char * f
+ {
+       struct nameidata nd;
+       int error;
++      struct lookup_intent it = { .it_op = IT_GETATTR };
++      nd.it=it;       
+       error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+       if (error)
+@@ -397,6 +406,7 @@ asmlinkage long sys_chdir(const char * f
+       set_fs_pwd(current->fs, nd.mnt, nd.dentry);
+ dput_and_out:
++      intent_release(nd.dentry, &nd.it);
+       path_release(&nd);
+ out:
+       return error;
+@@ -436,6 +446,8 @@ asmlinkage long sys_chroot(const char * 
+ {
+       struct nameidata nd;
+       int error;
++        struct lookup_intent it = { .it_op = IT_GETATTR };
++      nd.it=it;
+       error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+       if (error)
+@@ -508,6 +520,18 @@ asmlinkage long sys_chmod(const char * f
+       error = -EROFS;
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
++      
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_mode = mode;
++              newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
+       error = -EPERM;
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+@@ -619,7 +643,10 @@ asmlinkage long sys_fchown(unsigned int 
+ struct file *filp_open(const char * filename, int flags, int mode)
+ {
+       int namei_flags, error;
++      struct file * temp_filp;
+       struct nameidata nd;
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = flags };
++      nd.it=it;       
+       namei_flags = flags;
+       if ((namei_flags+1) & O_ACCMODE)
+@@ -628,9 +655,11 @@ struct file *filp_open(const char * file
+               namei_flags |= 2;
+       error = open_namei(filename, namei_flags, mode, &nd);
+-      if (!error)
+-              return dentry_open(nd.dentry, nd.mnt, flags);
+-
++      if (!error) {
++              temp_filp = dentry_open(nd.dentry, nd.mnt, flags);
++              intent_release(nd.dentry,&nd.it);
++              return temp_filp;
++      }       
+       return ERR_PTR(error);
+ }
+@@ -675,7 +704,7 @@ struct file *dentry_open(struct dentry *
+                               goto cleanup_all;
+               }
+       }
+-
++        
+       return f;
+ cleanup_all:
+--- linux-2.5.63-nointent/fs/stat.c~lustre-2.5.63      Fri Mar 21 21:15:40 2003
++++ linux-2.5.63-nointent-root/fs/stat.c       Fri Mar 21 21:16:53 2003
+@@ -65,6 +65,7 @@ int vfs_stat(char *name, struct kstat *s
+       error = user_path_walk(name, &nd);
+       if (!error) {
+               error = vfs_getattr(nd.mnt, nd.dentry, stat);
++              intent_release(nd.dentry, &nd.it);
+               path_release(&nd);
+       }
+       return error;
+@@ -80,6 +81,7 @@ int vfs_lstat(char *name, struct kstat *
+       error = user_path_walk_link(name, &nd);
+       if (!error) {
+               error = vfs_getattr(nd.mnt, nd.dentry, stat);
++              intent_release(nd.dentry, &nd.it);
+               path_release(&nd);
+       }
+       return error;
+
+_
diff --git a/lustre/kernel_patches/patches/lustre-2.5.patch b/lustre/kernel_patches/patches/lustre-2.5.patch
deleted file mode 100644 (file)
index 71d372f..0000000
+++ /dev/null
@@ -1,507 +0,0 @@
- arch/um/kernel/mem.c   |   18 +++++++++++-
- fs/namei.c             |   71 +++++++++++++++++++++++++++++++++++--------------
- fs/nfsd/vfs.c          |    2 -
- fs/sysfs/inode.c       |    2 -
- include/linux/dcache.h |   27 ++++++++++++++++++
- include/linux/fs.h     |   20 +++++++++++++
- include/linux/namei.h  |    3 +-
- include/linux/slab.h   |    1 
- kernel/ksyms.c         |    7 ++++
- mm/slab.c              |    5 +++
- net/unix/af_unix.c     |    2 -
- 11 files changed, 132 insertions(+), 26 deletions(-)
-
---- linux-2.5.59/arch/um/kernel/mem.c~lustre-2.5       2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/arch/um/kernel/mem.c     2003-02-22 21:56:58.000000000 +0800
-@@ -639,6 +639,22 @@ struct page *pte_mem_map(pte_t pte)
-       return(phys_mem_map(pte_val(pte)));
- }
-+struct page *check_get_page(unsigned long kaddr)
-+{
-+        struct page *page;
-+        struct mem_region *mr;
-+        unsigned long phys = __pa(kaddr);
-+      unsigned int n = phys_region_index(phys);
-+
-+      if(regions[n] == NULL) 
-+                return NULL; 
-+
-+        mr = regions[n];
-+        page = (struct page *) mr->mem_map;
-+      return page + ((phys_addr(phys)) >> PAGE_SHIFT);
-+}
-+
-+
- struct mem_region *page_region(struct page *page, int *index_out)
- {
-       int i;
-@@ -726,7 +742,7 @@ extern unsigned long region_pa(void *vir
-                  (addr <= region->start + region->len))
-                       return(mk_phys(addr - region->start, i));
-       }
--      panic("region_pa : no region for virtual address");
-+      //panic("region_pa : no region for virtual address");
-       return(0);
- }
---- linux-2.5.59/fs/namei.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/fs/namei.c       2003-02-22 21:56:58.000000000 +0800
-@@ -265,6 +265,9 @@ int deny_write_access(struct file * file
- void path_release(struct nameidata *nd)
- {
-+        if (nd->dentry && nd->dentry->d_op && 
-+            nd->dentry->d_op->d_intent_release)
-+                nd->dentry->d_op->d_intent_release(nd->dentry, &nd->it);
-       dput(nd->dentry);
-       mntput(nd->mnt);
- }
-@@ -273,10 +276,18 @@ void path_release(struct nameidata *nd)
-  * Internal lookup() using the new generic dcache.
-  * SMP-safe
-  */
--static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags, struct lookup_intent *it)
- {
-       struct dentry * dentry = d_lookup(parent, name);
-       
-+      if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) {
-+              if (!dentry->d_op->d_revalidate2(dentry, flags, it) &&
-+                  !d_invalidate(dentry)) {
-+                      dput(dentry);
-+                      dentry = NULL;
-+              }
-+              return dentry;
-+      } else
-       if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
-               if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
-                       dput(dentry);
-@@ -351,7 +362,7 @@ ok:
-  * make sure that nobody added the entry to the dcache in the meantime..
-  * SMP-safe
-  */
--static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags, struct lookup_intent *it)
- {
-       struct dentry * result;
-       struct inode *dir = parent->d_inode;
-@@ -369,7 +380,10 @@ static struct dentry * real_lookup(struc
-               struct dentry * dentry = d_alloc(parent, name);
-               result = ERR_PTR(-ENOMEM);
-               if (dentry) {
--                      result = dir->i_op->lookup(dir, dentry);
-+                      if (dir->i_op->lookup2)
-+                              result = dir->i_op->lookup2(dir, dentry, it);
-+                      else
-+                                result = dir->i_op->lookup(dir, dentry);
-                       if (result)
-                               dput(dentry);
-                       else {
-@@ -391,6 +405,12 @@ static struct dentry * real_lookup(struc
-                       dput(result);
-                       result = ERR_PTR(-ENOENT);
-               }
-+      } else if (result->d_op && result->d_op->d_revalidate2) {
-+              if (!result->d_op->d_revalidate2(result, flags, it) &&
-+                  !d_invalidate(result)) {
-+                      dput(result);
-+                      result = ERR_PTR(-ENOENT);
-+              }
-       }
-       return result;
- }
-@@ -534,7 +554,7 @@ dcache_miss:
-       unlock_nd(nd);
- need_lookup:
--      dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE);
-+      dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE, &nd->it);
-       if (IS_ERR(dentry))
-               goto fail;
-       mntget(mnt);
-@@ -684,7 +704,7 @@ int link_path_walk(const char * name, st
-                       nd->dentry = next.dentry;
-               }
-               err = -ENOTDIR; 
--              if (!inode->i_op->lookup)
-+              if (!inode->i_op->lookup && !inode->i_op->lookup2)
-                       break;
-               continue;
-               /* here ends the main loop */
-@@ -737,7 +757,8 @@ last_component:
-                       break;
-               if (lookup_flags & LOOKUP_DIRECTORY) {
-                       err = -ENOTDIR; 
--                      if (!inode->i_op || !inode->i_op->lookup)
-+                      if (!inode->i_op || 
-+                            (!inode->i_op->lookup && !inode->i_op->lookup2))
-                               break;
-               }
-               goto return_base;
-@@ -886,7 +907,8 @@ int path_lookup(const char *name, unsign
-  * needs parent already locked. Doesn't follow mounts.
-  * SMP-safe.
-  */
--struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+struct dentry * lookup_hash(struct qstr *name, struct dentry * base, 
-+                            struct lookup_intent *it)
- {
-       struct dentry * dentry;
-       struct inode *inode;
-@@ -909,13 +931,16 @@ struct dentry * lookup_hash(struct qstr 
-                       goto out;
-       }
--      dentry = cached_lookup(base, name, 0);
-+      dentry = cached_lookup(base, name, 0, it);
-       if (!dentry) {
-               struct dentry *new = d_alloc(base, name);
-               dentry = ERR_PTR(-ENOMEM);
-               if (!new)
-                       goto out;
--              dentry = inode->i_op->lookup(inode, new);
-+                if (inode->i_op->lookup2) 
-+                        dentry = inode->i_op->lookup2(inode, new, it);
-+                else 
-+                        dentry = inode->i_op->lookup(inode, new);
-               if (!dentry) {
-                       dentry = new;
-                       security_inode_post_lookup(inode, dentry);
-@@ -927,7 +952,7 @@ out:
- }
- /* SMP-safe */
--struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
-+struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct lookup_intent *it)
- {
-       unsigned long hash;
-       struct qstr this;
-@@ -947,11 +972,16 @@ struct dentry * lookup_one_len(const cha
-       }
-       this.hash = end_name_hash(hash);
--      return lookup_hash(&this, base);
-+      return lookup_hash(&this, base, it);
- access:
-       return ERR_PTR(-EACCES);
- }
-+struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
-+{
-+        return lookup_one_len_it(name, base, len, NULL);
-+}
-+
- /*
-  *    namei()
-  *
-@@ -1268,7 +1298,7 @@ int open_namei(const char * pathname, in
-       dir = nd->dentry;
-       down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash(&nd->last, nd->dentry, &nd->it);
- do_last:
-       error = PTR_ERR(dentry);
-@@ -1371,7 +1401,7 @@ do_link:
-       }
-       dir = nd->dentry;
-       down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash(&nd->last, nd->dentry, &nd->it);
-       putname(nd->last.name);
-       goto do_last;
- }
-@@ -1385,7 +1415,7 @@ static struct dentry *lookup_create(stru
-       dentry = ERR_PTR(-EEXIST);
-       if (nd->last_type != LAST_NORM)
-               goto fail;
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash(&nd->last, nd->dentry, &nd->it);
-       if (IS_ERR(dentry))
-               goto fail;
-       if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1617,7 +1647,7 @@ asmlinkage long sys_rmdir(const char * p
-                       goto exit1;
-       }
-       down(&nd.dentry->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash(&nd.last, nd.dentry, &nd.it);
-       error = PTR_ERR(dentry);
-       if (!IS_ERR(dentry)) {
-               error = vfs_rmdir(nd.dentry->d_inode, dentry);
-@@ -1677,7 +1707,7 @@ asmlinkage long sys_unlink(const char * 
-       if (nd.last_type != LAST_NORM)
-               goto exit1;
-       down(&nd.dentry->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash(&nd.last, nd.dentry, &nd.it);
-       error = PTR_ERR(dentry);
-       if (!IS_ERR(dentry)) {
-               /* Why not before? Because we want correct error value */
-@@ -1951,7 +1981,8 @@ int vfs_rename_other(struct inode *old_d
- }
- int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+             struct inode *new_dir, struct dentry *new_dentry, 
-+               struct lookup_intent *it)
- {
-       int error;
-       int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
-@@ -2022,7 +2053,7 @@ static inline int do_rename(const char *
-       trap = lock_rename(new_dir, old_dir);
--      old_dentry = lookup_hash(&oldnd.last, old_dir);
-+      old_dentry = lookup_hash(&oldnd.last, old_dir, &oldnd.it);
-       error = PTR_ERR(old_dentry);
-       if (IS_ERR(old_dentry))
-               goto exit3;
-@@ -2042,7 +2073,7 @@ static inline int do_rename(const char *
-       error = -EINVAL;
-       if (old_dentry == trap)
-               goto exit4;
--      new_dentry = lookup_hash(&newnd.last, new_dir);
-+      new_dentry = lookup_hash(&newnd.last, new_dir, &newnd.it);
-       error = PTR_ERR(new_dentry);
-       if (IS_ERR(new_dentry))
-               goto exit4;
-@@ -2052,7 +2083,7 @@ static inline int do_rename(const char *
-               goto exit5;
-       error = vfs_rename(old_dir->d_inode, old_dentry,
--                                 new_dir->d_inode, new_dentry);
-+                                 new_dir->d_inode, new_dentry, NULL);
- exit5:
-       dput(new_dentry);
- exit4:
---- linux-2.5.59/fs/nfsd/vfs.c~lustre-2.5      2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/fs/nfsd/vfs.c    2003-02-22 21:56:58.000000000 +0800
-@@ -1337,7 +1337,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
-                       err = nfserr_perm;
-       } else
- #endif
--      err = vfs_rename(fdir, odentry, tdir, ndentry);
-+      err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
-       if (!err && EX_ISSYNC(tfhp->fh_export)) {
-               nfsd_sync_dir(tdentry);
-               nfsd_sync_dir(fdentry);
---- linux-2.5.59/fs/sysfs/inode.c~lustre-2.5   2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/fs/sysfs/inode.c 2003-02-22 21:56:58.000000000 +0800
-@@ -539,7 +539,7 @@ static struct dentry * get_dentry(struct
-       qstr.name = name;
-       qstr.len = strlen(name);
-       qstr.hash = full_name_hash(name,qstr.len);
--      return lookup_hash(&qstr,parent);
-+      return lookup_hash(&qstr,parent,NULL);
- }
---- linux-2.5.59/include/linux/dcache.h~lustre-2.5     2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/include/linux/dcache.h   2003-02-22 22:02:55.000000000 +0800
-@@ -11,6 +11,27 @@
- struct vfsmount;
-+#define IT_OPEN     (1)
-+#define IT_CREAT    (1<<1)
-+#define IT_READDIR  (1<<2)
-+#define IT_GETATTR  (1<<3)
-+#define IT_LOOKUP   (1<<4)
-+#define IT_UNLINK   (1<<5)
-+
-+
-+struct lookup_intent {
-+       int it_op;
-+       int it_mode;
-+       int it_flags;
-+       int it_disposition;
-+       int it_status;
-+       struct iattr *it_iattr;
-+       __u64 it_lock_handle[2];
-+       int it_lock_mode;
-+       void *it_data;
-+};
-+
-+
- /*
-  * linux/include/linux/dcache.h
-  *
-@@ -32,6 +53,8 @@ struct qstr {
-       unsigned int hash;
- };
-+#include <linux/namei.h>
-+
- struct dentry_stat_t {
-       int nr_dentry;
-       int nr_unused;
-@@ -81,6 +104,7 @@ struct dentry {
-       struct list_head d_subdirs;     /* our children */
-       struct list_head d_alias;       /* inode alias list */
-       int d_mounted;
-+        struct lookup_intent *d_it;
-       struct qstr d_name;
-       unsigned long d_time;           /* used by d_revalidate */
-       struct dentry_operations  *d_op;
-@@ -100,6 +124,8 @@ struct dentry_operations {
-       int (*d_delete)(struct dentry *);
-       void (*d_release)(struct dentry *);
-       void (*d_iput)(struct dentry *, struct inode *);
-+      int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *);
-+      void (*d_intent_release)(struct  dentry *, struct lookup_intent *);
- };
- /* the dentry parameter passed to d_hash and d_compare is the parent
-@@ -139,6 +165,7 @@ d_iput:            no              no              yes
-       */
- #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
-+#define DCACHE_LUSTRE_INVALID         0x0010  /* Lustre invalidated */
- extern spinlock_t dcache_lock;
- extern rwlock_t dparent_lock;
---- linux-2.5.59/include/linux/fs.h~lustre-2.5 2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/include/linux/fs.h       2003-02-22 22:52:58.000000000 +0800
-@@ -234,6 +234,9 @@ typedef int (get_blocks_t)(struct inode 
- #define ATTR_ATTR_FLAG        1024
- #define ATTR_KILL_SUID        2048
- #define ATTR_KILL_SGID        4096
-+#define ATTR_RAW              8192    /* file system, not vfs will massage attrs */
-+#define ATTR_FROM_OPEN        16384    /* called from open path, ie O_TRUNC */
-+
- /*
-  * This is the Inode Attributes structure, used for notify_change().  It
-@@ -676,7 +679,7 @@ extern int vfs_symlink(struct inode *, s
- extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
- extern int vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_unlink(struct inode *, struct dentry *);
--extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-+extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct lookup_intent *it);
- /*
-  * File types
-@@ -762,19 +765,33 @@ struct file_operations {
- struct inode_operations {
-       int (*create) (struct inode *,struct dentry *,int);
-       struct dentry * (*lookup) (struct inode *,struct dentry *);
-+      struct dentry * (*lookup2) (struct inode *,struct dentry *, 
-+                                    struct lookup_intent *);
-       int (*link) (struct dentry *,struct inode *,struct dentry *);
-+      int (*link2) (struct inode *,struct inode *, const char *, int);
-       int (*unlink) (struct inode *,struct dentry *);
-+      int (*unlink2) (struct inode *, const char *, int);
-       int (*symlink) (struct inode *,struct dentry *,const char *);
-+      int (*symlink2) (struct inode *, const char *, int, const char *);
-       int (*mkdir) (struct inode *,struct dentry *,int);
-+      int (*mkdir2) (struct inode *, const char *, int,int);
-       int (*rmdir) (struct inode *,struct dentry *);
-+      int (*rmdir2) (struct inode *, const char *, int);
-       int (*mknod) (struct inode *,struct dentry *,int,dev_t);
-+      int (*mknod2) (struct inode *, const char *, int,int,int);
-       int (*rename) (struct inode *, struct dentry *,
-                       struct inode *, struct dentry *);
-+      int (*rename2) (struct inode *, struct inode *,
-+                      const char *oldname, int oldlen,
-+                      const char *newname, int newlen);
-       int (*readlink) (struct dentry *, char *,int);
-       int (*follow_link) (struct dentry *, struct nameidata *);
-+      int (*follow_link2) (struct dentry *, struct nameidata *,
-+                              struct lookup_intent *it);
-       void (*truncate) (struct inode *);
-       int (*permission) (struct inode *, int);
-       int (*setattr) (struct dentry *, struct iattr *);
-+      int (*setattr_raw) (struct inode *, struct iattr *);
-       int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
-       int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
-       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
-@@ -987,6 +1004,7 @@ extern int register_filesystem(struct fi
- extern int unregister_filesystem(struct file_system_type *);
- extern struct vfsmount *kern_mount(struct file_system_type *);
- extern int may_umount(struct vfsmount *);
-+struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data);
- extern long do_mount(char *, char *, char *, unsigned long, void *);
- extern int vfs_statfs(struct super_block *, struct statfs *);
---- linux-2.5.59/include/linux/namei.h~lustre-2.5      2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/include/linux/namei.h    2003-02-22 21:56:58.000000000 +0800
-@@ -13,6 +13,7 @@ struct nameidata {
-       int             last_type;
-       struct dentry   *old_dentry;
-       struct vfsmount *old_mnt;
-+        struct lookup_intent it;
- };
- /*
-@@ -46,7 +47,7 @@ extern int FASTCALL(link_path_walk(const
- extern void path_release(struct nameidata *);
- extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
--extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
-+extern struct dentry * lookup_hash(struct qstr *, struct dentry *, struct lookup_intent *);
- extern int follow_down(struct vfsmount **, struct dentry **);
- extern int follow_up(struct vfsmount **, struct dentry **);
---- linux-2.5.59/include/linux/slab.h~lustre-2.5       2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/include/linux/slab.h     2003-02-22 21:56:58.000000000 +0800
-@@ -56,6 +56,7 @@ extern int kmem_cache_destroy(kmem_cache
- extern int kmem_cache_shrink(kmem_cache_t *);
- extern void *kmem_cache_alloc(kmem_cache_t *, int);
- extern void kmem_cache_free(kmem_cache_t *, void *);
-+extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp);
- extern unsigned int kmem_cache_size(kmem_cache_t *);
- extern void *kmalloc(size_t, int);
---- linux-2.5.59/kernel/ksyms.c~lustre-2.5     2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/kernel/ksyms.c   2003-02-22 21:56:58.000000000 +0800
-@@ -376,6 +376,7 @@ EXPORT_SYMBOL(unregister_filesystem);
- EXPORT_SYMBOL(kern_mount);
- EXPORT_SYMBOL(__mntput);
- EXPORT_SYMBOL(may_umount);
-+EXPORT_SYMBOL(reparent_to_init);
- /* executable format registration */
- EXPORT_SYMBOL(register_binfmt);
-@@ -406,6 +407,12 @@ EXPORT_SYMBOL(request_irq);
- EXPORT_SYMBOL(free_irq);
- EXPORT_SYMBOL(irq_stat);
-+/* lustre */
-+EXPORT_SYMBOL(do_kern_mount);
-+EXPORT_SYMBOL(exit_files);
-+EXPORT_SYMBOL(kmem_cache_validate);
-+
-+
- /* waitqueue handling */
- EXPORT_SYMBOL(add_wait_queue);
- EXPORT_SYMBOL(add_wait_queue_exclusive);
---- linux-2.5.59/mm/slab.c~lustre-2.5  2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/mm/slab.c        2003-02-22 21:56:58.000000000 +0800
-@@ -1793,6 +1793,11 @@ static inline void __cache_free (kmem_ca
-       }
- }
-+int kmem_cache_validate(kmem_cache_t *cachep, void *objp)
-+{
-+      return 1;
-+}
-+
- /**
-  * kmem_cache_alloc - Allocate an object
-  * @cachep: The cache to allocate from.
---- linux-2.5.59/net/unix/af_unix.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800
-+++ linux-2.5.59-root/net/unix/af_unix.c       2003-02-22 21:56:58.000000000 +0800
-@@ -719,7 +719,7 @@ static int unix_bind(struct socket *sock
-               /*
-                * Do the final lookup.
-                */
--              dentry = lookup_hash(&nd.last, nd.dentry);
-+              dentry = lookup_hash(&nd.last, nd.dentry, NULL);
-               err = PTR_ERR(dentry);
-               if (IS_ERR(dentry))
-                       goto out_mknod_unlock;
-
-_
index d7b6dce..78855ac 100644 (file)
@@ -7,6 +7,6 @@
 --- /dev/null  Fri Aug 30 17:31:37 2002
 +++ linux-2.4.18-18.8.0-l12-braam/include/linux/lustre_version.h       Thu Feb 13 07:58:33 2003
 @@ -0,0 +1 @@
-+#define LUSTRE_KERNEL_VERSION 13
++#define LUSTRE_KERNEL_VERSION 19
 
 _
diff --git a/lustre/kernel_patches/patches/mcore-2.4.20-8.patch b/lustre/kernel_patches/patches/mcore-2.4.20-8.patch
new file mode 100644 (file)
index 0000000..c8b80eb
--- /dev/null
@@ -0,0 +1,2738 @@
+? linux/.config
+? linux/include/linux/autoconf.h
+? linux/include/linux/modules
+Index: linux/Makefile
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/Makefile,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.1
+diff -u -r1.3.2.1 -r1.3.2.1.2.1
+--- linux/Makefile     12 Mar 2003 19:48:52 -0000      1.3.2.1
++++ linux/Makefile     1 Apr 2003 12:17:40 -0000       1.3.2.1.2.1
+@@ -99,6 +99,10 @@
+ CFLAGS += -fomit-frame-pointer
+ endif
+ AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS)
++ifeq ($(CONFIG_MCL_COREDUMP),y)
++      CFLAGS += -g
++endif
++
+ #
+ # ROOT_DEV specifies the default root-device when making the image.
+Index: linux/Documentation/Configure.help
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/Documentation/Configure.help,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.1
+diff -u -r1.3.2.1 -r1.3.2.1.2.1
+--- linux/Documentation/Configure.help 12 Mar 2003 19:48:52 -0000      1.3.2.1
++++ linux/Documentation/Configure.help 1 Apr 2003 12:17:40 -0000       1.3.2.1.2.1
+@@ -21660,6 +21660,35 @@
+   This option allows you to run the kernel with data cache disabled.
+   Say Y if you experience CPM lock-ups.
++Boot kernel image support
++CONFIG_BOOTIMG
++  Add support for booting a new Linux kernel from a running Linux
++  system. You need to download the bootimg(8) utility from
++  ftp://icaftp.epfl.ch/pub/people/almesber/misc/bootimg-current.tar.gz
++  in order to use this functionality.
++
++Protect SMP configuration tables
++CONFIG_BOOTIMG_SMP
++  On SMP systems, the BIOS stores tables with configuration data in
++  memory and an SMP-enabled kernel reads these tables. However, a
++  kernel without SMP support will overwrite such tables. If a kernel
++  without SMP support used bootimg to boot an SMP-enabled kernel, the
++  latter will probably crash when trying to read the SMP tables. The
++  CONFIG_BOOTIMG_SMP option enables minimal support for scanning and
++  protecting of SMP configuration tables also for kernels without SMP
++  support.
++
++In-memory kernel core dump facility
++CONFIG_MCL_COREDUMP
++  In conjunction with bootimg, this allows you to get kernel core dumps
++  of your system at panic() time.  The panic call is modified so that it
++  calls the core dump facility and reboots the system.  On the way back 
++  up, the kernel dump image is written out to disk by the accompanying 
++  init script.  You can use the crash analysis tool to analyze the core 
++  dump.  This tool can be found at :
++
++       http://www.missioncriticallinux.com/download
++
+ #
+ # m68k-specific kernel options
+ # Documented by Chris Lawrence <mailto:quango@themall.net> et al.
+Index: linux/arch/i386/config.in
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/config.in,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.2
+diff -u -r1.3.2.1 -r1.3.2.1.2.2
+--- linux/arch/i386/config.in  12 Mar 2003 19:49:05 -0000      1.3.2.1
++++ linux/arch/i386/config.in  1 Apr 2003 19:35:12 -0000       1.3.2.1.2.2
+@@ -502,6 +502,12 @@
+    bool '  Magic SysRq key' CONFIG_MAGIC_SYSRQ
+    bool '  Spinlock debugging' CONFIG_DEBUG_SPINLOCK
+    bool '  Compile the kernel with frame pointers' CONFIG_FRAME_POINTER
++   if [ "$CONFIG_FRAME_POINTER " != "n" ]; then
++      bool '  Kernel Core Dump Facility' CONFIG_MCL_COREDUMP
++      if [ "$CONFIG_MCL_COREDUMP" = "y" ]; then
++         bool '  Reboot using bootimg' CONFIG_BOOTIMG
++      fi
++   fi
+ fi
+ endmenu
+Index: linux/arch/i386/vmlinux.lds
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/vmlinux.lds,v
+retrieving revision 1.1.1.1.4.1
+retrieving revision 1.1.1.1.4.1.2.1
+diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1
+--- linux/arch/i386/vmlinux.lds        12 Mar 2003 19:49:05 -0000      1.1.1.1.4.1
++++ linux/arch/i386/vmlinux.lds        1 Apr 2003 12:17:40 -0000       1.1.1.1.4.1.2.1
+@@ -19,6 +19,13 @@
+   .rodata : { *(.rodata) *(.rodata.*) }
+   .kstrtab : { *(.kstrtab) }
++  . = ALIGN(16);              /* Relocatable bootimage code */
++  __bootimg_start = .;
++  .bootimg : {
++      *(.bootimg)
++      }
++  __bootimg_end = .;
++
+   . = ALIGN(16);              /* Exception table */
+   __start___ex_table = .;
+   __ex_table : { *(__ex_table) }
+Index: linux/arch/i386/boot/setup.S
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/setup.S,v
+retrieving revision 1.2.2.1
+retrieving revision 1.2.2.1.2.1
+diff -u -r1.2.2.1 -r1.2.2.1.2.1
+--- linux/arch/i386/boot/setup.S       12 Mar 2003 19:49:05 -0000      1.2.2.1
++++ linux/arch/i386/boot/setup.S       1 Apr 2003 12:17:40 -0000       1.2.2.1.2.1
+@@ -105,16 +105,22 @@
+ # flags, unused bits must be zero (RFU) bit within loadflags
+ loadflags:
+ LOADED_HIGH   = 1                     # If set, the kernel is loaded high
++RELOADS_GDT   = 2                     # if set, kernel reloads GDT, such that
++                                      # boot loader does not have to provide
++                                      # GDT in a "safe" memory location
+ CAN_USE_HEAP  = 0x80                  # If set, the loader also has set
+                                       # heap_end_ptr to tell how much
+                                       # space behind setup.S can be used for
+                                       # heap purposes.
+                                       # Only the loader knows what is free
+-#ifndef __BIG_KERNEL__
+-              .byte   0
+-#else
+-              .byte   LOADED_HIGH
++_FLAGS = 0
++#ifdef __BIG_KERNEL__
++              _FLAGS = _FLAGS | LOADED_HIGH
+ #endif
++#ifdef CONFIG_BOOTIMG
++              _FLAGS = _FLAGS | RELOADS_GDT
++#endif
++              .byte _FLAGS
+ setup_move_size: .word  0x8000                # size to move, when setup is not
+                                       # loaded at 0x90000. We will move setup 
+Index: linux/arch/i386/kernel/Makefile
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/Makefile,v
+retrieving revision 1.2.2.1
+retrieving revision 1.2.2.1.2.1
+diff -u -r1.2.2.1 -r1.2.2.1.2.1
+--- linux/arch/i386/kernel/Makefile    12 Mar 2003 19:49:05 -0000      1.2.2.1
++++ linux/arch/i386/kernel/Makefile    1 Apr 2003 12:17:40 -0000       1.2.2.1.2.1
+@@ -49,6 +49,7 @@
+ obj-$(CONFIG_X86_LONGRUN)     += longrun.o
+ obj-$(CONFIG_ELAN_CPUFREQ)    += elanfreq.o
+ obj-$(CONFIG_PROFILING)               += profile.o
++obj-$(CONFIG_MCL_COREDUMP)    += crash.o
+ include $(TOPDIR)/Rules.make
+Index: linux/arch/i386/kernel/crash.c
+===================================================================
+RCS file: linux/arch/i386/kernel/crash.c
+diff -N linux/arch/i386/kernel/crash.c
+--- /dev/null  1 Jan 1970 00:00:00 -0000
++++ linux/arch/i386/kernel/crash.c     1 Apr 2003 12:17:40 -0000       1.1.6.1
+@@ -0,0 +1,82 @@
++/*
++ *  linux/arch/i386/crash.c
++ *
++ *  Architecture dependant code for MCL in-memory core dump.
++ */
++#include <linux/sched.h>
++#include <linux/types.h>
++#include <linux/smp.h>
++#include <linux/crash.h>
++#include <linux/reboot.h>
++#include <linux/bootimg.h>
++
++inline void crash_save_regs(void) {
++      static unsigned long regs[8];
++
++      __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs[0]));
++      __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs[1]));
++      __asm__ __volatile__("movl %%edx,%0" : "=m"(regs[2]));
++      __asm__ __volatile__("movl %%esi,%0" : "=m"(regs[3]));
++      __asm__ __volatile__("movl %%edi,%0" : "=m"(regs[4]));
++      __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs[5]));
++      __asm__ __volatile__("movl %%eax,%0" : "=m"(regs[6]));
++      __asm__ __volatile__("movl %%esp,%0" : "=m"(regs[7]));
++
++      panic_regs = regs;
++}
++
++/*
++ *  Save the current stack pointer and EIP.
++ */
++void crash_save_current_state(struct task_struct *tp)
++{
++      /*
++       *  Here we save ebp instead of esp just in case the compiler
++       *  decides to put an extra push in before we execute this
++       *  instruction (thus invalidating our frame pointer).
++       */
++      asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp));
++      tp->thread.eip = (u_long)crash_save_current_state;
++      panic_ksp[smp_processor_id()] = tp->thread.esp;
++      mb();
++
++      save_core();
++
++      crash_halt_or_reboot(1);
++}
++
++/*
++ *  If we are not the panicking thread, we simply halt.  Otherwise,
++ *  we take care of calling the reboot code.
++ */
++void crash_halt_or_reboot(int boot_cpu)
++{
++#ifdef CONFIG_SMP
++      if (!boot_cpu) {
++              stop_this_cpu(NULL);
++              /* NOTREACHED */
++      }
++#endif
++      machine_restart(NULL);
++}
++
++void crash_cleanup_smp_state(void)
++{
++      /*
++       *  Here we duplicate smp_send_stop.  Crash_halt_or_reboot() calls
++       *  stop_this_cpu.  We now know that we are the only one running, 
++       *  so we finish off the smp_send_stop function.
++       */
++      __cli();
++#ifdef CONFIG_SMP
++      disable_local_APIC();
++#endif
++}
++
++/*
++ *  Core dump IPI
++ */
++void smp_crash_funnel_cpu(void)
++{
++      crash_save_current_state(current);
++}
+Index: linux/arch/i386/kernel/nmi.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/nmi.c,v
+retrieving revision 1.2.2.1
+retrieving revision 1.2.2.1.2.1
+diff -u -r1.2.2.1 -r1.2.2.1.2.1
+--- linux/arch/i386/kernel/nmi.c       12 Mar 2003 19:49:06 -0000      1.2.2.1
++++ linux/arch/i386/kernel/nmi.c       1 Apr 2003 12:17:40 -0000       1.2.2.1.2.1
+@@ -374,11 +374,18 @@
+                       bust_spinlocks(1);
+                       printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
+                       show_registers(regs);
++#ifdef CONFIG_MCL_COREDUMP
++                      spin_unlock(&nmi_print_lock);
++                      bust_spinlocks(0);
++                      panic("die");
++                      /* NOTREACHED */
++#else
+                       printk("console shuts up ...\n");
+                       console_silent();
+                       spin_unlock(&nmi_print_lock);
+                       bust_spinlocks(0);
+                       do_exit(SIGSEGV);
++#endif
+               }
+       } else {
+               last_irq_sums[cpu] = sum;
+Index: linux/arch/i386/kernel/process.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/process.c,v
+retrieving revision 1.2.2.2
+retrieving revision 1.2.2.2.2.1
+diff -u -r1.2.2.2 -r1.2.2.2.2.1
+--- linux/arch/i386/kernel/process.c   1 Apr 2003 02:11:17 -0000       1.2.2.2
++++ linux/arch/i386/kernel/process.c   1 Apr 2003 12:17:40 -0000       1.2.2.2.2.1
+@@ -50,6 +50,9 @@
+ #ifdef CONFIG_MATH_EMULATION
+ #include <asm/math_emu.h>
+ #endif
++#ifdef CONFIG_BOOTIMG
++#include <linux/bootimg.h>
++#endif
+ #include <linux/irq.h>
+@@ -377,7 +380,21 @@
+ void machine_restart(char * __unused)
+ {
++#ifdef CONFIG_MCL_COREDUMP
++      extern char *panicmsg;
++      /*
++       *  Only call bootimg if we have a valid descriptor and
++       *  we are in a panic() context.
++       */
++      if (panicmsg)
++#endif
++#ifdef CONFIG_BOOTIMG
++              if (bootimg_dsc.page_dir)
++                      boot_image();
++#endif
++
+ #if CONFIG_SMP
++{
+       int cpuid;
+       
+       cpuid = GET_APIC_ID(apic_read(APIC_ID));
+@@ -413,6 +430,7 @@
+       if (!netdump_func)
+               smp_send_stop();
+       disable_IO_APIC();
++}
+ #endif
+       if(!reboot_thru_bios) {
+Index: linux/arch/i386/kernel/setup.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/setup.c,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.2
+diff -u -r1.3.2.1 -r1.3.2.1.2.2
+--- linux/arch/i386/kernel/setup.c     12 Mar 2003 19:49:06 -0000      1.3.2.1
++++ linux/arch/i386/kernel/setup.c     1 Apr 2003 17:55:35 -0000       1.3.2.1.2.2
+@@ -116,6 +116,9 @@
+ #include <asm/mpspec.h>
+ #include <asm/mmu_context.h>
+ #include <asm/edd.h>
++#ifdef CONFIG_MCL_COREDUMP
++#include <linux/crash.h>
++#endif
+ /*
+  * Machine setup..
+  */
+@@ -973,6 +976,7 @@
+ static unsigned long __init setup_memory(void)
+ {
+       unsigned long bootmap_size, start_pfn, max_low_pfn;
++      unsigned long bootmap_pages = 0UL, crash_pages = 0UL;
+       /*
+        * partially used pages are not usable - thus
+@@ -992,6 +996,21 @@
+       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+               pages_to_mb(highend_pfn - highstart_pfn));
+ #endif
++
++#ifdef CONFIG_MCL_COREDUMP
++      bootmap_pages = bootmem_bootmap_pages(max_low_pfn);
++      crash_pages = crash_pages_needed();
++
++      printk("start_pfn: %d, bootmap_pages: %d\n", start_pfn, bootmap_pages);
++
++      crash_init((u_long)phys_to_virt(PFN_PHYS(start_pfn)),
++                 (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn)),
++                 (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn +
++                                               crash_pages)));
++
++      printk("new start_pfn: %08lx\n", PFN_PHYS(start_pfn));
++      printk("crash map starts at %lx\n",(start_pfn+bootmap_pages)*PAGE_SIZE);
++#endif
+       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+                       pages_to_mb(max_low_pfn));
+       /*
+@@ -1007,8 +1026,8 @@
+        * the (very unlikely) case of us accidentally initializing the
+        * bootmem allocator with an invalid RAM area.
+        */
+-      reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
+-                       bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
++      reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + bootmap_size + 
++                    ((1+crash_pages)*PAGE_SIZE) + PAGE_SIZE-1) - (HIGH_MEMORY));
+       /*
+        * reserve physical page 0 - it's a special BIOS page on many boxes,
+@@ -1016,6 +1035,16 @@
+        */
+       reserve_bootmem(0, PAGE_SIZE);
++#ifdef CONFIG_BOOTIMG
++      /*
++       * bootimg(8) reads the old parameter block. Note that the copy in
++       * empty_zero_page will vanish when mem_init runs. (Should we
++       * memcpy(phys_to_virt(0x90000), PARAM, PAGE_SIZE);
++       * now ?)
++       */
++      reserve_bootmem(0x90000, PAGE_SIZE);
++#endif
++
+ #ifdef CONFIG_SMP
+       /*
+        * But first pinch a few for the stack/trampoline stuff
+@@ -1032,6 +1061,7 @@
+       find_smp_config();
+ #endif
+ #ifdef CONFIG_BLK_DEV_INITRD
++      printk("caution: initrd may overwrite dump\n"); /* phro */
+       if (LOADER_TYPE && INITRD_START) {
+               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
+                       reserve_bootmem(INITRD_START, INITRD_SIZE);
+@@ -1172,6 +1202,12 @@
+       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
+ #endif
+       paging_init();
++#ifdef CONFIG_MCL_COREDUMP
++      /*
++       * Reserve crash pages
++       */
++      crash_mark_dump_reserved();
++#endif
+ #ifdef CONFIG_X86_LOCAL_APIC
+       /*
+        * get boot-time SMP configuration:
+Index: linux/arch/i386/kernel/smp.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/smp.c,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.1
+diff -u -r1.3.2.1 -r1.3.2.1.2.1
+--- linux/arch/i386/kernel/smp.c       12 Mar 2003 19:49:06 -0000      1.3.2.1
++++ linux/arch/i386/kernel/smp.c       1 Apr 2003 12:17:40 -0000       1.3.2.1.2.1
+@@ -23,6 +23,9 @@
+ #include <asm/pgalloc.h>
+ #include <asm/smpboot.h>
++#ifdef CONFIG_MCL_COREDUMP
++#include <asm/crash.h>
++#endif
+ /*
+  *    Some notes on x86 processor bugs affecting SMP operation:
+  *
+@@ -579,7 +582,7 @@
+       return 0;
+ }
+-static void stop_this_cpu (void * dummy)
++void stop_this_cpu (void * dummy)
+ {
+       /*
+        * Remove this CPU:
+Index: linux/arch/i386/kernel/traps.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/traps.c,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.1
+diff -u -r1.3.2.1 -r1.3.2.1.2.1
+--- linux/arch/i386/kernel/traps.c     12 Mar 2003 19:49:06 -0000      1.3.2.1
++++ linux/arch/i386/kernel/traps.c     1 Apr 2003 12:17:40 -0000       1.3.2.1.2.1
+@@ -52,6 +52,10 @@
+ #include <linux/irq.h>
+ #include <linux/module.h>
++#ifdef CONFIG_MCL_COREDUMP
++#include <linux/crash.h>
++#endif
++
+ asmlinkage int system_call(void);
+ asmlinkage void lcall7(void);
+ asmlinkage void lcall27(void);
+@@ -309,7 +313,11 @@
+               netdump_func(regs);
+       bust_spinlocks(0);
+       spin_unlock_irq(&die_lock);
+-      do_exit(SIGSEGV);
++#ifdef CONFIG_MCL_COREDUMP 
++      if(panic_on_oops)
++              panic("die");
++#endif
++      do_exit(SIGSEGV);/* NOTREACHED */
+ }
+ static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
+Index: linux/drivers/char/misc.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/misc.c,v
+retrieving revision 1.2
+retrieving revision 1.2.4.1
+diff -u -r1.2 -r1.2.4.1
+--- linux/drivers/char/misc.c  25 Sep 2002 17:11:05 -0000      1.2
++++ linux/drivers/char/misc.c  1 Apr 2003 12:17:41 -0000       1.2.4.1
+@@ -78,6 +78,8 @@
+ extern int i8k_init(void);
+ extern int lcd_init(void);
++extern int crash_init_chrdev(void);
++
+ static int misc_read_proc(char *buf, char **start, off_t offset,
+                         int len, int *eof, void *private)
+ {
+@@ -255,6 +257,9 @@
+ int __init misc_init(void)
+ {
+       create_proc_read_entry("misc", 0, 0, misc_read_proc, NULL);
++#ifdef CONFIG_MCL_COREDUMP
++      crash_init_chrdev();
++#endif
+ #ifdef CONFIG_MVME16x
+       rtc_MK48T08_init();
+ #endif
+Index: linux/drivers/char/sysrq.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/sysrq.c,v
+retrieving revision 1.2.2.1
+retrieving revision 1.2.2.1.2.2
+diff -u -r1.2.2.1 -r1.2.2.1.2.2
+--- linux/drivers/char/sysrq.c 12 Mar 2003 19:49:47 -0000      1.2.2.1
++++ linux/drivers/char/sysrq.c 1 Apr 2003 17:55:35 -0000       1.2.2.1.2.2
+@@ -97,7 +97,18 @@
+       action_msg:     "Resetting",
+ };
+-
++#ifdef CONFIG_MCL_COREDUMP
++/* kernel core dump sysrq */
++static void sysrq_handle_coredump(int key, struct pt_regs *pt_regs,
++              struct kbd_struct *kbd, struct tty_struct *ttty) {
++      panic("sysrq");
++}
++static struct sysrq_key_op sysrq_coredump_op = {
++      handler:        sysrq_handle_coredump,
++      help_msg:       "Crash",
++      action_msg:     "Dumping core",
++};
++#endif
+ /* SYNC SYSRQ HANDLERS BLOCK */
+@@ -334,7 +345,11 @@
+                it is handled specially on the spark
+                and will never arive */
+ /* b */       &sysrq_reboot_op,
++#ifdef CONFIG_MCL_COREDUMP
++/* c */       &sysrq_coredump_op,
++#else
+ /* c */       NULL,
++#endif
+ /* d */       NULL,
+ /* e */       &sysrq_term_op,
+ /* f */       NULL,
+Index: linux/include/asm-i386/bootimg.h
+===================================================================
+RCS file: linux/include/asm-i386/bootimg.h
+diff -N linux/include/asm-i386/bootimg.h
+--- /dev/null  1 Jan 1970 00:00:00 -0000
++++ linux/include/asm-i386/bootimg.h   1 Apr 2003 12:17:41 -0000       1.1.6.1
+@@ -0,0 +1,141 @@
++/* asm-i386/bootimg.h - Boot image, i386-specific code */
++
++/* Written 2000 by Werner Almesberger */
++
++/*
++ * When porting bootimg(2) to a new architcture, you need to adapt the
++ * functions and definitions in this file.
++ */
++
++
++#ifndef _ASM_I386_BOOTIMG_H
++#define _ASM_I386_BOOTIMG_H
++
++#include <linux/config.h>
++#include <asm/system.h>
++
++#ifdef CONFIG_SMP
++#include <linux/smp.h>
++#include <linux/irq.h>
++#endif
++
++
++/*
++ * The memory page with the code currently executing has been copied from
++ * old_page to new_page. Jump there.
++ *
++ * Note: flush_icache_range has already been called on the new page.
++ */
++
++static inline void jump_relocated(unsigned long old_page,unsigned long new_page)
++{
++      int tmp;
++
++      __asm__ __volatile__(
++      "stc\n\t"
++      "call 1f\n"
++      "1:\tjnc 2f\n\t"
++      "popl %0\n\t"
++      "addl %1,%0\n\t"
++      "addl %1,%%esp\n\t"
++      "clc\n\t"
++      "jmp *%0\n"
++      "2:"
++      : "=&r" (tmp) : "r" (new_page-old_page));
++}
++
++
++/*
++ * Stop paging, such that
++ *  - page tables can be overwritten
++ *  - all physical memory can be accessed
++ *  - all physical memory is identity-mapped
++ *
++ * (Other rules are possible, but need to be encoded in bootimg(8).)
++ */
++
++static inline void stop_paging(void)
++{
++      unsigned long msw;
++
++      __asm__ __volatile__(
++      "movl %%cr0,%0\n\t"
++      "andl $0x7fffffff,%0\n\t"
++      "movl %0,%%cr0\n\t"
++      "jmp 1f\n\t"    /* i486 and such */
++      "1:"
++
++/* Clear the PAE bit in register %cr4 if we were in PAE mode.  The initial
++ * page table set up by the new kernel's bootstrap code is non-PAE regardless
++ * of whether the new kernel is a PAE kernel.  By clearing the PAE bit here,
++ * we make sure the bootstrap code doesn't accidentally enable PAE mode when
++ * it turns on address translation.
++ */
++#ifdef CONFIG_X86_PAE
++      "movl %%cr4,%0\n\t"
++      "andl $0xffffffdf,%0\n\t"
++      "movl %0,%%cr4\n\t"
++#endif
++
++      : "=&r" (msw) : : "memory");
++}
++
++
++/*
++ * Stop any remaining concurrency in the system. If become_only_thread fails
++ * but the system is still usable, become_only_thread should return an error
++ * code. If no recovery is possible, it may as well panic.
++ */
++
++static inline int become_only_thread(void)
++{
++#ifdef CONFIG_SMP
++      smp_send_stop();
++      disable_IO_APIC();
++#endif
++      cli();
++      return 0;
++}
++
++
++/*
++ * A conservative estimate of the number of bytes relocate_and_jump allocated
++ * on the stack. This is only used for sanity checking before running code,
++ * because we can't recover from failure in relocate_and_jump.
++ */
++
++#define RESERVE_MIN_RELOC_STACK       256
++
++
++/*
++ * Change the stack pointer such that stack is at the end of the specified
++ * page. No data on the old stack will be accessed anymore, so no copying is
++ * required.
++ */
++
++static inline void stack_on_page(void *page)
++{
++      __asm__ __volatile__(
++      "push %%ds\n\t"
++      "pop %%ss\n\t"
++      "movl %0,%%esp\n\t"
++      "addl $0x1000,%%esp\n\t"
++      : : "r" (page));
++}
++
++/*
++ * Set up things such that the kernel will be comfortable (e.g. some
++ * architectures expect the boot loader to set registers in certain ways),
++ * and then jump to the kernel's entry address.
++ */
++
++static inline void jump_to_kernel(void (*kernel_entry)(void))
++{
++      __asm__ __volatile__(
++      "mov $0x90000,%%esi\n\t"
++      : : );
++
++      kernel_entry();
++}
++
++#endif
+Index: linux/include/asm-i386/crash.h
+===================================================================
+RCS file: linux/include/asm-i386/crash.h
+diff -N linux/include/asm-i386/crash.h
+--- /dev/null  1 Jan 1970 00:00:00 -0000
++++ linux/include/asm-i386/crash.h     1 Apr 2003 12:17:41 -0000       1.1.6.1
+@@ -0,0 +1,15 @@
++#ifndef __ASM_CRASH_H
++#define __ASM_CRASH_H
++
++#define UPPER_MEM_BACKUP 0
++#define LOWER_MEM_FORWARD 0
++#define LOW_OFFSET 100
++
++/*
++ *  These two functions are inlined on alpha.  That's why they appear
++ *  in the arch dependent include file.
++ */
++void crash_save_current_state(struct task_struct *);
++void crash_halt_or_reboot(int);
++
++#endif
+Index: linux/include/linux/bootimg.h
+===================================================================
+RCS file: linux/include/linux/bootimg.h
+diff -N linux/include/linux/bootimg.h
+--- /dev/null  1 Jan 1970 00:00:00 -0000
++++ linux/include/linux/bootimg.h      1 Apr 2003 12:17:41 -0000       1.1.6.1
+@@ -0,0 +1,84 @@
++/* linux/bootimg.h - Boot image, general definitions */
++
++/* Written 2000 by Werner Almesberger */
++
++
++#ifndef _LINUX_BOOTIMG_H
++#define _LINUX_BOOTIMG_H
++
++
++/*
++ * Constraints on image_map:
++ *  - each image_map[n] is the virtual address of a page-sized memory region
++ *    readable by the user
++ *  - currently, image_map[n] is not required to be page-aligned, but this may
++ *    change in the future if we want to map pages directly to lower memory
++ *    pressure (NB: mapping works for ELF and plain binary images, but usually
++ *    not for (b)zImages, because the prepended boot and setup sectors
++ *    mis-align them)
++ *
++ * Constraints on load_map:
++ *  - each load_map[] is the physical address of a page in RAM
++ */
++
++struct boot_image {
++      void **image_map;       /* pointers to image pages in user memory */
++      int pages;              /* length in pages */
++      unsigned long *load_map;/* list of destination pages (physical addr) */
++      unsigned long start;    /* jump to this physical address */
++      int flags;              /* for future use, must be zero for now */
++};
++
++
++#ifdef __KERNEL__
++
++#define __bootimg __attribute__ ((__section__ (".bootimg")))
++
++
++struct bootimg_dsc {
++      unsigned long self;             /* code page            ALL ADDRESSES */
++      unsigned long scratch;          /* scratch page         ARE PHYSICAL !*/
++      unsigned long **page_dir;       /* src & dst page tables              */
++      void (*jump_to)(void);          /* start address                      */
++      int pages;                      /* number of pages */
++    unsigned long csum; /* Kernel Image checksum */
++};
++
++/*
++ * page_dir contains pointers to pages containing pointers to pages. We call
++ * page_dir a "directory" and the page page_dir[n] points to a "table". The
++ * first PAGES_PER_TABLE/2 entries of page_dir are for source pages, and other
++ * half are for destination pages.
++ */
++
++/*
++ * Note that the definitions used here do not necessarily correspond to the
++ * architecture-specific PTRS_PER_PTE, __pte_offset, etc.
++ */
++ 
++#define PAGES_PER_TABLE       (PAGE_SIZE/sizeof(void *))
++#define FROM_TABLE(i) ((i)/PAGES_PER_TABLE)
++#define TO_TABLE(i)   ((i)/PAGES_PER_TABLE+PAGES_PER_TABLE/2)
++#define PAGE_NR(i)    ((i) % PAGES_PER_TABLE)
++
++
++extern char __bootimg_start,__bootimg_end;    /* linker segment boundaries */
++extern unsigned long *unity_page; /* unity-mapped page for i386 */
++
++/*
++ * relocate_and_jump runs in its own page with its own stack. This makes it
++ * difficult to pass parameters. The solution chosen here is to use the global
++ * variable bootimg_dsc, which is copied into an "auto" variable by
++ * relocate_and_jump before any copying or relocation takes place.
++ */
++
++extern struct bootimg_dsc bootimg_dsc;
++
++typedef void (*relocate_and_jump_t)(void);
++
++void relocate_and_jump(void);
++int  boot_image(void);
++
++#endif /* __KERNEL__ */
++
++#endif
+Index: linux/include/linux/crash.h
+===================================================================
+RCS file: linux/include/linux/crash.h
+diff -N linux/include/linux/crash.h
+--- /dev/null  1 Jan 1970 00:00:00 -0000
++++ linux/include/linux/crash.h        1 Apr 2003 12:17:41 -0000       1.1.6.1
+@@ -0,0 +1,119 @@
++#ifndef __LINUX_CRASH_H
++#define __LINUX_CRASH_H
++
++/* defines for interfacing with user-space (ioctls, etc) */
++struct ioctl_getdump {
++      unsigned long kva;
++      unsigned long buf;
++};
++
++#define CRASH_IOC_MAGIC 'C'
++
++#define CRASH_IOCFREEDUMP _IO(CRASH_IOC_MAGIC, 0)
++#define CRASH_IOCGETDUMP _IOWR(CRASH_IOC_MAGIC, 1, struct ioctl_getdump)
++#define CRASH_IOCBOOTIMG _IOWR(CRASH_IOC_MAGIC, 2, struct boot_image)
++#define CRASH_IOCVERSION _IO(CRASH_IOC_MAGIC, 3)
++
++/* kernel-only part of crash.h */
++#ifdef __KERNEL__
++#include <asm/crash.h>
++
++#define CRASH_K_MINOR (1)
++#define CRASH_K_MAJOR (0)
++
++/*
++ * Crash prototypes.
++ */
++void save_core(void);
++void crash_mark_dump_reserved(void);
++void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va);
++u_long crash_pages_needed(void);
++void smp_crash_funnel_cpu(void);
++void crash_cleanup_smp_state(void);
++
++/*
++ *  Arch dependant crash.c funcs
++ */
++void crash_save_current_state(struct task_struct *);
++void crash_halt_or_reboot(int);
++inline void crash_save_regs(void);
++
++/*
++ * Crash globals
++ */
++extern u_long crash_dump_header;
++extern volatile u_long panic_ksp[];
++extern volatile int crash_release;
++extern int panic_on_oops;
++extern char *panicmsg;
++extern int panic_processor;
++extern int crash_perform_sync;
++extern unsigned long *panic_regs;
++
++/*
++ * symbols not exported by linux header files
++ */
++extern void stop_this_cpu(void *);
++
++/*  struct crash_map_hdr located at byte offset 0 */
++/* on-disk formats */
++
++#define trunc_page(x)   ((void *)(((unsigned long)(x)) & ~((unsigned long)(PAGE_SIZE - 1))))
++#define round_page(x)   trunc_page(((unsigned long)(x)) + ((unsigned long)(PAGE_SIZE - 1)))
++
++#define CRASH_MAGIC 0x9a8bccdd
++#define CRASH_SOURCE_PAGES 128
++#define CRASH_SUB_MAP_BYTES ((u_long)round_page((CRASH_SOURCE_PAGES+1)*sizeof(u_long)))
++#define CRASH_SUB_MAP_PAGES (CRASH_SUB_MAP_BYTES / PAGE_SIZE)
++#define CRASH_UNCOMPR_BUF_PAGES (CRASH_SOURCE_PAGES + CRASH_SUB_MAP_PAGES)
++#define CRASH_COMPR_BUF_PAGES (CRASH_UNCOMPR_BUF_PAGES + (CRASH_UNCOMPR_BUF_PAGES/4))
++#define CRASH_COMPESS_PRIME_PAGES (2*CRASH_COMPR_BUF_PAGES)
++#define CRASH_ZALLOC_PAGES 16*5*2     /* 2 to handle crash in crash */
++#define CRASH_LOW_WATER_PAGES 100
++
++#define CRASH_CPU_TIMEOUT 5000        /* 5 sec wait for other cpus to stop */
++
++#define CRASH_MARK_RESERVED(addr) (set_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags))
++#define CRASH_CLEAR_RESERVED(addr) (clear_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags))
++#define CRASH_MARK_BOOT_RESERVED(addr) reserve_bootmem(virt_to_phys((void *)addr), PAGE_SIZE);
++
++typedef int boolean_t;
++
++#define TRUE 1
++#define FALSE 0
++
++/* mem structure */
++struct mem_crash_map_hdr {
++      long magic[4];          /* identify crash dump */
++      u_long map;             /* location of map */
++      u_long map_pages;
++      u_long data_pages;
++      u_long compr_units;
++      u_long boot_reserved_start;
++      u_long boot_reserved_end;
++};
++struct mem_crash_map_entry {
++      u_long src_va;          /* source start of larger non-contig 
++                               * block.  a src_va of -1 means that 
++                               * the dest_page_va is the location of 
++                               * the next map page */
++      u_long dest_page_va;    /* dest of this sub block */
++      u_long check_sum;       /* check_sum for dest data */
++};
++
++/* file structure */
++struct crash_map_hdr {
++      long magic[4];          /* identify crash dump */
++      int blk_size;           /* block size for this device */
++      int map_block;          /* location of map */
++      int map_blocks;         /* number of blocks for map */
++};
++struct crash_map_entry {
++      u_long start_va;        /* virtual address */
++      char *exp_data;         /* expanded data in memory */
++      int start_blk;          /* device location */
++      int num_blks;
++};
++
++#endif /* __KERNEL__ */
++#endif /* __LINUX_CRASH_H */
+Index: linux/include/linux/mm.h
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/include/linux/mm.h,v
+retrieving revision 1.2.2.1
+retrieving revision 1.2.2.1.2.2
+diff -u -r1.2.2.1 -r1.2.2.1.2.2
+--- linux/include/linux/mm.h   12 Mar 2003 19:51:27 -0000      1.2.2.1
++++ linux/include/linux/mm.h   1 Apr 2003 17:55:35 -0000       1.2.2.1.2.2
+@@ -331,6 +331,11 @@
+ #define PG_lru                        18
+ #define PG_active_cache               19
+ #define PG_fs_1                       20      /* Filesystem specific */
++#ifdef CONFIG_MCL_COREDUMP
++#define PG_free                       21
++#define PG_shm                        22
++#define PG_anon                       23
++#endif
+ /* Make it prettier to test the above... */
+ #define UnlockPage(page)      unlock_page(page)
+@@ -452,6 +457,11 @@
+ #define PageSetSlab(page)     set_bit(PG_slab, &(page)->flags)
+ #define PageClearSlab(page)   clear_bit(PG_slab, &(page)->flags)
+ #define PageReserved(page)    test_bit(PG_reserved, &(page)->flags)
++#ifdef CONFIG_MCL_COREDUMP
++#define PageFree(page)          (test_bit(PG_free, &(page)->flags))
++#define PageAnon(page)          (test_bit(PG_anon, &(page)->flags))
++#define PageShm(page)           (test_bit(PG_shm, &(page)->flags))
++#endif
+ #define PageActiveAnon(page)          test_bit(PG_active_anon, &(page)->flags)
+ #define SetPageActiveAnon(page)       set_bit(PG_active_anon, &(page)->flags)
+Index: linux/include/linux/reboot.h
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/include/linux/reboot.h,v
+retrieving revision 1.1.1.1
+retrieving revision 1.1.1.1.10.2
+diff -u -r1.1.1.1 -r1.1.1.1.10.2
+--- linux/include/linux/reboot.h       7 May 2002 21:53:47 -0000       1.1.1.1
++++ linux/include/linux/reboot.h       1 Apr 2003 17:55:35 -0000       1.1.1.1.10.2
+@@ -20,6 +20,7 @@
+  * CAD_OFF     Ctrl-Alt-Del sequence sends SIGINT to init task.
+  * POWER_OFF   Stop OS and remove all power from system, if possible.
+  * RESTART2    Restart system using given command string.
++ * COREDUMP    We're taking a core dump, secondary cpus already stopped.
+  */
+ #define       LINUX_REBOOT_CMD_RESTART        0x01234567
+@@ -28,7 +29,9 @@
+ #define       LINUX_REBOOT_CMD_CAD_OFF        0x00000000
+ #define       LINUX_REBOOT_CMD_POWER_OFF      0x4321FEDC
+ #define       LINUX_REBOOT_CMD_RESTART2       0xA1B2C3D4
+-
++#ifdef CONFIG_MCL_COREDUMP
++#define LINUX_REBOOT_CMD_COREDUMP     0x9A8BCCDD
++#endif
+ #ifdef __KERNEL__
+Index: linux/include/linux/sysctl.h
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/include/linux/sysctl.h,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.1
+diff -u -r1.3.2.1 -r1.3.2.1.2.1
+--- linux/include/linux/sysctl.h       12 Mar 2003 19:51:30 -0000      1.3.2.1
++++ linux/include/linux/sysctl.h       1 Apr 2003 12:17:41 -0000       1.3.2.1.2.1
+@@ -126,6 +126,7 @@
+       KERN_CADPID=54,         /* int: PID of the process to notify on CAD */
+       KERN_CORE_PATTERN=56,   /* string: pattern for core-files */
+       KERN_PID_MAX=55,        /* int: max PID value of processes */
++      KERN_PANIC_ON_OOPS      /* int: panic on oops enabled */
+ };
+Index: linux/init/main.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/init/main.c,v
+retrieving revision 1.2.2.1
+retrieving revision 1.2.2.1.2.1
+diff -u -r1.2.2.1 -r1.2.2.1.2.1
+--- linux/init/main.c  12 Mar 2003 19:51:35 -0000      1.2.2.1
++++ linux/init/main.c  1 Apr 2003 12:17:41 -0000       1.2.2.1.2.1
+@@ -70,6 +70,10 @@
+ #include <asm/smp.h>
+ #endif
++#ifdef CONFIG_BOOTIMG
++#include <linux/bootimg.h>
++#endif
++
+ /*
+  * Versions of gcc older than that listed below may actually compile
+  * and link okay, but the end product can have subtle run time bugs.
+@@ -352,10 +356,14 @@
+ {
+       char * command_line;
+       extern char saved_command_line[];
++#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC)
++      unsigned long value;
++#endif
+ /*
+  * Interrupts are still disabled. Do necessary setups, then
+  * enable them
+  */
++      printk("start_kernel\n");
+       lock_kernel();
+       printk(linux_banner);
+       setup_arch(&command_line);
+@@ -373,12 +381,26 @@
+        * this. But we do want output early, in case something goes wrong.
+        */
+       console_init();
++
++#ifdef CONFIG_BOOTIMG
++      unity_page = alloc_bootmem_pages(PAGE_SIZE);
++      printk("unity_page addr: %p\n",unity_page);
++#endif
+ #ifdef CONFIG_MODULES
+       init_modules();
+ #endif
+       profile_init();
+       kmem_cache_init();
+       sti();
++#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC)
++      /* If we don't make sure the APIC is enabled, AND the LVT0
++       * register is programmed properly, we won't get timer interrupts
++       */
++      setup_local_APIC();
++      
++      value = apic_read(APIC_LVT0);
++      apic_write_around(APIC_LVT0, value & ~APIC_LVT_MASKED);
++#endif
+       calibrate_delay();
+ #ifdef CONFIG_BLK_DEV_INITRD
+       if (initrd_start && !initrd_below_start_ok &&
+Index: linux/kernel/Makefile
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/kernel/Makefile,v
+retrieving revision 1.1.1.1.4.1
+retrieving revision 1.1.1.1.4.1.2.1
+diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1
+--- linux/kernel/Makefile      12 Mar 2003 19:51:36 -0000      1.1.1.1.4.1
++++ linux/kernel/Makefile      1 Apr 2003 12:17:41 -0000       1.1.1.1.4.1.2.1
+@@ -22,7 +22,8 @@
+ obj-$(CONFIG_PM) += pm.o
+ obj-$(CONFIG_KALLSYMS) += kallsyms.o
+ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
+-
++obj-$(CONFIG_BOOTIMG) += bootimg.o bootimg_pic.o
++obj-$(CONFIG_MCL_COREDUMP) += crash.o
+ ifneq ($(CONFIG_IA64),y)
+ # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+Index: linux/kernel/bootimg.c
+===================================================================
+RCS file: linux/kernel/bootimg.c
+diff -N linux/kernel/bootimg.c
+--- /dev/null  1 Jan 1970 00:00:00 -0000
++++ linux/kernel/bootimg.c     1 Apr 2003 12:17:41 -0000       1.1.6.1
+@@ -0,0 +1,301 @@
++/* bootimg.c - Boot another (kernel) image */
++
++/* Written 2000 by Werner Almesberger */
++
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/capability.h>
++#include <linux/bootimg.h>
++#include <asm/bootimg.h>
++#include <asm/uaccess.h>
++#include <asm/io.h>
++#include <asm/pgtable.h>
++#include <linux/delay.h>
++
++#if 0
++#define DPRINTK_CONT(format,args...) printk(format,##args)
++#else
++#define DPRINTK_CONT(format,args...)
++#endif
++#define DPRINTK(format,args...) DPRINTK_CONT(KERN_DEBUG format,##args)
++
++unsigned long **bootimg_page_dir;
++
++struct bootimg_dsc bootimg_dsc; /* communication with PIC */
++unsigned long *unity_page; /* unity-mapped page for i386 */
++
++static unsigned long bootimg_checksum(unsigned long **page_dir, int num_pages)
++{
++      unsigned long checksum, *page;
++      int i, j;
++
++      checksum = 0;
++
++      for (i = 0; i < num_pages; i++) {
++              page = __va((unsigned long *)
++                          page_dir[FROM_TABLE(i)][PAGE_NR(i)]);
++
++              for (j = 0; j < PAGES_PER_TABLE; j++)
++                      checksum ^= page[j];
++
++              checksum ^= page_dir[TO_TABLE(i)][PAGE_NR(i)];
++      }
++
++        return checksum;
++}
++
++#ifdef CONFIG_X86_PAE
++
++static unsigned long get_identity_mapped_page(void)
++{
++      pgd_t *pgd;
++      pmd_t *pmd;
++      unsigned long phys_addr, page_base;
++
++      /* Set up a 2 Mb identity-mapped page. */
++
++      phys_addr = virt_to_phys(unity_page);
++      pgd = pgd_offset(current->active_mm, phys_addr);
++      pmd = pmd_offset(pgd, phys_addr);
++
++      /* We hardcode this rather than using PMD_MASK just in case the PAE
++               * mode setup ever changes so that 2 Mb pages are no longer used.
++               */
++      page_base = phys_addr & ~((1 << 21) - 1);
++
++      set_pmd(pmd, __pmd(page_base | _PAGE_PSE | _KERNPG_TABLE));
++      __flush_tlb_one(phys_addr);
++
++      return (unsigned long) unity_page;
++}
++
++#else
++
++static unsigned long get_identity_mapped_page(void)
++{
++      set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)),
++              __pgd((_KERNPG_TABLE + _PAGE_PSE + (virt_to_phys(unity_page)&PGDIR_MASK))));
++      __flush_tlb_one(virt_to_phys(unity_page));
++      return (unsigned long)unity_page;
++}
++
++#endif
++
++#if 0 /* Perhaps we'll need this in the future? */
++static void unmap_identity_mapped_page(void)
++{
++      set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)),__pgd(0));
++      __flush_tlb();
++}
++#endif
++
++static int fill_page_dir(unsigned long **page_dir,struct boot_image *image)
++{
++      int i, count=0;
++
++      memset(page_dir,0,PAGE_SIZE);
++      for (i = 0; i < image->pages; i += PAGES_PER_TABLE) {
++              unsigned long **table;
++              int bytes_left;
++
++              table = page_dir+FROM_TABLE(i);
++              *table = (unsigned long *) get_free_page(GFP_KERNEL);
++              if (!*table) return -ENOMEM;
++
++              memset(*table,0,PAGE_SIZE);
++              DPRINTK("page %d: from table %p @ %p\n",i,*table,table);
++              table = page_dir+TO_TABLE(i);
++              *table = (unsigned long *) get_free_page(GFP_KERNEL);
++              if (!*table) return -ENOMEM;
++
++              bytes_left = (image->pages-i)*sizeof(unsigned long);
++              if (copy_from_user(*table,image->load_map+i,
++                  bytes_left > PAGE_SIZE ? PAGE_SIZE : bytes_left))
++                      return -EFAULT;
++              DPRINTK("page %d: to table %p @ %p\n",i,*table,table);
++              count+=2; /* 2 pages per loop */
++      }
++
++      for (i = 0; i < image->pages; i++) {
++              unsigned long page = get_free_page(GFP_KERNEL);
++              void *src;
++
++              if (!page) return -ENOMEM;
++              count++;
++
++              page_dir[FROM_TABLE(i)][PAGE_NR(i)] =
++                  virt_to_phys((void *) page);
++              if (get_user(src,image->image_map+i) ||
++                  copy_from_user((void *) page,src,PAGE_SIZE))
++                      return -EFAULT;
++
++              DPRINTK("page %d: %p->%p->%p @ %p\n",i,src,(void *) page,
++                  (void *) page_dir[FROM_TABLE(i)][PAGE_NR(i)],
++                  &page_dir[FROM_TABLE(i)][PAGE_NR(i)]);
++      }
++
++      DPRINTK("fill_page_dir: %d pages allocated\n", count);
++
++      return 0;
++}
++
++
++static void free_page_dir(unsigned long **page_dir)
++{
++      int i,j,count=0;
++
++      for (i = 0; i < PAGES_PER_TABLE/2; i++)
++              if (page_dir[i])
++                      for (j = 0; j < PAGES_PER_TABLE; j++)
++                              if (page_dir[i][j]) {
++                                      free_page((unsigned long)
++                                          phys_to_virt(page_dir[i][j]));
++                                      count++;
++                              }
++      for (i = 0; i < PAGES_PER_TABLE; i++)
++              if (page_dir[i]) {
++                      free_page((unsigned long) *page_dir[i]);
++                      count++;
++              }
++      DPRINTK("free_page_dir: %d pages freed\n", count);
++}
++
++
++static void convert_table_refs_to_phys(unsigned long **page_dir)
++{
++      int i;
++
++      DPRINTK("PAGES_PER_TABLE: %d\n",PAGES_PER_TABLE);
++      for (i = 0; i < PAGES_PER_TABLE; i++)
++              if (page_dir[i]) {
++                      DPRINTK("table %i: mapped %p -> ",i,page_dir[i]);
++                      page_dir[i] = (unsigned long *)
++                          virt_to_phys(page_dir[i]);
++                      DPRINTK_CONT("%p\n",page_dir[i]);
++              }
++}
++
++
++
++static int fill_bootimg_dsc(struct boot_image *image)
++{
++      unsigned long scratch;
++      int error = -ENOMEM;
++
++      if(bootimg_page_dir) {
++              /* free previously allocated memory */
++              free_page_dir(bootimg_page_dir);
++              free_page((unsigned long) bootimg_page_dir);
++              DPRINTK("free_page (bootimg_page_dir)\n");
++      }
++
++      bootimg_page_dir = (unsigned long **) get_free_page(GFP_KERNEL);
++      if (!bootimg_page_dir) goto out0;
++      DPRINTK("get_free_page (bootimg_page_dir)\n");
++
++      error = fill_page_dir(bootimg_page_dir,image);
++      if (error) goto out1;
++
++      if(!bootimg_dsc.scratch) {
++              scratch = get_free_page(GFP_KERNEL);
++              DPRINTK("get_free_page (scratch)\n");
++      } else
++              scratch = 1; /* already allocated */
++
++      if (!scratch) goto out1;
++      /*
++       * Not all architectures need the code to be identity-mapped, but it
++       * can't hurt ...
++       */
++      DPRINTK("bootimg_page_dir: mapped %p -> ",bootimg_page_dir);
++      bootimg_dsc.page_dir = (unsigned long **) virt_to_phys(bootimg_page_dir);
++      DPRINTK_CONT("%p\n",bootimg_dsc.page_dir);
++      if(!bootimg_dsc.scratch)
++              bootimg_dsc.scratch = virt_to_phys((void *) scratch);
++      bootimg_dsc.jump_to = (void (*)(void)) image->start;
++      bootimg_dsc.pages = image->pages;
++      bootimg_dsc.csum = bootimg_checksum(bootimg_page_dir, image->pages);
++
++      return 0;
++
++out1:
++      free_page_dir(bootimg_page_dir);
++      free_page((unsigned long) bootimg_page_dir);
++      DPRINTK("free_page (bootimg_page_dir)\n");
++      bootimg_page_dir = 0;
++out0:
++      return error;
++}
++
++extern char *panicmsg;
++int boot_image()
++{
++      relocate_and_jump_t code;
++      unsigned long code_page;
++      int error = -ENOMEM;
++
++      if (bootimg_checksum(__va(bootimg_dsc.page_dir),bootimg_dsc.pages) 
++              != bootimg_dsc.csum)
++              printk("Checksum of kernel image failed.  Rebooting via BIOS\n");
++
++      code_page = get_identity_mapped_page();
++      if (!code_page) goto out3;
++      code = (relocate_and_jump_t) virt_to_phys((void *) code_page);
++      memcpy(code,&__bootimg_start,&__bootimg_end-&__bootimg_start);
++      flush_icache_range(&__bootimg_start, &__bootimg_end-&__bootimg_start);
++
++      bootimg_dsc.self = (unsigned long) code;
++      printk(KERN_INFO "Running boot code at 0x%p\n",code);
++      
++      /*
++       * The point of no return. Not even printk may work after a successful
++       * return from become_only_thread.
++       */
++
++      if (!panicmsg) {
++                      error = become_only_thread();
++                      if (error) goto out3;
++      } else {
++#ifdef CONFIG_SMP
++                      disable_IO_APIC();
++#endif
++                      __cli();
++      }
++
++      convert_table_refs_to_phys((unsigned long **)__va(bootimg_dsc.page_dir));
++      stack_on_page(code);
++
++      code();
++
++      panic("PIC code exec failed");
++out3:
++      printk("boot_image() failed!\n");
++      for(;;); 
++}
++
++/* changed from asmlinkage because we're called via an IOCTL on /dev/crash now */
++int sys_bootimg(struct boot_image *user_dsc)
++{
++      struct boot_image dsc;
++
++      if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_MODULE)) return -EPERM;
++      if (&__bootimg_end-&__bootimg_start > PAGE_SIZE-RESERVE_MIN_RELOC_STACK)
++         {
++              printk(KERN_ERR "boot_image: PIC too large (%d bytes)\n",
++                  &__bootimg_end-&__bootimg_start);
++              return -EIO;
++      }
++      if ((void *) relocate_and_jump != (void *) &__bootimg_start) {
++              printk(KERN_ERR "boot_image: relocate_and_jump is mis-placed"
++                  "(0x%p != 0x%p)\n",relocate_and_jump,&__bootimg_start);
++              return -EIO;
++      }
++      
++      if (copy_from_user(&dsc,user_dsc,sizeof(dsc))) return -EFAULT;
++      if (dsc.pages >= PAGES_PER_TABLE*PAGES_PER_TABLE/2) return -EFBIG;
++      if (dsc.flags) return -EINVAL; /* for future use */
++      return fill_bootimg_dsc(&dsc);
++}
+Index: linux/kernel/bootimg_pic.c
+===================================================================
+RCS file: linux/kernel/bootimg_pic.c
+diff -N linux/kernel/bootimg_pic.c
+--- /dev/null  1 Jan 1970 00:00:00 -0000
++++ linux/kernel/bootimg_pic.c 1 Apr 2003 12:17:41 -0000       1.1.6.1
+@@ -0,0 +1,91 @@
++/* bootimg_pic.c - Boot image, position-independent code */
++
++/* Written 2000 by Werner Almesberger */
++
++/*
++ * Strongly inspired by FiPaBoL designed mainly by Otfried Cheong and Roger
++ * Gammans, and written by the latter.
++ */
++
++/*
++ * This code is position-independent and must fit in a single page !
++ * Furthermore, everything (text+data+stack) has to go into the
++ * .bootimg segment.
++ */
++
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/bootimg.h>
++#include <asm/bootimg.h>
++
++#include <asm/io.h>
++
++#define copy_and_swap(from,to) \
++    ( { my_copy_page(from,to); \
++    tmp = from; \
++    from = to; \
++    to = tmp; } )
++
++
++static inline void my_copy_page(unsigned long from,unsigned long to)
++{
++      unsigned long end = from+PAGE_SIZE;
++
++      do *((unsigned long *) to)++ = *((unsigned long *) from)++;
++      while (from != end);
++}
++
++
++void __bootimg relocate_and_jump(void)
++{
++      struct bootimg_dsc dsc = bootimg_dsc;
++      int i;
++
++      stop_paging();
++      for (i = 0; i < dsc.pages; i++) {
++              unsigned long from,to,tmp;
++
++              from = dsc.page_dir[FROM_TABLE(i)][PAGE_NR(i)];
++              to = dsc.page_dir[TO_TABLE(i)][PAGE_NR(i)];
++              if (from == to) continue;
++              if (to == dsc.self) {
++                      copy_and_swap(dsc.self,dsc.scratch);
++                      /* WARNING: flush_icache_range MUST BE INLINED !!! */
++                      flush_icache_range(dsc.self,dsc.self+PAGE_SIZE-1);
++                      jump_relocated(dsc.scratch,dsc.self);
++              }
++              else if (to == (unsigned long) dsc.page_dir)
++                      copy_and_swap((unsigned long) dsc.page_dir,dsc.scratch);
++              else {
++                      /*
++                       * O((n^2-n)/2), sigh ...
++                       */
++                      unsigned long **table;
++                      int j;
++
++                      for (j = i+1; j < dsc.pages; j++) {
++                              table = dsc.page_dir+FROM_TABLE(j);
++                              if (((unsigned long) *table) == to) {
++                                      copy_and_swap(*table,dsc.scratch);
++                                      break;
++                              }
++                              if ((*table)[PAGE_NR(j)] == to) {
++                                      copy_and_swap((*table)[PAGE_NR(j)],
++                                          dsc.scratch);
++                                      break;
++                              }
++                              table = dsc.page_dir+TO_TABLE(j);
++                              if (((unsigned long) *table) == to) {
++                                      copy_and_swap(*table,dsc.scratch);
++                                      break;
++                              }
++                      }
++              }
++              my_copy_page(from,to);
++              dsc.scratch = from;
++      }
++      jump_to_kernel(dsc.jump_to);
++}
+Index: linux/kernel/crash.c
+===================================================================
+RCS file: linux/kernel/crash.c
+diff -N linux/kernel/crash.c
+--- /dev/null  1 Jan 1970 00:00:00 -0000
++++ linux/kernel/crash.c       1 Apr 2003 12:17:41 -0000       1.1.6.1
+@@ -0,0 +1,886 @@
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/crash.h>
++#include <linux/vmalloc.h>
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <asm/param.h>
++#include <asm/uaccess.h>
++#include <linux/zlib.h>
++#include <linux/reboot.h>
++#include <linux/delay.h>
++#include <asm/io.h>
++#include <linux/miscdevice.h>
++#include <linux/bootmem.h>
++
++#ifdef CONFIG_BOOTIMG
++#include <linux/bootimg.h>
++#endif
++
++static void crash_print_data_around(u_long p);
++static void crash_free_page(u_long addr);
++static int crash_chksum_page(u_long pg_addr, u_long * sum_addr);
++static void *czalloc(void *arg, unsigned int items, unsigned int size);
++static void czfree(void *arg, void *ptr);
++static u_long crash_alloc_dest_page(void);
++static void crash_free_dest_page(u_long dest);
++static void init_dest_page_alloc(void);
++static int crash_audit_maps(void);
++static u_long crash_get_source_page(void);
++static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages);
++static int crash_reset_stream(z_stream * stream);
++static boolean_t crash_is_kseg(u_long addr);
++static u_long *crash_link(u_long p);
++static int crash_chksum(u_long limit, u_long * sum_addr);
++static int crash_audit_map_page(u_long map);
++static void crash_wait_cpus(void);
++static int crash_is_dir_page(struct page *page);
++
++/* for the /dev/crash interface */
++int crash_init_chrdev(void);
++static int crashdev_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
++
++#define CRASH_DEBUG 1
++
++#ifdef CONFIG_BOOTIMG
++extern int sys_bootimg(struct boot_image *);
++#endif
++
++static u_long crash_compr_buf;
++static u_long crash_uncompr_buf;
++static u_long crash_dump_header = 0;
++static u_long crash_dest_free_list = 0;
++static u_long crash_debug = 0;
++
++static u_long crash_cur_pfn;
++
++static u_long src_pages_skipped = 0;
++static u_long src_pages_saved = 0;
++static u_long dest_pages_free = 0;
++
++/* this information is saved from within panic() */
++char *panicmsg = (char *)0;
++int panic_processor = 0;
++int crash_perform_sync = 0;
++
++u_int console_crash = 0;      /* should be moved to alpha branch */
++
++// typedef struct task_struct *task_t;
++
++/*
++ *  Threads active at time of panic:
++ */
++volatile task_t *panic_threads[NR_CPUS];
++volatile unsigned long panic_ksp[NR_CPUS];
++unsigned long *panic_regs = NULL;
++
++int panic_on_oops;            /* for /proc/sys/kernel/panic_on_oops */
++
++extern unsigned long max_low_pfn;
++
++u_long crash_zalloc_start; // , crash_zalloc_end, crash_zalloc_cur;
++
++/* 
++ * Crash Kernel API functions below
++ * crash_pages_needed, computes pages needed for header and compression temp
++ * crash_init, partitions out the allocated pages, sets defaults and 
++ *             initializes the character device.
++ * crash_mark_dump_reserved, marks pages reserved from a previous dump.
++ * save_core, called at panic time to save a dump to memory.
++ */
++u_long crash_pages_needed(void)
++{
++      /* one for the header */
++      return (1 + CRASH_ZALLOC_PAGES + CRASH_UNCOMPR_BUF_PAGES + CRASH_COMPR_BUF_PAGES);
++}
++
++void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va)
++{
++      struct mem_crash_map_hdr *header;
++      int i;
++
++      /* the default behavior is not NOT panic on a kernel OOPS */
++      panic_on_oops = 0;
++
++      printk("crash_init (crash_va: %08lx)\n", crash_va);
++      for (i = 0; i < NR_CPUS; i++)
++              panic_threads[i] = 0;
++      crash_dump_header = crash_va;
++      crash_va += PAGE_SIZE;
++      crash_zalloc_start = crash_va;
++      crash_va += CRASH_ZALLOC_PAGES * PAGE_SIZE;
++      crash_uncompr_buf = crash_va;
++      crash_va += CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE;
++      crash_compr_buf = crash_va;
++      crash_va += CRASH_COMPR_BUF_PAGES * PAGE_SIZE;
++#if 0
++      if (crash_va != end_alloc_va)
++              panic("crash_init inconsistency-1\n");
++#endif
++
++      header = (struct mem_crash_map_hdr *)crash_dump_header;
++#ifdef CRASH_DEBUG
++      printk("crash_dump_header %p {\n", header);
++      printk("    magic[0]            = %lx\n", header->magic[0]);
++      printk("    map                 = %lx\n", header->map);
++      printk("    map_pages           = %lx\n", header->map_pages);
++      printk("    data_pages          = %lx\n", header->data_pages);
++      printk("    compr_units         = %lx\n", header->compr_units);
++      printk("    boot_reserved_start = %lx\n", header->boot_reserved_start);
++      printk("    boot_reserved_end   = %lx\n", header->boot_reserved_end);
++#endif
++
++      if (header->magic[0] == CRASH_MAGIC) {
++              printk("crash found\n");
++              if ((header->boot_reserved_start != bootmap_va) ||
++                  (header->boot_reserved_end != end_alloc_va)) {
++                      /* crash audit will catch the corruption */
++                      printk("crash_init inconsistency, dump may be corrupted\n");
++              }
++      } else {
++printk("memset...");
++              memset(header, 0, sizeof(*header));
++printk("done\n");
++      }
++
++      header->boot_reserved_start = bootmap_va;
++      header->boot_reserved_end = end_alloc_va;
++
++}
++
++void crash_mark_dump_reserved(void)
++{
++      struct mem_crash_map_hdr *header;
++      struct mem_crash_map_entry *m;
++
++      header = (struct mem_crash_map_hdr *)crash_dump_header;
++      if (header->magic[0] != CRASH_MAGIC)
++              return;
++      m = (struct mem_crash_map_entry *)header->map;
++#ifdef CRASH_DEBUG
++      printk("\n\n\ncrash_mark_dump_reserved\n\n");
++      printk("crash_dump_header %p {\n", header);
++      printk("    magic[0]            = %lx\n", header->magic[0]);
++      printk("    map                 = %lx\n", header->map);
++      printk("    map_pages           = %lx\n", header->map_pages);
++      printk("    data_pages          = %lx\n", header->data_pages);
++      printk("    compr_units         = %lx\n", header->compr_units);
++      printk("    boot_reserved_start = %lx\n", header->boot_reserved_start);
++      printk("    boot_reserved_end   = %lx\n", header->boot_reserved_end);
++      printk("mem_crash_map_entry %p {\n", m);
++      printk("    src_va              = %lx\n", m->src_va);
++      printk("    dest_page_va        = %lx\n", m->dest_page_va);
++      printk("    check_sum           = %lx\n", m->check_sum);
++#endif
++
++      if (crash_audit_maps()) {
++              header->magic[0] = 0;
++              return;
++      }
++
++      m = (struct mem_crash_map_entry *)header->map;
++ again:
++      CRASH_MARK_BOOT_RESERVED(m);
++      for (; m->src_va; m++) {
++              if (m->src_va == -1) {
++                      m = (struct mem_crash_map_entry *)m->dest_page_va;
++                      goto again;
++              }
++              CRASH_MARK_BOOT_RESERVED(m->dest_page_va);
++      }
++      return;
++}
++
++void save_core(void)
++{
++      int i, j, k;
++      z_stream stream;
++      int err;
++      struct task_struct *tp;
++      struct mem_crash_map_hdr *header;
++      u_long *sub_map;
++      u_long map;
++      u_long src, dest, unc, cp, src_base, comp_pages;
++
++      k = 0;
++      dest = 0;
++      __cli();
++      tp = current;
++      mb();
++      if (smp_processor_id() != 0) {  /* boot_cpu_id is always 0, i think */
++              panic_threads[smp_processor_id()] = tp;
++              crash_halt_or_reboot(0);
++      } else {
++              if (console_crash)
++                      panic_threads[smp_processor_id()] = &init_task_union.task;
++              else
++                      panic_threads[smp_processor_id()] = tp;
++
++              crash_wait_cpus();
++      }
++
++      printk("save_core: started on CPU%d\n", smp_processor_id());
++      if (!crash_dump_header) {
++              printk("save_core: not initialized\n");
++              return;
++      }
++
++      header = (struct mem_crash_map_hdr *)crash_dump_header;
++      header->magic[0] = 0;
++      header->map_pages = 0;
++      header->data_pages = 0;
++      header->compr_units = 0;
++      header->map = 0;
++
++      stream.workspace=(void*)crash_zalloc_start;
++      //      stream.zalloc = czalloc;
++      //      stream.zfree = czfree;
++      //      stream.opaque = (voidpf) 0;
++      stream.next_out = (Bytef *) crash_compr_buf;
++      stream.avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE);
++      stream.next_in = (Bytef *) crash_uncompr_buf;
++      stream.avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE);
++      err = zlib_deflateInit(&stream, Z_BEST_SPEED);
++      if (err != Z_OK) {
++              printk("save_core: bad return %d from deflateInit\n", err);
++              return;
++      }
++
++      init_dest_page_alloc();
++      header->map = map = crash_update_map(0, 0, 0, &header->map_pages);
++      if (!map) {
++              printk("save_core: no dest pages\n");
++              return;
++      }
++      crash_cur_pfn = 0;
++      src_base = 0;
++      src = 0;
++      for (;;) {
++              sub_map = (u_long *) crash_uncompr_buf;
++              unc = crash_uncompr_buf + CRASH_SUB_MAP_PAGES * PAGE_SIZE;
++              for (i = 0; i < CRASH_SOURCE_PAGES; i++) {
++                      src = crash_get_source_page();
++                      if (!src)
++                              break;
++                      if (!i)
++                              src_base = src;
++                      if (!crash_is_kseg(unc) || !crash_is_kseg(src)) {
++                              printk("unc = 0x%lx, src = 0x%lx, i = %d\n", unc, src, i);
++                              i = src = 0;
++                              break;
++                      }
++                      memcpy((void *)unc, (void *)src, PAGE_SIZE);
++                      unc += PAGE_SIZE;
++                      *sub_map++ = src;
++              }
++              *sub_map = 0;
++              if (!i && !src)
++                      break;
++              err = zlib_deflate(&stream, Z_FINISH);
++              if (!(err == Z_STREAM_END)) {
++                      zlib_deflateEnd(&stream);
++                      printk("save_core: bad return %d from deflate, src_base = 0x%lx\n", err,
++                             src_base);
++                      return;
++              }
++              comp_pages = (u_long) round_page(stream.total_out) / PAGE_SIZE;
++              if (crash_debug)
++                      printk("src_base = 0x%lx compressed data in 0x%lx pages\n", src_base,
++                             comp_pages);
++
++              cp = crash_compr_buf;
++              j = 0;
++              if (crash_debug)
++                      printk("\nsrc = %lx\n", src_base);
++              else {
++                      printk(".");
++                      if (!(k++ % 64))
++                              printk("\n");
++              }
++              for (i = 0; i < comp_pages; i++) {
++                      dest = crash_alloc_dest_page();
++                      if (crash_debug) {
++                              printk("%lx ", dest);
++                              if (!(j++ % 8))
++                                      printk("\n");
++                      }
++                      header->data_pages++;
++                      if (!dest) {
++                              printk("save_core: no dest pages\n");
++                              return;
++                      }
++                      if (!crash_is_kseg(dest) || !crash_is_kseg(cp)) {
++                              printk("dest = 0x%lx, cp = 0x%lx, i = %d, comp_pages = 0x%lx\n",
++                                     dest, cp, i, comp_pages);
++                              src = 0;
++                              break;
++                      }
++                      memcpy((void *)dest, (void *)cp, PAGE_SIZE);
++                      cp += PAGE_SIZE;
++                      map = crash_update_map(map, src_base, dest, &header->map_pages); /* links a new map page, if necessary */
++                      if (!map) {
++                              printk("save_core: no map\n");
++                              return;
++                      }
++              }
++              header->compr_units++;
++              if (!src)
++                      break;
++              if (crash_reset_stream(&stream))
++                      return;
++      }
++
++      map = crash_update_map(map, 0, 0, &header->map_pages);
++      header->magic[0] = CRASH_MAGIC;
++
++      if (crash_audit_maps()) {
++              header->magic[0] = 0;
++              return;
++      }
++
++      printk("\nsave_core: src pages skipped = 0x%lx src pages saved = 0x%lx\n",
++             src_pages_skipped, src_pages_saved);
++      printk("save_core: data_pages = 0x%lx map_pages = 0x%lx\n", header->data_pages,
++             header->map_pages);
++      printk("save_core: completed, crash_dump_header = 0x%lx\n", crash_dump_header);
++}
++
++/* helper functions private to this file */
++static int crash_reset_stream(z_stream * stream)
++{
++      int err;
++
++      stream->workspace=(void*)crash_zalloc_start;
++      // stream->zalloc = czalloc;
++      // stream->zfree = czfree;
++      // stream->opaque = (voidpf) 0;
++      stream->next_out = (Bytef *) crash_compr_buf;
++      stream->avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE);
++      stream->next_in = (Bytef *) crash_uncompr_buf;
++      stream->avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE);
++      err = zlib_deflateReset(stream);
++      if (err != Z_OK) {
++              printk("crash_reset_stream: bad return %d from deflateReset\n", err);
++              return 1;
++      }
++      return 0;
++}
++
++static u_long crash_alloc_dest_page(void)
++{
++      u_long addr;
++
++      addr = crash_dest_free_list;
++      if (addr) {
++              crash_dest_free_list = *(u_long *) addr;
++              dest_pages_free--;
++      } else
++              printk("crash_alloc_dest_page: free list empty\n");
++      return addr;
++}
++
++static void crash_free_dest_page(u_long dest)
++{
++      if (!dest) {
++              printk("crash_free_dest_page: freeing addr 0\n");
++              return;
++      }
++      dest_pages_free++;
++      dest = (u_long) trunc_page(dest);
++      *(u_long *) dest = crash_dest_free_list;
++      crash_dest_free_list = dest;
++}
++
++/*
++ *  Stolen from setup.c
++ */
++#define PFN_PHYS(x)   ((x) << PAGE_SHIFT)
++
++static void init_dest_page_alloc(void)
++{
++      u_long va;
++      long i;
++      struct page *page;
++      struct mem_crash_map_hdr *header;
++
++      header = (struct mem_crash_map_hdr *)crash_dump_header;
++      for (i = ((1 << 24) >> PAGE_SHIFT) + LOWER_MEM_FORWARD;
++           i < (max_low_pfn - UPPER_MEM_BACKUP); i++) {
++              va = (u_long) phys_to_virt(PFN_PHYS(i));
++              if ((va >= header->boot_reserved_start) && (va < header->boot_reserved_end))
++                      continue;
++              page = mem_map + i;
++              if (PageLocked(page) || PageReserved(page))
++                      continue;
++              if (PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers)
++                      crash_free_dest_page(va);
++      }
++      if (crash_debug)
++              printk("init_dest_page_alloc: dest_pages_free = 0x%lx\n", dest_pages_free);
++}
++
++static int crash_is_dir_page(struct page *page) {
++      struct inode *tmp_inode;
++
++      if(page->mapping && page->mapping->host) {
++              tmp_inode = (struct inode *)page->mapping->host;
++              if((tmp_inode->i_sb->s_magic == EXT2_SUPER_MAGIC) &&
++                 (S_ISDIR(tmp_inode->i_mode)))
++                      return 1;
++      }
++
++      return 0;
++}
++
++static u_long crash_get_source_page(void)
++{
++      struct page *page;
++      u_long va;
++
++      while (crash_cur_pfn < max_low_pfn) {
++              page = mem_map + crash_cur_pfn;
++              if (!(PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers))
++                      break;
++              src_pages_skipped++;
++              crash_cur_pfn++;
++      }
++      if (crash_cur_pfn == max_low_pfn)
++              return 0;
++
++      va = (u_long) phys_to_virt(PFN_PHYS(crash_cur_pfn));
++      src_pages_saved++;
++      crash_cur_pfn++;
++      return va;
++}
++
++static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages)
++{
++      struct mem_crash_map_entry *m;
++
++
++      if (!map) {
++              (*pages)++;
++              return crash_alloc_dest_page();
++      }
++      m = (struct mem_crash_map_entry *)map;
++      m->src_va = src_base;
++      m->dest_page_va = dest;
++      if (dest)
++              if (crash_chksum_page(dest, &m->check_sum))
++                      return 0;
++
++      map += sizeof(struct mem_crash_map_entry);
++
++      m = (struct mem_crash_map_entry *)map;
++      if (!src_base) {        /* end of list */
++              if (crash_chksum((u_long) m, &m->src_va))
++                      return 0;
++      } else if ((map + 3 * sizeof(struct mem_crash_map_entry)) > (u_long) round_page(map)) {
++              m->src_va = -1;
++              map = m->dest_page_va = crash_alloc_dest_page();
++              if (crash_debug)
++                      printk("\nm = 0x%lx m->src_va = 0x%lx m->dest_page_va = 0x%lx\n",
++                             (u_long) trunc_page(m), m->src_va, m->dest_page_va);
++              m++;
++              if (crash_chksum((u_long) m, &m->src_va))
++                      return 0;
++              if (crash_debug)
++                      printk("m = 0x%lx chksum =  m->src_va = 0x%lx\n", (u_long) trunc_page(m),
++                             m->src_va);
++              if (crash_audit_map_page((u_long) m))
++                      return 0;
++              (*pages)++;
++      }
++      return map;
++}
++
++static int crash_chksum(u_long limit, u_long * sum_addr)
++{
++      u_long sum;
++      u_long *addr;
++
++      if (!crash_is_kseg(limit)) {
++              printk("bad addr = 0x%lx to crash_chksum\n", limit);
++              return 1;
++      }
++      sum = 0;
++      addr = (u_long *) trunc_page(limit);
++      for (; (u_long) addr < limit; addr++)
++              sum += *addr;
++      *sum_addr = sum;
++      return 0;
++}
++
++static int crash_chksum_page(u_long pg_addr, u_long * sum_addr)
++{
++      u_long sum, limit;
++      u_long *addr;
++
++      if (!crash_is_kseg(pg_addr)) {
++              printk("bad addr = 0x%lx to crash_chksum_page\n", pg_addr);
++              return 1;
++      }
++
++      sum = 0;
++      addr = (u_long *) trunc_page(pg_addr);
++      limit = (u_long) addr + PAGE_SIZE;
++      for (; (u_long) addr < limit; addr++)
++              sum += *addr;
++      *sum_addr = sum;
++      return 0;
++}
++
++static int crash_audit_maps(void)
++{
++      u_long m, count;
++      u_long *link_addr;
++      struct mem_crash_map_hdr *header;
++
++      header = (struct mem_crash_map_hdr *)crash_dump_header;
++      if (header->magic[0] != CRASH_MAGIC)
++              return 1;
++
++      link_addr = &header->map;
++      m = header->map;
++
++      count = 0;
++      for (;;) {
++              if (!crash_is_kseg(m)) {
++                      printk("crash_audit_maps: bad link 0x%lx at 0x%lx\n", m,
++                             (u_long) link_addr);
++                      return 1;
++              }
++              if (crash_audit_map_page(m)) {
++                      printk("audit failed while on map page %ld\n", count);
++                      return 1;
++              }
++              if (!crash_link(m))
++                      break;
++              link_addr = crash_link(m);
++              m = *link_addr;
++
++              count++;
++      }
++      return 0;
++}
++
++static int crash_audit_map_page(u_long map)
++{
++      struct mem_crash_map_entry *m;
++      u_long sum;
++
++      if (!map || !crash_is_kseg(map)) {
++              printk("crash_audit_map_page: bad map = 0x%lx\n", map);
++              return 1;
++      }
++      map = (u_long) trunc_page((u_long) map);
++      m = (struct mem_crash_map_entry *)map;
++      for (;;) {
++              if ((m->src_va == -1) || (m->src_va == 0)) {
++                      m++;
++                      if (crash_chksum((u_long) m, &sum))
++                              return 1;
++                      if (m->src_va != sum) {
++                              printk("crash_audit_map_page: checksum failure1\n");
++                              printk("m = 0x%lx, sum = 0x%lx, m->src_va = 0x%lx\n",
++                                     (u_long) m, (u_long) sum, (u_long) m->src_va);
++                              crash_print_data_around((u_long) & m->src_va);
++                              return 1;
++                      } else {
++                              return 0;
++                      }
++              } else {
++                      if (crash_chksum_page((u_long) m->dest_page_va, &sum)
++                          || (m->check_sum != sum)) {
++                              printk("crash_audit_map_page: checksum failure2\n");
++                              printk
++                                      ("dest_page_va = 0x%lx, &dest_page_va = 0x%lx, sum = 0x%lx, m->check_sum = 0x%lx\n",
++                                       (u_long) m->dest_page_va, (u_long) (&m->check_sum),
++                                       (u_long) sum, (u_long) m->check_sum);
++                              crash_print_data_around((u_long) & m->check_sum);
++                              return 1;
++                      }
++              }
++              m++;
++      }
++}
++
++static void crash_print_data_around(u_long p)
++{
++      u_long *a;
++      int i;
++
++      if (!crash_is_kseg(p)) {
++              printk("crash_print_data_around: p = 0x%lx not kseg\n", p);
++              return;
++      }
++      a = (u_long *) p;
++      a -= 20;
++      for (i = 0; i < 40; i++)
++              printk("%lx\n", *a++);
++}
++
++#ifdef CRASH_DEBUG
++static void crash_print_map_page(u_long map)
++{
++      struct mem_crash_map_entry *m;
++      int j = 0;
++      u_long sum;
++
++      map = (u_long) trunc_page((u_long) map);
++      m = (struct mem_crash_map_entry *)map;
++      for (;;) {
++              printk("%lx %lx %lx ", m->src_va, m->dest_page_va, m->check_sum);
++              if (!(j++ % 4))
++                      printk("\n");
++              if ((m->src_va == -1) || (m->src_va == 0)) {
++                      m++;
++                      printk("%lx %lx ", m->src_va, m->dest_page_va);
++                      if (crash_chksum((u_long) m, &sum));
++                      else
++                              printk("\nchksum = 0x%lx\n", sum);
++                      return;
++              }
++              m++;
++      }
++}
++#endif /* CRASH_DEBUG */
++
++static void crash_wait_cpus(void)
++{
++      int i;
++      int msecs = 0;
++
++      for (i = 0; i < smp_num_cpus; i++) {
++              if (i != smp_processor_id()) {
++                      while (!panic_threads[i]) {
++                              msecs++;
++                              mdelay(1);
++                              if (msecs > CRASH_CPU_TIMEOUT) {
++                                      /* if other cpus are still running
++                                       * we have to halt, otherwise we could
++                                       * risk using buffer cache pages which
++                                       * could subsequently get flushed to disk.
++                                       */
++                                      printk("Unable to halt other CPUs, halting system.\n");
++                                      crash_halt_or_reboot(0);
++                              }
++                      }
++              }
++      }
++
++      crash_cleanup_smp_state();
++}
++
++
++#if 0
++static void *czalloc(void *arg, unsigned int items, unsigned int size)
++{
++      u_long nbytes;
++      u_long addr;
++
++      nbytes = (u_long) (items * size);
++      nbytes = (u_long) round_page(nbytes);
++      if ((crash_zalloc_cur + nbytes) > crash_zalloc_end)
++              return 0;
++      addr = crash_zalloc_cur;
++      crash_zalloc_cur += nbytes;
++      return ((void *)addr);
++}
++
++static void czfree(void *arg, void *ptr)
++{
++      printk("zfree: ptr = 0x%lx\n", (u_long) ptr);
++}
++#endif
++
++static boolean_t crash_is_kseg(u_long addr)
++{
++      u_long phys;
++
++      phys = virt_to_phys((void *)addr);
++      if (phys < PFN_PHYS(max_low_pfn))
++              return TRUE;
++      else
++              return FALSE;
++}
++
++static u_long *crash_link(u_long p)
++{
++      struct mem_crash_map_entry *m;
++
++      p = (u_long) trunc_page(p);
++      m = (struct mem_crash_map_entry *)p;
++      for (; m->src_va; m++)
++              if (m->src_va == -1)
++                      return &m->dest_page_va;
++
++      return 0;
++}
++
++/* Call this after data written to disk. */
++static int crash_free_crashmem(void)
++{
++      struct mem_crash_map_hdr *header;
++      struct mem_crash_map_entry *m, *last_m;
++
++      if (crash_debug)
++              printk("crash_free_crashmem: \n");
++
++      header = (struct mem_crash_map_hdr *)crash_dump_header;
++      if (crash_audit_maps()) {
++              header->magic[0] = 0;
++              return 1;
++      }
++      m = (struct mem_crash_map_entry *)header->map;
++ again:
++      for (; m->src_va; m++) {
++              if (m->src_va == -1) {
++                      last_m = m;
++                      m = (struct mem_crash_map_entry *)m->dest_page_va;
++                      crash_free_page((unsigned long)last_m);
++                      goto again;
++              }
++              crash_free_page(m->dest_page_va);
++      }
++      if (crash_debug)
++              printk("crash_free_crashmem: 0x%lx freed\n",
++                     (header->data_pages + header->map_pages) * PAGE_SIZE);
++      header->magic[0] = 0;
++      return 0;
++}
++
++static void crash_free_page(u_long addr)
++{
++      struct page *page;
++
++      page = virt_to_page(addr);
++      ClearPageReserved(page);
++      set_page_count(page, 1);
++      __free_page(page);
++}
++
++static int get_dump_helper(u_long kva, u_long buf)
++{
++      struct page *page;
++      struct mem_crash_map_hdr *header;
++
++      header = (struct mem_crash_map_hdr *)crash_dump_header;
++      if (header->magic[0] != CRASH_MAGIC)
++              return 1;
++
++      if (!kva) {
++              if (crash_audit_maps()) {
++                      printk("get_dump_helper: audit failure\n");
++                      header->magic[0] = 0;
++                      return 1;
++              }
++              page = virt_to_page((u_long) crash_dump_header);
++              if (!PageReserved(page)) {
++                      printk("not reserved: crash_dump_header = 0x%lx\n", crash_dump_header);
++                      return 1;
++              }
++              if (copy_to_user((char *)buf, (char *)crash_dump_header,
++                               sizeof(struct mem_crash_map_hdr))) {
++                      printk("get_dump_helper: copy_to_user failed1\n");
++                      return 1;
++              }
++      } else {
++              page = virt_to_page(kva);
++              if (!PageReserved(page)) {
++                      printk("not reserved: kva = 0x%lx\n", kva);
++                      return 1;
++              }
++              if (copy_to_user((char *)buf, (char *)trunc_page(kva), PAGE_SIZE)) {
++                      printk("get_dump_helper: copy_to_user failed2\n");
++                      return 1;
++              }
++      }
++      return 0;
++}
++
++static void free_dump_helper(void)
++{
++      struct mem_crash_map_hdr *header;
++
++      header = (struct mem_crash_map_hdr *)crash_dump_header;
++      if (header->magic[0] != CRASH_MAGIC)
++              return;
++      if (crash_debug)
++              printk("free_dump_helper\n");
++      crash_free_crashmem();
++}
++
++static int crashdev_open(struct inode *inode, struct file *file)
++{
++      /* always return success -- nothing to do here */
++      return 0;
++}
++
++/* character device implementation */
++static struct file_operations crashdev_fops = {
++      ioctl:crashdev_ioctl,
++      open:crashdev_open,
++};
++
++static struct miscdevice crash_miscdev = {
++      190, "crash", &crashdev_fops
++};
++
++int crash_init_chrdev(void)
++{
++      int result;
++
++      result = misc_register(&crash_miscdev);
++
++      if (result < 0)
++              printk(KERN_WARNING "crash: can't register crash device (c 10 190)\n");
++
++      return result;
++}
++
++/* call the original syscalls, just to get things going */
++static int crashdev_ioctl(struct inode *inode, struct file *file,
++                        unsigned int cmd, unsigned long arg)
++{
++      int retval = 0;
++
++      switch (cmd) {
++      case CRASH_IOCFREEDUMP:
++              free_dump_helper();
++              break;
++
++      case CRASH_IOCGETDUMP:
++              if (crash_debug) {
++                      printk("crashdev_ioctl: get dump\n");
++                      printk("vals: %08lx %08lx\n",
++                             ((struct ioctl_getdump *)arg)->kva,
++                             ((struct ioctl_getdump *)arg)->buf);
++              }
++
++              retval = get_dump_helper((u_long) ((struct ioctl_getdump *)arg)->kva,
++                                       (u_long) ((struct ioctl_getdump *)arg)->buf);
++              break;
++
++#ifdef CONFIG_BOOTIMG
++      case CRASH_IOCBOOTIMG:
++              if (crash_debug)
++                      printk("crashdev_ioctl: bootimg\n");
++
++              retval = sys_bootimg((struct boot_image *)arg);
++              break;
++#endif
++
++      case CRASH_IOCVERSION:
++              if (crash_debug)
++                      printk("crashdev_ioctl: version\n");
++              retval = CRASH_K_MINOR | (CRASH_K_MAJOR << 16);
++              break;
++
++      default:
++              return -EINVAL;
++      }
++
++      return retval;
++}
+Index: linux/kernel/module.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/kernel/module.c,v
+retrieving revision 1.1.1.1.4.1
+retrieving revision 1.1.1.1.4.1.2.1
+diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1
+--- linux/kernel/module.c      12 Mar 2003 19:51:36 -0000      1.1.1.1.4.1
++++ linux/kernel/module.c      1 Apr 2003 12:17:41 -0000       1.1.1.1.4.1.2.1
+@@ -311,7 +311,14 @@
+               error = -EEXIST;
+               goto err1;
+       }
++#if defined(CONFIG_MCL_COREDUMP)
++      /* Call vmalloc_32 instead of module_map (vmalloc for i386)
++       * to avoid being mapped in highmem where mcore can't see us.
++       */
++      if ((mod = (struct module *)vmalloc_32(size)) == NULL) {
++#else
+       if ((mod = (struct module *)module_map(size)) == NULL) {
++#endif
+               error = -ENOMEM;
+               goto err1;
+       }
+Index: linux/kernel/panic.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/kernel/panic.c,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.1
+diff -u -r1.3.2.1 -r1.3.2.1.2.1
+--- linux/kernel/panic.c       12 Mar 2003 19:51:36 -0000      1.3.2.1
++++ linux/kernel/panic.c       1 Apr 2003 12:17:41 -0000       1.3.2.1.2.1
+@@ -19,6 +19,10 @@
+ #include <linux/vt_kern.h>
+ #include <linux/pc_keyb.h>
++#ifdef CONFIG_MCL_COREDUMP
++#include <linux/crash.h>
++#endif
++
+ asmlinkage void sys_sync(void);       /* it's really int */
+ int panic_timeout;
+@@ -197,20 +201,43 @@
+         unsigned long caller = (unsigned long) __builtin_return_address(0);
+ #endif
++#ifdef CONFIG_MCL_COREDUMP
++      crash_save_regs();
++#endif
++
+       bust_spinlocks(1);
+       va_start(args, fmt);
+       vsprintf(buf, fmt, args);
+       va_end(args);
+       printk(KERN_EMERG "Kernel panic: %s\n",buf);
++
++#ifdef CONFIG_MCL_COREDUMP
++      if (!panicmsg) {
++              panicmsg = buf;
++              panic_processor = smp_processor_id();
++              mb();
++      }
++#endif
++
+       if (netdump_func)
+               BUG();
+       if (in_interrupt())
+               printk(KERN_EMERG "In interrupt handler - not syncing\n");
+       else if (!current->pid)
+               printk(KERN_EMERG "In idle task - not syncing\n");
++#ifdef CONFIG_MCL_COREDUMP
++      else if (crash_perform_sync)
++#else
+       else
++#endif
+               sys_sync();
++
+       bust_spinlocks(0);
++
++#ifdef CONFIG_MCL_COREDUMP
++      smp_call_function((void *)smp_crash_funnel_cpu,0,0,0);
++      crash_save_current_state(current);
++#endif
+ #ifdef CONFIG_SMP
+       smp_send_stop();
+Index: linux/kernel/sysctl.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/kernel/sysctl.c,v
+retrieving revision 1.2.2.1
+retrieving revision 1.2.2.1.2.1
+diff -u -r1.2.2.1 -r1.2.2.1.2.1
+--- linux/kernel/sysctl.c      12 Mar 2003 19:51:36 -0000      1.2.2.1
++++ linux/kernel/sysctl.c      1 Apr 2003 12:17:41 -0000       1.2.2.1.2.1
+@@ -37,6 +37,10 @@
+ #include <linux/nfs_fs.h>
+ #endif
++#ifdef CONFIG_MCL_COREDUMP
++#include <linux/crash.h>
++#endif
++
+ #if defined(CONFIG_SYSCTL)
+ /* External variables not in a header file. */
+@@ -247,6 +251,10 @@
+       {KERN_SYSRQ, "sysrq", &sysrq_enabled, sizeof (int),
+        0644, NULL, &proc_dointvec},
+ #endif         
++#ifdef CONFIG_MCL_COREDUMP
++      {KERN_PANIC_ON_OOPS, "panic_on_oops", &panic_on_oops, sizeof(int),
++       0644, NULL, &proc_dointvec},
++#endif
+       {KERN_CADPID, "cad_pid", &cad_pid, sizeof (int),
+        0600, NULL, &proc_dointvec},
+       {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int),
+Index: linux/lib/Config.in
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/lib/Config.in,v
+retrieving revision 1.2
+retrieving revision 1.2.4.1
+diff -u -r1.2 -r1.2.4.1
+--- linux/lib/Config.in        14 Feb 2003 22:59:23 -0000      1.2
++++ linux/lib/Config.in        1 Apr 2003 12:17:41 -0000       1.2.4.1
+@@ -23,12 +23,14 @@
+   fi
+ fi
+-if [ "$CONFIG_PPP_DEFLATE" = "y" -o \
++if [ "$CONFIG_MCL_COREDUMP" = "y" -o \
++     "$CONFIG_PPP_DEFLATE" = "y" -o \
+      "$CONFIG_JFFS2_FS" = "y" ]; then
+    define_tristate CONFIG_ZLIB_DEFLATE y
+ else
+   if [ "$CONFIG_PPP_DEFLATE" = "m" -o \
+-       "$CONFIG_JFFS2_FS" = "m" ]; then
++       "$CONFIG_JFFS2_FS" = "m" -o \
++       "$CONFIG_MCL_COREDUMP" = "m" ]; then
+      define_tristate CONFIG_ZLIB_DEFLATE m
+   else
+      tristate 'zlib compression support' CONFIG_ZLIB_DEFLATE
+Index: linux/mm/memory.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/mm/memory.c,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.1
+diff -u -r1.3.2.1 -r1.3.2.1.2.1
+--- linux/mm/memory.c  12 Mar 2003 19:51:37 -0000      1.3.2.1
++++ linux/mm/memory.c  1 Apr 2003 12:17:41 -0000       1.3.2.1.2.1
+@@ -1381,6 +1381,10 @@
+       }
+       lock_page(page);
++#ifdef CONFIG_MCL_COREDUMP
++      set_bit(PG_anon, &page->flags);
++#endif
++
+       /*
+        * Back out if somebody else faulted in this pte while we
+        * released the page table lock.
+@@ -1470,6 +1474,9 @@
+               mm->rss++;
+               flush_page_to_ram(page);
+               entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
++#ifdef CONFIG_MCL_COREDUMP
++              set_bit(PG_anon, &page->flags);
++#endif
+               lru_cache_add(page);
+       }
+Index: linux/mm/page_alloc.c
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/mm/page_alloc.c,v
+retrieving revision 1.3.2.1
+retrieving revision 1.3.2.1.2.1
+diff -u -r1.3.2.1 -r1.3.2.1.2.1
+--- linux/mm/page_alloc.c      12 Mar 2003 19:51:37 -0000      1.3.2.1
++++ linux/mm/page_alloc.c      1 Apr 2003 12:17:41 -0000       1.3.2.1.2.1
+@@ -95,6 +95,10 @@
+       struct page *base;
+       per_cpu_t *per_cpu;
+       zone_t *zone;
++#ifdef CONFIG_MCL_COREDUMP
++      struct page *pagemap;
++      int count = 1<<order;
++#endif
+       /*
+        * Yes, think what happens when other parts of the kernel take 
+@@ -163,6 +167,15 @@
+       spin_lock(&zone->lock);
++#ifdef CONFIG_MCL_COREDUMP
++      pagemap = page;
++      do {
++              pagemap->flags |= (1<<PG_free);
++              pagemap->flags &= ~((1<<PG_anon)|(1<<PG_shm));
++              pagemap++;
++      } while(--count);
++#endif
++
+       zone->free_pages -= mask;
+       while (mask + (1 << (MAX_ORDER-1))) {
+@@ -268,6 +281,16 @@
+                       zone->free_pages -= 1UL << order;
+                       page = expand(zone, page, index, order, curr_order, area);
++#ifdef CONFIG_MCL_COREDUMP
++                      {
++                              struct page *pagemap = page;
++                              int             count = 1<<order;
++                              do {
++                                      pagemap->flags &= ~(1<<PG_free);
++                                      pagemap++;
++                              } while (--count);
++                      }
++#endif
+                       spin_unlock_irqrestore(&zone->lock, flags);
+                       set_page_count(page, 1);
+Index: linux/arch/i386//boot/compressed/head.S
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/compressed/head.S,v
+retrieving revision 1.1.1.1
+retrieving revision 1.1.1.1.12.6
+diff -u -r1.1.1.1 -r1.1.1.1.12.6
+--- linux/arch/i386//boot/compressed/head.S    7 May 2002 21:53:54 -0000       1.1.1.1
++++ linux/arch/i386//boot/compressed/head.S    5 Apr 2003 05:51:27 -0000       1.1.1.1.12.6
+@@ -23,6 +23,7 @@
+  */
+ .text
++#include <linux/config.h>
+ #include <linux/linkage.h>
+ #include <asm/segment.h>
+@@ -31,6 +32,55 @@
+ startup_32:
+       cld
+       cli
++
++#ifdef CONFIG_BOOTIMG
++/*
++ * GDT is invalid if we're booted by bootimg, so reload it now
++ */
++      lgdt    %cs:gdt_descr
++      ljmp    $(__KERNEL_CS),$1f
++
++gdt_table_limit = gdt_table_end - gdt_table - 1
++gdt_descr:
++      .word   gdt_table_limit
++      .long   gdt_table
++
++gdt_table: /* stolen from arch/i386/kernel/head.S */
++      .quad 0x0000000000000000        /* NULL descriptor */
++      .quad 0x0000000000000000        /* 0x0b reserved */
++      .quad 0x0000000000000000        /* 0x13 reserved */
++      .quad 0x0000000000000000        /* 0x1b reserved */
++      .quad 0x00cffa000000ffff        /* 0x23 user 4GB code at 0x00000000 */
++      .quad 0x00cff2000000ffff        /* 0x2b user 4GB data at 0x00000000 */
++      .quad 0x0000000000000000        /* 0x33 TLS entry 1 */
++      .quad 0x0000000000000000        /* 0x3b TLS entry 2 */
++      .quad 0x0000000000000000        /* 0x43 TLS entry 3 */
++      .quad 0x0000000000000000        /* 0x4b reserved */
++      .quad 0x0000000000000000        /* 0x53 reserved */
++      .quad 0x0000000000000000        /* 0x5b reserved */
++
++      .quad 0x00cf9a000000ffff        /* 0x60 kernel 4GB code at 0x00000000 */
++      .quad 0x00cf92000000ffff        /* 0x68 kernel 4GB data at 0x00000000 */
++      .quad 0x0000000000000000        /* 0x70 TSS descriptor */
++      .quad 0x0000000000000000        /* 0x78 LDT descriptor */
++
++      /* Segments used for calling PnP BIOS */
++      .quad 0x00c09a0000000000        /* 0x80 32-bit code */
++      .quad 0x00809a0000000000        /* 0x88 16-bit code */
++      .quad 0x0080920000000000        /* 0x90 16-bit data */
++      .quad 0x0080920000000000        /* 0x98 16-bit data */
++      .quad 0x0080920000000000        /* 0xa0 16-bit data */
++      /*
++       * The APM segments have byte granularity and their bases
++       * and limits are set at run time.
++       */
++      .quad 0x00409a0000000000        /* 0xa8 APM CS    code */
++      .quad 0x00009a0000000000        /* 0xb0 APM CS 16 code (16 bit) */
++      .quad 0x0040920000000000        /* 0xb8 APM DS    data */
++gdt_table_end:
++
++1:
++#endif
+       movl $(__KERNEL_DS),%eax
+       movl %eax,%ds
+       movl %eax,%es
+@@ -92,7 +142,6 @@
+       cld
+       rep
+       movsl
+-
+       popl %esi       # discard the address
+       popl %ebx       # real mode pointer
+       popl %esi       # low_buffer_start
+@@ -124,5 +173,10 @@
+       movsl
+       movl %ebx,%esi  # Restore setup pointer
+       xorl %ebx,%ebx
++#ifdef CONFIG_BOOTIMG
++        movl $0x100000,%eax
++        jmpl *%eax
++#else
+       ljmp $(__KERNEL_CS), $0x100000
++#endif
+ move_routine_end:
+Index: linux/arch/i386//kernel/head.S
+===================================================================
+RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/head.S,v
+retrieving revision 1.2.2.1
+retrieving revision 1.2.2.1.2.5
+diff -u -r1.2.2.1 -r1.2.2.1.2.5
+--- linux/arch/i386//kernel/head.S     12 Mar 2003 19:49:06 -0000      1.2.2.1
++++ linux/arch/i386//kernel/head.S     5 Apr 2003 05:51:27 -0000       1.2.2.1.2.5
+@@ -42,6 +42,21 @@
+  * On entry, %esi points to the real-mode code as a 32-bit pointer.
+  */
+ startup_32:
++#ifdef CONFIG_BOOTIMG
++/*
++ * GDT is invalid if we're booted by bootimg, so reload it now
++ */
++      lgdt %cs:_gdt_descr-__PAGE_OFFSET
++      ljmp $(__KERNEL_CS),$1f-__PAGE_OFFSET
++
++gdt_limit = SYMBOL_NAME(cpu_gdt_table_end) - SYMBOL_NAME(cpu_gdt_table) - 1
++
++_gdt_descr:
++      .word gdt_limit
++      .long SYMBOL_NAME(cpu_gdt_table)-__PAGE_OFFSET
++
++1:
++#endif
+ /*
+  * Set segments to known values
+  */
+@@ -452,6 +467,7 @@
+       .quad 0x00409a0000000000        /* 0xa8 APM CS    code */
+       .quad 0x00009a0000000000        /* 0xb0 APM CS 16 code (16 bit) */
+       .quad 0x0040920000000000        /* 0xb8 APM DS    data */
++ENTRY(cpu_gdt_table_end)
+ #if CONFIG_SMP
+       .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */
diff --git a/lustre/kernel_patches/patches/patch-2.4.18-hp1_pnnl18.2.8qsnet.patch b/lustre/kernel_patches/patches/patch-2.4.18-hp1_pnnl18.2.8qsnet.patch
deleted file mode 100644 (file)
index f25baa4..0000000
+++ /dev/null
@@ -1,1673 +0,0 @@
---- linux-pristine/./include/linux/lustre_version.h    Wed Dec 31 19:00:00 1969
-+++ linux/./include/linux/lustre_version.h     Tue Nov 26 07:02:14 2002
-@@ -0,0 +1 @@
-+#define LUSTRE_KERNEL_VERSION 5
---- linux-pristine/./arch/ia64/mm/init.c       Thu Dec  5 10:47:25 2002
-+++ linux/./arch/ia64/mm/init.c        Fri Nov 29 18:06:20 2002
-@@ -44,6 +44,12 @@
- static struct page *vmem_map;
-+struct page *check_get_page(unsigned long kaddr)
-+{
-+#warning FIXME: Lustre team, is this solid?
-+      return virt_to_page(kaddr);
-+}
-+
- int
- do_check_pgt_cache (int low, int high)
- {
---- linux-pristine/./arch/i386/mm/init.c       Thu Dec  5 10:47:24 2002
-+++ linux/./arch/i386/mm/init.c        Fri Nov 29 18:06:20 2002
-@@ -43,6 +43,12 @@
- static unsigned long totalram_pages;
- static unsigned long totalhigh_pages;
-+struct page *check_get_page(unsigned long kaddr)
-+{
-+#warning FIXME: Lustre team, is this solid?
-+      return virt_to_page(kaddr);
-+}
-+
- int do_check_pgt_cache(int low, int high)
- {
-       int freed = 0;
---- linux-pristine/./drivers/block/blkpg.c     Thu Dec  5 10:47:36 2002
-+++ linux/./drivers/block/blkpg.c      Fri Nov 29 18:08:05 2002
-@@ -308,6 +308,41 @@
- EXPORT_SYMBOL(blk_ioctl);
-+#define NUM_DEV_NO_WRITE 16
-+static int dev_no_write[NUM_DEV_NO_WRITE];
-+
-+/*
-+ * Debug code for turning block devices "read-only" (will discard writes
-+ * silently).  This is for filesystem crash/recovery testing.
-+ */
-+void dev_set_rdonly(kdev_t dev, int no_write)
-+{
-+      if (dev) {
-+              printk(KERN_WARNING "Turning device %s read-only\n",
-+                     bdevname(dev));
-+              dev_no_write[no_write] = 0xdead0000 + dev;
-+      }
-+}
-+
-+int dev_check_rdonly(kdev_t dev) {
-+      int i;
-+
-+      for (i = 0; i < NUM_DEV_NO_WRITE; i++) {
-+              if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 &&
-+                  dev == (dev_no_write[i] & 0xffff))
-+                      return 1;
-+      }
-+      return 0;
-+}
-+
-+void dev_clear_rdonly(int no_write) {
-+      dev_no_write[no_write] = 0;
-+}
-+
-+EXPORT_SYMBOL(dev_set_rdonly);
-+EXPORT_SYMBOL(dev_check_rdonly);
-+EXPORT_SYMBOL(dev_clear_rdonly);
-+
-  /*********************
-   * get_last_sector()
-   *  
---- linux-pristine/./drivers/block/loop.c      Thu Dec  5 10:47:37 2002
-+++ linux/./drivers/block/loop.c       Fri Nov 29 18:06:20 2002
-@@ -471,6 +471,11 @@
-       spin_unlock_irq(&lo->lo_lock);
-       if (rw == WRITE) {
-+#ifdef CONFIG_DEV_RDONLY
-+              if (dev_check_rdonly(rbh->b_rdev))
-+                      goto err;
-+#endif
-+
-               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
-                       goto err;
-       } else if (rw == READA) {
---- linux-pristine/./drivers/ide/ide-disk.c    Thu Dec  5 10:47:59 2002
-+++ linux/./drivers/ide/ide-disk.c     Fri Nov 29 18:06:20 2002
-@@ -367,6 +367,12 @@
-  */
- static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
- {
-+#ifdef CONFIG_DEV_RDONLY
-+      if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) {
-+              ide_end_request(1, HWGROUP(drive));
-+              return ide_stopped;
-+      }
-+#endif
-       if (IDE_CONTROL_REG)
-               OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
-       OUT_BYTE(0x00, IDE_FEATURE_REG);
---- linux-pristine/./fs/ext3/Makefile  Thu Dec  5 10:49:13 2002
-+++ linux/./fs/ext3/Makefile   Fri Nov 29 18:06:20 2002
-@@ -9,6 +9,8 @@
- O_TARGET := ext3.o
-+export-objs :=        super.o
-+
- obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-               ioctl.o namei.o super.o symlink.o
- obj-m    := $(O_TARGET)
---- linux-pristine/./fs/ext3/super.c   Thu Dec  5 10:49:13 2002
-+++ linux/./fs/ext3/super.c    Fri Nov 29 18:06:20 2002
-@@ -1744,7 +1744,7 @@
-       unregister_filesystem(&ext3_fs_type);
- }
--EXPORT_NO_SYMBOLS;
-+EXPORT_SYMBOL(ext3_bread);
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
- MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
---- linux-pristine/./fs/jbd/commit.c   Thu Dec  5 10:49:15 2002
-+++ linux/./fs/jbd/commit.c    Fri Nov 29 18:06:20 2002
-@@ -475,7 +475,7 @@
-            transaction's t_log_list queue, and metadata buffers are on
-            the t_iobuf_list queue.
--         Wait for the transactions in reverse order.  That way we are
-+         Wait for the buffers in reverse order.  That way we are
-          less likely to be woken up until all IOs have completed, and
-          so we incur less scheduling load.
-       */
-@@ -566,8 +566,10 @@
-       jbd_debug(3, "JBD: commit phase 6\n");
--      if (is_journal_aborted(journal))
-+      if (is_journal_aborted(journal)) {
-+              unlock_journal(journal);
-               goto skip_commit;
-+      }
-       /* Done it all: now write the commit record.  We should have
-        * cleaned up our previous buffers by now, so if we are in abort
-@@ -577,6 +579,7 @@
-       descriptor = journal_get_descriptor_buffer(journal);
-       if (!descriptor) {
-               __journal_abort_hard(journal);
-+              unlock_journal(journal);
-               goto skip_commit;
-       }
-       
-@@ -600,7 +603,6 @@
-               put_bh(bh);             /* One for getblk() */
-               journal_unlock_journal_head(descriptor);
-       }
--      lock_journal(journal);
-       /* End of a transaction!  Finally, we can do checkpoint
-            processing: any buffers committed as a result of this
-@@ -609,6 +611,25 @@
- skip_commit:
-+      /* Call any callbacks that had been registered for handles in this
-+       * transaction.  It is up to the callback to free any allocated
-+       * memory.
-+       */
-+      if (!list_empty(&commit_transaction->t_jcb)) {
-+              struct list_head *p, *n;
-+              int error = is_journal_aborted(journal);
-+
-+              list_for_each_safe(p, n, &commit_transaction->t_jcb) {
-+                      struct journal_callback *jcb;
-+
-+                      jcb = list_entry(p, struct journal_callback, jcb_list);
-+                      list_del(p);
-+                      jcb->jcb_func(jcb, error);
-+              }
-+      }
-+
-+      lock_journal(journal);
-+
-       jbd_debug(3, "JBD: commit phase 7\n");
-       J_ASSERT(commit_transaction->t_sync_datalist == NULL);
---- linux-pristine/./fs/jbd/journal.c  Thu Dec  5 10:49:15 2002
-+++ linux/./fs/jbd/journal.c   Fri Nov 29 18:06:20 2002
-@@ -58,6 +58,7 @@
- #endif
- EXPORT_SYMBOL(journal_flush);
- EXPORT_SYMBOL(journal_revoke);
-+EXPORT_SYMBOL(journal_callback_set);
- EXPORT_SYMBOL(journal_init_dev);
- EXPORT_SYMBOL(journal_init_inode);
---- linux-pristine/./fs/jbd/transaction.c      Thu Dec  5 10:49:15 2002
-+++ linux/./fs/jbd/transaction.c       Fri Nov 29 18:06:20 2002
-@@ -57,6 +57,7 @@
-       transaction->t_state = T_RUNNING;
-       transaction->t_tid = journal->j_transaction_sequence++;
-       transaction->t_expires = jiffies + journal->j_commit_interval;
-+      INIT_LIST_HEAD(&transaction->t_jcb);
-       /* Set up the commit timer for the new transaction. */
-       J_ASSERT (!journal->j_commit_timer_active);
-@@ -201,6 +202,20 @@
-       return 0;
- }
-+/* Allocate a new handle.  This should probably be in a slab... */
-+static handle_t *new_handle(int nblocks)
-+{
-+      handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+      if (!handle)
-+              return NULL;
-+      memset(handle, 0, sizeof (handle_t));
-+      handle->h_buffer_credits = nblocks;
-+      handle->h_ref = 1;
-+      INIT_LIST_HEAD(&handle->h_jcb);
-+
-+      return handle;
-+}
-+
- /*
-  * Obtain a new handle.  
-  *
-@@ -227,14 +242,11 @@
-               handle->h_ref++;
-               return handle;
-       }
--      
--      handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+
-+      handle = new_handle(nblocks);
-       if (!handle)
-               return ERR_PTR(-ENOMEM);
--      memset (handle, 0, sizeof (handle_t));
--      handle->h_buffer_credits = nblocks;
--      handle->h_ref = 1;
-       current->journal_info = handle;
-       err = start_this_handle(journal, handle);
-@@ -333,14 +345,11 @@
-       
-       if (is_journal_aborted(journal))
-               return ERR_PTR(-EIO);
--      
--      handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+
-+      handle = new_handle(nblocks);
-       if (!handle)
-               return ERR_PTR(-ENOMEM);
--      memset (handle, 0, sizeof (handle_t));
--      handle->h_buffer_credits = nblocks;
--      handle->h_ref = 1;
-       current->journal_info = handle;
-       err = try_start_this_handle(journal, handle);
-@@ -1328,6 +1337,28 @@
- #endif
- /*
-+ * Register a callback function for this handle.  The function will be
-+ * called when the transaction that this handle is part of has been
-+ * committed to disk with the original callback data struct and the
-+ * error status of the journal as parameters.  There is no guarantee of
-+ * ordering between handles within a single transaction, nor between
-+ * callbacks registered on the same handle.
-+ *
-+ * The caller is responsible for allocating the journal_callback struct.
-+ * This is to allow the caller to add as much extra data to the callback
-+ * as needed, but reduce the overhead of multiple allocations.  The caller
-+ * allocated struct must start with a struct journal_callback at offset 0,
-+ * and has the caller-specific data afterwards.
-+ */
-+void journal_callback_set(handle_t *handle,
-+                        void (*func)(struct journal_callback *jcb, int error),
-+                        struct journal_callback *jcb)
-+{
-+      list_add(&jcb->jcb_list, &handle->h_jcb);
-+      jcb->jcb_func = func;
-+}
-+
-+/*
-  * All done for a particular handle.
-  *
-  * There is not much action needed here.  We just return any remaining
-@@ -1393,7 +1424,10 @@
-                       wake_up(&journal->j_wait_transaction_locked);
-       }
--      /* 
-+      /* Move callbacks from the handle to the transaction. */
-+      list_splice(&handle->h_jcb, &transaction->t_jcb);
-+
-+      /*
-        * If the handle is marked SYNC, we need to set another commit
-        * going!  We also want to force a commit if the current
-        * transaction is occupying too much of the log, or if the
---- linux-pristine/./include/linux/blkdev.h    Thu Dec  5 10:49:41 2002
-+++ linux/./include/linux/blkdev.h     Fri Nov 29 18:30:34 2002
-@@ -228,4 +228,8 @@
-       return retval;
- }
-+#define CONFIG_DEV_RDONLY
-+void dev_set_rdonly(kdev_t, int);
-+int dev_check_rdonly(kdev_t);
-+void dev_clear_rdonly(int);
- #endif
---- linux-pristine/./include/linux/slab.h      Thu Dec  5 10:49:53 2002
-+++ linux/./include/linux/slab.h       Fri Nov 29 18:30:15 2002
-@@ -58,6 +58,7 @@
- extern void *kmem_cache_alloc(kmem_cache_t *, int);
- extern void *kmem_cache_zalloc(kmem_cache_t *, int);
- extern void kmem_cache_free(kmem_cache_t *, void *);
-+extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp);
- extern void *kmalloc(size_t, int);
- extern void kfree(const void *);
---- linux-pristine/./include/linux/jbd.h       Thu Dec  5 10:49:43 2002
-+++ linux/./include/linux/jbd.h        Fri Nov 29 18:50:01 2002
-@@ -249,6 +249,13 @@
-       return bh->b_private;
- }
-+#define HAVE_JOURNAL_CALLBACK_STATUS
-+struct journal_callback {
-+      struct list_head jcb_list;
-+      void (*jcb_func)(struct journal_callback *jcb, int error);
-+      /* user data goes here */
-+};
-+
- struct jbd_revoke_table_s;
- /* The handle_t type represents a single atomic update being performed
-@@ -279,6 +286,12 @@
-          operations */
-       int                     h_err;
-+      /* List of application registered callbacks for this handle.
-+       * The function(s) will be called after the transaction that
-+       * this handle is part of has been committed to disk.
-+       */
-+      struct list_head        h_jcb;
-+
-       /* Flags */
-       unsigned int    h_sync:         1;      /* sync-on-close */
-       unsigned int    h_jdata:        1;      /* force data journaling */
-@@ -398,6 +411,10 @@
-       /* How many handles used this transaction? */
-       int t_handle_count;
-+
-+      /* List of registered callback functions for this transaction.
-+       * Called when the transaction is committed. */
-+      struct list_head        t_jcb;
- };
-@@ -646,6 +663,9 @@
- extern int     journal_try_to_free_buffers(journal_t *, struct page *, int);
- extern int     journal_stop(handle_t *);
- extern int     journal_flush (journal_t *);
-+extern void    journal_callback_set(handle_t *handle,
-+                                    void (*fn)(struct journal_callback *,int),
-+                                    struct journal_callback *jcb);
- extern void    journal_lock_updates (journal_t *);
- extern void    journal_unlock_updates (journal_t *);
---- linux-pristine/./kernel/ksyms.c    Thu Dec  5 10:50:01 2002
-+++ linux/./kernel/ksyms.c     Fri Nov 29 18:37:23 2002
-@@ -271,6 +271,7 @@
- EXPORT_SYMBOL(set_page_dirty);
- EXPORT_SYMBOL(vfs_readlink);
- EXPORT_SYMBOL(vfs_follow_link);
-+EXPORT_SYMBOL(vfs_follow_link_it);
- EXPORT_SYMBOL(page_readlink);
- EXPORT_SYMBOL(page_follow_link);
- EXPORT_SYMBOL(page_symlink_inode_operations);
-@@ -285,6 +286,11 @@
- EXPORT_SYMBOL(nr_free_pages);
- EXPORT_SYMBOL(page_cache_size);
-+/* lustre */
-+EXPORT_SYMBOL(pagecache_lock);
-+EXPORT_SYMBOL(do_kern_mount);
-+EXPORT_SYMBOL(kmem_cache_validate);
-+
- /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
- EXPORT_SYMBOL(default_llseek);
- EXPORT_SYMBOL(dentry_open);
---- linux-pristine/./include/linux/dcache.h    Thu Dec  5 10:49:42 2002
-+++ linux/./include/linux/dcache.h     Fri Nov 29 18:30:11 2002
-@@ -6,6 +6,34 @@
- #include <asm/atomic.h>
- #include <linux/mount.h>
-+#define IT_OPEN  (1)
-+#define IT_CREAT  (1<<1)
-+#define IT_MKDIR  (1<<2)
-+#define IT_LINK  (1<<3)
-+#define IT_LINK2  (1<<4)
-+#define IT_SYMLINK  (1<<5)
-+#define IT_UNLINK  (1<<6)
-+#define IT_RMDIR  (1<<7)
-+#define IT_RENAME  (1<<8)
-+#define IT_RENAME2  (1<<9)
-+#define IT_READDIR  (1<<10)
-+#define IT_GETATTR  (1<<11)
-+#define IT_SETATTR  (1<<12)
-+#define IT_READLINK  (1<<13)
-+#define IT_MKNOD  (1<<14)
-+#define IT_LOOKUP  (1<<15)
-+
-+struct lookup_intent {
-+      int it_op;
-+      int it_mode;
-+      int it_disposition;
-+      int it_status;
-+      struct iattr *it_iattr;
-+      __u64 it_lock_handle[2];
-+      int it_lock_mode;
-+      void *it_data;
-+};
-+
- /*
-  * linux/include/linux/dcache.h
-  *
-@@ -78,6 +106,7 @@
-       unsigned long d_time;           /* used by d_revalidate */
-       struct dentry_operations  *d_op;
-       struct super_block * d_sb;      /* The root of the dentry tree */
-+      struct lookup_intent *d_it;
-       unsigned long d_vfs_flags;
-       void * d_fsdata;                /* fs-specific data */
-       unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
-@@ -90,6 +119,8 @@
-       int (*d_delete)(struct dentry *);
-       void (*d_release)(struct dentry *);
-       void (*d_iput)(struct dentry *, struct inode *);
-+      int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *);
-+      void (*d_intent_release)(struct dentry *, struct lookup_intent *);
- };
- /* the dentry parameter passed to d_hash and d_compare is the parent
---- linux-pristine/./include/linux/fs.h        Thu Dec  5 10:49:42 2002
-+++ linux/./include/linux/fs.h Fri Nov 29 18:30:15 2002
-@@ -588,6 +588,7 @@
-       /* needed for tty driver, and maybe others */
-       void                    *private_data;
-+      struct lookup_intent    *f_intent;
-       /* preallocated helper kiobuf to speedup O_DIRECT */
-       struct kiobuf           *f_iobuf;
-@@ -849,7 +850,9 @@
- extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
- extern int vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_unlink(struct inode *, struct dentry *);
--extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-+              struct inode *new_dir, struct dentry *new_dentry,
-+              struct lookup_intent *it);
- /*
-  * File types
-@@ -911,6 +914,7 @@
- struct inode_operations {
-       int (*create) (struct inode *,struct dentry *,int);
-       struct dentry * (*lookup) (struct inode *,struct dentry *);
-+      struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *);
-       int (*link) (struct dentry *,struct inode *,struct dentry *);
-       int (*unlink) (struct inode *,struct dentry *);
-       int (*symlink) (struct inode *,struct dentry *,const char *);
-@@ -921,6 +925,8 @@
-                       struct inode *, struct dentry *);
-       int (*readlink) (struct dentry *, char *,int);
-       int (*follow_link) (struct dentry *, struct nameidata *);
-+      int (*follow_link2) (struct dentry *, struct nameidata *,
-+                             struct lookup_intent *it);
-       void (*truncate) (struct inode *);
-       int (*permission) (struct inode *, int);
-       int (*revalidate) (struct dentry *);
-@@ -1063,7 +1069,7 @@
- extern struct vfsmount *kern_mount(struct file_system_type *);
- extern int may_umount(struct vfsmount *);
- extern long do_mount(char *, char *, char *, unsigned long, void *);
--
-+struct vfsmount *do_kern_mount(char *type, int flags, char *name, void *data);
- #define kern_umount mntput
- extern int vfs_statfs(struct super_block *, struct statfs *);
-@@ -1387,6 +1393,7 @@
- extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
- extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
-+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it));
- extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
- extern int FASTCALL(path_walk(const char *, struct nameidata *));
- extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
-@@ -1397,6 +1404,8 @@
- extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
- #define user_path_walk(name,nd)        __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
- #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
-+#define user_path_walk_it(name,nd,it)  __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it)
-+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it)
- extern void iput(struct inode *);
- extern void force_delete(struct inode *);
-@@ -1508,6 +1517,8 @@
- extern int vfs_readlink(struct dentry *, char *, int, const char *);
- extern int vfs_follow_link(struct nameidata *, const char *);
-+extern int vfs_follow_link_it(struct nameidata *, const char *,
-+                            struct lookup_intent *it);
- extern int page_readlink(struct dentry *, char *, int);
- extern int page_follow_link(struct dentry *, struct nameidata *);
- extern struct inode_operations page_symlink_inode_operations;
---- linux-pristine/./fs/dcache.c       Thu Dec  5 10:49:13 2002
-+++ linux/./fs/dcache.c        Fri Nov 29 18:06:20 2002
-@@ -617,6 +617,7 @@
-       dentry->d_op = NULL;
-       dentry->d_fsdata = NULL;
-       dentry->d_mounted = 0;
-+      dentry->d_it = NULL;
-       INIT_LIST_HEAD(&dentry->d_hash);
-       INIT_LIST_HEAD(&dentry->d_lru);
-       INIT_LIST_HEAD(&dentry->d_subdirs);
---- linux-pristine/./fs/nfsd/vfs.c     Thu Dec  5 10:49:18 2002
-+++ linux/./fs/nfsd/vfs.c      Fri Nov 29 18:06:20 2002
-@@ -1285,7 +1285,7 @@
-                       err = nfserr_perm;
-       } else
- #endif
--      err = vfs_rename(fdir, odentry, tdir, ndentry);
-+      err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
-       if (!err && EX_ISSYNC(tfhp->fh_export)) {
-               nfsd_sync_dir(tdentry);
-               nfsd_sync_dir(fdentry);
---- linux-pristine/./fs/namei.c        Thu Dec  5 10:49:16 2002
-+++ linux/./fs/namei.c Fri Nov 29 18:11:18 2002
-@@ -94,6 +94,12 @@
-  * XEmacs seems to be relying on it...
-  */
-+void intent_release(struct dentry *de, struct lookup_intent *it)
-+{
-+      if (it && de->d_op && de->d_op->d_intent_release)
-+              de->d_op->d_intent_release(de, it);
-+}
-+
- /* In order to reduce some races, while at the same time doing additional
-  * checking and hopefully speeding things up, we copy filenames to the
-  * kernel data space before using them..
-@@ -260,10 +266,19 @@
-  * Internal lookup() using the new generic dcache.
-  * SMP-safe
-  */
--static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name,
-+                                  int flags, struct lookup_intent *it)
- {
-       struct dentry * dentry = d_lookup(parent, name);
-+      if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) {
-+              if (!dentry->d_op->d_revalidate2(dentry, flags, it) &&
-+                  !d_invalidate(dentry)) {
-+                      dput(dentry);
-+                      dentry = NULL;
-+              }
-+              return dentry;
-+      } else
-       if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
-               if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
-                       dput(dentry);
-@@ -281,7 +296,8 @@
-  * make sure that nobody added the entry to the dcache in the meantime..
-  * SMP-safe
-  */
--static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name,
-+                                int flags, struct lookup_intent *it)
- {
-       struct dentry * result;
-       struct inode *dir = parent->d_inode;
-@@ -300,6 +316,9 @@
-               result = ERR_PTR(-ENOMEM);
-               if (dentry) {
-                       lock_kernel();
-+                      if (dir->i_op->lookup2)
-+                              result = dir->i_op->lookup2(dir, dentry, it);
-+                      else
-                       result = dir->i_op->lookup(dir, dentry);
-                       unlock_kernel();
-                       if (result)
-@@ -321,6 +340,12 @@
-                       dput(result);
-                       result = ERR_PTR(-ENOENT);
-               }
-+      } else if (result->d_op && result->d_op->d_revalidate2) {
-+              if (!result->d_op->d_revalidate2(result, flags, it) &&
-+                  !d_invalidate(result)) {
-+                      dput(result);
-+                      result = ERR_PTR(-ENOENT);
-+              }
-       }
-       return result;
- }
-@@ -332,7 +357,8 @@
-  * Without that kind of total limit, nasty chains of consecutive
-  * symlinks can cause almost arbitrarily long lookups. 
-  */
--static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
-+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd,
-+                               struct lookup_intent *it)
- {
-       int err;
-       if (current->link_count >= 5)
-@@ -346,10 +372,14 @@
-       current->link_count++;
-       current->total_link_count++;
-       UPDATE_ATIME(dentry->d_inode);
--      err = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (dentry->d_inode->i_op->follow_link2)
-+              err = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
-+      else
-+              err = dentry->d_inode->i_op->follow_link(dentry, nd);
-       current->link_count--;
-       return err;
- loop:
-+      intent_release(dentry, it);
-       path_release(nd);
-       return -ELOOP;
- }
-@@ -445,7 +475,8 @@
-  *
-  * We expect 'base' to be positive and a directory.
-  */
--int link_path_walk(const char * name, struct nameidata *nd)
-+int link_path_walk_it(const char *name, struct nameidata *nd,
-+                    struct lookup_intent *it)
- {
-       struct dentry *dentry;
-       struct inode *inode;
-@@ -518,9 +549,9 @@
-                               break;
-               }
-               /* This does the actual lookups.. */
--              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
-               if (!dentry) {
--                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
-                       err = PTR_ERR(dentry);
-                       if (IS_ERR(dentry))
-                               break;
-@@ -537,8 +568,8 @@
-               if (!inode->i_op)
-                       goto out_dput;
--              if (inode->i_op->follow_link) {
--                      err = do_follow_link(dentry, nd);
-+              if (inode->i_op->follow_link || inode->i_op->follow_link2) {
-+                      err = do_follow_link(dentry, nd, NULL);
-                       dput(dentry);
-                       if (err)
-                               goto return_err;
-@@ -554,7 +585,7 @@
-                       nd->dentry = dentry;
-               }
-               err = -ENOTDIR; 
--              if (!inode->i_op->lookup)
-+              if (!inode->i_op->lookup && !inode->i_op->lookup2)
-                       break;
-               continue;
-               /* here ends the main loop */
-@@ -581,9 +612,9 @@
-                       if (err < 0)
-                               break;
-               }
--              dentry = cached_lookup(nd->dentry, &this, 0);
-+              dentry = cached_lookup(nd->dentry, &this, 0, it);
-               if (!dentry) {
--                      dentry = real_lookup(nd->dentry, &this, 0);
-+                      dentry = real_lookup(nd->dentry, &this, 0, it);
-                       err = PTR_ERR(dentry);
-                       if (IS_ERR(dentry))
-                               break;
-@@ -591,9 +622,9 @@
-               while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
-                       ;
-               inode = dentry->d_inode;
--              if ((lookup_flags & LOOKUP_FOLLOW)
--                  && inode && inode->i_op && inode->i_op->follow_link) {
--                      err = do_follow_link(dentry, nd);
-+              if ((lookup_flags & LOOKUP_FOLLOW) && inode && inode->i_op &&
-+                  (inode->i_op->follow_link || inode->i_op->follow_link2)) {
-+                      err = do_follow_link(dentry, nd, it);
-                       dput(dentry);
-                       if (err)
-                               goto return_err;
-@@ -607,7 +638,8 @@
-                       goto no_inode;
-               if (lookup_flags & LOOKUP_DIRECTORY) {
-                       err = -ENOTDIR; 
--                      if (!inode->i_op || !inode->i_op->lookup)
-+                      if (!inode->i_op ||
-+                          (!inode->i_op->lookup && !inode->i_op->lookup2))
-                               break;
-               }
-               goto return_base;
-@@ -636,10 +668,21 @@
-       return err;
- }
-+int link_path_walk(const char * name, struct nameidata *nd)
-+{
-+      return link_path_walk_it(name, nd, NULL);
-+}
-+
-+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it)
-+{
-+      current->total_link_count = 0;
-+      return link_path_walk_it(name, nd, it);
-+}
-+
- int path_walk(const char * name, struct nameidata *nd)
- {
-       current->total_link_count = 0;
--      return link_path_walk(name, nd);
-+      return link_path_walk_it(name, nd, NULL);
- }
- /* SMP-safe */
-@@ -742,7 +785,8 @@
-  * needs parent already locked. Doesn't follow mounts.
-  * SMP-safe.
-  */
--struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base,
-+                             struct lookup_intent *it)
- {
-       struct dentry * dentry;
-       struct inode *inode;
-@@ -765,13 +809,16 @@
-                       goto out;
-       }
--      dentry = cached_lookup(base, name, 0);
-+      dentry = cached_lookup(base, name, 0, it);
-       if (!dentry) {
-               struct dentry *new = d_alloc(base, name);
-               dentry = ERR_PTR(-ENOMEM);
-               if (!new)
-                       goto out;
-               lock_kernel();
-+              if (inode->i_op->lookup2)
-+                      dentry = inode->i_op->lookup2(inode, new, it);
-+              else
-               dentry = inode->i_op->lookup(inode, new);
-               unlock_kernel();
-               if (!dentry)
-@@ -783,6 +830,12 @@
-       return dentry;
- }
-+struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+{
-+      return lookup_hash_it(name, base, NULL);
-+}
-+
-+
- /* SMP-safe */
- struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
- {
-@@ -804,7 +857,7 @@
-       }
-       this.hash = end_name_hash(hash);
--      return lookup_hash(&this, base);
-+      return lookup_hash_it(&this, base, NULL);
- access:
-       return ERR_PTR(-EACCES);
- }
-@@ -836,6 +889,23 @@
-       return err;
- }
-+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd,
-+                 struct lookup_intent *it)
-+{
-+      char *tmp;
-+      int err;
-+
-+      tmp = getname(name);
-+      err = PTR_ERR(tmp);
-+      if (!IS_ERR(tmp)) {
-+              err = 0;
-+              if (path_init(tmp, flags, nd))
-+                      err = path_walk_it(tmp, nd, it);
-+              putname(tmp);
-+      }
-+      return err;
-+}
-+
- /*
-  * It's inline, so penalty for filesystems that don't use sticky bit is
-  * minimal.
-@@ -970,7 +1040,8 @@
-  * for symlinks (where the permissions are checked later).
-  * SMP-safe
-  */
--int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
-+int open_namei_it(const char *pathname, int flag, int mode,
-+                struct nameidata *nd, struct lookup_intent *it)
- {
-       int acc_mode, error = 0;
-       struct inode *inode;
-@@ -985,7 +1056,7 @@
-        */
-       if (!(flag & O_CREAT)) {
-               if (path_init(pathname, lookup_flags(flag), nd))
--                      error = path_walk(pathname, nd);
-+                      error = path_walk_it(pathname, nd, it);
-               if (error)
-                       return error;
-               dentry = nd->dentry;
-@@ -995,6 +1066,10 @@
-       /*
-        * Create - we need to know the parent.
-        */
-+      if (it) {
-+              it->it_mode = mode;
-+              it->it_op |= IT_CREAT;
-+      }
-       if (path_init(pathname, LOOKUP_PARENT, nd))
-               error = path_walk(pathname, nd);
-       if (error)
-@@ -1011,7 +1086,7 @@
-       dir = nd->dentry;
-       down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
- do_last:
-       error = PTR_ERR(dentry);
-@@ -1020,6 +1095,7 @@
-               goto exit;
-       }
-+      it->it_mode = mode;
-       /* Negative dentry, just create the file */
-       if (!dentry->d_inode) {
-               if (!IS_POSIX_ACL(dir->d_inode))
-@@ -1054,7 +1130,8 @@
-       error = -ENOENT;
-       if (!dentry->d_inode)
-               goto exit_dput;
--      if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link)
-+      if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link ||
-+                                    dentry->d_inode->i_op->follow_link2))
-               goto do_link;
-       dput(nd->dentry);
-@@ -1140,8 +1217,10 @@
-       return 0;
- exit_dput:
-+      intent_release(dentry, it);
-       dput(dentry);
- exit:
-+      intent_release(nd->dentry, it);
-       path_release(nd);
-       return error;
-@@ -1160,7 +1239,12 @@
-        * are done. Procfs-like symlinks just set LAST_BIND.
-        */
-       UPDATE_ATIME(dentry->d_inode);
--      error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (dentry->d_inode->i_op->follow_link2)
-+              error = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
-+      else
-+              error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (error)
-+              intent_release(dentry, it);
-       dput(dentry);
-       if (error)
-               return error;
-@@ -1182,13 +1266,20 @@
-       }
-       dir = nd->dentry;
-       down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
-       putname(nd->last.name);
-       goto do_last;
- }
-+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd)
-+{
-+      return open_namei_it(pathname, flag, mode, nd, NULL);
-+}
-+
-+
- /* SMP-safe */
--static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
-+static struct dentry *lookup_create(struct nameidata *nd, int is_dir,
-+                                  struct lookup_intent *it)
- {
-       struct dentry *dentry;
-@@ -1196,7 +1287,7 @@
-       dentry = ERR_PTR(-EEXIST);
-       if (nd->last_type != LAST_NORM)
-               goto fail;
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
-       if (IS_ERR(dentry))
-               goto fail;
-       if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1242,6 +1333,7 @@
-       char * tmp;
-       struct dentry * dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_MKNOD, .it_mode = mode };
-       if (S_ISDIR(mode))
-               return -EPERM;
-@@ -1253,7 +1345,7 @@
-               error = path_walk(tmp, &nd);
-       if (error)
-               goto out;
--      dentry = lookup_create(&nd, 0);
-+      dentry = lookup_create(&nd, 0, &it);
-       error = PTR_ERR(dentry);
-       if (!IS_POSIX_ACL(nd.dentry->d_inode))
-@@ -1272,6 +1364,7 @@
-               default:
-                       error = -EINVAL;
-               }
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-@@ -1312,6 +1405,7 @@
- {
-       int error = 0;
-       char * tmp;
-+      struct lookup_intent it = { .it_op = IT_MKDIR, .it_mode = mode };
-       tmp = getname(pathname);
-       error = PTR_ERR(tmp);
-@@ -1323,12 +1417,13 @@
-                       error = path_walk(tmp, &nd);
-               if (error)
-                       goto out;
--              dentry = lookup_create(&nd, 1);
-+              dentry = lookup_create(&nd, 1, &it);
-               error = PTR_ERR(dentry);
-               if (!IS_ERR(dentry)) {
-                       if (!IS_POSIX_ACL(nd.dentry->d_inode))
-                               mode &= ~current->fs->umask;
-                       error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
-+                      intent_release(dentry, &it);
-                       dput(dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-@@ -1410,6 +1505,7 @@
-       char * name;
-       struct dentry *dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_RMDIR };
-       name = getname(pathname);
-       if(IS_ERR(name))
-@@ -1432,10 +1528,11 @@
-                       goto exit1;
-       }
-       down(&nd.dentry->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
-       error = PTR_ERR(dentry);
-       if (!IS_ERR(dentry)) {
-               error = vfs_rmdir(nd.dentry->d_inode, dentry);
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-@@ -1479,6 +1576,7 @@
-       char * name;
-       struct dentry *dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_UNLINK };
-       name = getname(pathname);
-       if(IS_ERR(name))
-@@ -1492,7 +1590,7 @@
-       if (nd.last_type != LAST_NORM)
-               goto exit1;
-       down(&nd.dentry->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
-       error = PTR_ERR(dentry);
-       if (!IS_ERR(dentry)) {
-               /* Why not before? Because we want correct error value */
-@@ -1500,6 +1598,7 @@
-                       goto slashes;
-               error = vfs_unlink(nd.dentry->d_inode, dentry);
-       exit2:
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-@@ -1546,6 +1645,7 @@
-       int error = 0;
-       char * from;
-       char * to;
-+      struct lookup_intent it = { .it_op = IT_SYMLINK };
-       from = getname(oldname);
-       if(IS_ERR(from))
-@@ -1560,10 +1660,12 @@
-                       error = path_walk(to, &nd);
-               if (error)
-                       goto out;
--              dentry = lookup_create(&nd, 0);
-+              it.it_data = from;
-+              dentry = lookup_create(&nd, 0, &it);
-               error = PTR_ERR(dentry);
-               if (!IS_ERR(dentry)) {
-                       error = vfs_symlink(nd.dentry->d_inode, dentry, from);
-+                      intent_release(dentry, &it);
-                       dput(dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-@@ -1629,6 +1731,7 @@
-       int error;
-       char * from;
-       char * to;
-+      struct lookup_intent it = { .it_op = IT_LINK };
-       from = getname(oldname);
-       if(IS_ERR(from))
-@@ -1641,7 +1744,7 @@
-               error = 0;
-               if (path_init(from, LOOKUP_POSITIVE, &old_nd))
--                      error = path_walk(from, &old_nd);
-+                      error = path_walk_it(from, &old_nd, &it);
-               if (error)
-                       goto exit;
-               if (path_init(to, LOOKUP_PARENT, &nd))
-@@ -1651,10 +1754,12 @@
-               error = -EXDEV;
-               if (old_nd.mnt != nd.mnt)
-                       goto out_release;
--              new_dentry = lookup_create(&nd, 0);
-+              it.it_op = IT_LINK2;
-+              new_dentry = lookup_create(&nd, 0, &it);
-               error = PTR_ERR(new_dentry);
-               if (!IS_ERR(new_dentry)) {
-                       error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-+                      intent_release(new_dentry, &it);
-                       dput(new_dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-@@ -1697,7 +1802,8 @@
-  *       locking].
-  */
- int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+                 struct inode *new_dir, struct dentry *new_dentry,
-+                 struct lookup_intent *it)
- {
-       int error;
-       struct inode *target;
-@@ -1757,6 +1863,7 @@
-               error = -EBUSY;
-       else 
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-+      intent_release(new_dentry, it);
-       if (target) {
-               if (!error)
-                       target->i_flags |= S_DEAD;
-@@ -1778,7 +1885,8 @@
- }
- int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+                   struct inode *new_dir, struct dentry *new_dentry,
-+                   struct lookup_intent *it)
- {
-       int error;
-@@ -1809,6 +1917,7 @@
-               error = -EBUSY;
-       else
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-+      intent_release(new_dentry, it);
-       double_up(&old_dir->i_zombie, &new_dir->i_zombie);
-       if (error)
-               return error;
-@@ -1820,13 +1929,14 @@
- }
- int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+             struct inode *new_dir, struct dentry *new_dentry,
-+             struct lookup_intent *it)
- {
-       int error;
-       if (S_ISDIR(old_dentry->d_inode->i_mode))
--              error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
-+              error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry,it);
-       else
--              error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
-+              error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,it);
-       if (!error) {
-               if (old_dir == new_dir)
-                       inode_dir_notify(old_dir, DN_RENAME);
-@@ -1843,6 +1953,7 @@
-       int error = 0;
-       struct dentry * old_dir, * new_dir;
-       struct dentry * old_dentry, *new_dentry;
-+      struct lookup_intent it = { .it_op = IT_RENAME };
-       struct nameidata oldnd, newnd;
-       if (path_init(oldname, LOOKUP_PARENT, &oldnd))
-@@ -1871,7 +1982,7 @@
-       double_lock(new_dir, old_dir);
--      old_dentry = lookup_hash(&oldnd.last, old_dir);
-+      old_dentry = lookup_hash_it(&oldnd.last, old_dir, &it);
-       error = PTR_ERR(old_dentry);
-       if (IS_ERR(old_dentry))
-               goto exit3;
-@@ -1887,18 +1998,21 @@
-               if (newnd.last.name[newnd.last.len])
-                       goto exit4;
-       }
--      new_dentry = lookup_hash(&newnd.last, new_dir);
-+      it.it_op = IT_RENAME2;
-+      new_dentry = lookup_hash_it(&newnd.last, new_dir, &it);
-       error = PTR_ERR(new_dentry);
-       if (IS_ERR(new_dentry))
-               goto exit4;
-       lock_kernel();
-       error = vfs_rename(old_dir->d_inode, old_dentry,
--                                 new_dir->d_inode, new_dentry);
-+                                 new_dir->d_inode, new_dentry, &it);
-       unlock_kernel();
-+      intent_release(new_dentry, &it);
-       dput(new_dentry);
- exit4:
-+      intent_release(old_dentry, &it);
-       dput(old_dentry);
- exit3:
-       double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem);
-@@ -1947,7 +2061,8 @@
- }
- static inline int
--__vfs_follow_link(struct nameidata *nd, const char *link)
-+__vfs_follow_link(struct nameidata *nd, const char *link,
-+                struct lookup_intent *it)
- {
-       int res = 0;
-       char *name;
-@@ -1960,7 +2075,7 @@
-                       /* weird __emul_prefix() stuff did it */
-                       goto out;
-       }
--      res = link_path_walk(link, nd);
-+      res = link_path_walk_it(link, nd, it);
- out:
-       if (current->link_count || res || nd->last_type!=LAST_NORM)
-               return res;
-@@ -1982,7 +2097,13 @@
- int vfs_follow_link(struct nameidata *nd, const char *link)
- {
--      return __vfs_follow_link(nd, link);
-+      return __vfs_follow_link(nd, link, NULL);
-+}
-+
-+int vfs_follow_link_it(struct nameidata *nd, const char *link,
-+                     struct lookup_intent *it)
-+{
-+      return __vfs_follow_link(nd, link, it);
- }
- /* get the link contents into pagecache */
-@@ -2024,7 +2145,7 @@
- {
-       struct page *page = NULL;
-       char *s = page_getlink(dentry, &page);
--      int res = __vfs_follow_link(nd, s);
-+      int res = __vfs_follow_link(nd, s, NULL);
-       if (page) {
-               kunmap(page);
-               page_cache_release(page);
---- linux-pristine/./fs/open.c Thu Dec  5 10:49:20 2002
-+++ linux/./fs/open.c  Fri Nov 29 18:06:21 2002
-@@ -19,6 +19,9 @@
- #include <asm/uaccess.h>
- #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
-+extern int path_walk_it(const char *name, struct nameidata *nd,
-+                      struct lookup_intent *it);
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- int vfs_statfs(struct super_block *sb, struct statfs *buf)
- {
-@@ -94,12 +97,13 @@
-       struct nameidata nd;
-       struct inode * inode;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
-       error = -EINVAL;
-       if (length < 0) /* sorry, but loff_t says... */
-               goto out;
--      error = user_path_walk(path, &nd);
-+      error = user_path_walk_it(path, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -144,6 +148,7 @@
-       put_write_access(inode);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -235,8 +240,9 @@
-       struct nameidata nd;
-       struct inode * inode;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -262,6 +268,7 @@
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -279,8 +286,9 @@
-       struct nameidata nd;
-       struct inode * inode;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-@@ -307,6 +315,7 @@
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -323,6 +332,7 @@
-       int old_fsuid, old_fsgid;
-       kernel_cap_t old_cap;
-       int res;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
-               return -EINVAL;
-@@ -340,13 +350,14 @@
-       else
-               current->cap_effective = current->cap_permitted;
--      res = user_path_walk(filename, &nd);
-+      res = user_path_walk_it(filename, &nd, &it);
-       if (!res) {
-               res = permission(nd.dentry->d_inode, mode);
-               /* SuS v2 requires we report a read only fs too */
-               if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
-                  && !special_file(nd.dentry->d_inode->i_mode))
-                       res = -EROFS;
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-@@ -362,6 +373,7 @@
-       int error;
-       struct nameidata nd;
-       char *name;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       name = getname(filename);
-       error = PTR_ERR(name);
-@@ -370,7 +382,7 @@
-       error = 0;
-       if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd))
--              error = path_walk(name, &nd);
-+              error = path_walk_it(name, &nd, &it);
-       putname(name);
-       if (error)
-               goto out;
-@@ -382,6 +394,7 @@
-       set_fs_pwd(current->fs, nd.mnt, nd.dentry);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -422,6 +435,7 @@
-       int error;
-       struct nameidata nd;
-       char *name;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       name = getname(filename);
-       error = PTR_ERR(name);
-@@ -430,7 +444,7 @@
-       path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
-                     LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
--      error = path_walk(name, &nd);   
-+      error = path_walk_it(name, &nd, &it);
-       putname(name);
-       if (error)
-               goto out;
-@@ -447,6 +461,7 @@
-       set_fs_altroot();
-       error = 0;
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -491,8 +506,9 @@
-       struct inode * inode;
-       int error;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -512,6 +528,7 @@
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -581,10 +598,12 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -594,10 +613,12 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -631,10 +652,16 @@
-  * for the internal routines (ie open_namei()/follow_link() etc). 00 is
-  * used by symlinks.
-  */
-+extern int open_namei_it(const char *filename, int namei_flags, int mode,
-+                       struct nameidata *nd, struct lookup_intent *it);
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it);
-+
- struct file *filp_open(const char * filename, int flags, int mode)
- {
-       int namei_flags, error;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_OPEN };
-       namei_flags = flags;
-       if ((namei_flags+1) & O_ACCMODE)
-@@ -642,14 +669,15 @@
-       if (namei_flags & O_TRUNC)
-               namei_flags |= 2;
--      error = open_namei(filename, namei_flags, mode, &nd);
--      if (!error)
--              return dentry_open(nd.dentry, nd.mnt, flags);
-+      error = open_namei_it(filename, namei_flags, mode, &nd, &it);
-+      if (error)
-+              return ERR_PTR(error);
--      return ERR_PTR(error);
-+      return dentry_open_it(nd.dentry, nd.mnt, flags, &it);
- }
--struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it)
- {
-       struct file * f;
-       struct inode *inode;
-@@ -692,6 +720,7 @@
-       }
-       f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
-+      intent_release(dentry, it);
-       return f;
- cleanup_all:
-@@ -706,11 +735,17 @@
- cleanup_file:
-       put_filp(f);
- cleanup_dentry:
-+      intent_release(dentry, it);
-       dput(dentry);
-       mntput(mnt);
-       return ERR_PTR(error);
- }
-+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+{
-+      return dentry_open_it(dentry, mnt, flags, NULL);
-+}
-+
- /*
-  * Find an empty file descriptor entry, and mark it busy.
-  */
---- linux-pristine/./fs/stat.c Thu Dec  5 10:49:22 2002
-+++ linux/./fs/stat.c  Fri Nov 29 18:06:21 2002
-@@ -13,6 +13,7 @@
- #include <asm/uaccess.h>
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- /*
-  * Revalidate the inode. This is required for proper NFS attribute caching.
-  */
-@@ -135,13 +136,15 @@
- asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -151,13 +154,15 @@
- asmlinkage long sys_newstat(char * filename, struct stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -172,13 +177,15 @@
- asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -189,13 +196,15 @@
- asmlinkage long sys_newlstat(char * filename, struct stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -247,20 +256,21 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_READLINK };
-       if (bufsiz <= 0)
-               return -EINVAL;
--      error = user_path_walk_link(path, &nd);
-+      error = user_path_walk_link_it(path, &nd, &it);
-       if (!error) {
-               struct inode * inode = nd.dentry->d_inode;
--
-               error = -EINVAL;
-               if (inode->i_op && inode->i_op->readlink &&
-                   !(error = do_revalidate(nd.dentry))) {
-                       UPDATE_ATIME(inode);
-                       error = inode->i_op->readlink(nd.dentry, buf, bufsiz);
-               }
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -333,12 +343,14 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -348,12 +360,14 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
---- linux-pristine/./mm/slab.c Thu Dec  5 10:50:02 2002
-+++ linux/./mm/slab.c  Fri Nov 29 18:06:21 2002
-@@ -1187,6 +1187,59 @@
-  * Called with the cache-lock held.
-  */
-+extern struct page *check_get_page(unsigned long kaddr);
-+struct page *page_mem_map(struct page *page);
-+static int kmem_check_cache_obj (kmem_cache_t * cachep,
-+                               slab_t *slabp, void * objp)
-+{
-+      int i;
-+      unsigned int objnr;
-+
-+#if DEBUG
-+      if (cachep->flags & SLAB_RED_ZONE) {
-+              objp -= BYTES_PER_WORD;
-+              if ( *(unsigned long *)objp != RED_MAGIC2)
-+                      /* Either write before start, or a double free. */
-+                      return 0;
-+              if (*(unsigned long *)(objp+cachep->objsize -
-+                              BYTES_PER_WORD) != RED_MAGIC2)
-+                      /* Either write past end, or a double free. */
-+                      return 0;
-+      }
-+#endif
-+
-+      objnr = (objp-slabp->s_mem)/cachep->objsize;
-+      if (objnr >= cachep->num)
-+              return 0;
-+      if (objp != slabp->s_mem + objnr*cachep->objsize)
-+              return 0;
-+
-+      /* Check slab's freelist to see if this obj is there. */
-+      for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
-+              if (i == objnr)
-+                      return 0;
-+      }
-+      return 1;
-+}
-+
-+
-+int kmem_cache_validate(kmem_cache_t *cachep, void *objp)
-+{
-+      struct page *page = check_get_page((unsigned long)objp);
-+
-+      if (!VALID_PAGE(page))
-+              return 0;
-+
-+      if (!PageSlab(page))
-+              return 0;
-+
-+      /* XXX check for freed slab objects ? */
-+      if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp))
-+              return 0;
-+
-+      return (cachep == GET_PAGE_CACHE(page));
-+}
-+
- #if DEBUG
- static int kmem_extra_free_checks (kmem_cache_t * cachep,
-                       slab_t *slabp, void * objp)
diff --git a/lustre/kernel_patches/patches/tcp-zero-copy.patch b/lustre/kernel_patches/patches/tcp-zero-copy.patch
new file mode 100644 (file)
index 0000000..7176eca
--- /dev/null
@@ -0,0 +1,455 @@
+diff -u -r1.1.1.1 linux/include/linux/skbuff.h
+--- linux/include/linux/skbuff.h       2 Aug 2002 10:59:25 -0000       1.1.1.1
++++ linux/include/linux/skbuff.h       2 Aug 2002 14:20:00 -0000
+@@ -116,6 +116,30 @@
+       __u16 size;
+ };
+
++/* Support for callback when skb data has been released */
++typedef struct zccd                           /* Zero Copy Callback Descriptor */
++{                                             /* (embed as first member of custom struct) */
++      atomic_t        zccd_count;             /* reference count */
++      void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
++} zccd_t;
++
++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
++{
++      atomic_set (&d->zccd_count, 1);
++      d->zccd_destructor = callback;
++}
++
++static inline void zccd_get (zccd_t *d)               /* take a reference */
++{
++      atomic_inc (&d->zccd_count);
++}
++
++static inline void zccd_put (zccd_t *d)               /* release a reference */
++{
++      if (atomic_dec_and_test (&d->zccd_count))
++              (d->zccd_destructor)(d);
++}
++
+ /* This data is invariant across clones and lives at
+  * the end of the header data, ie. at skb->end.
+  */
+@@ -123,6 +147,12 @@
+       atomic_t        dataref;
+       unsigned int    nr_frags;
+       struct sk_buff  *frag_list;
++      zccd_t          *zccd;                  /* zero copy descriptor */
++      zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
++      /* NB we expect zero-copy data to be at least 1 packet, so
++       * having 2 zccds means we don't unneccessarily split the packet
++       * where consecutive zero-copy sends abutt.
++       */
+       skb_frag_t      frags[MAX_SKB_FRAGS];
+ };
+
+diff -u -r1.1.1.1 linux/include/net/tcp.h
+--- linux/include/net/tcp.h    2 Aug 2002 10:59:29 -0000       1.1.1.1
++++ linux/include/net/tcp.h    2 Aug 2002 14:03:49 -0000
+@@ -639,6 +639,8 @@
+
+ extern int                    tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
+ extern ssize_t                        tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t                        tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++                                                int flags, zccd_t *zccd);
+
+ extern int                    tcp_ioctl(struct sock *sk,
+                                         int cmd,
+@@ -732,6 +734,9 @@
+                                           struct msghdr *msg,
+                                           int len, int nonblock,
+                                           int flags, int *addr_len);
++extern int                    tcp_recvpackets(struct sock *sk,
++                                              struct sk_buff_head *packets,
++                                              int len, int nonblock);
+
+ extern int                    tcp_listen_start(struct sock *sk);
+
+diff -u -r1.1.1.1 linux/net/netsyms.c
+--- linux/net/netsyms.c        2 Aug 2002 10:59:31 -0000       1.1.1.1
++++ linux/net/netsyms.c        2 Aug 2002 14:21:31 -0000
+@@ -395,6 +395,8 @@
+ EXPORT_SYMBOL(sysctl_tcp_ecn);
+ EXPORT_SYMBOL(tcp_cwnd_application_limited);
+ EXPORT_SYMBOL(tcp_sendpage);
++EXPORT_SYMBOL(tcp_sendpage_zccd);
++EXPORT_SYMBOL(tcp_recvpackets);
+
+ EXPORT_SYMBOL(tcp_write_xmit);
+
+diff -u -r1.1.1.1 linux/net/core/skbuff.c
+--- linux/net/core/skbuff.c    2 Aug 2002 10:59:32 -0000       1.1.1.1
++++ linux/net/core/skbuff.c    2 Aug 2002 14:07:13 -0000
+@@ -208,6 +208,8 @@
+       atomic_set(&(skb_shinfo(skb)->dataref), 1);
+       skb_shinfo(skb)->nr_frags = 0;
+       skb_shinfo(skb)->frag_list = NULL;
++      skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
++      skb_shinfo(skb)->zccd2 = NULL;
+       return skb;
+
+ nodata:
+@@ -276,6 +278,10 @@
+ {
+       if (!skb->cloned ||
+           atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
++              if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
++                      zccd_put (skb_shinfo(skb)->zccd); /* release hold */
++              if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
++                      zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
+               if (skb_shinfo(skb)->nr_frags) {
+                       int i;
+                       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -532,6 +538,8 @@
+       atomic_set(&(skb_shinfo(skb)->dataref), 1);
+       skb_shinfo(skb)->nr_frags = 0;
+       skb_shinfo(skb)->frag_list = NULL;
++      skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
++      skb_shinfo(skb)->zccd2 = NULL;
+
+       /* We are no longer a clone, even if we were. */
+       skb->cloned = 0;
+@@ -577,6 +585,14 @@
+
+       n->data_len = skb->data_len;
+       n->len = skb->len;
++
++      if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
++              zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
++      skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
++
++      if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
++              zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
++      skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
+
+       if (skb_shinfo(skb)->nr_frags) {
+               int i;
+@@ -620,6 +636,8 @@
+       u8 *data;
+       int size = nhead + (skb->end - skb->head) + ntail;
+       long off;
++      zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
++      zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
+
+       if (skb_shared(skb))
+               BUG();
+@@ -641,6 +659,11 @@
+       if (skb_shinfo(skb)->frag_list)
+               skb_clone_fraglist(skb);
+
++      if (zccd != NULL)                       /* user zero copy descriptor? */
++              zccd_get (zccd);                /* extra ref (pages are shared) */
++      if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
++              zccd_get (zccd2);               /* extra ref (pages are shared) */
++
+       skb_release_data(skb);
+
+       off = (data+nhead) - skb->head;
+@@ -655,6 +678,8 @@
+       skb->nh.raw += off;
+       skb->cloned = 0;
+       atomic_set(&skb_shinfo(skb)->dataref, 1);
++      skb_shinfo(skb)->zccd = zccd;
++      skb_shinfo(skb)->zccd2 = zccd2;
+       return 0;
+
+ nodata:
+diff -u -r1.1.1.1 linux/net/ipv4/tcp.c
+--- linux/net/ipv4/tcp.c       2 Aug 2002 10:59:34 -0000       1.1.1.1
++++ linux/net/ipv4/tcp.c       2 Aug 2002 14:36:30 -0000
+@@ -745,7 +745,7 @@
+       goto out;
+ }
+
+-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
+
+ static inline int
+ can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
+@@ -824,7 +824,8 @@
+       return err;
+ }
+
+-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
+ {
+       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+       int mss_now;
+@@ -872,6 +873,17 @@
+                       copy = size;
+
+               i = skb_shinfo(skb)->nr_frags;
++
++              if (zccd != NULL &&             /* this is a zcc I/O */
++                  skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
++                  skb_shinfo(skb)->zccd2 != NULL &&
++                  skb_shinfo(skb)->zccd != zccd && /* not the same one */
++                  skb_shinfo(skb)->zccd2 != zccd)
++              {
++                      tcp_mark_push (tp, skb);
++                      goto new_segment;
++              }
++
+               if (can_coalesce(skb, i, page, offset)) {
+                       skb_shinfo(skb)->frags[i-1].size += copy;
+               } else if (i < MAX_SKB_FRAGS) {
+@@ -881,6 +893,20 @@
+                       tcp_mark_push(tp, skb);
+                       goto new_segment;
+               }
++
++              if (zccd != NULL &&     /* this is a zcc I/O */
++                  skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
++                  skb_shinfo(skb)->zccd2 != zccd)
++              {
++                      zccd_get (zccd);        /* bump ref count */
++
++                      BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++
++                      if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
++                              skb_shinfo(skb)->zccd = zccd;
++                      else
++                              skb_shinfo(skb)->zccd2 = zccd;
++              }
+
+               skb->len += copy;
+               skb->data_len += copy;
+@@ -945,7 +971,31 @@
+
+       lock_sock(sk);
+       TCP_CHECK_TIMER(sk);
+-      res = do_tcp_sendpages(sk, &page, offset, size, flags);
++      res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
++      TCP_CHECK_TIMER(sk);
++      release_sock(sk);
++      return res;
++}
++
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++                        int flags, zccd_t *zccd)
++{
++      ssize_t res;
++      struct sock *sk = sock->sk;
++
++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
++
++      if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
++          !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++              BUG ();
++
++#undef TCP_ZC_CSUM_FLAGS
++
++      lock_sock(sk);
++      TCP_CHECK_TIMER(sk);
++
++      res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
++
+       TCP_CHECK_TIMER(sk);
+       release_sock(sk);
+       return res;
+@@ -1767,6 +1817,202 @@
+ recv_urg:
+       err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+       goto out;
++}
++
++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
++                   int len, int nonblock)
++{
++      struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
++      int copied;
++      long timeo;
++
++      BUG_TRAP (len > 0);
++      /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
++
++      lock_sock(sk);
++
++      TCP_CHECK_TIMER(sk);
++
++      copied = -ENOTCONN;
++      if (sk->state == TCP_LISTEN)
++              goto out;
++
++      copied = 0;
++      timeo = sock_rcvtimeo(sk, nonblock);
++
++      do {
++              struct sk_buff * skb;
++              u32 offset;
++              unsigned long used;
++              int exhausted;
++              int eaten;
++
++              /* Are we at urgent data? Stop if we have read anything. */
++              if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
++                      break;
++
++              /* We need to check signals first, to get correct SIGURG
++               * handling. FIXME: Need to check this doesnt impact 1003.1g
++               * and move it down to the bottom of the loop
++               */
++              if (signal_pending(current)) {
++                      if (copied)
++                              break;
++                      copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
++                      break;
++              }
++
++              /* Next get a buffer. */
++
++              skb = skb_peek(&sk->receive_queue);
++
++              if (skb == NULL)                /* nothing ready */
++              {
++                      if (copied) {
++                              if (sk->err ||
++                                  sk->state == TCP_CLOSE ||
++                                  (sk->shutdown & RCV_SHUTDOWN) ||
++                                  !timeo ||
++                                  (0))
++                                      break;
++                      } else {
++                              if (sk->done)
++                                      break;
++
++                              if (sk->err) {
++                                      copied = sock_error(sk);
++                                      break;
++                              }
++
++                              if (sk->shutdown & RCV_SHUTDOWN)
++                                      break;
++
++                              if (sk->state == TCP_CLOSE) {
++                                      if (!sk->done) {
++                                              /* This occurs when user tries to read
++                                               * from never connected socket.
++                                               */
++                                              copied = -ENOTCONN;
++                                              break;
++                                      }
++                                      break;
++                              }
++
++                              if (!timeo) {
++                                      copied = -EAGAIN;
++                                      break;
++                              }
++                      }
++
++                      cleanup_rbuf(sk, copied);
++                      timeo = tcp_data_wait(sk, timeo);
++                      continue;
++              }
++
++              BUG_TRAP (atomic_read (&skb->users) == 1);
++
++              exhausted = eaten = 0;
++
++              offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
++              if (skb->h.th->syn)
++                      offset--;
++
++              used = skb->len - offset;
++
++              if (tp->urg_data) {
++                      u32 urg_offset = tp->urg_seq - tp->copied_seq;
++                      if (urg_offset < used) {
++                              if (!urg_offset) { /* at urgent date */
++                                      if (!sk->urginline) {
++                                              tp->copied_seq++; /* discard the single byte of urgent data */
++                                              offset++;
++                                              used--;
++                                      }
++                              } else          /* truncate read */
++                                      used = urg_offset;
++                      }
++              }
++
++              BUG_TRAP (used >= 0);
++              if (len < used)
++                      used = len;
++
++              if (used == 0)
++                      exhausted = 1;
++              else
++              {
++                      if (skb_is_nonlinear (skb))
++                      {
++                              int   rc = skb_linearize (skb, GFP_KERNEL);
++
++                              printk ("tcp_recvpackets(): linearising: %d\n", rc);
++
++                              if (rc)
++                              {
++                                      if (!copied)
++                                              copied = rc;
++                                      break;
++                              }
++                      }
++
++                      if ((offset + used) == skb->len) /* consuming the whole packet */
++                      {
++                              __skb_unlink (skb, &sk->receive_queue);
++                              dst_release (skb->dst);
++                              skb_orphan (skb);
++                              __skb_pull (skb, offset);
++                              __skb_queue_tail (packets, skb);
++                              exhausted = eaten = 1;
++                      }
++                      else                    /* consuming only part of the packet */
++                      {
++                              struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
++
++                              if (skb2 == NULL)
++                              {
++                                      if (!copied)
++                                              copied = -ENOMEM;
++                                      break;
++                              }
++
++                              dst_release (skb2->dst);
++                              __skb_pull (skb2, offset);
++                              __skb_trim (skb2, used);
++                              __skb_queue_tail (packets, skb2);
++                      }
++
++                      tp->copied_seq += used;
++                      copied += used;
++                      len -= used;
++              }
++
++              if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
++                      tp->urg_data = 0;
++                      tcp_fast_path_check(sk, tp);
++              }
++
++              if (!exhausted)
++                      continue;
++
++              if (skb->h.th->fin)
++              {
++                      tp->copied_seq++;
++                      if (!eaten)
++                              tcp_eat_skb (sk, skb);
++                      break;
++              }
++
++              if (!eaten)
++                      tcp_eat_skb (sk, skb);
++
++      } while (len > 0);
++
++ out:
++      /* Clean up data we have read: This will do ACK frames. */
++      cleanup_rbuf(sk, copied);
++      TCP_CHECK_TIMER(sk);
++      release_sock(sk);
++      return copied;
+ }
+
+ /*
diff --git a/lustre/kernel_patches/patches/uml-patch-2.4.20-4.patch b/lustre/kernel_patches/patches/uml-patch-2.4.20-4.patch
new file mode 100644 (file)
index 0000000..b35fee0
--- /dev/null
@@ -0,0 +1,39358 @@
+diff -Naur -X ../exclude-files orig/CREDITS um/CREDITS
+--- orig/CREDITS       Thu Feb 27 13:04:11 2003
++++ um/CREDITS Thu Feb 27 13:05:17 2003
+@@ -432,6 +432,7 @@
+ E: lars@nocrew.org
+ W: http://lars.nocrew.org/
+ D: dsp56k device driver
++D: ptrace proxy in user mode kernel port
+ S: Kopmansg 2
+ S: 411 13  Goteborg
+ S: Sweden
+@@ -721,7 +722,7 @@
+ E: jdike@karaya.com
+ W: http://user-mode-linux.sourceforge.net
+ D: User mode kernel port
+-S: RR1 Box 67C
++S: 375 Tubbs Hill Rd
+ S: Deering NH 03244
+ S: USA
+diff -Naur -X ../exclude-files orig/Documentation/Configure.help um/Documentation/Configure.help
+--- orig/Documentation/Configure.help  Thu Feb 27 13:04:11 2003
++++ um/Documentation/Configure.help    Thu Feb 27 13:05:17 2003
+@@ -14690,19 +14690,23 @@
+   The module will be called dsbr100.o. If you want to compile it as a
+   module, say M here and read <file:Documentation/modules.txt>.
+-Always do synchronous disk IO for UBD
+-CONFIG_BLK_DEV_UBD_SYNC
++CONFIG_BLK_DEV_UBD
+   The User-Mode Linux port includes a driver called UBD which will let
+   you access arbitrary files on the host computer as block devices.
+-  Writes to such a block device are not immediately written to the
+-  host's disk; this may cause problems if, for example, the User-Mode
+-  Linux 'Virtual Machine' uses a journalling file system and the host
+-  computer crashes.
++  Unless you know that you do not need such virtual block devices say
++  Y here.
++
++Always do synchronous disk IO for UBD
++CONFIG_BLK_DEV_UBD_SYNC
++  Writes to the virtual block device are not immediately written to the host's
++  disk; this may cause problems if, for example, the User-Mode Linux
++  'Virtual Machine' uses a journalling filesystem and the host computer
++  crashes.
+   Synchronous operation (i.e. always writing data to the host's disk
+   immediately) is configurable on a per-UBD basis by using a special
+   kernel command line option.  Alternatively, you can say Y here to
+-  turn on synchronous operation by default for all block.
++  turn on synchronous operation by default for all block devices.
+   If you're running a journalling file system (like reiserfs, for
+   example) in your virtual machine, you will want to say Y here.  If
+@@ -14714,6 +14718,7 @@
+ CONFIG_PT_PROXY
+   This option enables a debugging interface which allows gdb to debug
+   the kernel without needing to actually attach to kernel threads.
++  CONFIG_XTERM_CHAN must be enabled in order to enable CONFIG_PT_PROXY.
+   If you want to do kernel debugging, say Y here; otherwise say N.
+ Management console
+@@ -14908,25 +14913,173 @@
+ SLIP transport
+ CONFIG_UML_NET_SLIP
+-  The Slip User-Mode Linux network transport allows a running UML to
++  The slip User-Mode Linux network transport allows a running UML to
+   network with its host over a point-to-point link.  Unlike Ethertap,
+   which can carry any Ethernet frame (and hence even non-IP packets),
+-  the Slip transport can only carry IP packets.
++  the slip transport can only carry IP packets.
+-  To use this, your host must support Slip devices.
++  To use this, your host must support slip devices.
+   For more information, see
+   <http://user-mode-linux.sourceforge.net/networking.html>.  That site
+-  has examples of the UML command line to use to enable Slip
++  has examples of the UML command line to use to enable slip
+   networking, and details of a few quirks with it.
+-  The Ethertap Transport is preferred over Slip because of its
+-  limitation.  If you prefer Slip, however, say Y here.  Otherwise
++  The Ethertap Transport is preferred over slip because of its
++  limitations.  If you prefer slip, however, say Y here.  Otherwise
+   choose the Multicast transport (to network multiple UMLs on 
+   multiple hosts), Ethertap (to network with the host and the
+   outside world), and/or the Daemon transport (to network multiple
+   UMLs on a single host).  You may choose more than one without
+   conflict.  If you don't need UML networking, say N.
++
++SLiRP transport
++CONFIG_UML_NET_SLIRP
++  The SLiRP User-Mode Linux network transport allows a running UML
++  to network by invoking a program that can handle SLIP encapsulated
++  packets.  This is commonly (but not limited to) the application
++  known as SLiRP, a program that can re-socket IP packets back onto
++  the host on which it is run.  Only IP packets are supported,
++  unlike other network transports that can handle all Ethernet
++  frames.  In general, slirp allows the UML the same IP connectivity
++  to the outside world that the host user is permitted, and unlike
++  other transports, SLiRP works without the need of root level
++  privleges, setuid binaries, or SLIP devices on the host.  This
++  also means not every type of connection is possible, but most
++  situations can be accomodated with carefully crafted slirp
++  commands that can be passed along as part of the network device's
++  setup string.  The effect of this transport on the UML is similar
++  that of a host behind a firewall that masquerades all network
++  connections passing through it (but is less secure).
++
++  To use this you should first have slirp compiled somewhere
++  accessible on the host, and have read its documentation.  If you
++  don't need UML networking, say N.
++
++  Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp"
++
++Default main console channel initialization
++CONFIG_CON_ZERO_CHAN
++  This is the string describing the channel to which the main console
++  will be attached by default.  This value can be overridden from the
++  command line.  The default value is "fd:0,fd:1", which attaches the
++  main console to stdin and stdout.
++  It is safe to leave this unchanged.
++
++Default console channel initialization
++CONFIG_CON_CHAN
++  This is the string describing the channel to which all consoles
++  except the main console will be attached by default.  This value can
++  be overridden from the command line.  The default value is "xterm",
++  which brings them up in xterms.
++  It is safe to leave this unchanged, although you may wish to change
++  this if you expect the UML that you build to be run in environments
++  which don't have X or xterm available.
++
++Default serial line channel initialization
++CONFIG_SSL_CHAN
++  This is the string describing the channel to which the serial lines
++  will be attached by default.  This value can be overridden from the
++  command line.  The default value is "pty", which attaches them to
++  traditional pseudo-terminals.
++  It is safe to leave this unchanged, although you may wish to change
++  this if you expect the UML that you build to be run in environments
++  which don't have a set of /dev/pty* devices.
++
++Nesting level
++CONFIG_NEST_LEVEL
++  This is set to the number of layers of UMLs that this UML will be run
++  in.  Normally, this is zero, meaning that it will run directly on the
++  host.  Setting it to one will build a UML that can run inside a UML
++  that is running on the host.  Generally, if you intend this UML to run
++  inside another UML, set CONFIG_NEST_LEVEL to one more than the host UML.
++  Note that if the hosting UML has its CONFIG_KERNEL_HALF_GIGS set to 
++  greater than one, then the guest UML should have its CONFIG_NEST_LEVEL 
++  set to the host's CONFIG_NEST_LEVEL + CONFIG_KERNEL_HALF_GIGS.
++  Only change this if you are running nested UMLs.
++
++Kernel address space size (in .5G units)
++CONFIG_KERNEL_HALF_GIGS
++  This determines the amount of address space that UML will allocate for
++  its own, measured in half Gigabyte units.  The default is 1.
++  Change this only if you need to boot UML with an unusually large amount
++  of physical memory.
++
++UML sound support
++CONFIG_UML_SOUND
++  This option enables UML sound support.  If enabled, it will pull in
++  soundcore and the UML hostaudio relay, which acts as a intermediary
++  between the host's dsp and mixer devices and the UML sound system.
++  It is safe to say 'Y' here.
++
++UML SMP support
++CONFIG_UML_SMP
++  This option enables UML SMP support.  UML implements virtual SMP by
++  allowing as many processes to run simultaneously on the host as
++  there are virtual processors configured.  Obviously, if the host is
++  a uniprocessor, those processes will timeshare, but, inside UML,
++  will appear to be running simultaneously.  If the host is a
++  multiprocessor, then UML processes may run simultaneously, depending
++  on the host scheduler.
++  CONFIG_SMP will be set to whatever this option is set to.
++  It is safe to leave this unchanged.
++
++file descriptor channel support
++CONFIG_FD_CHAN
++  This option enables support for attaching UML consoles and serial
++  lines to already set up file descriptors.  Generally, the main
++  console is attached to file descriptors 0 and 1 (stdin and stdout),
++  so it would be wise to leave this enabled unless you intend to
++  attach it to some other host device.
++
++null device channel support
++CONFIG_NULL_CHAN
++  This option enables support for attaching UML consoles and serial
++  lines to a device similar to /dev/null.  Data written to it disappears
++  and there is never any data to be read.
++
++port channel support
++CONFIG_PORT_CHAN
++  This option enables support for attaching UML consoles and serial
++  lines to host portals.  They may be accessed with 'telnet <host>
++  <port number>'.  Any number of consoles and serial lines may be
++  attached to a single portal, although what UML device you get when
++  you telnet to that portal will be unpredictable.
++  It is safe to say 'Y' here.
++
++pty channel support
++CONFIG_PTY_CHAN
++  This option enables support for attaching UML consoles and serial
++  lines to host pseudo-terminals.  Access to both traditional
++  pseudo-terminals (/dev/pty*) and pts pseudo-terminals are controlled
++  with this option.  The assignment of UML devices to host devices
++  will be announced in the kernel message log.
++  It is safe to say 'Y' here.
++
++tty channel support
++CONFIG_TTY_CHAN
++  This option enables support for attaching UML consoles and serial
++  lines to host terminals.  Access to both virtual consoles
++  (/dev/tty*) and the slave side of pseudo-terminals (/dev/ttyp* and
++  /dev/pts/*) are controlled by this option.
++  It is safe to say 'Y' here.
++
++xterm channel support
++CONFIG_XTERM_CHAN
++  This option enables support for attaching UML consoles and serial
++  lines to xterms.  Each UML device so assigned will be brought up in
++  its own xterm.
++  If you disable this option, then CONFIG_PT_PROXY will be disabled as
++  well, since UML's gdb currently requires an xterm.
++  It is safe to say 'Y' here.
++
++tty logging
++CONFIG_TTY_LOG
++  This option enables logging of all data going through pseudo-terminals
++  to the host.  This is primarily useful for honeypots, where you want
++  secure keystroke logging that can't be detected or disabled by root.
++  Say 'N' unless you are setting up a UML honeypot or otherwise know that
++  you want this option.
+ Microtek USB scanner support
+ CONFIG_USB_MICROTEK
+diff -Naur -X ../exclude-files orig/MAINTAINERS um/MAINTAINERS
+--- orig/MAINTAINERS   Thu Feb 27 13:04:12 2003
++++ um/MAINTAINERS     Thu Feb 27 13:05:17 2003
+@@ -1841,6 +1841,14 @@
+ L:    linux-usb-devel@lists.sourceforge.net
+ W:    http://usb.in.tum.de
+ S:    Maintained
++
++USER-MODE PORT
++P:    Jeff Dike
++M:    jdike@karaya.com
++L:    user-mode-linux-devel@lists.sourceforge.net
++L:    user-mode-linux-user@lists.sourceforge.net
++W:    http://user-mode-linux.sourceforge.net
++S:    Maintained
+       
+ USB "USBNET" DRIVER
+ P:    David Brownell
+diff -Naur -X ../exclude-files orig/Makefile um/Makefile
+--- orig/Makefile      Thu Feb 27 13:04:12 2003
++++ um/Makefile        Thu Feb 27 13:05:17 2003
+@@ -5,7 +5,15 @@
+ KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
+-ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/)
++# SUBARCH tells the usermode build what the underlying arch is.  That is set
++# first, and if a usermode build is happening, the "ARCH=um" on the command
++# line overrides the setting of ARCH below.  If a native build is happening,
++# then ARCH is assigned, getting whatever value it gets normally, and 
++# SUBARCH is subsequently ignored.
++
++SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/)
++ARCH := $(SUBARCH)
++
+ KERNELPATH=kernel-$(shell echo $(KERNELRELEASE) | sed -e "s/-//g")
+ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
+diff -Naur -X ../exclude-files orig/arch/um/Makefile um/arch/um/Makefile
+--- orig/arch/um/Makefile      Wed Dec 31 19:00:00 1969
++++ um/arch/um/Makefile        Fri Mar 28 21:46:54 2003
+@@ -0,0 +1,168 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++OS := $(shell uname -s)
++
++ARCH_DIR = arch/um
++
++core-y := kernel sys-$(SUBARCH) os-$(OS)
++drivers-y := fs drivers
++subdir-y := $(core-y) $(drivers-y)
++SUBDIRS += $(foreach dir,$(subdir-y),$(ARCH_DIR)/$(dir))
++
++CORE_FILES += $(foreach dir,$(core-y),$(ARCH_DIR)/$(dir)/built-in.o)
++DRIVERS += $(foreach dir,$(drivers-y),$(ARCH_DIR)/$(dir)/built-in.o)
++
++include $(ARCH_DIR)/Makefile-$(SUBARCH)
++include $(ARCH_DIR)/Makefile-os-$(OS)
++
++MAKEFILE-$(CONFIG_MODE_TT) += Makefile-tt
++MAKEFILE-$(CONFIG_MODE_SKAS) += Makefile-skas
++
++ifneq ($(MAKEFILE-y),)
++  include $(addprefix $(ARCH_DIR)/,$(MAKEFILE-y))
++endif
++
++EXTRAVERSION := $(EXTRAVERSION)-4um
++
++include/linux/version.h: arch/$(ARCH)/Makefile
++
++# Recalculate MODLIB to reflect the EXTRAVERSION changes (via KERNELRELEASE)
++# The way the toplevel Makefile is written EXTRAVERSION is not supposed
++# to be changed outside the toplevel Makefile, but recalculating MODLIB is
++# a sufficient workaround until we no longer need architecture dependent
++# EXTRAVERSION...
++MODLIB := $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE)
++
++ifeq ($(CONFIG_DEBUGSYM),y)
++CFLAGS := $(subst -fomit-frame-pointer,,$(CFLAGS))
++endif
++
++CFLAGS-$(CONFIG_DEBUGSYM) += -g
++
++ARCH_INCLUDE = -I$(TOPDIR)/$(ARCH_DIR)/include
++
++# -Derrno=kernel_errno - This turns all kernel references to errno into
++# kernel_errno to separate them from the libc errno.  This allows -fno-common
++# in CFLAGS.  Otherwise, it would cause ld to complain about the two different
++# errnos.
++
++CFLAGS += $(ARCH_CFLAGS) $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \
++      -D_LARGEFILE64_SOURCE $(ARCH_INCLUDE) -Derrno=kernel_errno \
++      $(MODE_INCLUDE)
++
++LINKFLAGS += -r
++
++LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc
++
++SIZE = (($(CONFIG_NEST_LEVEL) + $(CONFIG_KERNEL_HALF_GIGS)) * 0x20000000)
++
++# These aren't in Makefile-tt because they are needed in the !CONFIG_MODE_TT +
++# CONFIG_MODE_SKAS + CONFIG_STATIC_LINK case.
++
++LINK_TT = -static
++LD_SCRIPT_TT := link.ld
++
++ifeq ($(CONFIG_STATIC_LINK),y)
++  LINK-y += $(LINK_TT)
++  LD_SCRIPT-y := $(LD_SCRIPT_TT)
++else
++ifeq ($(CONFIG_MODE_TT),y)
++  LINK-y += $(LINK_TT)
++  LD_SCRIPT-y := $(LD_SCRIPT_TT)
++else
++ifeq ($(CONFIG_MODE_SKAS),y)
++  LINK-y += $(LINK_SKAS)
++  LD_SCRIPT-y := $(LD_SCRIPT_SKAS)
++endif
++endif
++endif
++
++LD_SCRIPT-y := $(ARCH_DIR)/$(LD_SCRIPT-y)
++M4_MODE_TT := $(shell [ "$(CONFIG_MODE_TT)" = "y" ] && echo -DMODE_TT)
++
++$(LD_SCRIPT-y): $(LD_SCRIPT-y).in
++      pages=$$(( 1 << $(CONFIG_KERNEL_STACK_ORDER) )) ; \
++      m4 -DSTART=$$(($(TOP_ADDR) - $(SIZE))) -DELF_ARCH=$(ELF_ARCH) \
++              -DELF_FORMAT=$(ELF_FORMAT) $(M4_MODE_TT) \
++              -DKERNEL_STACK_SIZE=$$(( 4096 * $$pages )) $< > $@
++
++SYMLINK_HEADERS = include/asm-um/archparam.h include/asm-um/system.h \
++      include/asm-um/sigcontext.h include/asm-um/processor.h \
++      include/asm-um/ptrace.h include/asm-um/arch-signal.h
++
++ARCH_SYMLINKS = include/asm-um/arch arch/um/include/sysdep arch/um/os \
++      $(SYMLINK_HEADERS) $(ARCH_DIR)/include/uml-config.h
++
++ifeq ($(CONFIG_MODE_SKAS), y)
++$(SYS_HEADERS) : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h
++endif
++
++GEN_HEADERS += $(ARCH_DIR)/include/task.h $(ARCH_DIR)/include/kern_constants.h 
++
++setup: $(ARCH_SYMLINKS) $(SYS_HEADERS) $(GEN_HEADERS) 
++
++linux: setup $(ARCH_DIR)/main.o vmlinux $(LD_SCRIPT-y)
++      mv vmlinux vmlinux.o
++      $(CC) -Wl,-T,$(LD_SCRIPT-y) $(LINK-y) $(LINK_WRAPS) \
++              -o linux $(ARCH_DIR)/main.o vmlinux.o -L/usr/lib -lutil
++
++USER_CFLAGS := $(patsubst -I%,,$(CFLAGS))
++USER_CFLAGS := $(patsubst -Derrno=kernel_errno,,$(USER_CFLAGS))
++USER_CFLAGS := $(patsubst -D__KERNEL__,,$(USER_CFLAGS)) $(ARCH_INCLUDE) \
++      $(MODE_INCLUDE)
++
++# To get a definition of F_SETSIG
++USER_CFLAGS += -D_GNU_SOURCE
++
++CLEAN_FILES += linux x.i gmon.out $(ARCH_DIR)/link.ld $(ARCH_DIR)/dyn_link.ld \
++      $(GEN_HEADERS) $(ARCH_DIR)/include/uml-config.h
++
++$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c
++      $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $<
++
++archmrproper:
++      rm -f $(SYMLINK_HEADERS) $(ARCH_SYMLINKS) include/asm \
++              $(LD_SCRIPT) $(addprefix $(ARCH_DIR)/kernel/,$(KERN_SYMLINKS))
++
++archclean: sysclean
++      find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \
++              -o -name '*.gcov' \) -type f -print | xargs rm -f
++      cd $(ARCH_DIR) ; \
++      for dir in $(subdir-y) util ; do $(MAKE) -C $$dir clean; done
++
++archdep: 
++
++$(SYMLINK_HEADERS):
++      cd $(TOPDIR)/$(dir $@) ; \
++      ln -sf $(basename $(notdir $@))-$(SUBARCH)$(suffix $@) $(notdir $@)
++
++include/asm-um/arch:
++      cd $(TOPDIR)/include/asm-um && ln -sf ../asm-$(SUBARCH) arch
++
++arch/um/include/sysdep:
++      cd $(TOPDIR)/arch/um/include && ln -sf sysdep-$(SUBARCH) sysdep
++
++arch/um/os:
++      cd $(ARCH_DIR) && ln -sf os-$(OS) os
++
++$(ARCH_DIR)/include/task.h : $(ARCH_DIR)/util/mk_task
++      $< > $@
++
++$(ARCH_DIR)/include/kern_constants.h : $(ARCH_DIR)/util/mk_constants
++      $< > $@
++
++$(ARCH_DIR)/include/uml-config.h : $(TOPDIR)/include/linux/autoconf.h
++      sed 's/ CONFIG/ UML_CONFIG/' $(TOPDIR)/include/linux/autoconf.h > $@
++
++$(ARCH_DIR)/util/mk_task : $(ARCH_DIR)/util/mk_task_user.c \
++      $(ARCH_DIR)/util/mk_task_kern.c $(SYS_HEADERS)
++      $(MAKE) $(MFLAGS) -C $(ARCH_DIR)/util mk_task
++
++$(ARCH_DIR)/util/mk_constants : $(ARCH_DIR)/util/mk_constants_user.c \
++      $(ARCH_DIR)/util/mk_constants_kern.c 
++      $(MAKE) $(MFLAGS) -C $(ARCH_DIR)/util mk_constants
++
++export SUBARCH USER_CFLAGS OS
+diff -Naur -X ../exclude-files orig/arch/um/Makefile-i386 um/arch/um/Makefile-i386
+--- orig/arch/um/Makefile-i386 Wed Dec 31 19:00:00 1969
++++ um/arch/um/Makefile-i386   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,35 @@
++ifeq ($(CONFIG_HOST_2G_2G), y)
++TOP_ADDR = 0x80000000
++else
++TOP_ADDR = 0xc0000000
++endif
++
++ARCH_CFLAGS = -U__$(SUBARCH)__ -U$(SUBARCH) -DUM_FASTCALL
++ELF_ARCH = $(SUBARCH)
++ELF_FORMAT = elf32-$(SUBARCH)
++
++I386_H = $(ARCH_DIR)/include/sysdep-i386
++SYS = $(ARCH_DIR)/sys-i386
++UTIL = $(SYS)/util
++SUBDIRS += $(UTIL)
++
++SYS_HEADERS = $(I386_H)/sc.h $(I386_H)/thread.h
++
++$(I386_H)/sc.h : $(UTIL)/mk_sc
++      $(UTIL)/mk_sc > $@
++
++$(I386_H)/thread.h : $(UTIL)/mk_thread
++      $(UTIL)/mk_thread > $@
++
++$(UTIL)/mk_sc : $(UTIL)/mk_sc.c
++      $(MAKE) -C $(UTIL) mk_sc
++
++$(UTIL)/mk_thread : $(UTIL)/mk_thread_user.c $(UTIL)/mk_thread_kern.c \
++      $(I386_H)/sc.h
++      $(MAKE) -C $(UTIL) mk_thread
++
++sysclean :
++      rm -f $(SYS_HEADERS)
++      $(MAKE) -C $(UTIL) clean
++      $(MAKE) -C $(SYS) clean
++
+diff -Naur -X ../exclude-files orig/arch/um/Makefile-ia64 um/arch/um/Makefile-ia64
+--- orig/arch/um/Makefile-ia64 Wed Dec 31 19:00:00 1969
++++ um/arch/um/Makefile-ia64   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1 @@
++START_ADDR = 0x1000000000000000
+diff -Naur -X ../exclude-files orig/arch/um/Makefile-os-Linux um/arch/um/Makefile-os-Linux
+--- orig/arch/um/Makefile-os-Linux     Wed Dec 31 19:00:00 1969
++++ um/arch/um/Makefile-os-Linux       Mon Dec  9 14:21:51 2002
+@@ -0,0 +1,7 @@
++# 
++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++SUBDIRS += $(ARCH_DIR)/os-$(OS)/drivers
++DRIVERS += $(ARCH_DIR)/os-$(OS)/drivers/drivers.o
+diff -Naur -X ../exclude-files orig/arch/um/Makefile-ppc um/arch/um/Makefile-ppc
+--- orig/arch/um/Makefile-ppc  Wed Dec 31 19:00:00 1969
++++ um/arch/um/Makefile-ppc    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,9 @@
++ifeq ($(CONFIG_HOST_2G_2G), y)
++START_ADDR = 0x80000000
++else
++START_ADDR = 0xc0000000
++endif
++ARCH_CFLAGS = -U__powerpc__ -D__UM_PPC__
++
++# The arch is ppc, but the elf32 name is powerpc
++ELF_SUBARCH = powerpc
+diff -Naur -X ../exclude-files orig/arch/um/Makefile-skas um/arch/um/Makefile-skas
+--- orig/arch/um/Makefile-skas Wed Dec 31 19:00:00 1969
++++ um/arch/um/Makefile-skas   Sun Dec 15 22:02:57 2002
+@@ -0,0 +1,20 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++PROFILE += -pg
++
++CFLAGS-$(CONFIG_GCOV) += -fprofile-arcs -ftest-coverage
++CFLAGS-$(CONFIG_GPROF) += $(PROFILE)
++LINK-$(CONFIG_GPROF) += $(PROFILE)
++
++MODE_INCLUDE += -I$(TOPDIR)/$(ARCH_DIR)/kernel/skas/include
++
++LINK_SKAS = -Wl,-rpath,/lib 
++LD_SCRIPT_SKAS = dyn_link.ld
++
++GEN_HEADERS += $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h
++
++$(ARCH_DIR)/kernel/skas/include/skas_ptregs.h :
++      $(MAKE) -C $(ARCH_DIR)/kernel/skas include/skas_ptregs.h
+diff -Naur -X ../exclude-files orig/arch/um/Makefile-tt um/arch/um/Makefile-tt
+--- orig/arch/um/Makefile-tt   Wed Dec 31 19:00:00 1969
++++ um/arch/um/Makefile-tt     Mon Dec 16 20:22:23 2002
+@@ -0,0 +1,7 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++MODE_INCLUDE += -I$(TOPDIR)/$(ARCH_DIR)/kernel/tt/include
++
+diff -Naur -X ../exclude-files orig/arch/um/common.ld.in um/arch/um/common.ld.in
+--- orig/arch/um/common.ld.in  Wed Dec 31 19:00:00 1969
++++ um/arch/um/common.ld.in    Tue Feb  4 19:35:13 2003
+@@ -0,0 +1,53 @@
++  .kstrtab : { *(.kstrtab) }
++
++  . = ALIGN(16);              /* Exception table */
++  __start___ex_table = .;
++  __ex_table : { *(__ex_table) }
++  __stop___ex_table = .;
++
++  __start___ksymtab = .;      /* Kernel symbol table */
++  __ksymtab : { *(__ksymtab) }
++  __stop___ksymtab = .;
++
++  .unprotected : { *(.unprotected) }
++  . = ALIGN(4096);
++  PROVIDE (_unprotected_end = .);
++
++  . = ALIGN(4096);
++  __uml_setup_start = .;
++  .uml.setup.init : { *(.uml.setup.init) }
++  __uml_setup_end = .;
++  __uml_help_start = .;
++  .uml.help.init : { *(.uml.help.init) }
++  __uml_help_end = .;
++  __uml_postsetup_start = .;
++  .uml.postsetup.init : { *(.uml.postsetup.init) }
++  __uml_postsetup_end = .;
++  __setup_start = .;
++  .setup.init : { *(.setup.init) }
++  __setup_end = .;
++  __initcall_start = .;
++  .initcall.init : { *(.initcall.init) }
++  __initcall_end = .;
++  __uml_initcall_start = .;
++  .uml.initcall.init : { *(.uml.initcall.init) }
++  __uml_initcall_end = .;
++  __init_end = .;
++  __exitcall_begin = .;
++  .exitcall : { *(.exitcall.exit) }
++  __exitcall_end = .;
++  __uml_exitcall_begin = .;
++  .uml.exitcall : { *(.uml.exitcall.exit) }
++  __uml_exitcall_end = .;
++
++  __preinit_array_start = .;
++  .preinit_array : { *(.preinit_array) }
++  __preinit_array_end = .;
++  __init_array_start = .;
++  .init_array : { *(.init_array) }
++  __init_array_end = .;
++  __fini_array_start = .;
++  .fini_array : { *(.fini_array) }
++  __fini_array_end = .;
++
++  .data.init : { *(.data.init) }
+diff -Naur -X ../exclude-files orig/arch/um/config.in um/arch/um/config.in
+--- orig/arch/um/config.in     Wed Dec 31 19:00:00 1969
++++ um/arch/um/config.in       Thu Feb 27 13:12:39 2003
+@@ -0,0 +1,104 @@
++define_bool CONFIG_USERMODE y
++
++mainmenu_name "Linux/Usermode Kernel Configuration"
++
++define_bool CONFIG_ISA n
++define_bool CONFIG_SBUS n
++define_bool CONFIG_PCI n
++
++define_bool CONFIG_UID16 y
++
++define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM y
++
++mainmenu_option next_comment
++comment 'Code maturity level options'
++bool 'Prompt for development and/or incomplete code/drivers' CONFIG_EXPERIMENTAL
++endmenu
++
++mainmenu_option next_comment
++comment 'General Setup'
++
++bool 'Separate kernel address space support' CONFIG_MODE_SKAS
++
++# This is to ensure that at least one of the modes is enabled.  When neither
++# is present in defconfig, they default to N, which is bad.
++if [ "$CONFIG_MODE_SKAS" != "y" ]; then
++   define_bool CONFIG_MODE_TT y
++fi
++
++bool 'Tracing thread support' CONFIG_MODE_TT
++if [ "$CONFIG_MODE_TT" != "y" ]; then
++   bool 'Statically linked binary when CONFIG_MODE_TT is disabled' CONFIG_STATIC_LINK
++fi
++bool 'Networking support' CONFIG_NET
++bool 'System V IPC' CONFIG_SYSVIPC
++bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT
++bool 'Sysctl support' CONFIG_SYSCTL
++tristate 'Kernel support for a.out binaries' CONFIG_BINFMT_AOUT
++tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF
++tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC
++tristate 'Host filesystem' CONFIG_HOSTFS
++tristate 'Honeypot proc filesystem' CONFIG_HPPFS
++bool 'Management console' CONFIG_MCONSOLE
++dep_bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ $CONFIG_MCONSOLE
++bool '2G/2G host address space split' CONFIG_HOST_2G_2G
++bool 'Symmetric multi-processing support' CONFIG_UML_SMP
++define_bool CONFIG_SMP $CONFIG_UML_SMP
++int 'Nesting level' CONFIG_NEST_LEVEL 0
++int 'Kernel address space size (in .5G units)' CONFIG_KERNEL_HALF_GIGS 1
++bool 'Highmem support' CONFIG_HIGHMEM
++bool '/proc/mm' CONFIG_PROC_MM
++int 'Kernel stack size order' CONFIG_KERNEL_STACK_ORDER 2
++endmenu
++
++mainmenu_option next_comment
++comment 'Loadable module support'
++bool 'Enable loadable module support' CONFIG_MODULES
++if [ "$CONFIG_MODULES" = "y" ]; then
++# MODVERSIONS does not yet work in this architecture
++#   bool '  Set version information on all module symbols' CONFIG_MODVERSIONS
++    bool '  Kernel module loader' CONFIG_KMOD
++fi
++endmenu
++
++source arch/um/config_char.in
++
++source arch/um/config_block.in
++
++define_bool CONFIG_NETDEVICES $CONFIG_NET
++
++if [ "$CONFIG_NET" = "y" ]; then
++   source arch/um/config_net.in
++   source net/Config.in
++fi
++
++source fs/Config.in
++
++mainmenu_option next_comment
++comment 'SCSI support'
++
++tristate 'SCSI support' CONFIG_SCSI
++
++if [ "$CONFIG_SCSI" != "n" ]; then
++   source arch/um/config_scsi.in
++fi
++endmenu
++
++source drivers/md/Config.in
++
++source drivers/mtd/Config.in
++
++source lib/Config.in
++
++mainmenu_option next_comment
++comment 'Kernel hacking'
++bool 'Debug memory allocations' CONFIG_DEBUG_SLAB
++bool 'Enable kernel debugging symbols' CONFIG_DEBUGSYM
++if [ "$CONFIG_XTERM_CHAN" = "y" ]; then
++   dep_bool 'Enable ptrace proxy' CONFIG_PT_PROXY $CONFIG_DEBUGSYM
++else 
++   define_bool CONFIG_PT_PROXY n
++fi
++dep_bool 'Enable gprof support' CONFIG_GPROF $CONFIG_DEBUGSYM
++dep_bool 'Enable gcov support' CONFIG_GCOV $CONFIG_DEBUGSYM
++endmenu
+diff -Naur -X ../exclude-files orig/arch/um/config.release um/arch/um/config.release
+--- orig/arch/um/config.release        Wed Dec 31 19:00:00 1969
++++ um/arch/um/config.release  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,302 @@
++#
++# Automatically generated make config: don't edit
++#
++CONFIG_USERMODE=y
++# CONFIG_ISA is not set
++# CONFIG_SBUS is not set
++# CONFIG_PCI is not set
++CONFIG_UID16=y
++CONFIG_RWSEM_XCHGADD_ALGORITHM=y
++
++#
++# Code maturity level options
++#
++CONFIG_EXPERIMENTAL=y
++
++#
++# General Setup
++#
++CONFIG_NET=y
++CONFIG_SYSVIPC=y
++CONFIG_BSD_PROCESS_ACCT=y
++CONFIG_SYSCTL=y
++CONFIG_BINFMT_AOUT=y
++CONFIG_BINFMT_ELF=y
++CONFIG_BINFMT_MISC=y
++CONFIG_HOSTFS=y
++# CONFIG_HPPFS is not set
++CONFIG_MCONSOLE=y
++CONFIG_MAGIC_SYSRQ=y
++# CONFIG_HOST_2G_2G is not set
++# CONFIG_UML_SMP is not set
++# CONFIG_SMP is not set
++CONFIG_NEST_LEVEL=0
++CONFIG_KERNEL_HALF_GIGS=1
++
++#
++# Loadable module support
++#
++CONFIG_MODULES=y
++CONFIG_KMOD=y
++
++#
++# Character Devices
++#
++CONFIG_STDIO_CONSOLE=y
++CONFIG_SSL=y
++CONFIG_FD_CHAN=y
++# CONFIG_NULL_CHAN is not set
++CONFIG_PORT_CHAN=y
++CONFIG_PTY_CHAN=y
++CONFIG_TTY_CHAN=y
++CONFIG_XTERM_CHAN=y
++CONFIG_CON_ZERO_CHAN="fd:0,fd:1"
++CONFIG_CON_CHAN="xterm"
++CONFIG_SSL_CHAN="pty"
++CONFIG_UNIX98_PTYS=y
++CONFIG_UNIX98_PTY_COUNT=256
++# CONFIG_WATCHDOG is not set
++CONFIG_UML_SOUND=y
++CONFIG_SOUND=y
++CONFIG_HOSTAUDIO=y
++# CONFIG_TTY_LOG is not set
++
++#
++# Block Devices
++#
++CONFIG_BLK_DEV_UBD=y
++# CONFIG_BLK_DEV_UBD_SYNC is not set
++CONFIG_BLK_DEV_LOOP=y
++CONFIG_BLK_DEV_NBD=y
++CONFIG_BLK_DEV_RAM=y
++CONFIG_BLK_DEV_RAM_SIZE=4096
++CONFIG_BLK_DEV_INITRD=y
++# CONFIG_MMAPPER is not set
++CONFIG_NETDEVICES=y
++
++#
++# Network Devices
++#
++CONFIG_UML_NET=y
++CONFIG_UML_NET_ETHERTAP=y
++CONFIG_UML_NET_TUNTAP=y
++CONFIG_UML_NET_SLIP=y
++CONFIG_UML_NET_DAEMON=y
++CONFIG_UML_NET_MCAST=y
++CONFIG_DUMMY=y
++CONFIG_BONDING=m
++CONFIG_EQUALIZER=m
++CONFIG_TUN=y
++CONFIG_PPP=m
++CONFIG_PPP_MULTILINK=y
++# CONFIG_PPP_ASYNC is not set
++CONFIG_PPP_SYNC_TTY=m
++CONFIG_PPP_DEFLATE=m
++CONFIG_PPP_BSDCOMP=m
++CONFIG_PPPOE=m
++CONFIG_SLIP=m
++
++#
++# Networking options
++#
++CONFIG_PACKET=y
++CONFIG_PACKET_MMAP=y
++# CONFIG_NETLINK_DEV is not set
++# CONFIG_NETFILTER is not set
++# CONFIG_FILTER is not set
++CONFIG_UNIX=y
++CONFIG_INET=y
++# CONFIG_IP_MULTICAST is not set
++# CONFIG_IP_ADVANCED_ROUTER is not set
++# CONFIG_IP_PNP is not set
++# CONFIG_NET_IPIP is not set
++# CONFIG_NET_IPGRE is not set
++# CONFIG_ARPD is not set
++# CONFIG_INET_ECN is not set
++# CONFIG_SYN_COOKIES is not set
++# CONFIG_IPV6 is not set
++# CONFIG_KHTTPD is not set
++# CONFIG_ATM is not set
++# CONFIG_VLAN_8021Q is not set
++
++#
++#  
++#
++# CONFIG_IPX is not set
++# CONFIG_ATALK is not set
++
++#
++# Appletalk devices
++#
++# CONFIG_DECNET is not set
++# CONFIG_BRIDGE is not set
++# CONFIG_X25 is not set
++# CONFIG_LAPB is not set
++# CONFIG_LLC is not set
++# CONFIG_NET_DIVERT is not set
++# CONFIG_ECONET is not set
++# CONFIG_WAN_ROUTER is not set
++# CONFIG_NET_FASTROUTE is not set
++# CONFIG_NET_HW_FLOWCONTROL is not set
++
++#
++# QoS and/or fair queueing
++#
++# CONFIG_NET_SCHED is not set
++
++#
++# Network testing
++#
++# CONFIG_NET_PKTGEN is not set
++
++#
++# File systems
++#
++CONFIG_QUOTA=y
++CONFIG_AUTOFS_FS=m
++CONFIG_AUTOFS4_FS=m
++CONFIG_REISERFS_FS=m
++# CONFIG_REISERFS_CHECK is not set
++# CONFIG_REISERFS_PROC_INFO is not set
++CONFIG_ADFS_FS=m
++# CONFIG_ADFS_FS_RW is not set
++CONFIG_AFFS_FS=m
++CONFIG_HFS_FS=m
++CONFIG_BFS_FS=m
++CONFIG_EXT3_FS=y
++CONFIG_JBD=y
++# CONFIG_JBD_DEBUG is not set
++CONFIG_FAT_FS=y
++CONFIG_MSDOS_FS=y
++CONFIG_UMSDOS_FS=y
++CONFIG_VFAT_FS=y
++CONFIG_EFS_FS=m
++CONFIG_CRAMFS=m
++CONFIG_TMPFS=y
++CONFIG_RAMFS=y
++CONFIG_ISO9660_FS=y
++# CONFIG_JOLIET is not set
++# CONFIG_ZISOFS is not set
++CONFIG_MINIX_FS=m
++CONFIG_VXFS_FS=m
++# CONFIG_NTFS_FS is not set
++CONFIG_HPFS_FS=m
++CONFIG_PROC_FS=y
++CONFIG_DEVFS_FS=y
++CONFIG_DEVFS_MOUNT=y
++# CONFIG_DEVFS_DEBUG is not set
++CONFIG_DEVPTS_FS=y
++CONFIG_QNX4FS_FS=m
++# CONFIG_QNX4FS_RW is not set
++CONFIG_ROMFS_FS=m
++CONFIG_EXT2_FS=y
++CONFIG_SYSV_FS=m
++CONFIG_UDF_FS=m
++# CONFIG_UDF_RW is not set
++CONFIG_UFS_FS=m
++# CONFIG_UFS_FS_WRITE is not set
++
++#
++# Network File Systems
++#
++# CONFIG_CODA_FS is not set
++# CONFIG_INTERMEZZO_FS is not set
++CONFIG_NFS_FS=y
++CONFIG_NFS_V3=y
++CONFIG_NFSD=y
++CONFIG_NFSD_V3=y
++CONFIG_SUNRPC=y
++CONFIG_LOCKD=y
++CONFIG_LOCKD_V4=y
++# CONFIG_SMB_FS is not set
++# CONFIG_NCP_FS is not set
++# CONFIG_ZISOFS_FS is not set
++CONFIG_ZLIB_FS_INFLATE=m
++
++#
++# Partition Types
++#
++# CONFIG_PARTITION_ADVANCED is not set
++CONFIG_MSDOS_PARTITION=y
++# CONFIG_SMB_NLS is not set
++CONFIG_NLS=y
++
++#
++# Native Language Support
++#
++CONFIG_NLS_DEFAULT="iso8859-1"
++# CONFIG_NLS_CODEPAGE_437 is not set
++# CONFIG_NLS_CODEPAGE_737 is not set
++# CONFIG_NLS_CODEPAGE_775 is not set
++# CONFIG_NLS_CODEPAGE_850 is not set
++# CONFIG_NLS_CODEPAGE_852 is not set
++# CONFIG_NLS_CODEPAGE_855 is not set
++# CONFIG_NLS_CODEPAGE_857 is not set
++# CONFIG_NLS_CODEPAGE_860 is not set
++# CONFIG_NLS_CODEPAGE_861 is not set
++# CONFIG_NLS_CODEPAGE_862 is not set
++# CONFIG_NLS_CODEPAGE_863 is not set
++# CONFIG_NLS_CODEPAGE_864 is not set
++# CONFIG_NLS_CODEPAGE_865 is not set
++# CONFIG_NLS_CODEPAGE_866 is not set
++# CONFIG_NLS_CODEPAGE_869 is not set
++# CONFIG_NLS_CODEPAGE_936 is not set
++# CONFIG_NLS_CODEPAGE_950 is not set
++# CONFIG_NLS_CODEPAGE_932 is not set
++# CONFIG_NLS_CODEPAGE_949 is not set
++# CONFIG_NLS_CODEPAGE_874 is not set
++# CONFIG_NLS_ISO8859_8 is not set
++# CONFIG_NLS_CODEPAGE_1250 is not set
++# CONFIG_NLS_CODEPAGE_1251 is not set
++# CONFIG_NLS_ISO8859_1 is not set
++# CONFIG_NLS_ISO8859_2 is not set
++# CONFIG_NLS_ISO8859_3 is not set
++# CONFIG_NLS_ISO8859_4 is not set
++# CONFIG_NLS_ISO8859_5 is not set
++# CONFIG_NLS_ISO8859_6 is not set
++# CONFIG_NLS_ISO8859_7 is not set
++# CONFIG_NLS_ISO8859_9 is not set
++# CONFIG_NLS_ISO8859_13 is not set
++# CONFIG_NLS_ISO8859_14 is not set
++# CONFIG_NLS_ISO8859_15 is not set
++# CONFIG_NLS_KOI8_R is not set
++# CONFIG_NLS_KOI8_U is not set
++# CONFIG_NLS_UTF8 is not set
++
++#
++# SCSI support
++#
++CONFIG_SCSI=y
++
++#
++# SCSI support type (disk, tape, CD-ROM)
++#
++# CONFIG_BLK_DEV_SD is not set
++# CONFIG_CHR_DEV_ST is not set
++# CONFIG_BLK_DEV_SR is not set
++# CONFIG_CHR_DEV_SG is not set
++
++#
++# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
++#
++# CONFIG_SCSI_DEBUG_QUEUES is not set
++# CONFIG_SCSI_MULTI_LUN is not set
++# CONFIG_SCSI_CONSTANTS is not set
++# CONFIG_SCSI_LOGGING is not set
++CONFIG_SCSI_DEBUG=m
++
++#
++# Multi-device support (RAID and LVM)
++#
++# CONFIG_MD is not set
++
++#
++# Memory Technology Devices (MTD)
++#
++# CONFIG_MTD is not set
++
++#
++# Kernel hacking
++#
++# CONFIG_DEBUG_SLAB is not set
++# CONFIG_DEBUGSYM is not set
+diff -Naur -X ../exclude-files orig/arch/um/config_block.in um/arch/um/config_block.in
+--- orig/arch/um/config_block.in       Wed Dec 31 19:00:00 1969
++++ um/arch/um/config_block.in Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,16 @@
++mainmenu_option next_comment
++comment 'Block Devices'
++
++bool 'Virtual block device' CONFIG_BLK_DEV_UBD
++dep_bool '  Always do synchronous disk IO for UBD' CONFIG_BLK_DEV_UBD_SYNC $CONFIG_BLK_DEV_UBD
++tristate 'Loopback device support' CONFIG_BLK_DEV_LOOP
++dep_tristate 'Network block device support' CONFIG_BLK_DEV_NBD $CONFIG_NET
++tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
++if [ "$CONFIG_BLK_DEV_RAM" = "y" -o "$CONFIG_BLK_DEV_RAM" = "m" ]; then
++      int '   Default RAM disk size' CONFIG_BLK_DEV_RAM_SIZE 4096
++fi
++dep_bool '  Initial RAM disk (initrd) support' CONFIG_BLK_DEV_INITRD $CONFIG_BLK_DEV_RAM
++
++tristate 'Example IO memory driver' CONFIG_MMAPPER
++
++endmenu
+diff -Naur -X ../exclude-files orig/arch/um/config_char.in um/arch/um/config_char.in
+--- orig/arch/um/config_char.in        Wed Dec 31 19:00:00 1969
++++ um/arch/um/config_char.in  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,37 @@
++mainmenu_option next_comment
++comment 'Character Devices'
++
++define_bool CONFIG_STDIO_CONSOLE y
++
++bool 'Virtual serial line' CONFIG_SSL
++
++bool 'file descriptor channel support' CONFIG_FD_CHAN
++bool 'null channel support' CONFIG_NULL_CHAN
++bool 'port channel support' CONFIG_PORT_CHAN
++bool 'pty channel support' CONFIG_PTY_CHAN
++bool 'tty channel support' CONFIG_TTY_CHAN
++bool 'xterm channel support' CONFIG_XTERM_CHAN
++string 'Default main console channel initialization' CONFIG_CON_ZERO_CHAN \
++            "fd:0,fd:1"
++string 'Default console channel initialization' CONFIG_CON_CHAN "xterm"
++string 'Default serial line channel initialization' CONFIG_SSL_CHAN "pty"
++
++
++bool 'Unix98 PTY support' CONFIG_UNIX98_PTYS
++if [ "$CONFIG_UNIX98_PTYS" = "y" ]; then
++   int 'Maximum number of Unix98 PTYs in use (0-2048)' CONFIG_UNIX98_PTY_COUNT 256
++fi
++
++bool 'Watchdog Timer Support' CONFIG_WATCHDOG
++dep_bool '  Disable watchdog shutdown on close' CONFIG_WATCHDOG_NOWAYOUT \
++      $CONFIG_WATCHDOG
++dep_tristate '  Software Watchdog' CONFIG_SOFT_WATCHDOG $CONFIG_WATCHDOG
++dep_tristate '  UML watchdog' CONFIG_UML_WATCHDOG $CONFIG_WATCHDOG
++
++tristate 'Sound support' CONFIG_UML_SOUND
++define_tristate CONFIG_SOUND $CONFIG_UML_SOUND
++define_tristate CONFIG_HOSTAUDIO $CONFIG_UML_SOUND
++
++bool 'Enable tty logging' CONFIG_TTY_LOG
++
++endmenu
+diff -Naur -X ../exclude-files orig/arch/um/config_net.in um/arch/um/config_net.in
+--- orig/arch/um/config_net.in Wed Dec 31 19:00:00 1969
++++ um/arch/um/config_net.in   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,47 @@
++mainmenu_option next_comment
++comment 'Network Devices'
++
++# UML virtual driver
++bool 'Virtual network device' CONFIG_UML_NET
++
++dep_bool '  Ethertap transport' CONFIG_UML_NET_ETHERTAP $CONFIG_UML_NET
++dep_bool '  TUN/TAP transport' CONFIG_UML_NET_TUNTAP $CONFIG_UML_NET
++dep_bool '  SLIP transport' CONFIG_UML_NET_SLIP $CONFIG_UML_NET
++dep_bool '  SLiRP transport' CONFIG_UML_NET_SLIRP $CONFIG_UML_NET
++dep_bool '  Daemon transport' CONFIG_UML_NET_DAEMON $CONFIG_UML_NET
++dep_bool '  Multicast transport' CONFIG_UML_NET_MCAST $CONFIG_UML_NET
++dep_bool '  pcap transport' CONFIG_UML_NET_PCAP $CONFIG_UML_NET
++
++# Below are hardware-independent drivers mirrored from
++# drivers/net/Config.in. It would be nice if Linux
++# had HW independent drivers separated from the other
++# but it does not. Until then each non-ISA/PCI arch
++# needs to provide it's own menu of network drivers
++
++tristate 'Dummy net driver support' CONFIG_DUMMY
++tristate 'Bonding driver support' CONFIG_BONDING
++tristate 'EQL (serial line load balancing) support' CONFIG_EQUALIZER
++tristate 'Universal TUN/TAP device driver support' CONFIG_TUN
++if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
++   if [ "$CONFIG_NETLINK" = "y" ]; then
++      tristate 'Ethertap network tap (OBSOLETE)' CONFIG_ETHERTAP
++   fi
++fi
++
++tristate 'PPP (point-to-point protocol) support' CONFIG_PPP
++if [ ! "$CONFIG_PPP" = "n" ]; then
++   dep_bool '  PPP multilink support (EXPERIMENTAL)' CONFIG_PPP_MULTILINK $CONFIG_EXPERIMENTAL
++   dep_bool '  PPP filtering' CONFIG_PPP_FILTER $CONFIG_FILTER
++   dep_tristate '  PPP support for async serial ports' CONFIG_PPP_ASYNC $CONFIG_PPP
++   dep_tristate '  PPP support for sync tty ports' CONFIG_PPP_SYNC_TTY $CONFIG_PPP
++   dep_tristate '  PPP Deflate compression' CONFIG_PPP_DEFLATE $CONFIG_PPP
++   dep_tristate '  PPP BSD-Compress compression' CONFIG_PPP_BSDCOMP $CONFIG_PPP
++   dep_tristate '  PPP over Ethernet (EXPERIMENTAL)' CONFIG_PPPOE $CONFIG_PPP $CONFIG_EXPERIMENTAL
++fi
++
++tristate 'SLIP (serial line) support' CONFIG_SLIP
++dep_bool '  CSLIP compressed headers' CONFIG_SLIP_COMPRESSED $CONFIG_SLIP
++dep_bool '  Keepalive and linefill' CONFIG_SLIP_SMART $CONFIG_SLIP
++dep_bool '  Six bit SLIP encapsulation' CONFIG_SLIP_MODE_SLIP6 $CONFIG_SLIP
++
++endmenu
+diff -Naur -X ../exclude-files orig/arch/um/config_scsi.in um/arch/um/config_scsi.in
+--- orig/arch/um/config_scsi.in        Wed Dec 31 19:00:00 1969
++++ um/arch/um/config_scsi.in  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,30 @@
++comment 'SCSI support type (disk, tape, CD-ROM)'
++
++dep_tristate '  SCSI disk support' CONFIG_BLK_DEV_SD $CONFIG_SCSI
++
++if [ "$CONFIG_BLK_DEV_SD" != "n" ]; then
++   int  'Maximum number of SCSI disks that can be loaded as modules' CONFIG_SD_EXTRA_DEVS 40
++fi
++
++dep_tristate '  SCSI tape support' CONFIG_CHR_DEV_ST $CONFIG_SCSI
++
++dep_tristate '  SCSI CD-ROM support' CONFIG_BLK_DEV_SR $CONFIG_SCSI
++
++if [ "$CONFIG_BLK_DEV_SR" != "n" ]; then
++   bool '    Enable vendor-specific extensions (for SCSI CDROM)' CONFIG_BLK_DEV_SR_VENDOR
++   int  'Maximum number of CDROM devices that can be loaded as modules' CONFIG_SR_EXTRA_DEVS 2
++fi
++dep_tristate '  SCSI generic support' CONFIG_CHR_DEV_SG $CONFIG_SCSI
++
++comment 'Some SCSI devices (e.g. CD jukebox) support multiple LUNs'
++
++#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
++   bool '  Enable extra checks in new queueing code' CONFIG_SCSI_DEBUG_QUEUES
++#fi
++
++bool '  Probe all LUNs on each SCSI device' CONFIG_SCSI_MULTI_LUN
++  
++bool '  Verbose SCSI error reporting (kernel size +=12K)' CONFIG_SCSI_CONSTANTS
++bool '  SCSI logging facility' CONFIG_SCSI_LOGGING
++
++dep_tristate 'SCSI debugging host simulator (EXPERIMENTAL)' CONFIG_SCSI_DEBUG $CONFIG_SCSI
+diff -Naur -X ../exclude-files orig/arch/um/defconfig um/arch/um/defconfig
+--- orig/arch/um/defconfig     Wed Dec 31 19:00:00 1969
++++ um/arch/um/defconfig       Mon Jan 20 11:26:54 2003
+@@ -0,0 +1,396 @@
++#
++# Automatically generated make config: don't edit
++#
++CONFIG_USERMODE=y
++# CONFIG_ISA is not set
++# CONFIG_SBUS is not set
++# CONFIG_PCI is not set
++CONFIG_UID16=y
++CONFIG_RWSEM_XCHGADD_ALGORITHM=y
++
++#
++# Code maturity level options
++#
++CONFIG_EXPERIMENTAL=y
++
++#
++# General Setup
++#
++CONFIG_MODE_TT=y
++CONFIG_MODE_SKAS=y
++CONFIG_NET=y
++CONFIG_SYSVIPC=y
++CONFIG_BSD_PROCESS_ACCT=y
++CONFIG_SYSCTL=y
++CONFIG_BINFMT_AOUT=y
++CONFIG_BINFMT_ELF=y
++CONFIG_BINFMT_MISC=y
++CONFIG_HOSTFS=y
++CONFIG_HPPFS=y
++CONFIG_MCONSOLE=y
++CONFIG_MAGIC_SYSRQ=y
++# CONFIG_HOST_2G_2G is not set
++# CONFIG_UML_SMP is not set
++# CONFIG_SMP is not set
++CONFIG_NEST_LEVEL=0
++CONFIG_KERNEL_HALF_GIGS=1
++# CONFIG_HIGHMEM is not set
++CONFIG_PROC_MM=y
++CONFIG_KERNEL_STACK_ORDER=2
++
++#
++# Loadable module support
++#
++CONFIG_MODULES=y
++# CONFIG_KMOD is not set
++
++#
++# Character Devices
++#
++CONFIG_STDIO_CONSOLE=y
++CONFIG_SSL=y
++CONFIG_FD_CHAN=y
++CONFIG_NULL_CHAN=y
++CONFIG_PORT_CHAN=y
++CONFIG_PTY_CHAN=y
++CONFIG_TTY_CHAN=y
++CONFIG_XTERM_CHAN=y
++CONFIG_CON_ZERO_CHAN="fd:0,fd:1"
++CONFIG_CON_CHAN="xterm"
++CONFIG_SSL_CHAN="pty"
++CONFIG_UNIX98_PTYS=y
++CONFIG_UNIX98_PTY_COUNT=256
++# CONFIG_WATCHDOG is not set
++# CONFIG_WATCHDOG_NOWAYOUT is not set
++# CONFIG_SOFT_WATCHDOG is not set
++# CONFIG_UML_WATCHDOG is not set
++CONFIG_UML_SOUND=y
++CONFIG_SOUND=y
++CONFIG_HOSTAUDIO=y
++# CONFIG_TTY_LOG is not set
++
++#
++# Block Devices
++#
++CONFIG_BLK_DEV_UBD=y
++# CONFIG_BLK_DEV_UBD_SYNC is not set
++CONFIG_BLK_DEV_LOOP=y
++CONFIG_BLK_DEV_NBD=y
++CONFIG_BLK_DEV_RAM=y
++CONFIG_BLK_DEV_RAM_SIZE=4096
++CONFIG_BLK_DEV_INITRD=y
++# CONFIG_MMAPPER is not set
++CONFIG_NETDEVICES=y
++
++#
++# Network Devices
++#
++CONFIG_UML_NET=y
++CONFIG_UML_NET_ETHERTAP=y
++CONFIG_UML_NET_TUNTAP=y
++CONFIG_UML_NET_SLIP=y
++CONFIG_UML_NET_SLIRP=y
++CONFIG_UML_NET_DAEMON=y
++CONFIG_UML_NET_MCAST=y
++# CONFIG_UML_NET_PCAP is not set
++CONFIG_DUMMY=y
++# CONFIG_BONDING is not set
++# CONFIG_EQUALIZER is not set
++CONFIG_TUN=y
++CONFIG_PPP=y
++# CONFIG_PPP_MULTILINK is not set
++# CONFIG_PPP_FILTER is not set
++# CONFIG_PPP_ASYNC is not set
++# CONFIG_PPP_SYNC_TTY is not set
++# CONFIG_PPP_DEFLATE is not set
++# CONFIG_PPP_BSDCOMP is not set
++# CONFIG_PPPOE is not set
++CONFIG_SLIP=y
++# CONFIG_SLIP_COMPRESSED is not set
++# CONFIG_SLIP_SMART is not set
++# CONFIG_SLIP_MODE_SLIP6 is not set
++
++#
++# Networking options
++#
++CONFIG_PACKET=y
++CONFIG_PACKET_MMAP=y
++# CONFIG_NETLINK_DEV is not set
++# CONFIG_NETFILTER is not set
++# CONFIG_FILTER is not set
++CONFIG_UNIX=y
++CONFIG_INET=y
++# CONFIG_IP_MULTICAST is not set
++# CONFIG_IP_ADVANCED_ROUTER is not set
++# CONFIG_IP_PNP is not set
++# CONFIG_NET_IPIP is not set
++# CONFIG_NET_IPGRE is not set
++# CONFIG_ARPD is not set
++# CONFIG_INET_ECN is not set
++# CONFIG_SYN_COOKIES is not set
++# CONFIG_IPV6 is not set
++# CONFIG_KHTTPD is not set
++# CONFIG_ATM is not set
++# CONFIG_VLAN_8021Q is not set
++
++#
++#  
++#
++# CONFIG_IPX is not set
++# CONFIG_ATALK is not set
++
++#
++# Appletalk devices
++#
++# CONFIG_DEV_APPLETALK is not set
++# CONFIG_DECNET is not set
++# CONFIG_BRIDGE is not set
++# CONFIG_X25 is not set
++# CONFIG_LAPB is not set
++# CONFIG_LLC is not set
++# CONFIG_NET_DIVERT is not set
++# CONFIG_ECONET is not set
++# CONFIG_WAN_ROUTER is not set
++# CONFIG_NET_FASTROUTE is not set
++# CONFIG_NET_HW_FLOWCONTROL is not set
++
++#
++# QoS and/or fair queueing
++#
++# CONFIG_NET_SCHED is not set
++
++#
++# Network testing
++#
++# CONFIG_NET_PKTGEN is not set
++
++#
++# File systems
++#
++CONFIG_QUOTA=y
++CONFIG_AUTOFS_FS=y
++CONFIG_AUTOFS4_FS=y
++CONFIG_REISERFS_FS=y
++# CONFIG_REISERFS_CHECK is not set
++# CONFIG_REISERFS_PROC_INFO is not set
++# CONFIG_ADFS_FS is not set
++# CONFIG_ADFS_FS_RW is not set
++# CONFIG_AFFS_FS is not set
++# CONFIG_HFS_FS is not set
++# CONFIG_BFS_FS is not set
++# CONFIG_EXT3_FS is not set
++# CONFIG_JBD is not set
++# CONFIG_JBD_DEBUG is not set
++CONFIG_FAT_FS=y
++CONFIG_MSDOS_FS=y
++CONFIG_UMSDOS_FS=y
++CONFIG_VFAT_FS=y
++# CONFIG_EFS_FS is not set
++CONFIG_JFFS_FS=y
++CONFIG_JFFS_FS_VERBOSE=0
++CONFIG_JFFS_PROC_FS=y
++CONFIG_JFFS2_FS=y
++CONFIG_JFFS2_FS_DEBUG=0
++# CONFIG_CRAMFS is not set
++# CONFIG_TMPFS is not set
++CONFIG_RAMFS=y
++CONFIG_ISO9660_FS=y
++# CONFIG_JOLIET is not set
++# CONFIG_ZISOFS is not set
++CONFIG_MINIX_FS=y
++# CONFIG_VXFS_FS is not set
++# CONFIG_NTFS_FS is not set
++# CONFIG_NTFS_RW is not set
++# CONFIG_HPFS_FS is not set
++CONFIG_PROC_FS=y
++CONFIG_DEVFS_FS=y
++CONFIG_DEVFS_MOUNT=y
++# CONFIG_DEVFS_DEBUG is not set
++CONFIG_DEVPTS_FS=y
++# CONFIG_QNX4FS_FS is not set
++# CONFIG_QNX4FS_RW is not set
++# CONFIG_ROMFS_FS is not set
++CONFIG_EXT2_FS=y
++# CONFIG_SYSV_FS is not set
++# CONFIG_UDF_FS is not set
++# CONFIG_UDF_RW is not set
++# CONFIG_UFS_FS is not set
++# CONFIG_UFS_FS_WRITE is not set
++
++#
++# Network File Systems
++#
++# CONFIG_CODA_FS is not set
++# CONFIG_INTERMEZZO_FS is not set
++# CONFIG_NFS_FS is not set
++# CONFIG_NFS_V3 is not set
++# CONFIG_ROOT_NFS is not set
++# CONFIG_NFSD is not set
++# CONFIG_NFSD_V3 is not set
++# CONFIG_SUNRPC is not set
++# CONFIG_LOCKD is not set
++# CONFIG_SMB_FS is not set
++# CONFIG_NCP_FS is not set
++# CONFIG_NCPFS_PACKET_SIGNING is not set
++# CONFIG_NCPFS_IOCTL_LOCKING is not set
++# CONFIG_NCPFS_STRONG is not set
++# CONFIG_NCPFS_NFS_NS is not set
++# CONFIG_NCPFS_OS2_NS is not set
++# CONFIG_NCPFS_SMALLDOS is not set
++# CONFIG_NCPFS_NLS is not set
++# CONFIG_NCPFS_EXTRAS is not set
++# CONFIG_ZISOFS_FS is not set
++# CONFIG_ZLIB_FS_INFLATE is not set
++
++#
++# Partition Types
++#
++# CONFIG_PARTITION_ADVANCED is not set
++CONFIG_MSDOS_PARTITION=y
++# CONFIG_SMB_NLS is not set
++CONFIG_NLS=y
++
++#
++# Native Language Support
++#
++CONFIG_NLS_DEFAULT="iso8859-1"
++# CONFIG_NLS_CODEPAGE_437 is not set
++# CONFIG_NLS_CODEPAGE_737 is not set
++# CONFIG_NLS_CODEPAGE_775 is not set
++# CONFIG_NLS_CODEPAGE_850 is not set
++# CONFIG_NLS_CODEPAGE_852 is not set
++# CONFIG_NLS_CODEPAGE_855 is not set
++# CONFIG_NLS_CODEPAGE_857 is not set
++# CONFIG_NLS_CODEPAGE_860 is not set
++# CONFIG_NLS_CODEPAGE_861 is not set
++# CONFIG_NLS_CODEPAGE_862 is not set
++# CONFIG_NLS_CODEPAGE_863 is not set
++# CONFIG_NLS_CODEPAGE_864 is not set
++# CONFIG_NLS_CODEPAGE_865 is not set
++# CONFIG_NLS_CODEPAGE_866 is not set
++# CONFIG_NLS_CODEPAGE_869 is not set
++# CONFIG_NLS_CODEPAGE_936 is not set
++# CONFIG_NLS_CODEPAGE_950 is not set
++# CONFIG_NLS_CODEPAGE_932 is not set
++# CONFIG_NLS_CODEPAGE_949 is not set
++# CONFIG_NLS_CODEPAGE_874 is not set
++# CONFIG_NLS_ISO8859_8 is not set
++# CONFIG_NLS_CODEPAGE_1250 is not set
++# CONFIG_NLS_CODEPAGE_1251 is not set
++# CONFIG_NLS_ISO8859_1 is not set
++# CONFIG_NLS_ISO8859_2 is not set
++# CONFIG_NLS_ISO8859_3 is not set
++# CONFIG_NLS_ISO8859_4 is not set
++# CONFIG_NLS_ISO8859_5 is not set
++# CONFIG_NLS_ISO8859_6 is not set
++# CONFIG_NLS_ISO8859_7 is not set
++# CONFIG_NLS_ISO8859_9 is not set
++# CONFIG_NLS_ISO8859_13 is not set
++# CONFIG_NLS_ISO8859_14 is not set
++# CONFIG_NLS_ISO8859_15 is not set
++# CONFIG_NLS_KOI8_R is not set
++# CONFIG_NLS_KOI8_U is not set
++# CONFIG_NLS_UTF8 is not set
++
++#
++# SCSI support
++#
++CONFIG_SCSI=y
++
++#
++# SCSI support type (disk, tape, CD-ROM)
++#
++# CONFIG_BLK_DEV_SD is not set
++# CONFIG_CHR_DEV_ST is not set
++# CONFIG_BLK_DEV_SR is not set
++# CONFIG_CHR_DEV_SG is not set
++
++#
++# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
++#
++# CONFIG_SCSI_DEBUG_QUEUES is not set
++# CONFIG_SCSI_MULTI_LUN is not set
++# CONFIG_SCSI_CONSTANTS is not set
++# CONFIG_SCSI_LOGGING is not set
++CONFIG_SCSI_DEBUG=y
++
++#
++# Multi-device support (RAID and LVM)
++#
++# CONFIG_MD is not set
++# CONFIG_BLK_DEV_MD is not set
++# CONFIG_MD_LINEAR is not set
++# CONFIG_MD_RAID0 is not set
++# CONFIG_MD_RAID1 is not set
++# CONFIG_MD_RAID5 is not set
++# CONFIG_MD_MULTIPATH is not set
++# CONFIG_BLK_DEV_LVM is not set
++
++#
++# Memory Technology Devices (MTD)
++#
++CONFIG_MTD=y
++# CONFIG_MTD_DEBUG is not set
++# CONFIG_MTD_PARTITIONS is not set
++# CONFIG_MTD_CONCAT is not set
++# CONFIG_MTD_REDBOOT_PARTS is not set
++
++#
++# User Modules And Translation Layers
++#
++CONFIG_MTD_CHAR=y
++CONFIG_MTD_BLOCK=y
++# CONFIG_FTL is not set
++# CONFIG_NFTL is not set
++
++#
++# RAM/ROM/Flash chip drivers
++#
++# CONFIG_MTD_CFI is not set
++# CONFIG_MTD_JEDECPROBE is not set
++# CONFIG_MTD_GEN_PROBE is not set
++# CONFIG_MTD_CFI_INTELEXT is not set
++# CONFIG_MTD_CFI_AMDSTD is not set
++# CONFIG_MTD_RAM is not set
++# CONFIG_MTD_ROM is not set
++# CONFIG_MTD_ABSENT is not set
++# CONFIG_MTD_OBSOLETE_CHIPS is not set
++# CONFIG_MTD_AMDSTD is not set
++# CONFIG_MTD_SHARP is not set
++# CONFIG_MTD_JEDEC is not set
++
++#
++# Mapping drivers for chip access
++#
++# CONFIG_MTD_PHYSMAP is not set
++# CONFIG_MTD_PCI is not set
++
++#
++# Self-contained MTD device drivers
++#
++# CONFIG_MTD_PMC551 is not set
++# CONFIG_MTD_SLRAM is not set
++# CONFIG_MTD_MTDRAM is not set
++CONFIG_MTD_BLKMTD=y
++
++#
++# Disk-On-Chip Device Drivers
++#
++# CONFIG_MTD_DOC1000 is not set
++# CONFIG_MTD_DOC2000 is not set
++# CONFIG_MTD_DOC2001 is not set
++# CONFIG_MTD_DOCPROBE is not set
++
++#
++# NAND Flash Device Drivers
++#
++# CONFIG_MTD_NAND is not set
++
++#
++# Kernel hacking
++#
++# CONFIG_DEBUG_SLAB is not set
++CONFIG_DEBUGSYM=y
++CONFIG_PT_PROXY=y
++# CONFIG_GPROF is not set
++# CONFIG_GCOV is not set
+diff -Naur -X ../exclude-files orig/arch/um/drivers/Makefile um/arch/um/drivers/Makefile
+--- orig/arch/um/drivers/Makefile      Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/Makefile        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,94 @@
++# 
++# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET := built-in.o 
++
++CHAN_OBJS := chan_kern.o chan_user.o line.o 
++
++list-multi := slip.o slirp.o daemon.o mcast.o mconsole.o net.o ubd.o \
++      hostaudio.o pcap.o port.o harddog.o
++
++slip-objs := slip_kern.o slip_user.o
++slirp-objs := slirp_kern.o slirp_user.o
++daemon-objs := daemon_kern.o daemon_user.o
++mcast-objs := mcast_kern.o mcast_user.o
++pcap-objs := pcap_kern.o pcap_user.o -lpcap -L/usr/lib
++net-objs := net_kern.o net_user.o
++mconsole-objs := mconsole_kern.o mconsole_user.o
++hostaudio-objs := hostaudio_kern.o hostaudio_user.o
++ubd-objs := ubd_kern.o ubd_user.o
++port-objs := port_kern.o port_user.o
++harddog-objs := harddog_kern.o harddog_user.o
++
++export-objs := mconsole_kern.o
++
++obj-y = 
++obj-$(CONFIG_SSL) += ssl.o 
++obj-$(CONFIG_UML_NET_SLIP) += slip.o
++obj-$(CONFIG_UML_NET_SLIRP) += slirp.o
++obj-$(CONFIG_UML_NET_DAEMON) += daemon.o 
++obj-$(CONFIG_UML_NET_MCAST) += mcast.o 
++obj-$(CONFIG_UML_NET_PCAP) += pcap.o 
++obj-$(CONFIG_UML_NET) += net.o 
++obj-$(CONFIG_MCONSOLE) += mconsole.o
++obj-$(CONFIG_MMAPPER) += mmapper_kern.o 
++obj-$(CONFIG_BLK_DEV_UBD) += ubd.o 
++obj-$(CONFIG_HOSTAUDIO) += hostaudio.o
++obj-$(CONFIG_FD_CHAN) += fd.o 
++obj-$(CONFIG_NULL_CHAN) += null.o 
++obj-$(CONFIG_PORT_CHAN) += port.o
++obj-$(CONFIG_PTY_CHAN) += pty.o
++obj-$(CONFIG_TTY_CHAN) += tty.o 
++obj-$(CONFIG_XTERM_CHAN) += xterm.o xterm_kern.o
++obj-$(CONFIG_UML_WATCHDOG) += harddog.o
++
++CFLAGS_pcap_user.o = -I/usr/include/pcap
++
++obj-y += stdio_console.o $(CHAN_OBJS)
++
++USER_SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs))
++
++USER_OBJS = $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) fd.o \
++      null.o pty.o tty.o xterm.o
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++clean:
++
++modules:
++
++fastdep:
++
++dep:
++
++archmrproper:
++
++daemon.o : $(daemon-objs)
++
++slip.o : $(slip-objs)
++
++slirp.o : $(slirp-objs)
++
++mcast.o : $(mcast-objs)
++
++pcap.o : $(pcap-objs)
++
++mconsole.o : $(mconsole-objs)
++
++net.o : $(net-objs)
++
++hostaudio.o : $(hostaudio-objs)
++
++ubd.o : $(ubd-objs)
++
++port.o : $(port-objs)
++
++harddog.o : $(harddog-objs)
++
++$(list-multi) : # This doesn't work, but should : '%.o : $(%-objs)'
++      $(LD) $(LD_RFLAG) -r -o $@ $($(patsubst %.o,%,$@)-objs)
+diff -Naur -X ../exclude-files orig/arch/um/drivers/chan_kern.c um/arch/um/drivers/chan_kern.c
+--- orig/arch/um/drivers/chan_kern.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/chan_kern.c     Thu Mar  6 19:25:16 2003
+@@ -0,0 +1,510 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <linux/stddef.h>
++#include <linux/kernel.h>
++#include <linux/list.h>
++#include <linux/slab.h>
++#include <linux/tty.h>
++#include <linux/string.h>
++#include <linux/tty_flip.h>
++#include <asm/irq.h>
++#include "chan_kern.h"
++#include "user_util.h"
++#include "kern.h"
++#include "irq_user.h"
++#include "sigio.h"
++#include "line.h"
++
++static void *not_configged_init(char *str, int device, struct chan_opts *opts)
++{
++      printk(KERN_ERR "Using a channel type which is configured out of "
++             "UML\n");
++      return(NULL);
++}
++
++static int not_configged_open(int input, int output, int primary, void *data,
++                            char **dev_out)
++{
++      printk(KERN_ERR "Using a channel type which is configured out of "
++             "UML\n");
++      return(-ENODEV);
++}
++
++static void not_configged_close(int fd, void *data)
++{
++      printk(KERN_ERR "Using a channel type which is configured out of "
++             "UML\n");
++}
++
++static int not_configged_read(int fd, char *c_out, void *data)
++{
++      printk(KERN_ERR "Using a channel type which is configured out of "
++             "UML\n");
++      return(-EIO);
++}
++
++static int not_configged_write(int fd, const char *buf, int len, void *data)
++{
++      printk(KERN_ERR "Using a channel type which is configured out of "
++             "UML\n");
++      return(-EIO);
++}
++
++static int not_configged_console_write(int fd, const char *buf, int len,
++                                     void *data)
++{
++      printk(KERN_ERR "Using a channel type which is configured out of "
++             "UML\n");
++      return(-EIO);
++}
++
++static int not_configged_window_size(int fd, void *data, unsigned short *rows,
++                                   unsigned short *cols)
++{
++      printk(KERN_ERR "Using a channel type which is configured out of "
++             "UML\n");
++      return(-ENODEV);
++}
++
++static void not_configged_free(void *data)
++{
++      printk(KERN_ERR "Using a channel type which is configured out of "
++             "UML\n");
++}
++
++static struct chan_ops not_configged_ops = {
++      .init           = not_configged_init,
++      .open           = not_configged_open,
++      .close          = not_configged_close,
++      .read           = not_configged_read,
++      .write          = not_configged_write,
++      .console_write  = not_configged_console_write,
++      .window_size    = not_configged_window_size,
++      .free           = not_configged_free,
++      .winch          = 0,
++};
++
++static void tty_receive_char(struct tty_struct *tty, char ch)
++{
++      if(tty == NULL) return;
++
++      if(I_IXON(tty) && !I_IXOFF(tty) && !tty->raw) {
++              if(ch == STOP_CHAR(tty)){
++                      stop_tty(tty);
++                      return;
++              }
++              else if(ch == START_CHAR(tty)){
++                      start_tty(tty);
++                      return;
++              }
++      }
++
++      if((tty->flip.flag_buf_ptr == NULL) || 
++         (tty->flip.char_buf_ptr == NULL))
++              return;
++      tty_insert_flip_char(tty, ch, TTY_NORMAL);
++}
++
++static int open_one_chan(struct chan *chan, int input, int output, int primary)
++{
++      int fd;
++
++      if(chan->opened) return(0);
++      if(chan->ops->open == NULL) fd = 0;
++      else fd = (*chan->ops->open)(input, output, primary, chan->data,
++                                   &chan->dev);
++      if(fd < 0) return(fd);
++      chan->fd = fd;
++
++      chan->opened = 1;
++      return(0);
++}
++
++int open_chan(struct list_head *chans)
++{
++      struct list_head *ele;
++      struct chan *chan;
++      int ret, err = 0;
++
++      list_for_each(ele, chans){
++              chan = list_entry(ele, struct chan, list);
++              ret = open_one_chan(chan, chan->input, chan->output,
++                                  chan->primary);
++              if(chan->primary) err = ret;
++      }
++      return(err);
++}
++
++void chan_enable_winch(struct list_head *chans, void *line)
++{
++      struct list_head *ele;
++      struct chan *chan;
++
++      list_for_each(ele, chans){
++              chan = list_entry(ele, struct chan, list);
++              if(chan->primary && chan->output && chan->ops->winch){
++                      register_winch(chan->fd, line);
++                      return;
++              }
++      }
++}
++
++void enable_chan(struct list_head *chans, void *data)
++{
++      struct list_head *ele;
++      struct chan *chan;
++
++      list_for_each(ele, chans){
++              chan = list_entry(ele, struct chan, list);
++              if(!chan->opened) continue;
++
++              line_setup_irq(chan->fd, chan->input, chan->output, data);
++      }
++}
++
++void close_chan(struct list_head *chans)
++{
++      struct list_head *ele;
++      struct chan *chan;
++
++      /* Close in reverse order as open in case more than one of them
++       * refers to the same device and they save and restore that device's
++       * state.  Then, the first one opened will have the original state,
++       * so it must be the last closed.
++       */
++        for(ele = chans->prev; ele != chans; ele = ele->prev){
++                chan = list_entry(ele, struct chan, list);
++              if(!chan->opened) continue;
++              if(chan->ops->close != NULL)
++                      (*chan->ops->close)(chan->fd, chan->data);
++              chan->opened = 0;
++              chan->fd = -1;
++      }
++}
++
++int write_chan(struct list_head *chans, const char *buf, int len, 
++             int write_irq)
++{
++      struct list_head *ele;
++      struct chan *chan;
++      int n, ret = 0;
++
++      list_for_each(ele, chans){
++              chan = list_entry(ele, struct chan, list);
++              if(!chan->output || (chan->ops->write == NULL)) continue;
++              n = chan->ops->write(chan->fd, buf, len, chan->data);
++              if(chan->primary){
++                      ret = n;
++                      if((ret == -EAGAIN) || ((ret >= 0) && (ret < len))){
++                              reactivate_fd(chan->fd, write_irq);
++                              if(ret == -EAGAIN) ret = 0;
++                      }
++              }
++      }
++      return(ret);
++}
++
++int console_write_chan(struct list_head *chans, const char *buf, int len)
++{
++      struct list_head *ele;
++      struct chan *chan;
++      int n, ret = 0;
++
++      list_for_each(ele, chans){
++              chan = list_entry(ele, struct chan, list);
++              if(!chan->output || (chan->ops->console_write == NULL))
++                      continue;
++              n = chan->ops->console_write(chan->fd, buf, len, chan->data);
++              if(chan->primary) ret = n;
++      }
++      return(ret);
++}
++
++int chan_window_size(struct list_head *chans, unsigned short *rows_out,
++                    unsigned short *cols_out)
++{
++      struct list_head *ele;
++      struct chan *chan;
++
++      list_for_each(ele, chans){
++              chan = list_entry(ele, struct chan, list);
++              if(chan->primary){
++                      if(chan->ops->window_size == NULL) return(0);
++                      return(chan->ops->window_size(chan->fd, chan->data,
++                                                    rows_out, cols_out));
++              }
++      }
++      return(0);
++}
++
++void free_one_chan(struct chan *chan)
++{
++      list_del(&chan->list);
++      if(chan->ops->free != NULL)
++              (*chan->ops->free)(chan->data);
++      free_irq_by_fd(chan->fd);
++      if(chan->primary && chan->output) ignore_sigio_fd(chan->fd);
++      kfree(chan);
++}
++
++void free_chan(struct list_head *chans)
++{
++      struct list_head *ele, *next;
++      struct chan *chan;
++
++      list_for_each_safe(ele, next, chans){
++              chan = list_entry(ele, struct chan, list);
++              free_one_chan(chan);
++      }
++}
++
++static int one_chan_config_string(struct chan *chan, char *str, int size,
++                                char **error_out)
++{
++      int n = 0;
++
++      CONFIG_CHUNK(str, size, n, chan->ops->type, 0);
++
++      if(chan->dev == NULL){
++              CONFIG_CHUNK(str, size, n, "", 1);
++              return(n);
++      }
++
++      CONFIG_CHUNK(str, size, n, ":", 0);
++      CONFIG_CHUNK(str, size, n, chan->dev, 0);
++
++      return(n);
++}
++
++static int chan_pair_config_string(struct chan *in, struct chan *out, 
++                                 char *str, int size, char **error_out)
++{
++      int n;
++
++      n = one_chan_config_string(in, str, size, error_out);
++      str += n;
++      size -= n;
++
++      if(in == out){
++              CONFIG_CHUNK(str, size, n, "", 1);
++              return(n);
++      }
++
++      CONFIG_CHUNK(str, size, n, ",", 1);
++      n = one_chan_config_string(out, str, size, error_out);
++      str += n;
++      size -= n;
++      CONFIG_CHUNK(str, size, n, "", 1);
++
++      return(n);
++}
++
++int chan_config_string(struct list_head *chans, char *str, int size, 
++                     char **error_out)
++{
++      struct list_head *ele;
++      struct chan *chan, *in = NULL, *out = NULL;
++
++      list_for_each(ele, chans){
++              chan = list_entry(ele, struct chan, list);
++              if(!chan->primary)
++                      continue;
++              if(chan->input)
++                      in = chan;
++              if(chan->output)
++                      out = chan;
++      }
++
++      return(chan_pair_config_string(in, out, str, size, error_out));
++}
++
++struct chan_type {
++      char *key;
++      struct chan_ops *ops;
++};
++
++struct chan_type chan_table[] = {
++#ifdef CONFIG_FD_CHAN
++      { "fd", &fd_ops },
++#else
++      { "fd", &not_configged_ops },
++#endif
++
++#ifdef CONFIG_NULL_CHAN
++      { "null", &null_ops },
++#else
++      { "null", &not_configged_ops },
++#endif
++
++#ifdef CONFIG_PORT_CHAN
++      { "port", &port_ops },
++#else
++      { "port", &not_configged_ops },
++#endif
++
++#ifdef CONFIG_PTY_CHAN
++      { "pty", &pty_ops },
++      { "pts", &pts_ops },
++#else
++      { "pty", &not_configged_ops },
++      { "pts", &not_configged_ops },
++#endif
++
++#ifdef CONFIG_TTY_CHAN
++      { "tty", &tty_ops },
++#else
++      { "tty", &not_configged_ops },
++#endif
++
++#ifdef CONFIG_XTERM_CHAN
++      { "xterm", &xterm_ops },
++#else
++      { "xterm", &not_configged_ops },
++#endif
++};
++
++static struct chan *parse_chan(char *str, int pri, int device, 
++                             struct chan_opts *opts)
++{
++      struct chan_type *entry;
++      struct chan_ops *ops;
++      struct chan *chan;
++      void *data;
++      int i;
++
++      ops = NULL;
++      data = NULL;
++      for(i = 0; i < sizeof(chan_table)/sizeof(chan_table[0]); i++){
++              entry = &chan_table[i];
++              if(!strncmp(str, entry->key, strlen(entry->key))){
++                      ops = entry->ops;
++                      str += strlen(entry->key);
++                      break;
++              }
++      }
++      if(ops == NULL){
++              printk(KERN_ERR "parse_chan couldn't parse \"%s\"\n", 
++                     str);
++              return(NULL);
++      }
++      if(ops->init == NULL) return(NULL); 
++      data = (*ops->init)(str, device, opts);
++      if(data == NULL) return(NULL);
++
++      chan = kmalloc(sizeof(*chan), GFP_KERNEL);
++      if(chan == NULL) return(NULL);
++      *chan = ((struct chan) { .list          = LIST_HEAD_INIT(chan->list),
++                               .primary       = 1,
++                               .input         = 0,
++                               .output        = 0,
++                               .opened        = 0,
++                               .fd            = -1,
++                               .pri           = pri,
++                               .ops           = ops,
++                               .data          = data });
++      return(chan);
++}
++
++int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
++                  struct chan_opts *opts)
++{
++      struct chan *new, *chan;
++      char *in, *out;
++
++      if(!list_empty(chans)){
++              chan = list_entry(chans->next, struct chan, list);
++              if(chan->pri >= pri) return(0);
++              free_chan(chans);
++              INIT_LIST_HEAD(chans);
++      }
++
++      if((out = strchr(str, ',')) != NULL){
++              in = str;
++              *out = '\0';
++              out++;
++              new = parse_chan(in, pri, device, opts);
++              if(new == NULL) return(-1);
++              new->input = 1;
++              list_add(&new->list, chans);
++
++              new = parse_chan(out, pri, device, opts);
++              if(new == NULL) return(-1);
++              list_add(&new->list, chans);
++              new->output = 1;
++      }
++      else {
++              new = parse_chan(str, pri, device, opts);
++              if(new == NULL) return(-1);
++              list_add(&new->list, chans);
++              new->input = 1;
++              new->output = 1;
++      }
++      return(0);
++}
++
++int chan_out_fd(struct list_head *chans)
++{
++      struct list_head *ele;
++      struct chan *chan;
++
++      list_for_each(ele, chans){
++              chan = list_entry(ele, struct chan, list);
++              if(chan->primary && chan->output)
++                      return(chan->fd);
++      }
++      return(-1);
++}
++
++void chan_interrupt(struct list_head *chans, struct tq_struct *task,
++                  struct tty_struct *tty, int irq, void *dev)
++{
++      struct list_head *ele, *next;
++      struct chan *chan;
++      int err;
++      char c;
++
++      list_for_each_safe(ele, next, chans){
++              chan = list_entry(ele, struct chan, list);
++              if(!chan->input || (chan->ops->read == NULL)) continue;
++              do {
++                      if((tty != NULL) && 
++                         (tty->flip.count >= TTY_FLIPBUF_SIZE)){
++                              queue_task(task, &tq_timer);
++                              goto out;
++                      }
++                      err = chan->ops->read(chan->fd, &c, chan->data);
++                      if(err > 0) tty_receive_char(tty, c);
++              } while(err > 0);
++              if(err == 0) reactivate_fd(chan->fd, irq);
++              if(err == -EIO){
++                      if(chan->primary){
++                              if(tty != NULL) tty_hangup(tty);
++                              line_disable(dev, irq);
++                              close_chan(chans);
++                              free_chan(chans);
++                              return;
++                      }
++                      else {
++                              if(chan->ops->close != NULL)
++                                      chan->ops->close(chan->fd, chan->data);
++                              free_one_chan(chan);
++                      }
++              }
++      }
++ out:
++      if(tty) tty_flip_buffer_push(tty);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/chan_user.c um/arch/um/drivers/chan_user.c
+--- orig/arch/um/drivers/chan_user.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/chan_user.c     Wed Mar 26 13:23:48 2003
+@@ -0,0 +1,213 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <stdlib.h>
++#include <errno.h>
++#include <termios.h>
++#include <fcntl.h>
++#include <string.h>
++#include <signal.h>
++#include <sys/stat.h>
++#include <sys/ioctl.h>
++#include <sys/socket.h>
++#include "kern_util.h"
++#include "user_util.h"
++#include "chan_user.h"
++#include "user.h"
++#include "helper.h"
++#include "os.h"
++#include "choose-mode.h"
++#include "mode.h"
++
++void generic_close(int fd, void *unused)
++{
++      close(fd);
++}
++
++int generic_read(int fd, char *c_out, void *unused)
++{
++      int n;
++
++      n = read(fd, c_out, sizeof(*c_out));
++      if(n < 0){
++              if(errno == EAGAIN) return(0);
++              return(-errno);
++      }
++      else if(n == 0) return(-EIO);
++      return(1);
++}
++
++int generic_write(int fd, const char *buf, int n, void *unused)
++{
++      int count;
++
++      count = write(fd, buf, n);
++      if(count < 0) return(-errno);
++      return(count);
++}
++
++int generic_console_write(int fd, const char *buf, int n, void *unused)
++{
++      struct termios save, new;
++      int err;
++
++      if(isatty(fd)){
++              tcgetattr(fd, &save);
++              new = save;
++              new.c_oflag |= OPOST;
++              tcsetattr(fd, TCSAFLUSH, &new);
++      }
++      err = generic_write(fd, buf, n, NULL);
++      if(isatty(fd)) tcsetattr(fd, TCSAFLUSH, &save);
++      return(err);
++}
++
++int generic_window_size(int fd, void *unused, unsigned short *rows_out,
++                      unsigned short *cols_out)
++{
++      struct winsize size;
++      int ret = 0;
++
++      if(ioctl(fd, TIOCGWINSZ, &size) == 0){
++              ret = ((*rows_out != size.ws_row) || 
++                     (*cols_out != size.ws_col));
++              *rows_out = size.ws_row;
++              *cols_out = size.ws_col;
++      }
++      return(ret);
++}
++
++void generic_free(void *data)
++{
++      kfree(data);
++}
++
++static void winch_handler(int sig)
++{
++}
++
++struct winch_data {
++      int pty_fd;
++      int pipe_fd;
++      int close_me;
++};
++
++static int winch_thread(void *arg)
++{
++      struct winch_data *data = arg;
++      sigset_t sigs;
++      int pty_fd, pipe_fd;
++      char c = 1;
++
++      close(data->close_me);
++      pty_fd = data->pty_fd;
++      pipe_fd = data->pipe_fd;
++      if(write(pipe_fd, &c, sizeof(c)) != sizeof(c))
++              printk("winch_thread : failed to write synchronization "
++                     "byte, errno = %d\n", errno);
++
++      signal(SIGWINCH, winch_handler);
++      sigfillset(&sigs);
++      sigdelset(&sigs, SIGWINCH);
++      if(sigprocmask(SIG_SETMASK, &sigs, NULL) < 0){
++              printk("winch_thread : sigprocmask failed, errno = %d\n", 
++                     errno);
++              exit(1);
++      }
++
++      if(setsid() < 0){
++              printk("winch_thread : setsid failed, errno = %d\n", errno);
++              exit(1);
++      }
++
++      if(ioctl(pty_fd, TIOCSCTTY, 0) < 0){
++              printk("winch_thread : TIOCSCTTY failed, errno = %d\n", errno);
++              exit(1);
++      }
++      if(tcsetpgrp(pty_fd, os_getpid()) < 0){
++              printk("winch_thread : tcsetpgrp failed, errno = %d\n", errno);
++              exit(1);
++      }
++
++      if(read(pipe_fd, &c, sizeof(c)) != sizeof(c))
++              printk("winch_thread : failed to read synchronization byte, "
++                     "errno = %d\n", errno);
++
++      while(1){
++              pause();
++
++              if(write(pipe_fd, &c, sizeof(c)) != sizeof(c)){
++                      printk("winch_thread : write failed, errno = %d\n",
++                             errno);
++              }
++      }
++}
++
++static int winch_tramp(int fd, void *device_data, int *fd_out)
++{
++      struct winch_data data;
++      unsigned long stack;
++      int fds[2], pid, n, err;
++      char c;
++
++      err = os_pipe(fds, 1, 1);
++      if(err){
++              printk("winch_tramp : os_pipe failed, errno = %d\n", -err);
++              return(err);
++      }
++
++      data = ((struct winch_data) { .pty_fd           = fd,
++                                    .pipe_fd          = fds[1],
++                                    .close_me         = fds[0] } );
++      pid = run_helper_thread(winch_thread, &data, 0, &stack, 0);
++      if(pid < 0){
++              printk("fork of winch_thread failed - errno = %d\n", errno);
++              return(pid);
++      }
++
++      close(fds[1]);
++      *fd_out = fds[0];
++      n = read(fds[0], &c, sizeof(c));
++      if(n != sizeof(c)){
++              printk("winch_tramp : failed to read synchronization byte\n");
++              printk("read returned %d, errno = %d\n", n, errno);
++              printk("fd %d will not support SIGWINCH\n", fd);
++              *fd_out = -1;
++      }
++      return(pid);
++}
++
++void register_winch(int fd, void *device_data)
++{
++      int pid, thread, thread_fd;
++      char c = 1;
++
++      if(!isatty(fd)) return;
++
++      pid = tcgetpgrp(fd);
++      if(!CHOOSE_MODE_PROC(is_tracer_winch, is_skas_winch, pid, fd, 
++                           device_data) && (pid == -1)){
++              thread = winch_tramp(fd, device_data, &thread_fd);
++              if(fd != -1){
++                      register_winch_irq(thread_fd, fd, thread, device_data);
++
++                      if(write(thread_fd, &c, sizeof(c)) != sizeof(c))
++                              printk("register_winch : failed to write "
++                                     "synchronization byte\n");
++              }
++      }
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/daemon.h um/arch/um/drivers/daemon.h
+--- orig/arch/um/drivers/daemon.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/daemon.h        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,35 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "net_user.h"
++
++#define SWITCH_VERSION 3
++
++struct daemon_data {
++      char *sock_type;
++      char *ctl_sock;
++      void *ctl_addr;
++      void *data_addr;
++      void *local_addr;
++      int fd;
++      int control;
++      void *dev;
++};
++
++extern struct net_user_info daemon_user_info;
++
++extern int daemon_user_write(int fd, void *buf, int len, 
++                           struct daemon_data *pri);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/daemon_kern.c um/arch/um/drivers/daemon_kern.c
+--- orig/arch/um/drivers/daemon_kern.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/daemon_kern.c   Sun Dec 15 21:19:17 2002
+@@ -0,0 +1,113 @@
++/*
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and 
++ * James Leu (jleu@mindspring.net).
++ * Copyright (C) 2001 by various other people who didn't put their name here.
++ * Licensed under the GPL.
++ */
++
++#include "linux/kernel.h"
++#include "linux/init.h"
++#include "linux/netdevice.h"
++#include "linux/etherdevice.h"
++#include "net_kern.h"
++#include "net_user.h"
++#include "daemon.h"
++
++struct daemon_init {
++      char *sock_type;
++      char *ctl_sock;
++};
++
++void daemon_init(struct net_device *dev, void *data)
++{
++      struct uml_net_private *pri;
++      struct daemon_data *dpri;
++      struct daemon_init *init = data;
++
++      init_etherdev(dev, 0);
++      pri = dev->priv;
++      dpri = (struct daemon_data *) pri->user;
++      *dpri = ((struct daemon_data)
++              { .sock_type            = init->sock_type,
++                .ctl_sock             = init->ctl_sock,
++                .ctl_addr             = NULL,
++                .data_addr            = NULL,
++                .local_addr           = NULL,
++                .fd                   = -1,
++                .control              = -1,
++                .dev                  = dev });
++
++      printk("daemon backend (uml_switch version %d) - %s:%s", 
++             SWITCH_VERSION, dpri->sock_type, dpri->ctl_sock);
++      printk("\n");
++}
++
++static int daemon_read(int fd, struct sk_buff **skb, 
++                     struct uml_net_private *lp)
++{
++      *skb = ether_adjust_skb(*skb, ETH_HEADER_OTHER);
++      if(*skb == NULL) return(-ENOMEM);
++      return(net_recvfrom(fd, (*skb)->mac.raw, 
++                          (*skb)->dev->mtu + ETH_HEADER_OTHER));
++}
++
++static int daemon_write(int fd, struct sk_buff **skb,
++                      struct uml_net_private *lp)
++{
++      return(daemon_user_write(fd, (*skb)->data, (*skb)->len, 
++                               (struct daemon_data *) &lp->user));
++}
++
++static struct net_kern_info daemon_kern_info = {
++      .init                   = daemon_init,
++      .protocol               = eth_protocol,
++      .read                   = daemon_read,
++      .write                  = daemon_write,
++};
++
++int daemon_setup(char *str, char **mac_out, void *data)
++{
++      struct daemon_init *init = data;
++      char *remain;
++
++      *init = ((struct daemon_init)
++              { .sock_type            = "unix",
++                .ctl_sock             = "/tmp/uml.ctl" });
++      
++      remain = split_if_spec(str, mac_out, &init->sock_type, &init->ctl_sock,
++                             NULL);
++      if(remain != NULL)
++              printk(KERN_WARNING "daemon_setup : Ignoring data socket "
++                     "specification\n");
++      
++      return(1);
++}
++
++static struct transport daemon_transport = {
++      .list           = LIST_HEAD_INIT(daemon_transport.list),
++      .name           = "daemon",
++      .setup          = daemon_setup,
++      .user           = &daemon_user_info,
++      .kern           = &daemon_kern_info,
++      .private_size   = sizeof(struct daemon_data),
++      .setup_size     = sizeof(struct daemon_init),
++};
++
++static int register_daemon(void)
++{
++      register_transport(&daemon_transport);
++      return(1);
++}
++
++__initcall(register_daemon);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/daemon_user.c um/arch/um/drivers/daemon_user.c
+--- orig/arch/um/drivers/daemon_user.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/daemon_user.c   Fri Jan 17 13:48:59 2003
+@@ -0,0 +1,195 @@
++/*
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and 
++ * James Leu (jleu@mindspring.net).
++ * Copyright (C) 2001 by various other people who didn't put their name here.
++ * Licensed under the GPL.
++ */
++
++#include <errno.h>
++#include <unistd.h>
++#include <stdint.h>
++#include <sys/socket.h>
++#include <sys/un.h>
++#include <sys/time.h>
++#include "net_user.h"
++#include "daemon.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "user.h"
++#include "os.h"
++
++#define MAX_PACKET (ETH_MAX_PACKET + ETH_HEADER_OTHER)
++
++enum request_type { REQ_NEW_CONTROL };
++
++#define SWITCH_MAGIC 0xfeedface
++
++struct request_v3 {
++      uint32_t magic;
++      uint32_t version;
++      enum request_type type;
++      struct sockaddr_un sock;
++};
++
++static struct sockaddr_un *new_addr(void *name, int len)
++{
++      struct sockaddr_un *sun;
++
++      sun = um_kmalloc(sizeof(struct sockaddr_un));
++      if(sun == NULL){
++              printk("new_addr: allocation of sockaddr_un failed\n");
++              return(NULL);
++      }
++      sun->sun_family = AF_UNIX;
++      memcpy(sun->sun_path, name, len);
++      return(sun);
++}
++
++static int connect_to_switch(struct daemon_data *pri)
++{
++      struct sockaddr_un *ctl_addr = pri->ctl_addr;
++      struct sockaddr_un *local_addr = pri->local_addr;
++      struct sockaddr_un *sun;
++      struct request_v3 req;
++      int fd, n, err;
++
++      if((pri->control = socket(AF_UNIX, SOCK_STREAM, 0)) < 0){
++              printk("daemon_open : control socket failed, errno = %d\n", 
++                     errno);          
++              return(-errno);
++      }
++
++      if(connect(pri->control, (struct sockaddr *) ctl_addr, 
++                 sizeof(*ctl_addr)) < 0){
++              printk("daemon_open : control connect failed, errno = %d\n",
++                     errno);
++              err = -errno;
++              goto out;
++      }
++
++      if((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0){
++              printk("daemon_open : data socket failed, errno = %d\n", 
++                     errno);
++              err = -errno;
++              goto out;
++      }
++      if(bind(fd, (struct sockaddr *) local_addr, sizeof(*local_addr)) < 0){
++              printk("daemon_open : data bind failed, errno = %d\n", 
++                     errno);
++              err = -errno;
++              goto out_close;
++      }
++
++      sun = um_kmalloc(sizeof(struct sockaddr_un));
++      if(sun == NULL){
++              printk("new_addr: allocation of sockaddr_un failed\n");
++              err = -ENOMEM;
++              goto out_close;
++      }
++
++      req.magic = SWITCH_MAGIC;
++      req.version = SWITCH_VERSION;
++      req.type = REQ_NEW_CONTROL;
++      req.sock = *local_addr;
++      n = write(pri->control, &req, sizeof(req));
++      if(n != sizeof(req)){
++              printk("daemon_open : control setup request returned %d, "
++                     "errno = %d\n", n, errno);
++              err = -ENOTCONN;
++              goto out;               
++      }
++
++      n = read(pri->control, sun, sizeof(*sun));
++      if(n != sizeof(*sun)){
++              printk("daemon_open : read of data socket returned %d, "
++                     "errno = %d\n", n, errno);
++              err = -ENOTCONN;
++              goto out_close;         
++      }
++
++      pri->data_addr = sun;
++      return(fd);
++
++ out_close:
++      close(fd);
++ out:
++      close(pri->control);
++      return(err);
++}
++
++static void daemon_user_init(void *data, void *dev)
++{
++      struct daemon_data *pri = data;
++      struct timeval tv;
++      struct {
++              char zero;
++              int pid;
++              int usecs;
++      } name;
++
++      if(!strcmp(pri->sock_type, "unix"))
++              pri->ctl_addr = new_addr(pri->ctl_sock, 
++                                       strlen(pri->ctl_sock) + 1);
++      name.zero = 0;
++      name.pid = os_getpid();
++      gettimeofday(&tv, NULL);
++      name.usecs = tv.tv_usec;
++      pri->local_addr = new_addr(&name, sizeof(name));
++      pri->dev = dev;
++      pri->fd = connect_to_switch(pri);
++      if(pri->fd < 0){
++              kfree(pri->local_addr);
++              pri->local_addr = NULL;
++      }
++}
++
++static int daemon_open(void *data)
++{
++      struct daemon_data *pri = data;
++      return(pri->fd);
++}
++
++static void daemon_remove(void *data)
++{
++      struct daemon_data *pri = data;
++
++      close(pri->fd);
++      close(pri->control);
++      if(pri->data_addr != NULL) kfree(pri->data_addr);
++      if(pri->ctl_addr != NULL) kfree(pri->ctl_addr);
++      if(pri->local_addr != NULL) kfree(pri->local_addr);
++}
++
++int daemon_user_write(int fd, void *buf, int len, struct daemon_data *pri)
++{
++      struct sockaddr_un *data_addr = pri->data_addr;
++
++      return(net_sendto(fd, buf, len, data_addr, sizeof(*data_addr)));
++}
++
++static int daemon_set_mtu(int mtu, void *data)
++{
++      return(mtu);
++}
++
++struct net_user_info daemon_user_info = {
++      .init           = daemon_user_init,
++      .open           = daemon_open,
++      .close          = NULL,
++      .remove         = daemon_remove,
++      .set_mtu        = daemon_set_mtu,
++      .add_address    = NULL,
++      .delete_address = NULL,
++      .max_packet     = MAX_PACKET - ETH_HEADER_OTHER
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/fd.c um/arch/um/drivers/fd.c
+--- orig/arch/um/drivers/fd.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/fd.c    Sun Dec 15 20:57:25 2002
+@@ -0,0 +1,96 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <termios.h>
++#include "user.h"
++#include "user_util.h"
++#include "chan_user.h"
++
++struct fd_chan {
++      int fd;
++      int raw;
++      struct termios tt;
++      char str[sizeof("1234567890\0")];
++};
++
++void *fd_init(char *str, int device, struct chan_opts *opts)
++{
++      struct fd_chan *data;
++      char *end;
++      int n;
++
++      if(*str != ':'){
++              printk("fd_init : channel type 'fd' must specify a file "
++                     "descriptor\n");
++              return(NULL);
++      }
++      str++;
++      n = strtoul(str, &end, 0);
++      if((*end != '\0') || (end == str)){
++              printk("fd_init : couldn't parse file descriptor '%s'\n", str);
++              return(NULL);
++      }
++      if((data = um_kmalloc(sizeof(*data))) == NULL) return(NULL);
++      *data = ((struct fd_chan) { .fd         = n,
++                                  .raw        = opts->raw });
++      return(data);
++}
++
++int fd_open(int input, int output, int primary, void *d, char **dev_out)
++{
++      struct fd_chan *data = d;
++
++      if(data->raw && isatty(data->fd)){
++              tcgetattr(data->fd, &data->tt);
++              raw(data->fd, 0);
++      }
++      sprintf(data->str, "%d", data->fd);
++      *dev_out = data->str;
++      return(data->fd);
++}
++
++void fd_close(int fd, void *d)
++{
++      struct fd_chan *data = d;
++
++      if(data->raw && isatty(fd)){
++              tcsetattr(fd, TCSAFLUSH, &data->tt);
++              data->raw = 0;
++      }
++}
++
++int fd_console_write(int fd, const char *buf, int n, void *d)
++{
++      struct fd_chan *data = d;
++
++      return(generic_console_write(fd, buf, n, &data->tt));
++}
++
++struct chan_ops fd_ops = {
++      .type           = "fd",
++      .init           = fd_init,
++      .open           = fd_open,
++      .close          = fd_close,
++      .read           = generic_read,
++      .write          = generic_write,
++      .console_write  = fd_console_write,
++      .window_size    = generic_window_size,
++      .free           = generic_free,
++      .winch          = 1,
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/harddog_kern.c um/arch/um/drivers/harddog_kern.c
+--- orig/arch/um/drivers/harddog_kern.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/harddog_kern.c  Sun Dec 15 20:57:42 2002
+@@ -0,0 +1,194 @@
++/* UML hardware watchdog, shamelessly stolen from:
++ *
++ *    SoftDog 0.05:   A Software Watchdog Device
++ *
++ *    (c) Copyright 1996 Alan Cox <alan@redhat.com>, All Rights Reserved.
++ *                            http://www.redhat.com
++ *
++ *    This program is free software; you can redistribute it and/or
++ *    modify it under the terms of the GNU General Public License
++ *    as published by the Free Software Foundation; either version
++ *    2 of the License, or (at your option) any later version.
++ *    
++ *    Neither Alan Cox nor CymruNet Ltd. admit liability nor provide 
++ *    warranty for any of this software. This material is provided 
++ *    "AS-IS" and at no charge.       
++ *
++ *    (c) Copyright 1995    Alan Cox <alan@lxorguk.ukuu.org.uk>
++ *
++ *    Software only watchdog driver. Unlike its big brother the WDT501P
++ *    driver this won't always recover a failed machine.
++ *
++ *  03/96: Angelo Haritsis <ah@doc.ic.ac.uk> :
++ *    Modularised.
++ *    Added soft_margin; use upon insmod to change the timer delay.
++ *    NB: uses same minor as wdt (WATCHDOG_MINOR); we could use separate
++ *        minors.
++ *
++ *  19980911 Alan Cox
++ *    Made SMP safe for 2.3.x
++ *
++ *  20011127 Joel Becker (jlbec@evilplan.org>
++ *    Added soft_noboot; Allows testing the softdog trigger without 
++ *    requiring a recompile.
++ *    Added WDIOC_GETTIMEOUT and WDIOC_SETTIMOUT.
++ */
++ 
++#include <linux/module.h>
++#include <linux/config.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/miscdevice.h>
++#include <linux/watchdog.h>
++#include <linux/reboot.h>
++#include <linux/smp_lock.h>
++#include <linux/init.h>
++#include <asm/uaccess.h>
++#include "helper.h"
++#include "mconsole.h"
++
++MODULE_LICENSE("GPL");
++
++/* Locked by the BKL in harddog_open and harddog_release */
++static int timer_alive;
++static int harddog_in_fd = -1;
++static int harddog_out_fd = -1;
++
++/*
++ *    Allow only one person to hold it open
++ */
++ 
++extern int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock);
++
++static int harddog_open(struct inode *inode, struct file *file)
++{
++      int err;
++      char *sock = NULL;
++
++      lock_kernel();
++      if(timer_alive)
++              return -EBUSY;
++#ifdef CONFIG_HARDDOG_NOWAYOUT         
++      MOD_INC_USE_COUNT;
++#endif
++
++#ifdef CONFIG_MCONSOLE
++      sock = mconsole_notify_socket();
++#endif
++      err = start_watchdog(&harddog_in_fd, &harddog_out_fd, sock);
++      if(err) return(err);
++
++      timer_alive = 1;
++      unlock_kernel();
++      return 0;
++}
++
++extern void stop_watchdog(int in_fd, int out_fd);
++
++static int harddog_release(struct inode *inode, struct file *file)
++{
++      /*
++       *      Shut off the timer.
++       */
++      lock_kernel();
++
++      stop_watchdog(harddog_in_fd, harddog_out_fd);
++      harddog_in_fd = -1;
++      harddog_out_fd = -1;
++
++      timer_alive=0;
++      unlock_kernel();
++      return 0;
++}
++
++extern int ping_watchdog(int fd);
++
++static ssize_t harddog_write(struct file *file, const char *data, size_t len,
++                           loff_t *ppos)
++{
++      /*  Can't seek (pwrite) on this device  */
++      if (ppos != &file->f_pos)
++              return -ESPIPE;
++
++      /*
++       *      Refresh the timer.
++       */
++      if(len)
++              return(ping_watchdog(harddog_out_fd));
++      return 0;
++}
++
++static int harddog_ioctl(struct inode *inode, struct file *file,
++                       unsigned int cmd, unsigned long arg)
++{
++      static struct watchdog_info ident = {
++              WDIOF_SETTIMEOUT,
++              0,
++              "UML Hardware Watchdog"
++      };
++      switch (cmd) {
++              default:
++                      return -ENOTTY;
++              case WDIOC_GETSUPPORT:
++                      if(copy_to_user((struct harddog_info *)arg, &ident,
++                                      sizeof(ident)))
++                              return -EFAULT;
++                      return 0;
++              case WDIOC_GETSTATUS:
++              case WDIOC_GETBOOTSTATUS:
++                      return put_user(0,(int *)arg);
++              case WDIOC_KEEPALIVE:
++                      return(ping_watchdog(harddog_out_fd));
++      }
++}
++
++static struct file_operations harddog_fops = {
++      .owner          = THIS_MODULE,
++      .write          = harddog_write,
++      .ioctl          = harddog_ioctl,
++      .open           = harddog_open,
++      .release        = harddog_release,
++};
++
++static struct miscdevice harddog_miscdev = {
++      .minor          = WATCHDOG_MINOR,
++      .name           = "watchdog",
++      .fops           = &harddog_fops,
++};
++
++static char banner[] __initdata = KERN_INFO "UML Watchdog Timer\n";
++
++static int __init harddog_init(void)
++{
++      int ret;
++
++      ret = misc_register(&harddog_miscdev);
++
++      if (ret)
++              return ret;
++
++      printk(banner);
++
++      return(0);
++}
++
++static void __exit harddog_exit(void)
++{
++      misc_deregister(&harddog_miscdev);
++}
++
++module_init(harddog_init);
++module_exit(harddog_exit);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/harddog_user.c um/arch/um/drivers/harddog_user.c
+--- orig/arch/um/drivers/harddog_user.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/harddog_user.c  Wed Dec  4 16:38:05 2002
+@@ -0,0 +1,137 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <errno.h>
++#include "user_util.h"
++#include "user.h"
++#include "helper.h"
++#include "mconsole.h"
++#include "os.h"
++#include "choose-mode.h"
++#include "mode.h"
++
++struct dog_data {
++      int stdin;
++      int stdout;
++      int close_me[2];
++};
++
++static void pre_exec(void *d)
++{
++      struct dog_data *data = d;
++
++      dup2(data->stdin, 0);
++      dup2(data->stdout, 1);
++      dup2(data->stdout, 2);
++      close(data->stdin);
++      close(data->stdout);
++      close(data->close_me[0]);
++      close(data->close_me[1]);
++}
++
++int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock)
++{
++      struct dog_data data;
++      int in_fds[2], out_fds[2], pid, n, err;
++      char pid_buf[sizeof("nnnnn\0")], c;
++      char *pid_args[] = { "/usr/bin/uml_watchdog", "-pid", pid_buf, NULL };
++      char *mconsole_args[] = { "/usr/bin/uml_watchdog", "-mconsole", NULL, 
++                                NULL };
++      char **args = NULL;
++
++      err = os_pipe(in_fds, 1, 0);
++      if(err){
++              printk("harddog_open - os_pipe failed, errno = %d\n", -err);
++              return(err);
++      }
++
++      err = os_pipe(out_fds, 1, 0);
++      if(err){
++              printk("harddog_open - os_pipe failed, errno = %d\n", -err);
++              return(err);
++      }
++
++      data.stdin = out_fds[0];
++      data.stdout = in_fds[1];
++      data.close_me[0] = out_fds[1];
++      data.close_me[1] = in_fds[0];
++
++      if(sock != NULL){
++              mconsole_args[2] = sock;
++              args = mconsole_args;
++      }
++      else {
++              /* XXX The os_getpid() is not SMP correct */
++              sprintf(pid_buf, "%d", CHOOSE_MODE(tracing_pid, os_getpid()));
++              args = pid_args;
++      }
++
++      pid = run_helper(pre_exec, &data, args, NULL);
++
++      close(out_fds[0]);
++      close(in_fds[1]);
++
++      if(pid < 0){
++              err = -pid;
++              printk("harddog_open - run_helper failed, errno = %d\n", err);
++              goto out;
++      }
++
++      n = read(in_fds[0], &c, sizeof(c));
++      if(n == 0){
++              printk("harddog_open - EOF on watchdog pipe\n");
++              helper_wait(pid);
++              err = -EIO;
++              goto out;
++      }
++      else if(n < 0){
++              printk("harddog_open - read of watchdog pipe failed, "
++                     "errno = %d\n", errno);
++              helper_wait(pid);
++              err = -errno;
++              goto out;
++      }
++      *in_fd_ret = in_fds[0];
++      *out_fd_ret = out_fds[1];
++      return(0);
++ out:
++      close(out_fds[1]);
++      close(in_fds[0]);
++      return(err);
++}
++
++void stop_watchdog(int in_fd, int out_fd)
++{
++      close(in_fd);
++      close(out_fd);
++}
++
++int ping_watchdog(int fd)
++{
++      int n;
++      char c = '\n';
++
++      n = write(fd, &c, sizeof(c));
++      if(n < sizeof(c)){
++              printk("ping_watchdog - write failed, errno = %d\n",
++                     errno);
++              return(-errno);
++      }
++      return 1;
++
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/hostaudio_kern.c um/arch/um/drivers/hostaudio_kern.c
+--- orig/arch/um/drivers/hostaudio_kern.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/hostaudio_kern.c        Fri Mar 28 21:57:16 2003
+@@ -0,0 +1,330 @@
++/* 
++ * Copyright (C) 2002 Steve Schmidtke 
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/module.h"
++#include "linux/version.h"
++#include "linux/init.h"
++#include "linux/slab.h"
++#include "linux/fs.h"
++#include "linux/sound.h"
++#include "linux/soundcard.h"
++#include "asm/uaccess.h"
++#include "kern_util.h"
++#include "init.h"
++#include "hostaudio.h"
++
++/* Only changed from linux_main at boot time */
++char *dsp = HOSTAUDIO_DEV_DSP;
++char *mixer = HOSTAUDIO_DEV_MIXER;
++
++#ifndef MODULE
++static int set_dsp(char *name, int *add)
++{
++      dsp = name;
++      return(0);
++}
++
++__uml_setup("dsp=", set_dsp,
++"dsp=<dsp device>\n"
++"    This is used to specify the host dsp device to the hostaudio driver.\n"
++"    The default is \"" HOSTAUDIO_DEV_DSP "\".\n\n"
++);
++
++static int set_mixer(char *name, int *add)
++{
++      mixer = name;
++      return(0);
++}
++
++__uml_setup("mixer=", set_mixer,
++"mixer=<mixer device>\n"
++"    This is used to specify the host mixer device to the hostaudio driver.\n"
++"    The default is \"" HOSTAUDIO_DEV_MIXER "\".\n\n"
++);
++#endif
++
++/* /dev/dsp file operations */
++
++static ssize_t hostaudio_read(struct file *file, char *buffer, size_t count, 
++                            loff_t *ppos)
++{
++        struct hostaudio_state *state = file->private_data;
++      void *kbuf;
++      int err;
++
++#ifdef DEBUG
++        printk("hostaudio: read called, count = %d\n", count);
++#endif
++
++      kbuf = kmalloc(count, GFP_KERNEL);
++      if(kbuf == NULL)
++              return(-ENOMEM);
++
++        err = hostaudio_read_user(state, kbuf, count, ppos);
++      if(err < 0)
++              goto out;
++
++      if(copy_to_user(buffer, kbuf, err))
++              err = -EFAULT;
++
++ out:
++      kfree(kbuf);
++      return(err);
++}
++
++static ssize_t hostaudio_write(struct file *file, const char *buffer, 
++                             size_t count, loff_t *ppos)
++{
++        struct hostaudio_state *state = file->private_data;
++      void *kbuf;
++      int err;
++
++#ifdef DEBUG
++        printk("hostaudio: write called, count = %d\n", count);
++#endif
++
++      kbuf = kmalloc(count, GFP_KERNEL);
++      if(kbuf == NULL)
++              return(-ENOMEM);
++
++      err = -EFAULT;
++      if(copy_from_user(kbuf, buffer, count))
++              goto out;
++
++        err = hostaudio_write_user(state, kbuf, count, ppos);
++      if(err < 0)
++              goto out;
++
++ out:
++      kfree(kbuf);
++      return(err);
++}
++
++static unsigned int hostaudio_poll(struct file *file, 
++                                 struct poll_table_struct *wait)
++{
++        unsigned int mask = 0;
++
++#ifdef DEBUG
++        printk("hostaudio: poll called (unimplemented)\n");
++#endif
++
++        return(mask);
++}
++
++static int hostaudio_ioctl(struct inode *inode, struct file *file, 
++                         unsigned int cmd, unsigned long arg)
++{
++        struct hostaudio_state *state = file->private_data;
++      unsigned long data = 0;
++      int err;
++
++#ifdef DEBUG
++        printk("hostaudio: ioctl called, cmd = %u\n", cmd);
++#endif
++      switch(cmd){
++      case SNDCTL_DSP_SPEED:
++      case SNDCTL_DSP_STEREO:
++      case SNDCTL_DSP_GETBLKSIZE:
++      case SNDCTL_DSP_CHANNELS:
++      case SNDCTL_DSP_SUBDIVIDE:
++      case SNDCTL_DSP_SETFRAGMENT:
++              if(get_user(data, (int *) arg))
++                      return(-EFAULT);
++              break;
++      default:
++              break;
++      }
++
++        err = hostaudio_ioctl_user(state, cmd, (unsigned long) &data);
++
++      switch(cmd){
++      case SNDCTL_DSP_SPEED:
++      case SNDCTL_DSP_STEREO:
++      case SNDCTL_DSP_GETBLKSIZE:
++      case SNDCTL_DSP_CHANNELS:
++      case SNDCTL_DSP_SUBDIVIDE:
++      case SNDCTL_DSP_SETFRAGMENT:
++              if(put_user(data, (int *) arg))
++                      return(-EFAULT);
++              break;
++      default:
++              break;
++      }
++
++      return(err);
++}
++
++static int hostaudio_open(struct inode *inode, struct file *file)
++{
++        struct hostaudio_state *state;
++        int r = 0, w = 0;
++        int ret;
++
++#ifdef DEBUG
++        printk("hostaudio: open called (host: %s)\n", dsp);
++#endif
++
++        state = kmalloc(sizeof(struct hostaudio_state), GFP_KERNEL);
++        if(state == NULL) return(-ENOMEM);
++
++        if(file->f_mode & FMODE_READ) r = 1;
++        if(file->f_mode & FMODE_WRITE) w = 1;
++
++        ret = hostaudio_open_user(state, r, w, dsp);
++        if(ret < 0){
++              kfree(state);
++              return(ret);
++        }
++
++        file->private_data = state;
++        return(0);
++}
++
++static int hostaudio_release(struct inode *inode, struct file *file)
++{
++        struct hostaudio_state *state = file->private_data;
++        int ret;
++
++#ifdef DEBUG
++        printk("hostaudio: release called\n");
++#endif
++
++        ret = hostaudio_release_user(state);
++        kfree(state);
++
++        return(ret);
++}
++
++/* /dev/mixer file operations */
++
++static int hostmixer_ioctl_mixdev(struct inode *inode, struct file *file, 
++                                unsigned int cmd, unsigned long arg)
++{
++        struct hostmixer_state *state = file->private_data;
++
++#ifdef DEBUG
++        printk("hostmixer: ioctl called\n");
++#endif
++
++        return(hostmixer_ioctl_mixdev_user(state, cmd, arg));
++}
++
++static int hostmixer_open_mixdev(struct inode *inode, struct file *file)
++{
++        struct hostmixer_state *state;
++        int r = 0, w = 0;
++        int ret;
++
++#ifdef DEBUG
++        printk("hostmixer: open called (host: %s)\n", mixer);
++#endif
++
++        state = kmalloc(sizeof(struct hostmixer_state), GFP_KERNEL);
++        if(state == NULL) return(-ENOMEM);
++
++        if(file->f_mode & FMODE_READ) r = 1;
++        if(file->f_mode & FMODE_WRITE) w = 1;
++
++        ret = hostmixer_open_mixdev_user(state, r, w, mixer);
++        
++        if(ret < 0){
++              kfree(state);
++              return(ret);
++        }
++
++        file->private_data = state;
++        return(0);
++}
++
++static int hostmixer_release(struct inode *inode, struct file *file)
++{
++        struct hostmixer_state *state = file->private_data;
++      int ret;
++
++#ifdef DEBUG
++        printk("hostmixer: release called\n");
++#endif
++
++        ret = hostmixer_release_mixdev_user(state);
++        kfree(state);
++
++        return(ret);
++}
++
++
++/* kernel module operations */
++
++static struct file_operations hostaudio_fops = {
++        .owner          = THIS_MODULE,
++        .llseek         = no_llseek,
++        .read           = hostaudio_read,
++        .write          = hostaudio_write,
++        .poll           = hostaudio_poll,
++        .ioctl          = hostaudio_ioctl,
++        .mmap           = NULL,
++        .open           = hostaudio_open,
++        .release        = hostaudio_release,
++};
++
++static struct file_operations hostmixer_fops = {
++        .owner          = THIS_MODULE,
++        .llseek         = no_llseek,
++        .ioctl          = hostmixer_ioctl_mixdev,
++        .open           = hostmixer_open_mixdev,
++        .release        = hostmixer_release,
++};
++
++struct {
++      int dev_audio;
++      int dev_mixer;
++} module_data;
++
++MODULE_AUTHOR("Steve Schmidtke");
++MODULE_DESCRIPTION("UML Audio Relay");
++MODULE_LICENSE("GPL");
++
++static int __init hostaudio_init_module(void)
++{
++        printk(KERN_INFO "UML Audio Relay (host dsp = %s, host mixer = %s)\n",
++             dsp, mixer);
++
++      module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1);
++        if(module_data.dev_audio < 0){
++                printk(KERN_ERR "hostaudio: couldn't register DSP device!\n");
++                return -ENODEV;
++        }
++
++      module_data.dev_mixer = register_sound_mixer(&hostmixer_fops, -1);
++        if(module_data.dev_mixer < 0){
++                printk(KERN_ERR "hostmixer: couldn't register mixer "
++                     "device!\n");
++                unregister_sound_dsp(module_data.dev_audio);
++                return -ENODEV;
++        }
++
++        return 0;
++}
++
++static void __exit hostaudio_cleanup_module (void)
++{
++       unregister_sound_mixer(module_data.dev_mixer);
++       unregister_sound_dsp(module_data.dev_audio);
++}
++
++module_init(hostaudio_init_module);
++module_exit(hostaudio_cleanup_module);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/hostaudio_user.c um/arch/um/drivers/hostaudio_user.c
+--- orig/arch/um/drivers/hostaudio_user.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/hostaudio_user.c        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,149 @@
++/* 
++ * Copyright (C) 2002 Steve Schmidtke 
++ * Licensed under the GPL
++ */
++
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <sys/ioctl.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <errno.h>
++#include "hostaudio.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "os.h"
++
++/* /dev/dsp file operations */
++
++ssize_t hostaudio_read_user(struct hostaudio_state *state, char *buffer, 
++                          size_t count, loff_t *ppos)
++{
++      ssize_t ret;
++
++#ifdef DEBUG
++        printk("hostaudio: read_user called, count = %d\n", count);
++#endif
++
++        ret = read(state->fd, buffer, count);
++
++        if(ret < 0) return(-errno);
++        return(ret);
++}
++
++ssize_t hostaudio_write_user(struct hostaudio_state *state, const char *buffer,
++                           size_t count, loff_t *ppos)
++{
++      ssize_t ret;
++
++#ifdef DEBUG
++        printk("hostaudio: write_user called, count = %d\n", count);
++#endif
++
++        ret = write(state->fd, buffer, count);
++
++        if(ret < 0) return(-errno);
++        return(ret);
++}
++
++int hostaudio_ioctl_user(struct hostaudio_state *state, unsigned int cmd, 
++                       unsigned long arg)
++{
++      int ret;
++#ifdef DEBUG
++        printk("hostaudio: ioctl_user called, cmd = %u\n", cmd);
++#endif
++
++        ret = ioctl(state->fd, cmd, arg);
++      
++        if(ret < 0) return(-errno);
++        return(ret);
++}
++
++int hostaudio_open_user(struct hostaudio_state *state, int r, int w, char *dsp)
++{
++#ifdef DEBUG
++        printk("hostaudio: open_user called\n");
++#endif
++
++        state->fd = os_open_file(dsp, of_set_rw(OPENFLAGS(), r, w), 0);
++
++        if(state->fd >= 0) return(0);
++
++        printk("hostaudio_open_user failed to open '%s', errno = %d\n",
++             dsp, errno);
++        
++        return(-errno); 
++}
++
++int hostaudio_release_user(struct hostaudio_state *state)
++{
++#ifdef DEBUG
++        printk("hostaudio: release called\n");
++#endif
++        if(state->fd >= 0){
++              close(state->fd);
++              state->fd=-1;
++        }
++
++        return(0);
++}
++
++/* /dev/mixer file operations */
++
++int hostmixer_ioctl_mixdev_user(struct hostmixer_state *state, 
++                              unsigned int cmd, unsigned long arg)
++{
++      int ret;
++#ifdef DEBUG
++        printk("hostmixer: ioctl_user called cmd = %u\n",cmd);
++#endif
++
++        ret = ioctl(state->fd, cmd, arg);
++      if(ret < 0) 
++              return(-errno);
++      return(ret);
++}
++
++int hostmixer_open_mixdev_user(struct hostmixer_state *state, int r, int w,
++                             char *mixer)
++{
++#ifdef DEBUG
++        printk("hostmixer: open_user called\n");
++#endif
++
++        state->fd = os_open_file(mixer, of_set_rw(OPENFLAGS(), r, w), 0);
++
++        if(state->fd >= 0) return(0);
++
++        printk("hostaudio_open_mixdev_user failed to open '%s', errno = %d\n",
++             mixer, errno);
++        
++        return(-errno); 
++}
++
++int hostmixer_release_mixdev_user(struct hostmixer_state *state)
++{
++#ifdef DEBUG
++        printk("hostmixer: release_user called\n");
++#endif
++
++        if(state->fd >= 0){
++              close(state->fd);
++              state->fd = -1;
++        }
++
++        return 0;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/line.c um/arch/um/drivers/line.c
+--- orig/arch/um/drivers/line.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/line.c  Wed Mar 26 15:09:44 2003
+@@ -0,0 +1,589 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sched.h"
++#include "linux/slab.h"
++#include "linux/list.h"
++#include "linux/devfs_fs_kernel.h"
++#include "asm/irq.h"
++#include "asm/uaccess.h"
++#include "chan_kern.h"
++#include "irq_user.h"
++#include "line.h"
++#include "kern.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "os.h"
++
++#define LINE_BUFSIZE 4096
++
++void line_interrupt(int irq, void *data, struct pt_regs *unused)
++{
++      struct line *dev = data;
++
++      if(dev->count > 0) 
++              chan_interrupt(&dev->chan_list, &dev->task, dev->tty, irq, 
++                             dev);
++}
++
++void line_timer_cb(void *arg)
++{
++      struct line *dev = arg;
++
++      line_interrupt(dev->driver->read_irq, dev, NULL);
++}
++
++static void buffer_data(struct line *line, const char *buf, int len)
++{
++      int end;
++
++      if(line->buffer == NULL){
++              line->buffer = kmalloc(LINE_BUFSIZE, GFP_ATOMIC);
++              if(line->buffer == NULL){
++                      printk("buffer_data - atomic allocation failed\n");
++                      return;
++              }
++              line->head = line->buffer;
++              line->tail = line->buffer;
++      }
++      end = line->buffer + LINE_BUFSIZE - line->tail;
++      if(len < end){
++              memcpy(line->tail, buf, len);
++              line->tail += len;
++      }
++      else {
++              memcpy(line->tail, buf, end);
++              buf += end;
++              len -= end;
++              memcpy(line->buffer, buf, len);
++              line->tail = line->buffer + len;
++      }
++}
++
++static int flush_buffer(struct line *line)
++{
++      int n, count;
++
++      if((line->buffer == NULL) || (line->head == line->tail)) return(1);
++
++      if(line->tail < line->head){
++              count = line->buffer + LINE_BUFSIZE - line->head;
++              n = write_chan(&line->chan_list, line->head, count,
++                             line->driver->write_irq);
++              if(n < 0) return(n);
++              if(n == count) line->head = line->buffer;
++              else {
++                      line->head += n;
++                      return(0);
++              }
++      }
++
++      count = line->tail - line->head;
++      n = write_chan(&line->chan_list, line->head, count, 
++                     line->driver->write_irq);
++      if(n < 0) return(n);
++
++      line->head += n;
++      return(line->head == line->tail);
++}
++
++int line_write(struct line *lines, struct tty_struct *tty, int from_user,
++             const char *buf, int len)
++{
++      struct line *line;
++      char *new;
++      unsigned long flags;
++      int n, err, i;
++
++      if(tty->stopped) return 0;
++
++      if(from_user){
++              new = kmalloc(len, GFP_KERNEL);
++              if(new == NULL)
++                      return(0);
++              n = copy_from_user(new, buf, len);
++              if(n == len)
++                      return(-EFAULT);
++              buf = new;
++      }
++
++      i = minor(tty->device) - tty->driver.minor_start;
++      line = &lines[i];
++
++      down(&line->sem);
++      if(line->head != line->tail){
++              local_irq_save(flags);
++              buffer_data(line, buf, len);
++              err = flush_buffer(line);
++              local_irq_restore(flags);
++              if(err <= 0)
++                      goto out;
++      }
++      else {
++              n = write_chan(&line->chan_list, buf, len, 
++                             line->driver->write_irq);
++              if(n < 0){
++                      len = n;
++                      goto out;
++              }
++              if(n < len)
++                      buffer_data(line, buf + n, len - n);
++      }
++ out:
++      up(&line->sem);
++
++      if(from_user)
++              kfree(buf);
++      return(len);
++}
++
++void line_write_interrupt(int irq, void *data, struct pt_regs *unused)
++{
++      struct line *dev = data;
++      struct tty_struct *tty = dev->tty;
++      int err;
++
++      err = flush_buffer(dev);
++      if(err == 0) return;
++      else if(err < 0){
++              dev->head = dev->buffer;
++              dev->tail = dev->buffer;
++      }
++
++      if(tty == NULL) return;
++
++      if(test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) &&
++         (tty->ldisc.write_wakeup != NULL))
++              (tty->ldisc.write_wakeup)(tty);
++      
++      /* BLOCKING mode
++       * In blocking mode, everything sleeps on tty->write_wait.
++       * Sleeping in the console driver would break non-blocking
++       * writes.
++       */
++
++      if (waitqueue_active(&tty->write_wait))
++              wake_up_interruptible(&tty->write_wait);
++
++}
++
++int line_write_room(struct tty_struct *tty)
++{
++      struct line *dev = tty->driver_data;
++      int n;
++
++      if(dev->buffer == NULL) return(LINE_BUFSIZE - 1);
++
++      n = dev->head - dev->tail;
++      if(n <= 0) n = LINE_BUFSIZE + n;
++      return(n - 1);
++}
++
++int line_setup_irq(int fd, int input, int output, void *data)
++{
++      struct line *line = data;
++      struct line_driver *driver = line->driver;
++      int err = 0, flags = SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM;
++
++      if(input) err = um_request_irq(driver->read_irq, fd, IRQ_READ, 
++                                     line_interrupt, flags, 
++                                     driver->read_irq_name, line);
++      if(err) return(err);
++      if(output) err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, 
++                                      line_write_interrupt, flags, 
++                                      driver->write_irq_name, line);
++      line->have_irq = 1;
++      return(err);
++}
++
++void line_disable(struct line *line, int current_irq)
++{
++      if(!line->have_irq) return;
++
++      if(line->driver->read_irq == current_irq)
++              free_irq_later(line->driver->read_irq, line);
++      else
++              free_irq(line->driver->read_irq, line);
++
++      if(line->driver->write_irq == current_irq)
++              free_irq_later(line->driver->write_irq, line);
++      else
++              free_irq(line->driver->write_irq, line);
++
++      line->have_irq = 0;
++}
++
++int line_open(struct line *lines, struct tty_struct *tty,
++            struct chan_opts *opts)
++{
++      struct line *line;
++      int n, err = 0;
++
++      if(tty == NULL) n = 0;
++      else n = minor(tty->device) - tty->driver.minor_start;
++      line = &lines[n];
++
++      down(&line->sem);
++      if(line->count == 0){
++              if(!line->valid){
++                      err = -ENODEV;
++                      goto out;
++              }
++              if(list_empty(&line->chan_list)){
++                      err = parse_chan_pair(line->init_str, &line->chan_list,
++                                            line->init_pri, n, opts);
++                      if(err) goto out;
++                      err = open_chan(&line->chan_list);
++                      if(err) goto out;
++              }
++              enable_chan(&line->chan_list, line);
++              INIT_TQUEUE(&line->task, line_timer_cb, line);
++      }
++
++      if(!line->sigio){
++              chan_enable_winch(&line->chan_list, line);
++              line->sigio = 1;
++      }
++
++      /* This is outside the if because the initial console is opened
++       * with tty == NULL
++       */
++      line->tty = tty;
++
++      if(tty != NULL){
++              tty->driver_data = line;
++              chan_window_size(&line->chan_list, &tty->winsize.ws_row, 
++                               &tty->winsize.ws_col);
++      }
++
++      line->count++;
++ out:
++      up(&line->sem);
++      return(err);
++}
++
++void line_close(struct line *lines, struct tty_struct *tty)
++{
++      struct line *line;
++      int n;
++
++      if(tty == NULL) n = 0;
++      else n = minor(tty->device) - tty->driver.minor_start;
++      line = &lines[n];
++
++      down(&line->sem);
++      line->count--;
++
++      /* I don't like this, but I can't think of anything better.  What's
++       * going on is that the tty is in the process of being closed for
++       * the last time.  Its count hasn't been dropped yet, so it's still
++       * at 1.  This may happen when line->count != 0 because of the initial
++       * console open (without a tty) bumping it up to 1.
++       */
++      if((line->tty != NULL) && (line->tty->count == 1))
++              line->tty = NULL;
++      if(line->count == 0)
++              line_disable(line, -1);
++      up(&line->sem);
++}
++
++void close_lines(struct line *lines, int nlines)
++{
++      int i;
++
++      for(i = 0; i < nlines; i++)
++              close_chan(&lines[i].chan_list);
++}
++
++int line_setup(struct line *lines, int num, char *init, int all_allowed)
++{
++      int i, n;
++      char *end;
++
++      if(*init == '=') n = -1;
++      else {
++              n = simple_strtoul(init, &end, 0);
++              if(*end != '='){
++                      printk(KERN_ERR "line_setup failed to parse \"%s\"\n", 
++                             init);
++                      return(1);
++              }
++              init = end;
++      }
++      init++;
++      if((n >= 0) && (n >= num)){
++              printk("line_setup - %d out of range ((0 ... %d) allowed)\n",
++                     n, num);
++              return(1);
++      }
++      else if(n >= 0){
++              if(lines[n].count > 0){
++                      printk("line_setup - device %d is open\n", n);
++                      return(1);
++              }
++              if(lines[n].init_pri <= INIT_ONE){
++                      lines[n].init_pri = INIT_ONE;
++                      if(!strcmp(init, "none")) lines[n].valid = 0;
++                      else {
++                              lines[n].init_str = init;
++                              lines[n].valid = 1;
++                      }       
++              }
++      }
++      else if(!all_allowed){
++              printk("line_setup - can't configure all devices from "
++                     "mconsole\n");
++              return(1);
++      }
++      else {
++              for(i = 0; i < num; i++){
++                      if(lines[i].init_pri <= INIT_ALL){
++                              lines[i].init_pri = INIT_ALL;
++                              if(!strcmp(init, "none")) lines[i].valid = 0;
++                              else {
++                                      lines[i].init_str = init;
++                                      lines[i].valid = 1;
++                              }
++                      }
++              }
++      }
++      return(0);
++}
++
++int line_config(struct line *lines, int num, char *str)
++{
++      char *new = uml_strdup(str);
++
++      if(new == NULL){
++              printk("line_config - uml_strdup failed\n");
++              return(-ENOMEM);
++      }
++      return(line_setup(lines, num, new, 0));
++}
++
++int line_get_config(char *name, struct line *lines, int num, char *str, 
++                  int size, char **error_out)
++{
++      struct line *line;
++      char *end;
++      int dev, n = 0;
++
++      dev = simple_strtoul(name, &end, 0);
++      if((*end != '\0') || (end == name)){
++              *error_out = "line_get_config failed to parse device number";
++              return(0);
++      }
++
++      if((dev < 0) || (dev >= num)){
++              *error_out = "device number of of range";
++              return(0);
++      }
++
++      line = &lines[dev];
++
++      down(&line->sem);
++      if(!line->valid)
++              CONFIG_CHUNK(str, size, n, "none", 1);
++      else if(line->count == 0)
++              CONFIG_CHUNK(str, size, n, line->init_str, 1);
++      else n = chan_config_string(&line->chan_list, str, size, error_out);
++      up(&line->sem);
++
++      return(n);
++}
++
++int line_remove(struct line *lines, int num, char *str)
++{
++      char config[sizeof("conxxxx=none\0")];
++
++      sprintf(config, "%s=none", str);
++      return(line_setup(lines, num, config, 0));
++}
++
++void line_register_devfs(struct lines *set, struct line_driver *line_driver, 
++                       struct tty_driver *driver, struct line *lines,
++                       int nlines)
++{
++      int err, i, n;
++      char *from, *to;
++
++      driver->driver_name = line_driver->name;
++      driver->name = line_driver->devfs_name;
++      driver->major = line_driver->major;
++      driver->minor_start = line_driver->minor_start;
++      driver->type = line_driver->type;
++      driver->subtype = line_driver->subtype;
++      driver->magic = TTY_DRIVER_MAGIC;
++      driver->flags = TTY_DRIVER_REAL_RAW;
++
++      n = set->num;
++      driver->num = n;
++      driver->table = kmalloc(n * sizeof(driver->table[0]), GFP_KERNEL);
++      driver->termios = kmalloc(n * sizeof(driver->termios[0]), GFP_KERNEL);
++      driver->termios_locked = kmalloc(n * sizeof(driver->termios_locked[0]),
++                                       GFP_KERNEL);
++      if((driver->table == NULL) || (driver->termios == NULL) ||
++         (driver->termios_locked == NULL))
++              panic("Failed to allocate driver table");
++
++      memset(driver->table, 0, n * sizeof(driver->table[0]));
++      memset(driver->termios, 0, n * sizeof(driver->termios[0]));
++      memset(driver->termios_locked, 0, 
++             n * sizeof(driver->termios_locked[0]));
++
++      driver->write_room = line_write_room;
++      driver->init_termios = tty_std_termios;
++
++      if (tty_register_driver(driver))
++              panic("line_register_devfs : Couldn't register driver\n");
++
++      from = line_driver->symlink_from;
++      to = line_driver->symlink_to;
++      err = devfs_mk_symlink(NULL, from, 0, to, NULL, NULL);
++      if(err) printk("Symlink creation from /dev/%s to /dev/%s "
++                     "returned %d\n", from, to, err);
++
++      for(i = 0; i < nlines; i++){
++              if(!lines[i].valid) 
++                      tty_unregister_devfs(driver, driver->minor_start + i);
++      }
++
++      mconsole_register_dev(&line_driver->mc);
++}
++
++void lines_init(struct line *lines, int nlines)
++{
++      struct line *line;
++      int i;
++
++      for(i = 0; i < nlines; i++){
++              line = &lines[i];
++              INIT_LIST_HEAD(&line->chan_list);
++              sema_init(&line->sem, 1);
++              if(line->init_str != NULL){
++                      line->init_str = uml_strdup(line->init_str);
++                      if(line->init_str == NULL)
++                              printk("lines_init - uml_strdup returned "
++                                     "NULL\n");
++              }
++      }
++}
++
++struct winch {
++      struct list_head list;
++      int fd;
++      int tty_fd;
++      int pid;
++      struct line *line;
++};
++
++void winch_interrupt(int irq, void *data, struct pt_regs *unused)
++{
++      struct winch *winch = data;
++      struct tty_struct *tty;
++      int err;
++      char c;
++
++      if(winch->fd != -1){
++              err = generic_read(winch->fd, &c, NULL);
++              if(err < 0){
++                      if(err != -EAGAIN){
++                              printk("winch_interrupt : read failed, "
++                                     "errno = %d\n", -err);
++                              printk("fd %d is losing SIGWINCH support\n", 
++                                     winch->tty_fd);
++                              return;
++                      }
++                      goto out;
++              }
++      }
++      tty = winch->line->tty;
++      if(tty != NULL){
++              chan_window_size(&winch->line->chan_list, 
++                               &tty->winsize.ws_row, 
++                               &tty->winsize.ws_col);
++              kill_pg(tty->pgrp, SIGWINCH, 1);
++      }
++ out:
++      if(winch->fd != -1)
++              reactivate_fd(winch->fd, WINCH_IRQ);
++}
++
++DECLARE_MUTEX(winch_handler_sem);
++LIST_HEAD(winch_handlers);
++
++void register_winch_irq(int fd, int tty_fd, int pid, void *line)
++{
++      struct winch *winch;
++
++      down(&winch_handler_sem);
++      winch = kmalloc(sizeof(*winch), GFP_KERNEL);
++      if(winch == NULL){
++              printk("register_winch_irq - kmalloc failed\n");
++              goto out;
++      }
++      *winch = ((struct winch) { .list        = LIST_HEAD_INIT(winch->list),
++                                 .fd          = fd,
++                                 .tty_fd      = tty_fd,
++                                 .pid         = pid,
++                                 .line        = line });
++      list_add(&winch->list, &winch_handlers);
++      if(um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt, 
++                        SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, 
++                        "winch", winch) < 0)
++              printk("register_winch_irq - failed to register IRQ\n");
++ out:
++      up(&winch_handler_sem);
++}
++
++static void winch_cleanup(void)
++{
++      struct list_head *ele;
++      struct winch *winch;
++
++      list_for_each(ele, &winch_handlers){
++              winch = list_entry(ele, struct winch, list);
++              if(winch->fd != -1){
++                      deactivate_fd(winch->fd, WINCH_IRQ);
++                      close(winch->fd);
++              }
++              if(winch->pid != -1) 
++                      os_kill_process(winch->pid, 1);
++      }
++}
++
++__uml_exitcall(winch_cleanup);
++
++char *add_xterm_umid(char *base)
++{
++      char *umid, *title;
++      int len;
++
++      umid = get_umid(1);
++      if(umid == NULL) return(base);
++      
++      len = strlen(base) + strlen(" ()") + strlen(umid) + 1;
++      title = kmalloc(len, GFP_KERNEL);
++      if(title == NULL){
++              printk("Failed to allocate buffer for xterm title\n");
++              return(base);
++      }
++
++      strncpy(title, base, len);
++      len -= strlen(title);
++      snprintf(&title[strlen(title)], len, " (%s)", umid);
++      return(title);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/mcast.h um/arch/um/drivers/mcast.h
+--- orig/arch/um/drivers/mcast.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/mcast.h Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,30 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "net_user.h"
++
++struct mcast_data {
++      char *addr;
++      unsigned short port;
++      void *mcast_addr;
++      int ttl;
++      void *dev;
++};
++
++extern struct net_user_info mcast_user_info;
++
++extern int mcast_user_write(int fd, void *buf, int len, 
++                          struct mcast_data *pri);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/mcast_kern.c um/arch/um/drivers/mcast_kern.c
+--- orig/arch/um/drivers/mcast_kern.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/mcast_kern.c    Sun Dec 15 20:58:55 2002
+@@ -0,0 +1,145 @@
++/*
++ * user-mode-linux networking multicast transport
++ * Copyright (C) 2001 by Harald Welte <laforge@gnumonks.org>
++ *
++ * based on the existing uml-networking code, which is
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and 
++ * James Leu (jleu@mindspring.net).
++ * Copyright (C) 2001 by various other people who didn't put their name here.
++ *
++ * Licensed under the GPL.
++ */
++
++#include "linux/kernel.h"
++#include "linux/init.h"
++#include "linux/netdevice.h"
++#include "linux/etherdevice.h"
++#include "linux/in.h"
++#include "linux/inet.h"
++#include "net_kern.h"
++#include "net_user.h"
++#include "mcast.h"
++
++struct mcast_init {
++      char *addr;
++      int port;
++      int ttl;
++};
++
++void mcast_init(struct net_device *dev, void *data)
++{
++      struct uml_net_private *pri;
++      struct mcast_data *dpri;
++      struct mcast_init *init = data;
++
++      init_etherdev(dev, 0);
++      pri = dev->priv;
++      dpri = (struct mcast_data *) pri->user;
++      *dpri = ((struct mcast_data)
++              { .addr         = init->addr,
++                .port         = init->port,
++                .ttl          = init->ttl,
++                .mcast_addr   = NULL,
++                .dev          = dev });
++      printk("mcast backend ");
++      printk("multicast adddress: %s:%u, TTL:%u ",
++             dpri->addr, dpri->port, dpri->ttl);
++
++      printk("\n");
++}
++
++static int mcast_read(int fd, struct sk_buff **skb, struct uml_net_private *lp)
++{
++      *skb = ether_adjust_skb(*skb, ETH_HEADER_OTHER);
++      if(*skb == NULL) return(-ENOMEM);
++      return(net_recvfrom(fd, (*skb)->mac.raw, 
++                          (*skb)->dev->mtu + ETH_HEADER_OTHER));
++}
++
++static int mcast_write(int fd, struct sk_buff **skb,
++                      struct uml_net_private *lp)
++{
++      return mcast_user_write(fd, (*skb)->data, (*skb)->len, 
++                               (struct mcast_data *) &lp->user);
++}
++
++static struct net_kern_info mcast_kern_info = {
++      .init                   = mcast_init,
++      .protocol               = eth_protocol,
++      .read                   = mcast_read,
++      .write                  = mcast_write,
++};
++
++int mcast_setup(char *str, char **mac_out, void *data)
++{
++      struct mcast_init *init = data;
++      char *port_str = NULL, *ttl_str = NULL, *remain;
++      char *last;
++      int n;
++
++      *init = ((struct mcast_init)
++              { .addr         = "239.192.168.1",
++                .port         = 1102,
++                .ttl          = 1 });
++
++      remain = split_if_spec(str, mac_out, &init->addr, &port_str, &ttl_str,
++                             NULL);
++      if(remain != NULL){
++              printk(KERN_ERR "mcast_setup - Extra garbage on "
++                     "specification : '%s'\n", remain);
++              return(0);
++      }
++      
++      if(port_str != NULL){
++              n = simple_strtoul(port_str, &last, 10);
++              if((*last != '\0') || (last == port_str)){
++                      printk(KERN_ERR "mcast_setup - Bad port : '%s'\n", 
++                             port_str);
++                      return(0);
++              }
++              init->port = htons(n);
++      }
++
++      if(ttl_str != NULL){
++              init->ttl = simple_strtoul(ttl_str, &last, 10);
++              if((*last != '\0') || (last == ttl_str)){
++                      printk(KERN_ERR "mcast_setup - Bad ttl : '%s'\n", 
++                             ttl_str);
++                      return(0);
++              }
++      }
++
++      printk(KERN_INFO "Configured mcast device: %s:%u-%u\n", init->addr,
++             init->port, init->ttl);
++
++      return(1);
++}
++
++static struct transport mcast_transport = {
++      .list           = LIST_HEAD_INIT(mcast_transport.list),
++      .name           = "mcast",
++      .setup          = mcast_setup,
++      .user           = &mcast_user_info,
++      .kern           = &mcast_kern_info,
++      .private_size   = sizeof(struct mcast_data),
++      .setup_size     = sizeof(struct mcast_init),
++};
++
++static int register_mcast(void)
++{
++      register_transport(&mcast_transport);
++      return(1);
++}
++
++__initcall(register_mcast);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/mcast_user.c um/arch/um/drivers/mcast_user.c
+--- orig/arch/um/drivers/mcast_user.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/mcast_user.c    Sun Dec 15 21:19:16 2002
+@@ -0,0 +1,175 @@
++/*
++ * user-mode-linux networking multicast transport
++ * Copyright (C) 2001 by Harald Welte <laforge@gnumonks.org>
++ *
++ * based on the existing uml-networking code, which is
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and 
++ * James Leu (jleu@mindspring.net).
++ * Copyright (C) 2001 by various other people who didn't put their name here.
++ *
++ * Licensed under the GPL.
++ *
++ */
++
++#include <errno.h>
++#include <unistd.h>
++#include <linux/inet.h>
++#include <sys/socket.h>
++#include <sys/un.h>
++#include <sys/time.h>
++#include <netinet/in.h>
++#include "net_user.h"
++#include "mcast.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "user.h"
++
++#define MAX_PACKET (ETH_MAX_PACKET + ETH_HEADER_OTHER)
++
++static struct sockaddr_in *new_addr(char *addr, unsigned short port)
++{
++      struct sockaddr_in *sin;
++
++      sin = um_kmalloc(sizeof(struct sockaddr_in));
++      if(sin == NULL){
++              printk("new_addr: allocation of sockaddr_in failed\n");
++              return(NULL);
++      }
++      sin->sin_family = AF_INET;
++      sin->sin_addr.s_addr = in_aton(addr);
++      sin->sin_port = port;
++      return(sin);
++}
++
++static void mcast_user_init(void *data, void *dev)
++{
++      struct mcast_data *pri = data;
++
++      pri->mcast_addr = new_addr(pri->addr, pri->port);
++      pri->dev = dev;
++}
++
++static int mcast_open(void *data)
++{
++      struct mcast_data *pri = data;
++      struct sockaddr_in *sin = pri->mcast_addr;
++      struct ip_mreq mreq;
++      int fd, yes = 1;
++
++
++      if ((sin->sin_addr.s_addr == 0) || (sin->sin_port == 0)) {
++              fd = -EINVAL;
++              goto out;
++      }
++
++      if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) < 0){
++              printk("mcast_open : data socket failed, errno = %d\n", 
++                     errno);
++              fd = -ENOMEM;
++              goto out;
++      }
++
++      if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) {
++              printk("mcast_open: SO_REUSEADDR failed, errno = %d\n",
++                      errno);
++              close(fd);
++              fd = -EINVAL;
++              goto out;
++      }
++
++      /* set ttl according to config */
++      if (setsockopt(fd, SOL_IP, IP_MULTICAST_TTL, &pri->ttl,
++                     sizeof(pri->ttl)) < 0) {
++              printk("mcast_open: IP_MULTICAST_TTL failed, error = %d\n",
++                      errno);
++              close(fd);
++              fd = -EINVAL;
++              goto out;
++      }
++
++      /* set LOOP, so data does get fed back to local sockets */
++      if (setsockopt(fd, SOL_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0) {
++              printk("mcast_open: IP_MULTICAST_LOOP failed, error = %d\n",
++                      errno);
++              close(fd);
++              fd = -EINVAL;
++              goto out;
++      }
++
++      /* bind socket to mcast address */
++      if (bind(fd, (struct sockaddr *) sin, sizeof(*sin)) < 0) {
++              printk("mcast_open : data bind failed, errno = %d\n", errno);
++              close(fd);
++              fd = -EINVAL;
++              goto out;
++      }               
++      
++      /* subscribe to the multicast group */
++      mreq.imr_multiaddr.s_addr = sin->sin_addr.s_addr;
++      mreq.imr_interface.s_addr = 0;
++      if (setsockopt(fd, SOL_IP, IP_ADD_MEMBERSHIP, 
++                     &mreq, sizeof(mreq)) < 0) {
++              printk("mcast_open: IP_ADD_MEMBERSHIP failed, error = %d\n",
++                      errno);
++              printk("There appears not to be a multicast-capable network "
++                     "interface on the host.\n");
++              printk("eth0 should be configured in order to use the "
++                     "multicast transport.\n");
++              close(fd);
++              fd = -EINVAL;
++      }
++
++ out:
++      return(fd);
++}
++
++static void mcast_close(int fd, void *data)
++{
++      struct ip_mreq mreq;
++      struct mcast_data *pri = data;
++      struct sockaddr_in *sin = pri->mcast_addr;
++
++      mreq.imr_multiaddr.s_addr = sin->sin_addr.s_addr;
++      mreq.imr_interface.s_addr = 0;
++      if (setsockopt(fd, SOL_IP, IP_DROP_MEMBERSHIP,
++                     &mreq, sizeof(mreq)) < 0) {
++              printk("mcast_open: IP_DROP_MEMBERSHIP failed, error = %d\n",
++                      errno);
++      }
++
++      close(fd);
++}
++
++int mcast_user_write(int fd, void *buf, int len, struct mcast_data *pri)
++{
++      struct sockaddr_in *data_addr = pri->mcast_addr;
++
++      return(net_sendto(fd, buf, len, data_addr, sizeof(*data_addr)));
++}
++
++static int mcast_set_mtu(int mtu, void *data)
++{
++      return(mtu);
++}
++
++struct net_user_info mcast_user_info = {
++      .init           = mcast_user_init,
++      .open           = mcast_open,
++      .close          = mcast_close,
++      .remove         = NULL,
++      .set_mtu        = mcast_set_mtu,
++      .add_address    = NULL,
++      .delete_address = NULL,
++      .max_packet     = MAX_PACKET - ETH_HEADER_OTHER
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/mconsole_kern.c um/arch/um/drivers/mconsole_kern.c
+--- orig/arch/um/drivers/mconsole_kern.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/mconsole_kern.c Fri Mar 28 21:58:11 2003
+@@ -0,0 +1,453 @@
++/*
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org)
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/kernel.h"
++#include "linux/slab.h"
++#include "linux/init.h"
++#include "linux/notifier.h"
++#include "linux/reboot.h"
++#include "linux/utsname.h"
++#include "linux/ctype.h"
++#include "linux/interrupt.h"
++#include "linux/sysrq.h"
++#include "linux/tqueue.h"
++#include "linux/module.h"
++#include "linux/proc_fs.h"
++#include "asm/irq.h"
++#include "asm/uaccess.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "mconsole.h"
++#include "mconsole_kern.h"
++#include "irq_user.h"
++#include "init.h"
++#include "os.h"
++#include "umid.h"
++
++static int do_unlink_socket(struct notifier_block *notifier, 
++                          unsigned long what, void *data)
++{
++      return(mconsole_unlink_socket());
++}
++
++
++static struct notifier_block reboot_notifier = {
++      .notifier_call          = do_unlink_socket,
++      .priority               = 0,
++};
++
++/* Safe without explicit locking for now.  Tasklets provide their own 
++ * locking, and the interrupt handler is safe because it can't interrupt
++ * itself and it can only happen on CPU 0.
++ */
++
++LIST_HEAD(mc_requests);
++
++void mc_task_proc(void *unused)
++{
++      struct mconsole_entry *req;
++      unsigned long flags;
++      int done;
++
++      do {
++              save_flags(flags);
++              req = list_entry(mc_requests.next, struct mconsole_entry, 
++                               list);
++              list_del(&req->list);
++              done = list_empty(&mc_requests);
++              restore_flags(flags);
++              req->request.cmd->handler(&req->request);
++              kfree(req);
++      } while(!done);
++}
++
++struct tq_struct mconsole_task = {
++      .routine        = mc_task_proc,
++      .data           = NULL
++};
++
++void mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++      int fd;
++      struct mconsole_entry *new;
++      struct mc_request req;
++
++      fd = (int) dev_id;
++      while (mconsole_get_request(fd, &req)){
++              if(req.cmd->as_interrupt) (*req.cmd->handler)(&req);
++              else {
++                      new = kmalloc(sizeof(req), GFP_ATOMIC);
++                      if(new == NULL)
++                              mconsole_reply(&req, "Out of memory", 1, 0);
++                      else {
++                              new->request = req;
++                              list_add(&new->list, &mc_requests);
++                      }
++              }
++      }
++      if(!list_empty(&mc_requests)) schedule_task(&mconsole_task);
++      reactivate_fd(fd, MCONSOLE_IRQ);
++}
++
++void mconsole_version(struct mc_request *req)
++{
++      char version[256];
++
++      sprintf(version, "%s %s %s %s %s", system_utsname.sysname, 
++              system_utsname.nodename, system_utsname.release, 
++              system_utsname.version, system_utsname.machine);
++      mconsole_reply(req, version, 0, 0);
++}
++
++#define UML_MCONSOLE_HELPTEXT \
++"Commands: \n\
++    version - Get kernel version \n\
++    help - Print this message \n\
++    halt - Halt UML \n\
++    reboot - Reboot UML \n\
++    config <dev>=<config> - Add a new device to UML;  \n\
++      same syntax as command line \n\
++    config <dev> - Query the configuration of a device \n\
++    remove <dev> - Remove a device from UML \n\
++    sysrq <letter> - Performs the SysRq action controlled by the letter \n\
++    cad - invoke the Ctl-Alt-Del handler \n\
++    stop - pause the UML; it will do nothing until it receives a 'go' \n\
++    go - continue the UML after a 'stop' \n\
++"
++
++void mconsole_help(struct mc_request *req)
++{
++      mconsole_reply(req, UML_MCONSOLE_HELPTEXT, 0, 0);
++}
++
++void mconsole_halt(struct mc_request *req)
++{
++      mconsole_reply(req, "", 0, 0);
++      machine_halt();
++}
++
++void mconsole_reboot(struct mc_request *req)
++{
++      mconsole_reply(req, "", 0, 0);
++      machine_restart(NULL);
++}
++
++extern void ctrl_alt_del(void);
++
++void mconsole_cad(struct mc_request *req)
++{
++      mconsole_reply(req, "", 0, 0);
++      ctrl_alt_del();
++}
++
++void mconsole_go(struct mc_request *req)
++{
++      mconsole_reply(req, "Not stopped", 1, 0);
++}
++
++void mconsole_stop(struct mc_request *req)
++{
++      deactivate_fd(req->originating_fd, MCONSOLE_IRQ);
++      os_set_fd_block(req->originating_fd, 1);
++      mconsole_reply(req, "", 0, 0);
++      while(mconsole_get_request(req->originating_fd, req)){
++              if(req->cmd->handler == mconsole_go) break;
++              (*req->cmd->handler)(req);
++      }
++      os_set_fd_block(req->originating_fd, 0);
++      reactivate_fd(req->originating_fd, MCONSOLE_IRQ);
++      mconsole_reply(req, "", 0, 0);
++}
++
++/* This list is populated by __initcall routines. */
++
++LIST_HEAD(mconsole_devices);
++
++void mconsole_register_dev(struct mc_device *new)
++{
++      list_add(&new->list, &mconsole_devices);
++}
++
++static struct mc_device *mconsole_find_dev(char *name)
++{
++      struct list_head *ele;
++      struct mc_device *dev;
++
++      list_for_each(ele, &mconsole_devices){
++              dev = list_entry(ele, struct mc_device, list);
++              if(!strncmp(name, dev->name, strlen(dev->name)))
++                      return(dev);
++      }
++      return(NULL);
++}
++
++#define CONFIG_BUF_SIZE 64
++
++static void mconsole_get_config(int (*get_config)(char *, char *, int, 
++                                                char **),
++                              struct mc_request *req, char *name)
++{
++      char default_buf[CONFIG_BUF_SIZE], *error, *buf;
++      int n, size;
++
++      if(get_config == NULL){
++              mconsole_reply(req, "No get_config routine defined", 1, 0);
++              return;
++      }
++
++      error = NULL;
++      size = sizeof(default_buf)/sizeof(default_buf[0]);
++      buf = default_buf;
++
++      while(1){
++              n = (*get_config)(name, buf, size, &error);
++              if(error != NULL){
++                      mconsole_reply(req, error, 1, 0);
++                      goto out;
++              }
++
++              if(n <= size){
++                      mconsole_reply(req, buf, 0, 0);
++                      goto out;
++              }
++
++              if(buf != default_buf)
++                      kfree(buf);
++
++              size = n;
++              buf = kmalloc(size, GFP_KERNEL);
++              if(buf == NULL){
++                      mconsole_reply(req, "Failed to allocate buffer", 1, 0);
++                      return;
++              }
++      }
++ out:
++      if(buf != default_buf)
++              kfree(buf);
++      
++}
++
++void mconsole_config(struct mc_request *req)
++{
++      struct mc_device *dev;
++      char *ptr = req->request.data, *name;
++      int err;
++
++      ptr += strlen("config");
++      while(isspace(*ptr)) ptr++;
++      dev = mconsole_find_dev(ptr);
++      if(dev == NULL){
++              mconsole_reply(req, "Bad configuration option", 1, 0);
++              return;
++      }
++
++      name = &ptr[strlen(dev->name)];
++      ptr = name;
++      while((*ptr != '=') && (*ptr != '\0'))
++              ptr++;
++
++      if(*ptr == '='){
++              err = (*dev->config)(name);
++              mconsole_reply(req, "", err, 0);
++      }
++      else mconsole_get_config(dev->get_config, req, name);
++}
++
++void mconsole_remove(struct mc_request *req)
++{
++      struct mc_device *dev;  
++      char *ptr = req->request.data;
++      int err;
++
++      ptr += strlen("remove");
++      while(isspace(*ptr)) ptr++;
++      dev = mconsole_find_dev(ptr);
++      if(dev == NULL){
++              mconsole_reply(req, "Bad remove option", 1, 0);
++              return;
++      }
++      err = (*dev->remove)(&ptr[strlen(dev->name)]);
++      mconsole_reply(req, "", err, 0);
++}
++
++#ifdef CONFIG_MAGIC_SYSRQ
++void mconsole_sysrq(struct mc_request *req)
++{
++      char *ptr = req->request.data;
++
++      ptr += strlen("sysrq");
++      while(isspace(*ptr)) ptr++;
++
++      handle_sysrq(*ptr, &current->thread.regs, NULL, NULL);
++      mconsole_reply(req, "", 0, 0);
++}
++#else
++void mconsole_sysrq(struct mc_request *req)
++{
++      mconsole_reply(req, "Sysrq not compiled in", 1, 0);
++}
++#endif
++
++/* Changed by mconsole_setup, which is __setup, and called before SMP is
++ * active.
++ */
++static char *notify_socket = NULL; 
++
++int mconsole_init(void)
++{
++      int err, sock;
++      char file[256];
++
++      if(umid_file_name("mconsole", file, sizeof(file))) return(-1);
++      snprintf(mconsole_socket_name, sizeof(file), "%s", file);
++
++      sock = create_unix_socket(file, sizeof(file));
++      if (sock < 0){
++              printk("Failed to initialize management console\n");
++              return(1);
++      }
++
++      register_reboot_notifier(&reboot_notifier);
++
++      err = um_request_irq(MCONSOLE_IRQ, sock, IRQ_READ, mconsole_interrupt,
++                           SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM,
++                           "mconsole", (void *)sock);
++      if (err){
++              printk("Failed to get IRQ for management console\n");
++              return(1);
++      }
++
++      if(notify_socket != NULL){
++              notify_socket = uml_strdup(notify_socket);
++              if(notify_socket != NULL)
++                      mconsole_notify(notify_socket, MCONSOLE_SOCKET,
++                                      mconsole_socket_name, 
++                                      strlen(mconsole_socket_name) + 1);
++              else printk(KERN_ERR "mconsole_setup failed to strdup "
++                          "string\n");
++      }
++
++      printk("mconsole (version %d) initialized on %s\n", 
++             MCONSOLE_VERSION, mconsole_socket_name);
++      return(0);
++}
++
++__initcall(mconsole_init);
++
++static int write_proc_mconsole(struct file *file, const char *buffer,
++                             unsigned long count, void *data)
++{
++      char *buf;
++
++      buf = kmalloc(count + 1, GFP_KERNEL);
++      if(buf == NULL) 
++              return(-ENOMEM);
++
++      if(copy_from_user(buf, buffer, count))
++              return(-EFAULT);
++      buf[count] = '\0';
++
++      mconsole_notify(notify_socket, MCONSOLE_USER_NOTIFY, buf, count);
++      return(count);
++}
++
++static int create_proc_mconsole(void)
++{
++      struct proc_dir_entry *ent;
++
++      if(notify_socket == NULL) return(0);
++
++      ent = create_proc_entry("mconsole", S_IFREG | 0200, NULL);
++      if(ent == NULL){
++              printk("create_proc_mconsole : create_proc_entry failed\n");
++              return(0);
++      }
++
++      ent->read_proc = NULL;
++      ent->write_proc = write_proc_mconsole;
++      return(0);
++}
++
++static spinlock_t notify_spinlock = SPIN_LOCK_UNLOCKED;
++
++void lock_notify(void)
++{
++      spin_lock(&notify_spinlock);
++}
++
++void unlock_notify(void)
++{
++      spin_unlock(&notify_spinlock);
++}
++
++__initcall(create_proc_mconsole);
++
++#define NOTIFY "=notify:"
++
++static int mconsole_setup(char *str)
++{
++      if(!strncmp(str, NOTIFY, strlen(NOTIFY))){
++              str += strlen(NOTIFY);
++              notify_socket = str;
++      }
++      else printk(KERN_ERR "mconsole_setup : Unknown option - '%s'\n", str);
++      return(1);
++}
++
++__setup("mconsole", mconsole_setup);
++
++__uml_help(mconsole_setup,
++"mconsole=notify:<socket>\n"
++"    Requests that the mconsole driver send a message to the named Unix\n"
++"    socket containing the name of the mconsole socket.  This also serves\n"
++"    to notify outside processes when UML has booted far enough to respond\n"
++"    to mconsole requests.\n\n"
++);
++
++static int notify_panic(struct notifier_block *self, unsigned long unused1,
++                      void *ptr)
++{
++      char *message = ptr;
++
++      if(notify_socket == NULL) return(0);
++
++      mconsole_notify(notify_socket, MCONSOLE_PANIC, message, 
++                      strlen(message) + 1);
++      return(0);
++}
++
++static struct notifier_block panic_exit_notifier = {
++      .notifier_call          = notify_panic,
++      .next                   = NULL,
++      .priority               = 1
++};
++
++static int add_notifier(void)
++{
++      notifier_chain_register(&panic_notifier_list, &panic_exit_notifier);
++      return(0);
++}
++
++__initcall(add_notifier);
++
++char *mconsole_notify_socket(void)
++{
++      return(notify_socket);
++}
++
++EXPORT_SYMBOL(mconsole_notify_socket);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/mconsole_user.c um/arch/um/drivers/mconsole_user.c
+--- orig/arch/um/drivers/mconsole_user.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/mconsole_user.c Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,212 @@
++/*
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org)
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <errno.h>
++#include <signal.h>
++#include <sys/socket.h>
++#include <sys/types.h>
++#include <sys/uio.h>
++#include <sys/un.h>
++#include <unistd.h>
++#include "user.h"
++#include "mconsole.h"
++#include "umid.h"
++
++static struct mconsole_command commands[] = {
++      { "version", mconsole_version, 1 },
++      { "halt", mconsole_halt, 0 },
++      { "reboot", mconsole_reboot, 0 },
++      { "config", mconsole_config, 0 },
++      { "remove", mconsole_remove, 0 },
++      { "sysrq", mconsole_sysrq, 1 },
++      { "help", mconsole_help, 1 },
++      { "cad", mconsole_cad, 1 },
++      { "stop", mconsole_stop, 0 },
++      { "go", mconsole_go, 1 },
++};
++
++/* Initialized in mconsole_init, which is an initcall */
++char mconsole_socket_name[256];
++
++int mconsole_reply_v0(struct mc_request *req, char *reply)
++{
++        struct iovec iov;
++        struct msghdr msg;
++
++        iov.iov_base = reply;
++        iov.iov_len = strlen(reply);
++
++        msg.msg_name = &(req->origin);
++        msg.msg_namelen = req->originlen;
++        msg.msg_iov = &iov;
++        msg.msg_iovlen = 1;
++        msg.msg_control = NULL;
++        msg.msg_controllen = 0;
++        msg.msg_flags = 0;
++
++        return sendmsg(req->originating_fd, &msg, 0);
++}
++
++static struct mconsole_command *mconsole_parse(struct mc_request *req)
++{
++      struct mconsole_command *cmd;
++      int i;
++
++      for(i=0;i<sizeof(commands)/sizeof(commands[0]);i++){
++              cmd = &commands[i];
++              if(!strncmp(req->request.data, cmd->command, 
++                          strlen(cmd->command))){
++                      return(cmd);
++              }
++      }
++      return(NULL);
++}
++
++#define MIN(a,b) ((a)<(b) ? (a):(b))
++
++#define STRINGX(x) #x
++#define STRING(x) STRINGX(x)
++
++int mconsole_get_request(int fd, struct mc_request *req)
++{
++      int len;
++
++      req->originlen = sizeof(req->origin);
++      req->len = recvfrom(fd, &req->request, sizeof(req->request), 0,
++                          (struct sockaddr *) req->origin, &req->originlen);
++      if (req->len < 0)
++              return 0;
++
++      req->originating_fd = fd;
++
++      if(req->request.magic != MCONSOLE_MAGIC){
++              /* Unversioned request */
++              len = MIN(sizeof(req->request.data) - 1, 
++                        strlen((char *) &req->request));
++              memmove(req->request.data, &req->request, len);
++              req->request.data[len] = '\0';
++
++              req->request.magic = MCONSOLE_MAGIC;
++              req->request.version = 0;
++              req->request.len = len;
++
++              mconsole_reply_v0(req, "ERR Version 0 mconsole clients are "
++                                "not supported by this driver");
++              return(0);
++      }
++
++      if(req->request.len >= MCONSOLE_MAX_DATA){
++              mconsole_reply(req, "Request too large", 1, 0);
++              return(0);
++      }
++      if(req->request.version != MCONSOLE_VERSION){
++              mconsole_reply(req, "This driver only supports version " 
++                               STRING(MCONSOLE_VERSION) " clients", 1, 0);
++      }
++      
++      req->request.data[req->request.len] = '\0';
++      req->cmd = mconsole_parse(req);
++      if(req->cmd == NULL){
++              mconsole_reply(req, "Unknown command", 1, 0);
++              return(0);
++      }
++
++      return(1);
++}
++
++int mconsole_reply(struct mc_request *req, char *str, int err, int more)
++{
++      struct mconsole_reply reply;
++      int total, len, n;
++
++      total = strlen(str);
++      do {
++              reply.err = err;
++
++              /* err can only be true on the first packet */
++              err = 0;
++
++              len = MIN(total, MCONSOLE_MAX_DATA - 1);
++
++              if(len == total) reply.more = more;
++              else reply.more = 1;
++
++              memcpy(reply.data, str, len);
++              reply.data[len] = '\0';
++              total -= len;
++              reply.len = len + 1;
++
++              len = sizeof(reply) + reply.len - sizeof(reply.data);
++
++              n = sendto(req->originating_fd, &reply, len, 0,
++                         (struct sockaddr *) req->origin, req->originlen);
++
++              if(n < 0) return(-errno);
++      } while(total > 0);
++      return(0);
++}
++
++int mconsole_unlink_socket(void)
++{
++      unlink(mconsole_socket_name);
++      return 0;
++}
++
++static int notify_sock = -1;
++
++int mconsole_notify(char *sock_name, int type, const void *data, int len)
++{
++      struct sockaddr_un target;
++      struct mconsole_notify packet;
++      int n, err = 0;
++
++      lock_notify();
++      if(notify_sock < 0){
++              notify_sock = socket(PF_UNIX, SOCK_DGRAM, 0);
++              if(notify_sock < 0){
++                      printk("mconsole_notify - socket failed, errno = %d\n",
++                             errno);
++                      err = -errno;
++              }
++      }
++      unlock_notify();
++      
++      if(err)
++              return(err);
++
++      target.sun_family = AF_UNIX;
++      strcpy(target.sun_path, sock_name);
++
++      packet.magic = MCONSOLE_MAGIC;
++      packet.version = MCONSOLE_VERSION;
++      packet.type = type;
++      len = (len > sizeof(packet.data)) ? sizeof(packet.data) : len;
++      packet.len = len;
++      memcpy(packet.data, data, len);
++
++      err = 0;
++      len = sizeof(packet) + packet.len - sizeof(packet.data);
++      n = sendto(notify_sock, &packet, len, 0, (struct sockaddr *) &target, 
++                 sizeof(target));
++      if(n < 0){
++              printk("mconsole_notify - sendto failed, errno = %d\n", errno);
++              err = -errno;
++      }
++      return(err);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/mmapper_kern.c um/arch/um/drivers/mmapper_kern.c
+--- orig/arch/um/drivers/mmapper_kern.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/mmapper_kern.c  Sun Dec 15 21:03:08 2002
+@@ -0,0 +1,148 @@
++/*
++ * arch/um/drivers/mmapper_kern.c
++ *
++ * BRIEF MODULE DESCRIPTION
++ *
++ * Copyright (C) 2000 RidgeRun, Inc.
++ * Author: RidgeRun, Inc.
++ *         Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com
++ *
++ */
++#include <linux/kdev_t.h>
++#include <linux/time.h>
++#include <linux/devfs_fs_kernel.h>
++#include <linux/module.h>
++#include <linux/mm.h> 
++#include <linux/slab.h>
++#include <linux/init.h> 
++#include <asm/uaccess.h>
++#include <asm/irq.h>
++#include <asm/smplock.h>
++#include <asm/pgtable.h>
++#include "mem_user.h"
++#include "user_util.h"
++ 
++/* These are set in mmapper_init, which is called at boot time */
++static unsigned long mmapper_size;
++static unsigned long p_buf = 0;
++static char *v_buf = NULL;
++
++static ssize_t
++mmapper_read(struct file *file, char *buf, size_t count, loff_t *ppos)
++{
++      if(*ppos > mmapper_size)
++              return -EINVAL;
++
++      if(count + *ppos > mmapper_size)
++              count = count + *ppos - mmapper_size;
++
++      if(count < 0)
++              return -EINVAL;
++ 
++      copy_to_user(buf,&v_buf[*ppos],count);
++      
++      return count;
++}
++
++static ssize_t
++mmapper_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
++{
++      if(*ppos > mmapper_size)
++              return -EINVAL;
++
++      if(count + *ppos > mmapper_size)
++              count = count + *ppos - mmapper_size;
++
++      if(count < 0)
++              return -EINVAL;
++
++      copy_from_user(&v_buf[*ppos],buf,count);
++      
++      return count;
++}
++
++static int 
++mmapper_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
++       unsigned long arg)
++{
++      return(-ENOIOCTLCMD);
++}
++
++static int 
++mmapper_mmap(struct file *file, struct vm_area_struct * vma)
++{
++      int ret = -EINVAL;
++      int size;
++
++      lock_kernel();
++      if (vma->vm_pgoff != 0)
++              goto out;
++      
++      size = vma->vm_end - vma->vm_start;
++      if(size > mmapper_size) return(-EFAULT);
++
++      /* XXX A comment above remap_page_range says it should only be
++       * called when the mm semaphore is held
++       */
++      if (remap_page_range(vma->vm_start, p_buf, size, vma->vm_page_prot))
++              goto out;
++      ret = 0;
++out:
++      unlock_kernel();
++      return ret;
++}
++
++static int
++mmapper_open(struct inode *inode, struct file *file)
++{
++      return 0;
++}
++
++static int 
++mmapper_release(struct inode *inode, struct file *file)
++{
++      return 0;
++}
++
++static struct file_operations mmapper_fops = {
++      .owner          = THIS_MODULE,
++      .read           = mmapper_read,
++      .write          = mmapper_write,
++      .ioctl          = mmapper_ioctl,
++      .mmap           = mmapper_mmap,
++      .open           = mmapper_open,
++      .release        = mmapper_release,
++};
++
++static int __init mmapper_init(void)
++{
++      printk(KERN_INFO "Mapper v0.1\n");
++
++      v_buf = (char *) find_iomem("mmapper", &mmapper_size);
++      if(mmapper_size == 0) return(0);
++
++      p_buf = __pa(v_buf);
++
++      devfs_register (NULL, "mmapper", DEVFS_FL_DEFAULT, 
++                      30, 0, S_IFCHR | S_IRUGO | S_IWUGO, 
++                      &mmapper_fops, NULL); 
++      devfs_mk_symlink(NULL, "mmapper0", DEVFS_FL_DEFAULT, "mmapper",
++                       NULL, NULL);
++      return(0);
++}
++
++static void mmapper_exit(void)
++{
++}
++
++module_init(mmapper_init);
++module_exit(mmapper_exit);
++
++MODULE_AUTHOR("Greg Lonnon <glonnon@ridgerun.com>");
++MODULE_DESCRIPTION("DSPLinux simulator mmapper driver");
++/*
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/net_kern.c um/arch/um/drivers/net_kern.c
+--- orig/arch/um/drivers/net_kern.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/net_kern.c      Sun Dec 15 21:19:16 2002
+@@ -0,0 +1,870 @@
++/*
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and 
++ * James Leu (jleu@mindspring.net).
++ * Copyright (C) 2001 by various other people who didn't put their name here.
++ * Licensed under the GPL.
++ */
++
++#include "linux/config.h"
++#include "linux/kernel.h"
++#include "linux/netdevice.h"
++#include "linux/rtnetlink.h"
++#include "linux/skbuff.h"
++#include "linux/socket.h"
++#include "linux/spinlock.h"
++#include "linux/module.h"
++#include "linux/init.h"
++#include "linux/etherdevice.h"
++#include "linux/list.h"
++#include "linux/inetdevice.h"
++#include "linux/ctype.h"
++#include "linux/bootmem.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "net_kern.h"
++#include "net_user.h"
++#include "mconsole_kern.h"
++#include "init.h"
++#include "irq_user.h"
++
++static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED;
++LIST_HEAD(opened);
++
++static int uml_net_rx(struct net_device *dev)
++{
++      struct uml_net_private *lp = dev->priv;
++      int pkt_len;
++      struct sk_buff *skb;
++
++      /* If we can't allocate memory, try again next round. */
++      if ((skb = dev_alloc_skb(dev->mtu)) == NULL) {
++              lp->stats.rx_dropped++;
++              return 0;
++      }
++
++      skb->dev = dev;
++      skb_put(skb, dev->mtu);
++      skb->mac.raw = skb->data;
++      pkt_len = (*lp->read)(lp->fd, &skb, lp);
++
++      if (pkt_len > 0) {
++              skb_trim(skb, pkt_len);
++              skb->protocol = (*lp->protocol)(skb);
++              netif_rx(skb);
++
++              lp->stats.rx_bytes += skb->len;
++              lp->stats.rx_packets++;
++              return pkt_len;
++      }
++
++      kfree_skb(skb);
++      return pkt_len;
++}
++
++void uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++      struct net_device *dev = dev_id;
++      struct uml_net_private *lp = dev->priv;
++      int err;
++
++      if(!netif_running(dev))
++              return;
++
++      spin_lock(&lp->lock);
++      while((err = uml_net_rx(dev)) > 0) ;
++      if(err < 0) {
++              printk(KERN_ERR 
++                     "Device '%s' read returned %d, shutting it down\n", 
++                     dev->name, err);
++              dev_close(dev);
++              goto out;
++      }
++      reactivate_fd(lp->fd, UM_ETH_IRQ);
++
++ out:
++      spin_unlock(&lp->lock);
++}
++
++static int uml_net_open(struct net_device *dev)
++{
++      struct uml_net_private *lp = dev->priv;
++      char addr[sizeof("255.255.255.255\0")];
++      int err;
++
++      spin_lock(&lp->lock);
++
++      if(lp->fd >= 0){
++              err = -ENXIO;
++              goto out;
++      }
++
++      if(!lp->have_mac){
++              dev_ip_addr(dev, addr, &lp->mac[2]);
++              set_ether_mac(dev, lp->mac);
++      }
++
++      lp->fd = (*lp->open)(&lp->user);
++      if(lp->fd < 0){
++              err = lp->fd;
++              goto out;
++      }
++
++      err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt,
++                           SA_INTERRUPT | SA_SHIRQ, dev->name, dev);
++      if(err != 0){
++              printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err);
++              if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user);
++              lp->fd = -1;
++              err = -ENETUNREACH;
++      }
++
++      lp->tl.data = (unsigned long) &lp->user;
++      netif_start_queue(dev);
++
++      spin_lock(&opened_lock);
++      list_add(&lp->list, &opened);
++      spin_unlock(&opened_lock);
++      MOD_INC_USE_COUNT;
++ out:
++      spin_unlock(&lp->lock);
++      return(err);
++}
++
++static int uml_net_close(struct net_device *dev)
++{
++      struct uml_net_private *lp = dev->priv;
++      
++      netif_stop_queue(dev);
++      spin_lock(&lp->lock);
++
++      free_irq(dev->irq, dev);
++      if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user);
++      lp->fd = -1;
++      spin_lock(&opened_lock);
++      list_del(&lp->list);
++      spin_unlock(&opened_lock);
++
++      MOD_DEC_USE_COUNT;
++      spin_unlock(&lp->lock);
++      return 0;
++}
++
++static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++      struct uml_net_private *lp = dev->priv;
++      unsigned long flags;
++      int len;
++
++      netif_stop_queue(dev);
++
++      spin_lock_irqsave(&lp->lock, flags);
++
++      len = (*lp->write)(lp->fd, &skb, lp);
++
++      if(len == skb->len) {
++              lp->stats.tx_packets++;
++              lp->stats.tx_bytes += skb->len;
++              dev->trans_start = jiffies;
++              netif_start_queue(dev);
++
++              /* this is normally done in the interrupt when tx finishes */
++              netif_wake_queue(dev);
++      } 
++      else if(len == 0){
++              netif_start_queue(dev);
++              lp->stats.tx_dropped++;
++      }
++      else {
++              netif_start_queue(dev);
++              printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len);
++      }
++
++      spin_unlock_irqrestore(&lp->lock, flags);
++
++      dev_kfree_skb(skb);
++
++      return 0;
++}
++
++static struct net_device_stats *uml_net_get_stats(struct net_device *dev)
++{
++      struct uml_net_private *lp = dev->priv;
++      return &lp->stats;
++}
++
++static void uml_net_set_multicast_list(struct net_device *dev)
++{
++      if (dev->flags & IFF_PROMISC) return;
++      else if (dev->mc_count) dev->flags |= IFF_ALLMULTI;
++      else dev->flags &= ~IFF_ALLMULTI;
++}
++
++static void uml_net_tx_timeout(struct net_device *dev)
++{
++      dev->trans_start = jiffies;
++      netif_wake_queue(dev);
++}
++
++static int uml_net_set_mac(struct net_device *dev, void *addr)
++{
++      struct uml_net_private *lp = dev->priv;
++      struct sockaddr *hwaddr = addr;
++
++      spin_lock(&lp->lock);
++      memcpy(dev->dev_addr, hwaddr->sa_data, ETH_ALEN);
++      spin_unlock(&lp->lock);
++
++      return(0);
++}
++
++static int uml_net_change_mtu(struct net_device *dev, int new_mtu)
++{
++      struct uml_net_private *lp = dev->priv;
++      int err = 0;
++
++      spin_lock(&lp->lock);
++
++      new_mtu = (*lp->set_mtu)(new_mtu, &lp->user);
++      if(new_mtu < 0){
++              err = new_mtu;
++              goto out;
++      }
++
++      dev->mtu = new_mtu;
++
++ out:
++      spin_unlock(&lp->lock);
++      return err;
++}
++
++static int uml_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
++{
++      return(-EINVAL);
++}
++
++void uml_net_user_timer_expire(unsigned long _conn)
++{
++#ifdef undef
++      struct connection *conn = (struct connection *)_conn;
++
++      dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn);
++      do_connect(conn);
++#endif
++}
++
++/*
++ * default do nothing hard header packet routines for struct net_device init.
++ * real ethernet transports will overwrite with real routines.
++ */
++static int uml_net_hard_header(struct sk_buff *skb, struct net_device *dev,
++                 unsigned short type, void *daddr, void *saddr, unsigned len)
++{
++      return(0); /* no change */
++}
++
++static int uml_net_rebuild_header(struct sk_buff *skb)
++{
++      return(0); /* ignore */ 
++}
++
++static int uml_net_header_cache(struct neighbour *neigh, struct hh_cache *hh)
++{
++      return(-1); /* fail */
++}
++
++static void uml_net_header_cache_update(struct hh_cache *hh,
++                 struct net_device *dev, unsigned char * haddr)
++{
++      /* ignore */
++}
++
++static int uml_net_header_parse(struct sk_buff *skb, unsigned char *haddr)
++{
++      return(0); /* nothing */
++}
++
++static spinlock_t devices_lock = SPIN_LOCK_UNLOCKED;
++static struct list_head devices = LIST_HEAD_INIT(devices);
++
++static int eth_configure(int n, void *init, char *mac,
++                       struct transport *transport)
++{
++      struct uml_net *device;
++      struct net_device *dev;
++      struct uml_net_private *lp;
++      int save, err, size;
++
++      size = transport->private_size + sizeof(struct uml_net_private) + 
++              sizeof(((struct uml_net_private *) 0)->user);
++
++      device = kmalloc(sizeof(*device), GFP_KERNEL);
++      if(device == NULL){
++              printk(KERN_ERR "eth_configure failed to allocate uml_net\n");
++              return(1);
++      }
++
++      *device = ((struct uml_net) { .list     = LIST_HEAD_INIT(device->list),
++                                    .dev      = NULL,
++                                    .index    = n,
++                                    .mac      = { [ 0 ... 5 ] = 0 },
++                                    .have_mac = 0 });
++
++      spin_lock(&devices_lock);
++      list_add(&device->list, &devices);
++      spin_unlock(&devices_lock);
++
++      if(setup_etheraddr(mac, device->mac))
++              device->have_mac = 1;
++
++      printk(KERN_INFO "Netdevice %d ", n);
++      if(device->have_mac) printk("(%02x:%02x:%02x:%02x:%02x:%02x) ",
++                                  device->mac[0], device->mac[1], 
++                                  device->mac[2], device->mac[3], 
++                                  device->mac[4], device->mac[5]);
++      printk(": ");
++      dev = kmalloc(sizeof(*dev) + size, GFP_KERNEL);
++      if(dev == NULL){
++              printk(KERN_ERR "eth_configure: failed to allocate device\n");
++              return(1);
++      }
++      memset(dev, 0, sizeof(*dev) + size);
++
++      snprintf(dev->name, sizeof(dev->name), "eth%d", n);
++      dev->priv = (void *) &dev[1];
++      device->dev = dev;
++
++        dev->hard_header = uml_net_hard_header;
++        dev->rebuild_header = uml_net_rebuild_header;
++        dev->hard_header_cache = uml_net_header_cache;
++        dev->header_cache_update= uml_net_header_cache_update;
++        dev->hard_header_parse = uml_net_header_parse;
++
++      (*transport->kern->init)(dev, init);
++
++      dev->mtu = transport->user->max_packet;
++      dev->open = uml_net_open;
++      dev->hard_start_xmit = uml_net_start_xmit;
++      dev->stop = uml_net_close;
++      dev->get_stats = uml_net_get_stats;
++      dev->set_multicast_list = uml_net_set_multicast_list;
++      dev->tx_timeout = uml_net_tx_timeout;
++      dev->set_mac_address = uml_net_set_mac;
++      dev->change_mtu = uml_net_change_mtu;
++      dev->do_ioctl = uml_net_ioctl;
++      dev->watchdog_timeo = (HZ >> 1);
++      dev->irq = UM_ETH_IRQ;
++
++      rtnl_lock();
++      err = register_netdevice(dev);
++      rtnl_unlock();
++      if(err)
++              return(1);
++      lp = dev->priv;
++
++      /* lp.user is the first four bytes of the transport data, which
++       * has already been initialized.  This structure assignment will
++       * overwrite that, so we make sure that .user gets overwritten with
++       * what it already has.
++       */
++      save = lp->user[0];
++      *lp = ((struct uml_net_private) 
++              { .list                 = LIST_HEAD_INIT(lp->list),
++                .lock                 = SPIN_LOCK_UNLOCKED,
++                .dev                  = dev,
++                .fd                   = -1,
++                .mac                  = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0},
++                .have_mac             = device->have_mac,
++                .protocol             = transport->kern->protocol,
++                .open                 = transport->user->open,
++                .close                = transport->user->close,
++                .remove               = transport->user->remove,
++                .read                 = transport->kern->read,
++                .write                = transport->kern->write,
++                .add_address          = transport->user->add_address,
++                .delete_address       = transport->user->delete_address,
++                .set_mtu              = transport->user->set_mtu,
++                .user                 = { save } });
++      init_timer(&lp->tl);
++      lp->tl.function = uml_net_user_timer_expire;
++      memset(&lp->stats, 0, sizeof(lp->stats));
++      if(lp->have_mac) memcpy(lp->mac, device->mac, sizeof(lp->mac));
++
++      if(transport->user->init) 
++              (*transport->user->init)(&lp->user, dev);
++
++      if(device->have_mac)
++              set_ether_mac(dev, device->mac);
++      return(0);
++}
++
++static struct uml_net *find_device(int n)
++{
++      struct uml_net *device;
++      struct list_head *ele;
++
++      spin_lock(&devices_lock);
++      list_for_each(ele, &devices){
++              device = list_entry(ele, struct uml_net, list);
++              if(device->index == n)
++                      goto out;
++      }
++      device = NULL;
++ out:
++      spin_unlock(&devices_lock);
++      return(device);
++}
++
++static int eth_parse(char *str, int *index_out, char **str_out)
++{
++      char *end;
++      int n;
++
++      n = simple_strtoul(str, &end, 0);
++      if(end == str){
++              printk(KERN_ERR "eth_setup: Failed to parse '%s'\n", str);
++              return(1);
++      }
++      if(n < 0){
++              printk(KERN_ERR "eth_setup: device %d is negative\n", n);
++              return(1);
++      }
++      str = end;
++      if(*str != '='){
++              printk(KERN_ERR 
++                     "eth_setup: expected '=' after device number\n");
++              return(1);
++      }
++      str++;
++      if(find_device(n)){
++              printk(KERN_ERR "eth_setup: Device %d already configured\n",
++                     n);
++              return(1);
++      }
++      if(index_out) *index_out = n;
++      *str_out = str;
++      return(0);
++}
++
++struct eth_init {
++      struct list_head list;
++      char *init;
++      int index;
++};
++
++/* Filled in at boot time.  Will need locking if the transports become
++ * modular.
++ */
++struct list_head transports = LIST_HEAD_INIT(transports);
++
++/* Filled in during early boot */
++struct list_head eth_cmd_line = LIST_HEAD_INIT(eth_cmd_line);
++
++static int check_transport(struct transport *transport, char *eth, int n,
++                         void **init_out, char **mac_out)
++{
++      int len;
++
++      len = strlen(transport->name);
++      if(strncmp(eth, transport->name, len))
++              return(0);
++
++      eth += len;
++      if(*eth == ',')
++              eth++;
++      else if(*eth != '\0')
++              return(0);
++
++      *init_out = kmalloc(transport->setup_size, GFP_KERNEL);
++      if(*init_out == NULL)
++              return(1);
++
++      if(!transport->setup(eth, mac_out, *init_out)){
++              kfree(*init_out);
++              *init_out = NULL;
++      }
++      return(1);
++}
++
++void register_transport(struct transport *new)
++{
++      struct list_head *ele, *next;
++      struct eth_init *eth;
++      void *init;
++      char *mac = NULL;
++      int match;
++
++      list_add(&new->list, &transports);
++
++      list_for_each_safe(ele, next, &eth_cmd_line){
++              eth = list_entry(ele, struct eth_init, list);
++              match = check_transport(new, eth->init, eth->index, &init,
++                                      &mac);
++              if(!match)
++                      continue;
++              else if(init != NULL){
++                      eth_configure(eth->index, init, mac, new);
++                      kfree(init);
++              }
++              list_del(&eth->list);
++      }
++}
++
++static int eth_setup_common(char *str, int index)
++{
++      struct list_head *ele;
++      struct transport *transport;
++      void *init;
++      char *mac = NULL;
++
++      list_for_each(ele, &transports){
++              transport = list_entry(ele, struct transport, list);
++              if(!check_transport(transport, str, index, &init, &mac))
++                      continue;
++              if(init != NULL){
++                      eth_configure(index, init, mac, transport);
++                      kfree(init);
++              }
++              return(1);
++      }
++      return(0);
++}
++
++static int eth_setup(char *str)
++{
++      struct eth_init *new;
++      int n, err;
++
++      err = eth_parse(str, &n, &str);
++      if(err) return(1);
++
++      new = alloc_bootmem(sizeof(new));
++      if(new == NULL){
++              printk("eth_init : alloc_bootmem failed\n");
++              return(1);
++      }
++      *new = ((struct eth_init) { .list       = LIST_HEAD_INIT(new->list),
++                                  .index      = n,
++                                  .init       = str });
++      list_add_tail(&new->list, &eth_cmd_line);
++      return(1);
++}
++
++__setup("eth", eth_setup);
++__uml_help(eth_setup,
++"eth[0-9]+=<transport>,<options>\n"
++"    Configure a network device.\n\n"
++);
++
++static int eth_init(void)
++{
++      struct list_head *ele, *next;
++      struct eth_init *eth;
++
++      list_for_each_safe(ele, next, &eth_cmd_line){
++              eth = list_entry(ele, struct eth_init, list);
++
++              if(eth_setup_common(eth->init, eth->index))
++                      list_del(&eth->list);
++      }
++      
++      return(1);
++}
++
++__initcall(eth_init);
++
++static int net_config(char *str)
++{
++      int n, err;
++
++      err = eth_parse(str, &n, &str);
++      if(err) return(err);
++
++      str = uml_strdup(str);
++      if(str == NULL){
++              printk(KERN_ERR "net_config failed to strdup string\n");
++              return(-1);
++      }
++      err = !eth_setup_common(str, n);
++      if(err) 
++              kfree(str);
++      return(err);
++}
++
++static int net_remove(char *str)
++{
++      struct uml_net *device;
++      struct net_device *dev;
++      struct uml_net_private *lp;
++      char *end;
++      int n;
++
++      n = simple_strtoul(str, &end, 0);
++      if((*end != '\0') || (end == str))
++              return(-1);
++
++      device = find_device(n);
++      if(device == NULL)
++              return(0);
++
++      dev = device->dev;
++      lp = dev->priv;
++      if(lp->fd > 0) return(-1);
++      if(lp->remove != NULL) (*lp->remove)(&lp->user);
++      unregister_netdev(dev);
++
++      list_del(&device->list);
++      kfree(device);
++      return(0);
++}
++
++static struct mc_device net_mc = {
++      .name           = "eth",
++      .config         = net_config,
++      .get_config     = NULL,
++      .remove         = net_remove,
++};
++
++static int uml_inetaddr_event(struct notifier_block *this, unsigned long event,
++                            void *ptr)
++{
++      struct in_ifaddr *ifa = ptr;
++      u32 addr = ifa->ifa_address;
++      u32 netmask = ifa->ifa_mask;
++      struct net_device *dev = ifa->ifa_dev->dev;
++      struct uml_net_private *lp;
++      void (*proc)(unsigned char *, unsigned char *, void *);
++      unsigned char addr_buf[4], netmask_buf[4];
++
++      if(dev->open != uml_net_open) return(NOTIFY_DONE);
++
++      lp = dev->priv;
++
++      proc = NULL;
++      switch (event){
++      case NETDEV_UP:
++              proc = lp->add_address;
++              break;
++      case NETDEV_DOWN:
++              proc = lp->delete_address;
++              break;
++      }
++      if(proc != NULL){
++              addr_buf[0] = addr & 0xff;
++              addr_buf[1] = (addr >> 8) & 0xff;
++              addr_buf[2] = (addr >> 16) & 0xff;
++              addr_buf[3] = addr >> 24;
++              netmask_buf[0] = netmask & 0xff;
++              netmask_buf[1] = (netmask >> 8) & 0xff;
++              netmask_buf[2] = (netmask >> 16) & 0xff;
++              netmask_buf[3] = netmask >> 24;
++              (*proc)(addr_buf, netmask_buf, &lp->user);
++      }
++      return(NOTIFY_DONE);
++}
++
++struct notifier_block uml_inetaddr_notifier = {
++      .notifier_call          = uml_inetaddr_event,
++};
++
++static int uml_net_init(void)
++{
++      struct list_head *ele;
++      struct uml_net_private *lp;     
++      struct in_device *ip;
++      struct in_ifaddr *in;
++
++      mconsole_register_dev(&net_mc);
++      register_inetaddr_notifier(&uml_inetaddr_notifier);
++
++      /* Devices may have been opened already, so the uml_inetaddr_notifier
++       * didn't get a chance to run for them.  This fakes it so that
++       * addresses which have already been set up get handled properly.
++       */
++      list_for_each(ele, &opened){
++              lp = list_entry(ele, struct uml_net_private, list);
++              ip = lp->dev->ip_ptr;
++              if(ip == NULL) continue;
++              in = ip->ifa_list;
++              while(in != NULL){
++                      uml_inetaddr_event(NULL, NETDEV_UP, in);
++                      in = in->ifa_next;
++              }
++      }       
++
++      return(0);
++}
++
++__initcall(uml_net_init);
++
++static void close_devices(void)
++{
++      struct list_head *ele;
++      struct uml_net_private *lp;     
++
++      list_for_each(ele, &opened){
++              lp = list_entry(ele, struct uml_net_private, list);
++              if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user);
++              if(lp->remove != NULL) (*lp->remove)(&lp->user);
++      }
++}
++
++__uml_exitcall(close_devices);
++
++int setup_etheraddr(char *str, unsigned char *addr)
++{
++      char *end;
++      int i;
++
++      if(str == NULL)
++              return(0);
++      for(i=0;i<6;i++){
++              addr[i] = simple_strtoul(str, &end, 16);
++              if((end == str) ||
++                 ((*end != ':') && (*end != ',') && (*end != '\0'))){
++                      printk(KERN_ERR 
++                             "setup_etheraddr: failed to parse '%s' "
++                             "as an ethernet address\n", str);
++                      return(0);
++              }
++              str = end + 1;
++      }
++      if(addr[0] & 1){
++              printk(KERN_ERR 
++                     "Attempt to assign a broadcast ethernet address to a "
++                     "device disallowed\n");
++              return(0);
++      }
++      return(1);
++}
++
++void dev_ip_addr(void *d, char *buf, char *bin_buf)
++{
++      struct net_device *dev = d;
++      struct in_device *ip = dev->ip_ptr;
++      struct in_ifaddr *in;
++      u32 addr;
++
++      if((ip == NULL) || ((in = ip->ifa_list) == NULL)){
++              printk(KERN_WARNING "dev_ip_addr - device not assigned an "
++                     "IP address\n");
++              return;
++      }
++      addr = in->ifa_address;
++      sprintf(buf, "%d.%d.%d.%d", addr & 0xff, (addr >> 8) & 0xff, 
++              (addr >> 16) & 0xff, addr >> 24);
++      if(bin_buf){
++              bin_buf[0] = addr & 0xff;
++              bin_buf[1] = (addr >> 8) & 0xff;
++              bin_buf[2] = (addr >> 16) & 0xff;
++              bin_buf[3] = addr >> 24;
++      }
++}
++
++void set_ether_mac(void *d, unsigned char *addr)
++{
++      struct net_device *dev = d;
++
++      memcpy(dev->dev_addr, addr, ETH_ALEN);  
++}
++
++struct sk_buff *ether_adjust_skb(struct sk_buff *skb, int extra)
++{
++      if((skb != NULL) && (skb_tailroom(skb) < extra)){
++              struct sk_buff *skb2;
++
++              skb2 = skb_copy_expand(skb, 0, extra, GFP_ATOMIC);
++              dev_kfree_skb(skb);
++              skb = skb2;
++      }
++      if(skb != NULL) skb_put(skb, extra);
++      return(skb);
++}
++
++void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *, 
++                                      void *), 
++                  void *arg)
++{
++      struct net_device *dev = d;
++      struct in_device *ip = dev->ip_ptr;
++      struct in_ifaddr *in;
++      unsigned char address[4], netmask[4];
++
++      if(ip == NULL) return;
++      in = ip->ifa_list;
++      while(in != NULL){
++              address[0] = in->ifa_address & 0xff;
++              address[1] = (in->ifa_address >> 8) & 0xff;
++              address[2] = (in->ifa_address >> 16) & 0xff;
++              address[3] = in->ifa_address >> 24;
++              netmask[0] = in->ifa_mask & 0xff;
++              netmask[1] = (in->ifa_mask >> 8) & 0xff;
++              netmask[2] = (in->ifa_mask >> 16) & 0xff;
++              netmask[3] = in->ifa_mask >> 24;
++              (*cb)(address, netmask, arg);
++              in = in->ifa_next;
++      }
++}
++
++int dev_netmask(void *d, void *m)
++{
++      struct net_device *dev = d;
++      struct in_device *ip = dev->ip_ptr;
++      struct in_ifaddr *in;
++      __u32 *mask_out = m;
++
++      if(ip == NULL) 
++              return(1);
++
++      in = ip->ifa_list;
++      if(in == NULL) 
++              return(1);
++
++      *mask_out = in->ifa_mask;
++      return(0);
++}
++
++void *get_output_buffer(int *len_out)
++{
++      void *ret;
++
++      ret = (void *) __get_free_pages(GFP_KERNEL, 0);
++      if(ret) *len_out = PAGE_SIZE;
++      else *len_out = 0;
++      return(ret);
++}
++
++void free_output_buffer(void *buffer)
++{
++      free_pages((unsigned long) buffer, 0);
++}
++
++int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, 
++                   char **gate_addr)
++{
++      char *remain;
++
++      remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL);
++      if(remain != NULL){
++              printk("tap_setup_common - Extra garbage on specification : "
++                     "'%s'\n", remain);
++              return(1);
++      }
++
++      return(0);
++}
++
++unsigned short eth_protocol(struct sk_buff *skb)
++{
++      return(eth_type_trans(skb, skb->dev));
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/net_user.c um/arch/um/drivers/net_user.c
+--- orig/arch/um/drivers/net_user.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/net_user.c      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,254 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stddef.h>
++#include <stdarg.h>
++#include <unistd.h>
++#include <stdio.h>
++#include <errno.h>
++#include <stdlib.h>
++#include <string.h>
++#include <sys/socket.h>
++#include <sys/wait.h>
++#include "user.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "net_user.h"
++#include "helper.h"
++#include "os.h"
++
++int tap_open_common(void *dev, char *gate_addr)
++{
++      int tap_addr[4];
++
++      if(gate_addr == NULL) return(0);
++      if(sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], 
++                &tap_addr[1], &tap_addr[2], &tap_addr[3]) != 4){
++              printk("Invalid tap IP address - '%s'\n", 
++                     gate_addr);
++              return(-EINVAL);
++      }
++      return(0);
++}
++
++void tap_check_ips(char *gate_addr, char *eth_addr)
++{
++      int tap_addr[4];
++
++      if((gate_addr != NULL) && 
++         (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], 
++                 &tap_addr[1], &tap_addr[2], &tap_addr[3]) == 4) &&
++         (eth_addr[0] == tap_addr[0]) && 
++         (eth_addr[1] == tap_addr[1]) && 
++         (eth_addr[2] == tap_addr[2]) && 
++         (eth_addr[3] == tap_addr[3])){
++              printk("The tap IP address and the UML eth IP address"
++                     " must be different\n");
++      }
++}
++
++void read_output(int fd, char *output, int len)
++{
++      int remain, n, actual;
++      char c;
++
++      if(output == NULL){
++              output = &c;
++              len = sizeof(c);
++      }
++              
++      *output = '\0';
++      if(read(fd, &remain, sizeof(remain)) != sizeof(remain)){
++              printk("read_output - read of length failed, errno = %d\n",
++                     errno);
++              return;
++      }
++
++      while(remain != 0){
++              n = (remain < len) ? remain : len;
++              actual = read(fd, output, n);
++              if(actual != n){
++                      printk("read_output - read of data failed, "
++                             "errno = %d\n", errno);
++                      return;
++              }
++              remain -= actual;
++      }
++      return;
++}
++
++int net_read(int fd, void *buf, int len)
++{
++      int n;
++
++      while(((n = read(fd,  buf,  len)) < 0) && (errno == EINTR)) ;
++
++      if(n < 0){
++              if(errno == EAGAIN) return(0);
++              return(-errno);
++      }
++      else if(n == 0) return(-ENOTCONN);
++      return(n);
++}
++
++int net_recvfrom(int fd, void *buf, int len)
++{
++      int n;
++
++      while(((n = recvfrom(fd,  buf,  len, 0, NULL, NULL)) < 0) && 
++            (errno == EINTR)) ;
++
++      if(n < 0){
++              if(errno == EAGAIN) return(0);
++              return(-errno);
++      }
++      else if(n == 0) return(-ENOTCONN);
++      return(n);
++}
++
++int net_write(int fd, void *buf, int len)
++{
++      int n;
++
++      while(((n = write(fd, buf, len)) < 0) && (errno == EINTR)) ;
++      if(n < 0){
++              if(errno == EAGAIN) return(0);
++              return(-errno);
++      }
++      else if(n == 0) return(-ENOTCONN);
++      return(n);      
++}
++
++int net_send(int fd, void *buf, int len)
++{
++      int n;
++
++      while(((n = send(fd, buf, len, 0)) < 0) && (errno == EINTR)) ;
++      if(n < 0){
++              if(errno == EAGAIN) return(0);
++              return(-errno);
++      }
++      else if(n == 0) return(-ENOTCONN);
++      return(n);      
++}
++
++int net_sendto(int fd, void *buf, int len, void *to, int sock_len)
++{
++      int n;
++
++      while(((n = sendto(fd, buf, len, 0, (struct sockaddr *) to,
++                         sock_len)) < 0) && (errno == EINTR)) ;
++      if(n < 0){
++              if(errno == EAGAIN) return(0);
++              return(-errno);
++      }
++      else if(n == 0) return(-ENOTCONN);
++      return(n);      
++}
++
++struct change_pre_exec_data {
++      int close_me;
++      int stdout;
++};
++
++static void change_pre_exec(void *arg)
++{
++      struct change_pre_exec_data *data = arg;
++
++      close(data->close_me);
++      dup2(data->stdout, 1);
++}
++
++static int change_tramp(char **argv, char *output, int output_len)
++{
++      int pid, fds[2], err;
++      struct change_pre_exec_data pe_data;
++
++      err = os_pipe(fds, 1, 0);
++      if(err){
++              printk("change_tramp - pipe failed, errno = %d\n", -err);
++              return(err);
++      }
++      pe_data.close_me = fds[0];
++      pe_data.stdout = fds[1];
++      pid = run_helper(change_pre_exec, &pe_data, argv, NULL);
++
++      close(fds[1]);
++      read_output(fds[0], output, output_len);
++      waitpid(pid, NULL, 0);  
++      return(pid);
++}
++
++static void change(char *dev, char *what, unsigned char *addr,
++                 unsigned char *netmask)
++{
++      char addr_buf[sizeof("255.255.255.255\0")];
++      char netmask_buf[sizeof("255.255.255.255\0")];
++      char version[sizeof("nnnnn\0")];
++      char *argv[] = { "uml_net", version, what, dev, addr_buf, 
++                       netmask_buf, NULL };
++      char *output;
++      int output_len, pid;
++
++      sprintf(version, "%d", UML_NET_VERSION);
++      sprintf(addr_buf, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]);
++      sprintf(netmask_buf, "%d.%d.%d.%d", netmask[0], netmask[1], 
++              netmask[2], netmask[3]);
++
++      output_len = page_size();
++      output = um_kmalloc(output_len);
++      if(output == NULL)
++              printk("change : failed to allocate output buffer\n");
++
++      pid = change_tramp(argv, output, output_len);
++      if(pid < 0) return;
++
++      if(output != NULL){
++              printk("%s", output);
++              kfree(output);
++      }
++}
++
++void open_addr(unsigned char *addr, unsigned char *netmask, void *arg)
++{
++      change(arg, "add", addr, netmask);
++}
++
++void close_addr(unsigned char *addr, unsigned char *netmask, void *arg)
++{
++      change(arg, "del", addr, netmask);
++}
++
++char *split_if_spec(char *str, ...)
++{
++      char **arg, *end;
++      va_list ap;
++
++      va_start(ap, str);
++      while((arg = va_arg(ap, char **)) != NULL){
++              if(*str == '\0')
++                      return(NULL);
++              end = strchr(str, ',');
++              if(end != str)
++                      *arg = str;
++              if(end == NULL)
++                      return(NULL);
++              *end++ = '\0';
++              str = end;
++      }
++      va_end(ap);
++      return(str);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/null.c um/arch/um/drivers/null.c
+--- orig/arch/um/drivers/null.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/null.c  Sun Dec 15 21:04:00 2002
+@@ -0,0 +1,56 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <errno.h>
++#include <fcntl.h>
++#include "chan_user.h"
++#include "os.h"
++
++static int null_chan;
++
++void *null_init(char *str, int device, struct chan_opts *opts)
++{
++      return(&null_chan);
++}
++
++int null_open(int input, int output, int primary, void *d, char **dev_out)
++{
++      *dev_out = NULL;
++      return(os_open_file(DEV_NULL, of_rdwr(OPENFLAGS()), 0));
++}
++
++int null_read(int fd, char *c_out, void *unused)
++{
++      return(-ENODEV);
++}
++
++void null_free(void *data)
++{
++}
++
++struct chan_ops null_ops = {
++      .type           = "null",
++      .init           = null_init,
++      .open           = null_open,
++      .close          = generic_close,
++      .read           = null_read,
++      .write          = generic_write,
++      .console_write  = generic_console_write,
++      .window_size    = generic_window_size,
++      .free           = null_free,
++      .winch          = 0,
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/pcap_kern.c um/arch/um/drivers/pcap_kern.c
+--- orig/arch/um/drivers/pcap_kern.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/pcap_kern.c     Sun Dec 15 21:19:15 2002
+@@ -0,0 +1,127 @@
++/*
++ * Copyright (C) 2002 Jeff Dike <jdike@karaya.com>
++ * Licensed under the GPL.
++ */
++
++#include "linux/init.h"
++#include "linux/netdevice.h"
++#include "linux/etherdevice.h"
++#include "net_kern.h"
++#include "net_user.h"
++#include "pcap_user.h"
++
++struct pcap_init {
++      char *host_if;
++      int promisc;
++      int optimize;
++      char *filter;
++};
++
++void pcap_init(struct net_device *dev, void *data)
++{
++      struct uml_net_private *pri;
++      struct pcap_data *ppri;
++      struct pcap_init *init = data;
++
++      init_etherdev(dev, 0);
++      pri = dev->priv;
++      ppri = (struct pcap_data *) pri->user;
++      *ppri = ((struct pcap_data)
++              { .host_if      = init->host_if,
++                .promisc      = init->promisc,
++                .optimize     = init->optimize,
++                .filter       = init->filter,
++                .compiled     = NULL,
++                .pcap         = NULL });
++}
++
++static int pcap_read(int fd, struct sk_buff **skb, 
++                     struct uml_net_private *lp)
++{
++      *skb = ether_adjust_skb(*skb, ETH_HEADER_OTHER);
++      if(*skb == NULL) return(-ENOMEM);
++      return(pcap_user_read(fd, (*skb)->mac.raw, 
++                            (*skb)->dev->mtu + ETH_HEADER_OTHER,
++                            (struct pcap_data *) &lp->user));
++}
++
++static int pcap_write(int fd, struct sk_buff **skb, struct uml_net_private *lp)
++{
++      return(-EPERM);
++}
++
++static struct net_kern_info pcap_kern_info = {
++      .init                   = pcap_init,
++      .protocol               = eth_protocol,
++      .read                   = pcap_read,
++      .write                  = pcap_write,
++};
++
++int pcap_setup(char *str, char **mac_out, void *data)
++{
++      struct pcap_init *init = data;
++      char *remain, *host_if = NULL, *options[2] = { NULL, NULL };
++      int i;
++
++      *init = ((struct pcap_init)
++              { .host_if      = "eth0",
++                .promisc      = 1,
++                .optimize     = 0,
++                .filter       = NULL });
++
++      remain = split_if_spec(str, &host_if, &init->filter, 
++                             &options[0], &options[1], NULL);
++      if(remain != NULL){
++              printk(KERN_ERR "pcap_setup - Extra garbage on "
++                     "specification : '%s'\n", remain);
++              return(0);
++      }
++
++      if(host_if != NULL)
++              init->host_if = host_if;
++
++      for(i = 0; i < sizeof(options)/sizeof(options[0]); i++){
++              if(options[i] == NULL)
++                      continue;
++              if(!strcmp(options[i], "promisc"))
++                      init->promisc = 1;
++              else if(!strcmp(options[i], "nopromisc"))
++                      init->promisc = 0;
++              else if(!strcmp(options[i], "optimize"))
++                      init->optimize = 1;
++              else if(!strcmp(options[i], "nooptimize"))
++                      init->optimize = 0;
++              else printk("pcap_setup : bad option - '%s'\n", options[i]);
++      }
++
++      return(1);
++}
++
++static struct transport pcap_transport = {
++      .list           = LIST_HEAD_INIT(pcap_transport.list),
++      .name           = "pcap",
++      .setup          = pcap_setup,
++      .user           = &pcap_user_info,
++      .kern           = &pcap_kern_info,
++      .private_size   = sizeof(struct pcap_data),
++      .setup_size     = sizeof(struct pcap_init),
++};
++
++static int register_pcap(void)
++{
++      register_transport(&pcap_transport);
++      return(1);
++}
++
++__initcall(register_pcap);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/pcap_user.c um/arch/um/drivers/pcap_user.c
+--- orig/arch/um/drivers/pcap_user.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/pcap_user.c     Sun Dec 15 21:04:39 2002
+@@ -0,0 +1,143 @@
++/*
++ * Copyright (C) 2002 Jeff Dike <jdike@karaya.com>
++ * Licensed under the GPL.
++ */
++
++#include <unistd.h>
++#include <stdlib.h>
++#include <string.h>
++#include <errno.h>
++#include <pcap.h>
++#include <asm/types.h>
++#include "net_user.h"
++#include "pcap_user.h"
++#include "user.h"
++
++#define MAX_PACKET (ETH_MAX_PACKET + ETH_HEADER_OTHER)
++
++#define PCAP_FD(p) (*(int *)(p))
++
++static void pcap_user_init(void *data, void *dev)
++{
++      struct pcap_data *pri = data;
++      pcap_t *p;
++      char errors[PCAP_ERRBUF_SIZE];
++
++      p = pcap_open_live(pri->host_if, MAX_PACKET, pri->promisc, 0, errors);
++      if(p == NULL){
++              printk("pcap_user_init : pcap_open_live failed - '%s'\n", 
++                     errors);
++              return;
++      }
++
++      pri->dev = dev;
++      pri->pcap = p;
++}
++
++static int pcap_open(void *data)
++{
++      struct pcap_data *pri = data;
++      __u32 netmask;
++      int err;
++
++      if(pri->pcap == NULL)
++              return(-ENODEV);
++
++      if(pri->filter != NULL){
++              err = dev_netmask(pri->dev, &netmask);
++              if(err < 0){
++                      printk("pcap_open : dev_netmask failed\n");
++                      return(-EIO);
++              }
++
++              pri->compiled = um_kmalloc(sizeof(struct bpf_program));
++              if(pri->compiled == NULL){
++                      printk("pcap_open : kmalloc failed\n");
++                      return(-ENOMEM);
++              }
++              
++              err = pcap_compile(pri->pcap, 
++                                 (struct bpf_program *) pri->compiled, 
++                                 pri->filter, pri->optimize, netmask);
++              if(err < 0){
++                      printk("pcap_open : pcap_compile failed - '%s'\n", 
++                             pcap_geterr(pri->pcap));
++                      return(-EIO);
++              }
++
++              err = pcap_setfilter(pri->pcap, pri->compiled);
++              if(err < 0){
++                      printk("pcap_open : pcap_setfilter failed - '%s'\n", 
++                             pcap_geterr(pri->pcap));
++                      return(-EIO);
++              }
++      }
++      
++      return(PCAP_FD(pri->pcap));
++}
++
++static void pcap_remove(void *data)
++{
++      struct pcap_data *pri = data;
++
++      if(pri->compiled != NULL)
++              pcap_freecode(pri->compiled);
++
++      pcap_close(pri->pcap);
++}
++
++struct pcap_handler_data {
++      char *buffer;
++      int len;
++};
++
++static void handler(u_char *data, const struct pcap_pkthdr *header, 
++                  const u_char *packet)
++{
++      int len;
++
++      struct pcap_handler_data *hdata = (struct pcap_handler_data *) data;
++
++      len = hdata->len < header->caplen ? hdata->len : header->caplen;
++      memcpy(hdata->buffer, packet, len);
++      hdata->len = len;
++}
++
++int pcap_user_read(int fd, void *buffer, int len, struct pcap_data *pri)
++{
++      struct pcap_handler_data hdata = ((struct pcap_handler_data)
++                                        { .buffer     = buffer,
++                                          .len        = len });
++      int n;
++
++      n = pcap_dispatch(pri->pcap, 1, handler, (u_char *) &hdata);
++      if(n < 0){
++              printk("pcap_dispatch failed - %s\n", pcap_geterr(pri->pcap));
++              return(-EIO);
++      }
++      else if(n == 0) 
++              return(0);
++      return(hdata.len);
++}
++
++struct net_user_info pcap_user_info = {
++      .init           = pcap_user_init,
++      .open           = pcap_open,
++      .close          = NULL,
++      .remove         = pcap_remove,
++      .set_mtu        = NULL,
++      .add_address    = NULL,
++      .delete_address = NULL,
++      .max_packet     = MAX_PACKET - ETH_HEADER_OTHER
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/pcap_user.h um/arch/um/drivers/pcap_user.h
+--- orig/arch/um/drivers/pcap_user.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/pcap_user.h     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,31 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "net_user.h"
++
++struct pcap_data {
++      char *host_if;
++      int promisc;
++      int optimize;
++      char *filter;
++      void *compiled;
++      void *pcap;
++      void *dev;
++};
++
++extern struct net_user_info pcap_user_info;
++
++extern int pcap_user_read(int fd, void *buf, int len, struct pcap_data *pri);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/port.h um/arch/um/drivers/port.h
+--- orig/arch/um/drivers/port.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/port.h  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,30 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PORT_H__
++#define __PORT_H__
++
++extern void *port_data(int port);
++extern int port_wait(void *data);
++extern void port_kern_close(void *d);
++extern int port_connection(int fd, int *socket_out, int *pid_out);
++extern int port_listen_fd(int port);
++extern void port_read(int fd, void *data);
++extern void port_kern_free(void *d);
++extern int port_rcv_fd(int fd);
++extern void port_remove_dev(void *d);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/port_kern.c um/arch/um/drivers/port_kern.c
+--- orig/arch/um/drivers/port_kern.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/port_kern.c     Mon Dec 30 20:57:42 2002
+@@ -0,0 +1,302 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/list.h"
++#include "linux/sched.h"
++#include "linux/slab.h"
++#include "linux/irq.h"
++#include "linux/spinlock.h"
++#include "linux/errno.h"
++#include "asm/semaphore.h"
++#include "asm/errno.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "irq_user.h"
++#include "port.h"
++#include "init.h"
++#include "os.h"
++
++struct port_list {
++      struct list_head list;
++      int has_connection;
++      struct semaphore sem;
++      int port;
++      int fd;
++      spinlock_t lock;
++      struct list_head pending;
++      struct list_head connections;
++};
++
++struct port_dev {
++      struct port_list *port;
++      int helper_pid;
++      int telnetd_pid;
++};
++
++struct connection {
++      struct list_head list;
++      int fd;
++      int helper_pid;
++      int socket[2];
++      int telnetd_pid;
++      struct port_list *port;
++};
++
++static void pipe_interrupt(int irq, void *data, struct pt_regs *regs)
++{
++      struct connection *conn = data;
++      int fd;
++
++      fd = os_rcv_fd(conn->socket[0], &conn->helper_pid);
++      if(fd < 0){
++              if(fd == -EAGAIN)
++                      return;
++
++              printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n",
++                     -fd);
++              os_close_file(conn->fd);
++      }
++
++      list_del(&conn->list);
++
++      conn->fd = fd;
++      list_add(&conn->list, &conn->port->connections);
++
++      up(&conn->port->sem);
++}
++
++static int port_accept(struct port_list *port)
++{
++      struct connection *conn;
++      int fd, socket[2], pid, ret = 0;
++
++      fd = port_connection(port->fd, socket, &pid);
++      if(fd < 0){
++              if(fd != -EAGAIN)
++                      printk(KERN_ERR "port_accept : port_connection "
++                             "returned %d\n", -fd);
++              goto out;
++      }
++
++      conn = kmalloc(sizeof(*conn), GFP_ATOMIC);
++      if(conn == NULL){
++              printk(KERN_ERR "port_accept : failed to allocate "
++                     "connection\n");
++              goto out_close;
++      }
++      *conn = ((struct connection) 
++              { .list         = LIST_HEAD_INIT(conn->list),
++                .fd           = fd,
++                .socket       = { socket[0], socket[1] },
++                .telnetd_pid  = pid,
++                .port         = port });
++
++      if(um_request_irq(TELNETD_IRQ, socket[0], IRQ_READ, pipe_interrupt, 
++                        SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, 
++                        "telnetd", conn)){
++              printk(KERN_ERR "port_accept : failed to get IRQ for "
++                     "telnetd\n");
++              goto out_free;
++      }
++
++      list_add(&conn->list, &port->pending);
++      return(1);
++
++ out_free:
++      kfree(conn);
++ out_close:
++      os_close_file(fd);
++      if(pid != -1) 
++              os_kill_process(pid, 1);
++ out:
++      return(ret);
++} 
++
++DECLARE_MUTEX(ports_sem);
++struct list_head ports = LIST_HEAD_INIT(ports);
++
++void port_task_proc(void *unused)
++{
++      struct port_list *port;
++      struct list_head *ele;
++      unsigned long flags;
++
++      save_flags(flags);
++      list_for_each(ele, &ports){
++              port = list_entry(ele, struct port_list, list);
++              if(!port->has_connection)
++                      continue;
++              reactivate_fd(port->fd, ACCEPT_IRQ);
++              while(port_accept(port)) ;
++              port->has_connection = 0;
++      }
++      restore_flags(flags);
++}
++
++struct tq_struct port_task = {
++      .routine        = port_task_proc,
++      .data           = NULL
++};
++
++static void port_interrupt(int irq, void *data, struct pt_regs *regs)
++{
++      struct port_list *port = data;
++
++      port->has_connection = 1;
++      schedule_task(&port_task);
++} 
++
++void *port_data(int port_num)
++{
++      struct list_head *ele;
++      struct port_list *port;
++      struct port_dev *dev = NULL;
++      int fd;
++
++      down(&ports_sem);
++      list_for_each(ele, &ports){
++              port = list_entry(ele, struct port_list, list);
++              if(port->port == port_num) goto found;
++      }
++      port = kmalloc(sizeof(struct port_list), GFP_KERNEL);
++      if(port == NULL){
++              printk(KERN_ERR "Allocation of port list failed\n");
++              goto out;
++      }
++
++      fd = port_listen_fd(port_num);
++      if(fd < 0){
++              printk(KERN_ERR "binding to port %d failed, errno = %d\n",
++                     port_num, -fd);
++              goto out_free;
++      }
++      if(um_request_irq(ACCEPT_IRQ, fd, IRQ_READ, port_interrupt, 
++                        SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, "port",
++                        port)){
++              printk(KERN_ERR "Failed to get IRQ for port %d\n", port_num);
++              goto out_close;
++      }
++
++      *port = ((struct port_list) 
++              { .list                 = LIST_HEAD_INIT(port->list),
++                .has_connection       = 0,
++                .sem                  = __SEMAPHORE_INITIALIZER(port->sem, 
++                                                                0),
++                .lock                 = SPIN_LOCK_UNLOCKED,
++                .port                 = port_num,
++                .fd                   = fd,
++                .pending              = LIST_HEAD_INIT(port->pending),
++                .connections          = LIST_HEAD_INIT(port->connections) });
++      list_add(&port->list, &ports);
++
++ found:
++      dev = kmalloc(sizeof(struct port_dev), GFP_KERNEL);
++      if(dev == NULL){
++              printk(KERN_ERR "Allocation of port device entry failed\n");
++              goto out;
++      }
++
++      *dev = ((struct port_dev) { .port               = port,
++                                  .helper_pid         = -1,
++                                  .telnetd_pid        = -1 });
++      goto out;
++
++ out_free:
++      kfree(port);
++ out_close:
++      os_close_file(fd);
++ out:
++      up(&ports_sem);
++      return(dev);
++}
++
++int port_wait(void *data)
++{
++      struct port_dev *dev = data;
++      struct connection *conn;
++      struct port_list *port = dev->port;
++      int fd;
++
++      while(1){
++              if(down_interruptible(&port->sem)) 
++                      return(-ERESTARTSYS);
++
++              spin_lock(&port->lock);
++
++              conn = list_entry(port->connections.next, struct connection, 
++                                list);
++              list_del(&conn->list);
++              spin_unlock(&port->lock);
++
++              os_shutdown_socket(conn->socket[0], 1, 1);
++              os_close_file(conn->socket[0]);
++              os_shutdown_socket(conn->socket[1], 1, 1);
++              os_close_file(conn->socket[1]); 
++
++              /* This is done here because freeing an IRQ can't be done
++               * within the IRQ handler.  So, pipe_interrupt always ups
++               * the semaphore regardless of whether it got a successful
++               * connection.  Then we loop here throwing out failed 
++               * connections until a good one is found.
++               */
++              free_irq(TELNETD_IRQ, conn);
++
++              if(conn->fd >= 0) break;
++              os_close_file(conn->fd);
++              kfree(conn);
++      }
++
++      fd = conn->fd;
++      dev->helper_pid = conn->helper_pid;
++      dev->telnetd_pid = conn->telnetd_pid;
++      kfree(conn);
++
++      return(fd);
++}
++
++void port_remove_dev(void *d)
++{
++      struct port_dev *dev = d;
++
++      if(dev->helper_pid != -1)
++              os_kill_process(dev->helper_pid, 0);
++      if(dev->telnetd_pid != -1)
++              os_kill_process(dev->telnetd_pid, 1);
++      dev->helper_pid = -1;
++      dev->telnetd_pid = -1;
++}
++
++void port_kern_free(void *d)
++{
++      struct port_dev *dev = d;
++
++      port_remove_dev(dev);
++      kfree(dev);
++}
++
++static void free_port(void)
++{
++      struct list_head *ele;
++      struct port_list *port;
++
++      list_for_each(ele, &ports){
++              port = list_entry(ele, struct port_list, list);
++              free_irq_by_fd(port->fd);
++              os_close_file(port->fd);
++      }
++}
++
++__uml_exitcall(free_port);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/port_user.c um/arch/um/drivers/port_user.c
+--- orig/arch/um/drivers/port_user.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/port_user.c     Mon Dec 16 22:46:20 2002
+@@ -0,0 +1,206 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stddef.h>
++#include <stdlib.h>
++#include <string.h>
++#include <errno.h>
++#include <unistd.h>
++#include <termios.h>
++#include <sys/socket.h>
++#include <sys/un.h>
++#include <netinet/in.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "chan_user.h"
++#include "port.h"
++#include "helper.h"
++#include "os.h"
++
++struct port_chan {
++      int raw;
++      struct termios tt;
++      void *kernel_data;
++      char dev[sizeof("32768\0")];
++};
++
++void *port_init(char *str, int device, struct chan_opts *opts)
++{
++      struct port_chan *data;
++      void *kern_data;
++      char *end;
++      int port;
++
++      if(*str != ':'){
++              printk("port_init : channel type 'port' must specify a "
++                     "port number\n");
++              return(NULL);
++      }
++      str++;
++      port = strtoul(str, &end, 0);
++      if((*end != '\0') || (end == str)){
++              printk("port_init : couldn't parse port '%s'\n", str);
++              return(NULL);
++      }
++
++      if((kern_data = port_data(port)) == NULL) 
++              return(NULL);
++
++      if((data = um_kmalloc(sizeof(*data))) == NULL) 
++              goto err;
++
++      *data = ((struct port_chan) { .raw              = opts->raw,
++                                    .kernel_data      = kern_data });
++      sprintf(data->dev, "%d", port);
++
++      return(data);
++ err:
++      port_kern_free(kern_data);
++      return(NULL);
++}
++
++void port_free(void *d)
++{
++      struct port_chan *data = d;
++
++      port_kern_free(data->kernel_data);
++      kfree(data);
++}
++
++int port_open(int input, int output, int primary, void *d, char **dev_out)
++{
++      struct port_chan *data = d;
++      int fd;
++
++      fd = port_wait(data->kernel_data);
++      if((fd >= 0) && data->raw){
++              tcgetattr(fd, &data->tt);
++              raw(fd, 0);
++      }
++      *dev_out = data->dev;
++      return(fd);
++}
++
++void port_close(int fd, void *d)
++{
++      struct port_chan *data = d;
++
++      port_remove_dev(data->kernel_data);
++      close(fd);
++}
++
++int port_console_write(int fd, const char *buf, int n, void *d)
++{
++      struct port_chan *data = d;
++
++      return(generic_console_write(fd, buf, n, &data->tt));
++}
++
++struct chan_ops port_ops = {
++      .type           = "port",
++      .init           = port_init,
++      .open           = port_open,
++      .close          = port_close,
++      .read           = generic_read,
++      .write          = generic_write,
++      .console_write  = port_console_write,
++      .window_size    = generic_window_size,
++      .free           = port_free,
++      .winch          = 1,
++};
++
++int port_listen_fd(int port)
++{
++      struct sockaddr_in addr;
++      int fd, err;
++
++      fd = socket(PF_INET, SOCK_STREAM, 0);
++      if(fd == -1) 
++              return(-errno);
++
++      addr.sin_family = AF_INET;
++      addr.sin_port = htons(port);
++      addr.sin_addr.s_addr = htonl(INADDR_ANY);
++      if(bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0){
++              err = -errno;
++              goto out;
++      }
++  
++      if((listen(fd, 1) < 0) || (os_set_fd_block(fd, 0))){
++              err = -errno;
++              goto out;
++      }
++
++      return(fd);
++ out:
++      os_close_file(fd);
++      return(err);
++}
++
++struct port_pre_exec_data {
++      int sock_fd;
++      int pipe_fd;
++};
++
++void port_pre_exec(void *arg)
++{
++      struct port_pre_exec_data *data = arg;
++
++      dup2(data->sock_fd, 0);
++      dup2(data->sock_fd, 1);
++      dup2(data->sock_fd, 2);
++      close(data->sock_fd);
++      dup2(data->pipe_fd, 3);
++      os_shutdown_socket(3, 1, 0);
++      close(data->pipe_fd);
++}
++
++int port_connection(int fd, int *socket, int *pid_out)
++{
++      int new, err;
++      char *argv[] = { "/usr/sbin/in.telnetd", "-L", 
++                       "/usr/lib/uml/port-helper", NULL };
++      struct port_pre_exec_data data;
++
++      if((new = os_accept_connection(fd)) < 0)
++              return(-errno);
++
++      err = os_pipe(socket, 0, 0);
++      if(err) 
++              goto out_close;
++
++      data = ((struct port_pre_exec_data)
++              { .sock_fd              = new,
++                .pipe_fd              = socket[1] });
++
++      err = run_helper(port_pre_exec, &data, argv, NULL);
++      if(err < 0) 
++              goto out_shutdown;
++
++      *pid_out = err;
++      return(new);
++
++ out_shutdown:
++      os_shutdown_socket(socket[0], 1, 1);
++      close(socket[0]);
++      os_shutdown_socket(socket[1], 1, 1);    
++      close(socket[1]);
++ out_close:
++      close(new);
++      return(err);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/pty.c um/arch/um/drivers/pty.c
+--- orig/arch/um/drivers/pty.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/pty.c   Sun Dec 15 21:06:01 2002
+@@ -0,0 +1,148 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <string.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <termios.h>
++#include "chan_user.h"
++#include "user.h"
++#include "user_util.h"
++#include "kern_util.h"
++
++struct pty_chan {
++      void (*announce)(char *dev_name, int dev);
++      int dev;
++      int raw;
++      struct termios tt;
++      char dev_name[sizeof("/dev/pts/0123456\0")];
++};
++
++void *pty_chan_init(char *str, int device, struct chan_opts *opts)
++{
++      struct pty_chan *data;
++
++      if((data = um_kmalloc(sizeof(*data))) == NULL) return(NULL);
++      *data = ((struct pty_chan) { .announce          = opts->announce, 
++                                   .dev               = device,
++                                   .raw               = opts->raw });
++      return(data);
++}
++
++int pts_open(int input, int output, int primary, void *d, char **dev_out)
++{
++      struct pty_chan *data = d;
++      char *dev;
++      int fd;
++
++      if((fd = get_pty()) < 0){
++              printk("open_pts : Failed to open pts\n");
++              return(-errno);
++      }
++      if(data->raw){
++              tcgetattr(fd, &data->tt);
++              raw(fd, 0);
++      }
++
++      dev = ptsname(fd);
++      sprintf(data->dev_name, "%s", dev);
++      *dev_out = data->dev_name;
++      if(data->announce) (*data->announce)(dev, data->dev);
++      return(fd);
++}
++
++int getmaster(char *line)
++{
++      struct stat stb;
++      char *pty, *bank, *cp;
++      int master;
++
++      pty = &line[strlen("/dev/ptyp")];
++      for (bank = "pqrs"; *bank; bank++) {
++              line[strlen("/dev/pty")] = *bank;
++              *pty = '0';
++              if (stat(line, &stb) < 0)
++                      break;
++              for (cp = "0123456789abcdef"; *cp; cp++) {
++                      *pty = *cp;
++                      master = open(line, O_RDWR);
++                      if (master >= 0) {
++                              char *tp = &line[strlen("/dev/")];
++                              int ok;
++
++                              /* verify slave side is usable */
++                              *tp = 't';
++                              ok = access(line, R_OK|W_OK) == 0;
++                              *tp = 'p';
++                              if (ok) return(master);
++                              (void) close(master);
++                      }
++              }
++      }
++      return(-1);
++}
++
++int pty_open(int input, int output, int primary, void *d, char **dev_out)
++{
++      struct pty_chan *data = d;
++      int fd;
++      char dev[sizeof("/dev/ptyxx\0")] = "/dev/ptyxx";
++
++      fd = getmaster(dev);
++      if(fd < 0) return(-errno);
++      
++      if(data->raw) raw(fd, 0);
++      if(data->announce) (*data->announce)(dev, data->dev);
++
++      sprintf(data->dev_name, "%s", dev);
++      *dev_out = data->dev_name;
++      return(fd);
++}
++
++int pty_console_write(int fd, const char *buf, int n, void *d)
++{
++      struct pty_chan *data = d;
++
++      return(generic_console_write(fd, buf, n, &data->tt));
++}
++
++struct chan_ops pty_ops = {
++      .type           = "pty",
++      .init           = pty_chan_init,
++      .open           = pty_open,
++      .close          = generic_close,
++      .read           = generic_read,
++      .write          = generic_write,
++      .console_write  = pty_console_write,
++      .window_size    = generic_window_size,
++      .free           = generic_free,
++      .winch          = 0,
++};
++
++struct chan_ops pts_ops = {
++      .type           = "pts",
++      .init           = pty_chan_init,
++      .open           = pts_open,
++      .close          = generic_close,
++      .read           = generic_read,
++      .write          = generic_write,
++      .console_write  = pty_console_write,
++      .window_size    = generic_window_size,
++      .free           = generic_free,
++      .winch          = 0,
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/slip.h um/arch/um/drivers/slip.h
+--- orig/arch/um/drivers/slip.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/slip.h  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,39 @@
++#ifndef __UM_SLIP_H
++#define __UM_SLIP_H
++
++#define BUF_SIZE 1500
++ /* two bytes each for a (pathological) max packet of escaped chars +  * 
++  * terminating END char + initial END char                            */
++#define ENC_BUF_SIZE (2 * BUF_SIZE + 2)
++
++struct slip_data {
++      void *dev;
++      char name[sizeof("slnnnnn\0")];
++      char *addr;
++      char *gate_addr;
++      int slave;
++      char ibuf[ENC_BUF_SIZE];
++      char obuf[ENC_BUF_SIZE];
++      int more; /* more data: do not read fd until ibuf has been drained */
++      int pos;
++      int esc;
++};
++
++extern struct net_user_info slip_user_info;
++
++extern int set_umn_addr(int fd, char *addr, char *ptp_addr);
++extern int slip_user_read(int fd, void *buf, int len, struct slip_data *pri);
++extern int slip_user_write(int fd, void *buf, int len, struct slip_data *pri);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/slip_kern.c um/arch/um/drivers/slip_kern.c
+--- orig/arch/um/drivers/slip_kern.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/slip_kern.c     Sun Dec 15 21:06:24 2002
+@@ -0,0 +1,109 @@
++#include "linux/config.h"
++#include "linux/kernel.h"
++#include "linux/stddef.h"
++#include "linux/init.h"
++#include "linux/netdevice.h"
++#include "linux/if_arp.h"
++#include "net_kern.h"
++#include "net_user.h"
++#include "kern.h"
++#include "slip.h"
++
++struct slip_init {
++      char *gate_addr;
++};
++
++void slip_init(struct net_device *dev, void *data)
++{
++      struct uml_net_private *private;
++      struct slip_data *spri;
++      struct slip_init *init = data;
++
++      private = dev->priv;
++      spri = (struct slip_data *) private->user;
++      *spri = ((struct slip_data)
++              { .name         = { '\0' },
++                .addr         = NULL,
++                .gate_addr    = init->gate_addr,
++                .slave        = -1,
++                .ibuf         = { '\0' },
++                .obuf         = { '\0' },
++                .pos          = 0,
++                .esc          = 0,
++                .dev          = dev });
++
++      dev->init = NULL;
++      dev->hard_header_len = 0;
++      dev->addr_len = 4;
++      dev->type = ARPHRD_ETHER;
++      dev->tx_queue_len = 256;
++      dev->flags = IFF_NOARP;
++      printk("SLIP backend - SLIP IP = %s\n", spri->gate_addr);
++}
++
++static unsigned short slip_protocol(struct sk_buff *skbuff)
++{
++      return(htons(ETH_P_IP));
++}
++
++static int slip_read(int fd, struct sk_buff **skb, 
++                     struct uml_net_private *lp)
++{
++      return(slip_user_read(fd, (*skb)->mac.raw, (*skb)->dev->mtu, 
++                            (struct slip_data *) &lp->user));
++}
++
++static int slip_write(int fd, struct sk_buff **skb,
++                    struct uml_net_private *lp)
++{
++      return(slip_user_write(fd, (*skb)->data, (*skb)->len, 
++                             (struct slip_data *) &lp->user));
++}
++
++struct net_kern_info slip_kern_info = {
++      .init                   = slip_init,
++      .protocol               = slip_protocol,
++      .read                   = slip_read,
++      .write                  = slip_write,
++};
++
++static int slip_setup(char *str, char **mac_out, void *data)
++{
++      struct slip_init *init = data;
++
++      *init = ((struct slip_init)
++              { .gate_addr            = NULL });
++
++      if(str[0] != '\0') 
++              init->gate_addr = str;
++      return(1);
++}
++
++static struct transport slip_transport = {
++      .list           = LIST_HEAD_INIT(slip_transport.list),
++      .name           = "slip",
++      .setup          = slip_setup,
++      .user           = &slip_user_info,
++      .kern           = &slip_kern_info,
++      .private_size   = sizeof(struct slip_data),
++      .setup_size     = sizeof(struct slip_init),
++};
++
++static int register_slip(void)
++{
++      register_transport(&slip_transport);
++      return(1);
++}
++
++__initcall(register_slip);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/slip_proto.h um/arch/um/drivers/slip_proto.h
+--- orig/arch/um/drivers/slip_proto.h  Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/slip_proto.h    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,93 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_SLIP_PROTO_H__
++#define __UM_SLIP_PROTO_H__
++
++/* SLIP protocol characters. */
++#define SLIP_END             0300     /* indicates end of frame       */
++#define SLIP_ESC             0333     /* indicates byte stuffing      */
++#define SLIP_ESC_END         0334     /* ESC ESC_END means END 'data' */
++#define SLIP_ESC_ESC         0335     /* ESC ESC_ESC means ESC 'data' */
++
++static inline int slip_unesc(unsigned char c,char *buf,int *pos, int *esc)
++{
++      int ret;
++
++      switch(c){
++      case SLIP_END:
++              *esc = 0;
++              ret=*pos;
++              *pos=0;
++              return(ret);
++      case SLIP_ESC:
++              *esc = 1;
++              return(0);
++      case SLIP_ESC_ESC:
++              if(*esc){
++                      *esc = 0;
++                      c = SLIP_ESC;
++              }
++              break;
++      case SLIP_ESC_END:
++              if(*esc){
++                      *esc = 0;
++                      c = SLIP_END;
++              }
++              break;
++      }
++      buf[(*pos)++] = c;
++      return(0);
++}
++
++static inline int slip_esc(unsigned char *s, unsigned char *d, int len)
++{
++      unsigned char *ptr = d;
++      unsigned char c;
++
++      /*
++       * Send an initial END character to flush out any
++       * data that may have accumulated in the receiver
++       * due to line noise.
++       */
++
++      *ptr++ = SLIP_END;
++
++      /*
++       * For each byte in the packet, send the appropriate
++       * character sequence, according to the SLIP protocol.
++       */
++
++      while (len-- > 0) {
++              switch(c = *s++) {
++              case SLIP_END:
++                      *ptr++ = SLIP_ESC;
++                      *ptr++ = SLIP_ESC_END;
++                      break;
++              case SLIP_ESC:
++                      *ptr++ = SLIP_ESC;
++                      *ptr++ = SLIP_ESC_ESC;
++                      break;
++              default:
++                      *ptr++ = c;
++                      break;
++              }
++      }
++      *ptr++ = SLIP_END;
++      return (ptr - d);
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/slip_user.c um/arch/um/drivers/slip_user.c
+--- orig/arch/um/drivers/slip_user.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/slip_user.c     Sun Dec 15 21:06:35 2002
+@@ -0,0 +1,279 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <stddef.h>
++#include <sched.h>
++#include <string.h>
++#include <sys/fcntl.h>
++#include <sys/errno.h>
++#include <sys/termios.h>
++#include <sys/wait.h>
++#include <sys/ioctl.h>
++#include <sys/signal.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "net_user.h"
++#include "slip.h"
++#include "slip_proto.h"
++#include "helper.h"
++#include "os.h"
++
++void slip_user_init(void *data, void *dev)
++{
++      struct slip_data *pri = data;
++
++      pri->dev = dev;
++}
++
++static int set_up_tty(int fd)
++{
++      int i;
++      struct termios tios;
++
++      if (tcgetattr(fd, &tios) < 0) {
++              printk("could not get initial terminal attributes\n");
++              return(-1);
++      }
++
++      tios.c_cflag = CS8 | CREAD | HUPCL | CLOCAL;
++      tios.c_iflag = IGNBRK | IGNPAR;
++      tios.c_oflag = 0;
++      tios.c_lflag = 0;
++      for (i = 0; i < NCCS; i++)
++              tios.c_cc[i] = 0;
++      tios.c_cc[VMIN] = 1;
++      tios.c_cc[VTIME] = 0;
++
++      cfsetospeed(&tios, B38400);
++      cfsetispeed(&tios, B38400);
++
++      if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) {
++              printk("failed to set terminal attributes\n");
++              return(-1);
++      }
++      return(0);
++}
++
++struct slip_pre_exec_data {
++      int stdin;
++      int stdout;
++      int close_me;
++};
++
++static void slip_pre_exec(void *arg)
++{
++      struct slip_pre_exec_data *data = arg;
++
++      if(data->stdin != -1) dup2(data->stdin, 0);
++      dup2(data->stdout, 1);
++      if(data->close_me != -1) close(data->close_me);
++}
++
++static int slip_tramp(char **argv, int fd)
++{
++      struct slip_pre_exec_data pe_data;
++      char *output;
++      int status, pid, fds[2], err, output_len;
++
++      err = os_pipe(fds, 1, 0);
++      if(err){
++              printk("slip_tramp : pipe failed, errno = %d\n", -err);
++              return(err);
++      }
++
++      err = 0;
++      pe_data.stdin = fd;
++      pe_data.stdout = fds[1];
++      pe_data.close_me = fds[0];
++      pid = run_helper(slip_pre_exec, &pe_data, argv, NULL);
++
++      if(pid < 0) err = pid;
++      else {
++              output_len = page_size();
++              output = um_kmalloc(output_len);
++              if(output == NULL)
++                      printk("slip_tramp : failed to allocate output "
++                             "buffer\n");
++
++              close(fds[1]);
++              read_output(fds[0], output, output_len);
++              if(output != NULL){
++                      printk("%s", output);
++                      kfree(output);
++              }
++              if(waitpid(pid, &status, 0) < 0) err = errno;
++              else if(!WIFEXITED(status) || (WEXITSTATUS(status) != 0)){
++                      printk("'%s' didn't exit with status 0\n", argv[0]);
++                      err = EINVAL;
++              }
++      }
++      return(err);
++}
++
++static int slip_open(void *data)
++{
++      struct slip_data *pri = data;
++      char version_buf[sizeof("nnnnn\0")];
++      char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")];
++      char *argv[] = { "uml_net", version_buf, "slip", "up", gate_buf, 
++                       NULL };
++      int sfd, mfd, disc, sencap, err;
++
++      if((mfd = get_pty()) < 0){
++              printk("umn : Failed to open pty\n");
++              return(-1);
++      }
++      if((sfd = os_open_file(ptsname(mfd), of_rdwr(OPENFLAGS()), 0)) < 0){
++              printk("Couldn't open tty for slip line\n");
++              return(-1);
++      }
++      if(set_up_tty(sfd)) return(-1);
++      pri->slave = sfd;
++      pri->pos = 0;
++      pri->esc = 0;
++      if(pri->gate_addr != NULL){
++              sprintf(version_buf, "%d", UML_NET_VERSION);
++              strcpy(gate_buf, pri->gate_addr);
++
++              err = slip_tramp(argv, sfd);
++
++              if(err != 0){
++                      printk("slip_tramp failed - errno = %d\n", err);
++                      return(-err);
++              }
++              if(ioctl(pri->slave, SIOCGIFNAME, pri->name) < 0){
++                      printk("SIOCGIFNAME failed, errno = %d\n", errno);
++                      return(-errno);
++              }
++              iter_addresses(pri->dev, open_addr, pri->name);
++      }
++      else {
++              disc = N_SLIP;
++              if(ioctl(sfd, TIOCSETD, &disc) < 0){
++                      printk("Failed to set slip line discipline - "
++                             "errno = %d\n", errno);
++                      return(-errno);
++              }
++              sencap = 0;
++              if(ioctl(sfd, SIOCSIFENCAP, &sencap) < 0){
++                      printk("Failed to set slip encapsulation - "
++                             "errno = %d\n", errno);
++                      return(-errno);
++              }
++      }
++      return(mfd);
++}
++
++static void slip_close(int fd, void *data)
++{
++      struct slip_data *pri = data;
++      char version_buf[sizeof("nnnnn\0")];
++      char *argv[] = { "uml_net", version_buf, "slip", "down", pri->name, 
++                       NULL };
++      int err;
++
++      if(pri->gate_addr != NULL)
++              iter_addresses(pri->dev, close_addr, pri->name);
++
++      sprintf(version_buf, "%d", UML_NET_VERSION);
++
++      err = slip_tramp(argv, -1);
++
++      if(err != 0)
++              printk("slip_tramp failed - errno = %d\n", err);
++      close(fd);
++      close(pri->slave);
++      pri->slave = -1;
++}
++
++int slip_user_read(int fd, void *buf, int len, struct slip_data *pri)
++{
++      int i, n, size, start;
++
++      if(pri->more>0) {
++              i = 0;
++              while(i < pri->more) {
++                      size = slip_unesc(pri->ibuf[i++],
++                                      pri->ibuf, &pri->pos, &pri->esc);
++                      if(size){
++                              memcpy(buf, pri->ibuf, size);
++                              memmove(pri->ibuf, &pri->ibuf[i], pri->more-i);
++                              pri->more=pri->more-i; 
++                              return(size);
++                      }
++              }
++              pri->more=0;
++      }
++
++      n = net_read(fd, &pri->ibuf[pri->pos], sizeof(pri->ibuf) - pri->pos);
++      if(n <= 0) return(n);
++
++      start = pri->pos;
++      for(i = 0; i < n; i++){
++              size = slip_unesc(pri->ibuf[start + i],
++                              pri->ibuf, &pri->pos, &pri->esc);
++              if(size){
++                      memcpy(buf, pri->ibuf, size);
++                      memmove(pri->ibuf, &pri->ibuf[start+i+1], n-(i+1));
++                      pri->more=n-(i+1); 
++                      return(size);
++              }
++      }
++      return(0);
++}
++
++int slip_user_write(int fd, void *buf, int len, struct slip_data *pri)
++{
++      int actual, n;
++
++      actual = slip_esc(buf, pri->obuf, len);
++      n = net_write(fd, pri->obuf, actual);
++      if(n < 0) return(n);
++      else return(len);
++}
++
++static int slip_set_mtu(int mtu, void *data)
++{
++      return(mtu);
++}
++
++static void slip_add_addr(unsigned char *addr, unsigned char *netmask,
++                        void *data)
++{
++      struct slip_data *pri = data;
++
++      if(pri->slave == -1) return;
++      open_addr(addr, netmask, pri->name);
++}
++
++static void slip_del_addr(unsigned char *addr, unsigned char *netmask,
++                          void *data)
++{
++      struct slip_data *pri = data;
++
++      if(pri->slave == -1) return;
++      close_addr(addr, netmask, pri->name);
++}
++
++struct net_user_info slip_user_info = {
++      .init           = slip_user_init,
++      .open           = slip_open,
++      .close          = slip_close,
++      .remove         = NULL,
++      .set_mtu        = slip_set_mtu,
++      .add_address    = slip_add_addr,
++      .delete_address = slip_del_addr,
++      .max_packet     = BUF_SIZE
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/slirp.h um/arch/um/drivers/slirp.h
+--- orig/arch/um/drivers/slirp.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/slirp.h Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,51 @@
++#ifndef __UM_SLIRP_H
++#define __UM_SLIRP_H
++
++#define BUF_SIZE 1500
++ /* two bytes each for a (pathological) max packet of escaped chars +  * 
++  * terminating END char + initial END char                            */
++#define ENC_BUF_SIZE (2 * BUF_SIZE + 2)
++
++#define SLIRP_MAX_ARGS 100
++/*
++ * XXX this next definition is here because I don't understand why this
++ * initializer doesn't work in slirp_kern.c:
++ *
++ *   argv :  { init->argv[ 0 ... SLIRP_MAX_ARGS-1 ] },
++ *
++ * or why I can't typecast like this:
++ *
++ *   argv :  (char* [SLIRP_MAX_ARGS])(init->argv), 
++ */
++struct arg_list_dummy_wrapper { char *argv[SLIRP_MAX_ARGS]; };
++
++struct slirp_data {
++      void *dev;
++      struct arg_list_dummy_wrapper argw;
++      int pid;
++      int slave;
++      char ibuf[ENC_BUF_SIZE];
++      char obuf[ENC_BUF_SIZE];
++      int more; /* more data: do not read fd until ibuf has been drained */
++      int pos;
++      int esc;
++};
++
++extern struct net_user_info slirp_user_info;
++
++extern int set_umn_addr(int fd, char *addr, char *ptp_addr);
++extern int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri);
++extern int slirp_user_write(int fd, void *buf, int len, struct slirp_data *pri);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/slirp_kern.c um/arch/um/drivers/slirp_kern.c
+--- orig/arch/um/drivers/slirp_kern.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/slirp_kern.c    Sun Dec 15 21:06:54 2002
+@@ -0,0 +1,132 @@
++#include "linux/kernel.h"
++#include "linux/stddef.h"
++#include "linux/init.h"
++#include "linux/netdevice.h"
++#include "linux/if_arp.h"
++#include "net_kern.h"
++#include "net_user.h"
++#include "kern.h"
++#include "slirp.h"
++
++struct slirp_init {
++      struct arg_list_dummy_wrapper argw;  /* XXX should be simpler... */
++};
++
++void slirp_init(struct net_device *dev, void *data)
++{
++      struct uml_net_private *private;
++      struct slirp_data *spri;
++      struct slirp_init *init = data;
++      int i;
++
++      private = dev->priv;
++      spri = (struct slirp_data *) private->user;
++      *spri = ((struct slirp_data)
++              { .argw         = init->argw,
++                .pid          = -1,
++                .slave        = -1,
++                .ibuf         = { '\0' },
++                .obuf         = { '\0' },
++                .pos          = 0,
++                .esc          = 0,
++                .dev          = dev });
++
++      dev->init = NULL;
++      dev->hard_header_len = 0;
++      dev->addr_len = 4;
++      dev->type = ARPHRD_ETHER;
++      dev->tx_queue_len = 256;
++      dev->flags = IFF_NOARP;
++      printk("SLIRP backend - command line:");
++      for(i=0;spri->argw.argv[i]!=NULL;i++) {
++              printk(" '%s'",spri->argw.argv[i]);
++      }
++      printk("\n");
++}
++
++static unsigned short slirp_protocol(struct sk_buff *skbuff)
++{
++      return(htons(ETH_P_IP));
++}
++
++static int slirp_read(int fd, struct sk_buff **skb, 
++                     struct uml_net_private *lp)
++{
++      return(slirp_user_read(fd, (*skb)->mac.raw, (*skb)->dev->mtu, 
++                            (struct slirp_data *) &lp->user));
++}
++
++static int slirp_write(int fd, struct sk_buff **skb,
++                    struct uml_net_private *lp)
++{
++      return(slirp_user_write(fd, (*skb)->data, (*skb)->len, 
++                             (struct slirp_data *) &lp->user));
++}
++
++struct net_kern_info slirp_kern_info = {
++      .init                   = slirp_init,
++      .protocol               = slirp_protocol,
++      .read                   = slirp_read,
++      .write                  = slirp_write,
++};
++
++static int slirp_setup(char *str, char **mac_out, void *data)
++{
++      struct slirp_init *init = data;
++      int i=0;
++
++      *init = ((struct slirp_init)
++              { argw :                { { "slirp", NULL  } } });
++
++      str = split_if_spec(str, mac_out, NULL);
++
++      if(str == NULL) { /* no command line given after MAC addr */
++              return(1);
++      }
++
++      do {
++              if(i>=SLIRP_MAX_ARGS-1) {
++                      printk("slirp_setup: truncating slirp arguments\n");
++                      break;
++              }
++              init->argw.argv[i++] = str;
++              while(*str && *str!=',') {
++                      if(*str=='_') *str=' ';
++                      str++;
++              }
++              if(*str!=',')
++                      break;
++              *str++='\0';
++      } while(1);
++      init->argw.argv[i]=NULL;
++      return(1);
++}
++
++static struct transport slirp_transport = {
++      .list           = LIST_HEAD_INIT(slirp_transport.list),
++      .name           = "slirp",
++      .setup          = slirp_setup,
++      .user           = &slirp_user_info,
++      .kern           = &slirp_kern_info,
++      .private_size   = sizeof(struct slirp_data),
++      .setup_size     = sizeof(struct slirp_init),
++};
++
++static int register_slirp(void)
++{
++      register_transport(&slirp_transport);
++      return(1);
++}
++
++__initcall(register_slirp);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/slirp_user.c um/arch/um/drivers/slirp_user.c
+--- orig/arch/um/drivers/slirp_user.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/slirp_user.c    Sun Dec 15 21:07:08 2002
+@@ -0,0 +1,202 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <stddef.h>
++#include <sched.h>
++#include <string.h>
++#include <sys/fcntl.h>
++#include <sys/errno.h>
++#include <sys/wait.h>
++#include <sys/signal.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "net_user.h"
++#include "slirp.h"
++#include "slip_proto.h"
++#include "helper.h"
++#include "os.h"
++
++void slirp_user_init(void *data, void *dev)
++{
++      struct slirp_data *pri = data;
++
++      pri->dev = dev;
++}
++
++struct slirp_pre_exec_data {
++      int stdin;
++      int stdout;
++};
++
++static void slirp_pre_exec(void *arg)
++{
++      struct slirp_pre_exec_data *data = arg;
++
++      if(data->stdin != -1) dup2(data->stdin, 0);
++      if(data->stdout != -1) dup2(data->stdout, 1);
++}
++
++static int slirp_tramp(char **argv, int fd)
++{
++      struct slirp_pre_exec_data pe_data;
++      int pid;
++
++      pe_data.stdin = fd;
++      pe_data.stdout = fd;
++      pid = run_helper(slirp_pre_exec, &pe_data, argv, NULL);
++
++      return(pid);
++}
++ 
++static int slirp_datachan(int *mfd, int *sfd)
++{
++      int fds[2], err;
++
++      err = os_pipe(fds, 1, 1);
++      if(err){
++              printk("slirp_datachan: Failed to open pipe, errno = %d\n",
++                     -err);
++              return(err);
++      }
++
++      *mfd = fds[0];
++      *sfd = fds[1];
++      return(0);
++}
++
++static int slirp_open(void *data)
++{
++      struct slirp_data *pri = data;
++      int sfd, mfd, pid, err;
++
++      err = slirp_datachan(&mfd, &sfd);
++      if(err)
++              return(err);
++
++      pid = slirp_tramp(pri->argw.argv, sfd);
++
++      if(pid < 0){
++              printk("slirp_tramp failed - errno = %d\n", pid);
++              os_close_file(sfd);     
++              os_close_file(mfd);     
++              return(pid);
++      }
++
++      pri->slave = sfd;
++      pri->pos = 0;
++      pri->esc = 0;
++
++      pri->pid = pid;
++
++      return(mfd);
++}
++
++static void slirp_close(int fd, void *data)
++{
++      struct slirp_data *pri = data;
++      int status,err;
++
++      close(fd);
++      close(pri->slave);
++
++      pri->slave = -1;
++
++      if(pri->pid<1) {
++              printk("slirp_close: no child process to shut down\n");
++              return;
++      }
++
++#if 0
++      if(kill(pri->pid, SIGHUP)<0) {
++              printk("slirp_close: sending hangup to %d failed (%d)\n",
++                      pri->pid, errno);
++      }
++#endif
++
++      err = waitpid(pri->pid, &status, WNOHANG);
++      if(err<0) {
++              printk("slirp_close: waitpid returned %d\n", errno);
++              return;
++      }
++
++      if(err==0) {
++              printk("slirp_close: process %d has not exited\n");
++              return;
++      }
++
++      pri->pid = -1;
++}
++
++int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri)
++{
++      int i, n, size, start;
++
++      if(pri->more>0) {
++              i = 0;
++              while(i < pri->more) {
++                      size = slip_unesc(pri->ibuf[i++],
++                                      pri->ibuf,&pri->pos,&pri->esc);
++                      if(size){
++                              memcpy(buf, pri->ibuf, size);
++                              memmove(pri->ibuf, &pri->ibuf[i], pri->more-i);
++                              pri->more=pri->more-i; 
++                              return(size);
++                      }
++              }
++              pri->more=0;
++      }
++
++      n = net_read(fd, &pri->ibuf[pri->pos], sizeof(pri->ibuf) - pri->pos);
++      if(n <= 0) return(n);
++
++      start = pri->pos;
++      for(i = 0; i < n; i++){
++              size = slip_unesc(pri->ibuf[start + i],
++                              pri->ibuf,&pri->pos,&pri->esc);
++              if(size){
++                      memcpy(buf, pri->ibuf, size);
++                      memmove(pri->ibuf, &pri->ibuf[start+i+1], n-(i+1));
++                      pri->more=n-(i+1); 
++                      return(size);
++              }
++      }
++      return(0);
++}
++
++int slirp_user_write(int fd, void *buf, int len, struct slirp_data *pri)
++{
++      int actual, n;
++
++      actual = slip_esc(buf, pri->obuf, len);
++      n = net_write(fd, pri->obuf, actual);
++      if(n < 0) return(n);
++      else return(len);
++}
++
++static int slirp_set_mtu(int mtu, void *data)
++{
++      return(mtu);
++}
++
++struct net_user_info slirp_user_info = {
++      .init           = slirp_user_init,
++      .open           = slirp_open,
++      .close          = slirp_close,
++      .remove         = NULL,
++      .set_mtu        = slirp_set_mtu,
++      .add_address    = NULL,
++      .delete_address = NULL,
++      .max_packet     = BUF_SIZE
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/ssl.c um/arch/um/drivers/ssl.c
+--- orig/arch/um/drivers/ssl.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/ssl.c   Thu Mar  6 18:55:01 2003
+@@ -0,0 +1,265 @@
++/* 
++ * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/fs.h"
++#include "linux/tty.h"
++#include "linux/tty_driver.h"
++#include "linux/major.h"
++#include "linux/mm.h"
++#include "linux/init.h"
++#include "asm/termbits.h"
++#include "asm/irq.h"
++#include "line.h"
++#include "ssl.h"
++#include "chan_kern.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "init.h"
++#include "irq_user.h"
++#include "mconsole_kern.h"
++#include "2_5compat.h"
++
++static int ssl_version = 1;
++
++/* Referenced only by tty_driver below - presumably it's locked correctly
++ * by the tty driver.
++ */
++static int ssl_refcount = 0;
++
++static struct tty_driver ssl_driver;
++
++#define NR_PORTS 64
++
++void ssl_announce(char *dev_name, int dev)
++{
++      printk(KERN_INFO "Serial line %d assigned device '%s'\n", dev,
++             dev_name);
++}
++
++static struct chan_opts opts = {
++      .announce       = ssl_announce,
++      .xterm_title    = "Serial Line #%d",
++      .raw            = 1,
++      .tramp_stack    = 0,
++      .in_kernel      = 1,
++};
++
++static int ssl_config(char *str);
++static int ssl_get_config(char *dev, char *str, int size, char **error_out);
++static int ssl_remove(char *str);
++
++static struct line_driver driver = {
++      .name                   = "UML serial line",
++      .devfs_name             = "tts/%d",
++      .major                  = TTY_MAJOR,
++      .minor_start            = 64,
++      .type                   = TTY_DRIVER_TYPE_SERIAL,
++      .subtype                = 0,
++      .read_irq               = SSL_IRQ,
++      .read_irq_name          = "ssl",
++      .write_irq              = SSL_WRITE_IRQ,
++      .write_irq_name         = "ssl-write",
++      .symlink_from           = "serial",
++      .symlink_to             = "tts",
++      .mc  = {
++              .name           = "ssl",
++              .config         = ssl_config,
++              .get_config     = ssl_get_config,
++              .remove         = ssl_remove,
++      },
++};
++
++/* The array is initialized by line_init, which is an initcall.  The 
++ * individual elements are protected by individual semaphores.
++ */
++static struct line serial_lines[NR_PORTS] =
++      { [0 ... NR_PORTS - 1] = LINE_INIT(CONFIG_SSL_CHAN, &driver) };
++
++static struct lines lines = LINES_INIT(NR_PORTS);
++
++static int ssl_config(char *str)
++{
++      return(line_config(serial_lines, 
++                         sizeof(serial_lines)/sizeof(serial_lines[0]), str));
++}
++
++static int ssl_get_config(char *dev, char *str, int size, char **error_out)
++{
++      return(line_get_config(dev, serial_lines, 
++                             sizeof(serial_lines)/sizeof(serial_lines[0]), 
++                             str, size, error_out));
++}
++
++static int ssl_remove(char *str)
++{
++      return(line_remove(serial_lines, 
++                         sizeof(serial_lines)/sizeof(serial_lines[0]), str));
++}
++
++int ssl_open(struct tty_struct *tty, struct file *filp)
++{
++      return(line_open(serial_lines, tty, &opts));
++}
++
++static void ssl_close(struct tty_struct *tty, struct file * filp)
++{
++      line_close(serial_lines, tty);
++}
++
++static int ssl_write(struct tty_struct * tty, int from_user,
++                   const unsigned char *buf, int count)
++{
++      return(line_write(serial_lines, tty, from_user, buf, count));
++}
++
++static void ssl_put_char(struct tty_struct *tty, unsigned char ch)
++{
++      line_write(serial_lines, tty, 0, &ch, sizeof(ch));
++}
++
++static void ssl_flush_chars(struct tty_struct *tty)
++{
++      return;
++}
++
++static int ssl_chars_in_buffer(struct tty_struct *tty)
++{
++      return(0);
++}
++
++static void ssl_flush_buffer(struct tty_struct *tty)
++{
++      return;
++}
++
++static int ssl_ioctl(struct tty_struct *tty, struct file * file,
++                   unsigned int cmd, unsigned long arg)
++{
++      int ret;
++
++      ret = 0;
++      switch(cmd){
++      case TCGETS:
++      case TCSETS:
++      case TCFLSH:
++      case TCSETSF:
++      case TCSETSW:
++      case TCGETA:
++      case TIOCMGET:
++              ret = -ENOIOCTLCMD;
++              break;
++      default:
++              printk(KERN_ERR 
++                     "Unimplemented ioctl in ssl_ioctl : 0x%x\n", cmd);
++              ret = -ENOIOCTLCMD;
++              break;
++      }
++      return(ret);
++}
++
++static void ssl_throttle(struct tty_struct * tty)
++{
++      printk(KERN_ERR "Someone should implement ssl_throttle\n");
++}
++
++static void ssl_unthrottle(struct tty_struct * tty)
++{
++      printk(KERN_ERR "Someone should implement ssl_unthrottle\n");
++}
++
++static void ssl_set_termios(struct tty_struct *tty, 
++                          struct termios *old_termios)
++{
++}
++
++static void ssl_stop(struct tty_struct *tty)
++{
++      printk(KERN_ERR "Someone should implement ssl_stop\n");
++}
++
++static void ssl_start(struct tty_struct *tty)
++{
++      printk(KERN_ERR "Someone should implement ssl_start\n");
++}
++
++void ssl_hangup(struct tty_struct *tty)
++{
++}
++
++static struct tty_driver ssl_driver = {
++      .refcount               = &ssl_refcount,
++      .open                   = ssl_open,
++      .close                  = ssl_close,
++      .write                  = ssl_write,
++      .put_char               = ssl_put_char,
++      .flush_chars            = ssl_flush_chars,
++      .chars_in_buffer        = ssl_chars_in_buffer,
++      .flush_buffer           = ssl_flush_buffer,
++      .ioctl                  = ssl_ioctl,
++      .throttle               = ssl_throttle,
++      .unthrottle             = ssl_unthrottle,
++      .set_termios            = ssl_set_termios,
++      .stop                   = ssl_stop,
++      .start                  = ssl_start,
++      .hangup                 = ssl_hangup
++};
++
++/* Changed by ssl_init and referenced by ssl_exit, which are both serialized
++ * by being an initcall and exitcall, respectively.
++ */
++static int ssl_init_done = 0;
++
++int ssl_init(void)
++{
++      char *new_title;
++
++      printk(KERN_INFO "Initializing software serial port version %d\n", 
++             ssl_version);
++
++      line_register_devfs(&lines, &driver, &ssl_driver, serial_lines, 
++                          sizeof(serial_lines)/sizeof(serial_lines[0]));
++
++      lines_init(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0]));
++
++      new_title = add_xterm_umid(opts.xterm_title);
++      if(new_title != NULL) opts.xterm_title = new_title;
++
++      ssl_init_done = 1;
++      return(0);
++}
++
++__initcall(ssl_init);
++
++static int ssl_chan_setup(char *str)
++{
++      line_setup(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0]),
++                 str, 1);
++      return(1);
++}
++
++__setup("ssl", ssl_chan_setup);
++__channel_help(ssl_chan_setup, "ssl");
++
++static void ssl_exit(void)
++{
++      if(!ssl_init_done) return;
++      close_lines(serial_lines, 
++                  sizeof(serial_lines)/sizeof(serial_lines[0]));
++}
++
++__uml_exitcall(ssl_exit);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/ssl.h um/arch/um/drivers/ssl.h
+--- orig/arch/um/drivers/ssl.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/ssl.h   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,23 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SSL_H__
++#define __SSL_H__
++
++extern int ssl_read(int fd, int line);
++extern void ssl_receive_char(int line, char ch);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/stdio_console.c um/arch/um/drivers/stdio_console.c
+--- orig/arch/um/drivers/stdio_console.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/stdio_console.c Sun Dec 15 21:08:20 2002
+@@ -0,0 +1,250 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/posix_types.h"
++#include "linux/tty.h"
++#include "linux/tty_flip.h"
++#include "linux/types.h"
++#include "linux/major.h"
++#include "linux/kdev_t.h"
++#include "linux/console.h"
++#include "linux/string.h"
++#include "linux/sched.h"
++#include "linux/list.h"
++#include "linux/init.h"
++#include "linux/interrupt.h"
++#include "linux/slab.h"
++#include "asm/current.h"
++#include "asm/softirq.h"
++#include "asm/hardirq.h"
++#include "asm/irq.h"
++#include "stdio_console.h"
++#include "line.h"
++#include "chan_kern.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "irq_user.h"
++#include "mconsole_kern.h"
++#include "init.h"
++#include "2_5compat.h"
++
++#define MAX_TTYS (8)
++
++/* Referenced only by tty_driver below - presumably it's locked correctly
++ * by the tty driver.
++ */
++
++static struct tty_driver console_driver;
++
++static int console_refcount = 0;
++
++static struct chan_ops init_console_ops = {
++      .type           = "you shouldn't see this",
++      .init           = NULL,
++      .open           = NULL,
++      .close          = NULL,
++      .read           = NULL,
++      .write          = NULL,
++      .console_write  = generic_write,
++      .window_size    = NULL,
++      .free           = NULL,
++      .winch          = 0,
++};
++
++static struct chan init_console_chan = {
++      .list           = { },
++      .primary        = 1,
++      .input          = 0,
++      .output         = 1,
++      .opened         = 1,
++      .fd             = 1,
++      .pri            = INIT_STATIC,
++      .ops            = &init_console_ops,
++      .data           = NULL
++};
++
++void stdio_announce(char *dev_name, int dev)
++{
++      printk(KERN_INFO "Virtual console %d assigned device '%s'\n", dev,
++             dev_name);
++}
++
++static struct chan_opts opts = {
++      .announce       = stdio_announce,
++      .xterm_title    = "Virtual Console #%d",
++      .raw            = 1,
++      .tramp_stack    = 0,
++      .in_kernel      = 1,
++};
++
++static int con_config(char *str);
++static int con_get_config(char *dev, char *str, int size, char **error_out);
++static int con_remove(char *str);
++
++static struct line_driver driver = {
++      .name                   = "UML console",
++      .devfs_name             = "vc/%d",
++      .major                  = TTY_MAJOR,
++      .minor_start            = 0,
++      .type                   = TTY_DRIVER_TYPE_CONSOLE,
++      .subtype                = SYSTEM_TYPE_CONSOLE,
++      .read_irq               = CONSOLE_IRQ,
++      .read_irq_name          = "console",
++      .write_irq              = CONSOLE_WRITE_IRQ,
++      .write_irq_name         = "console-write",
++      .symlink_from           = "ttys",
++      .symlink_to             = "vc",
++      .mc  = {
++              .name           = "con",
++              .config         = con_config,
++              .get_config     = con_get_config,
++              .remove         = con_remove,
++      },
++};
++
++static struct lines console_lines = LINES_INIT(MAX_TTYS);
++
++/* The array is initialized by line_init, which is an initcall.  The 
++ * individual elements are protected by individual semaphores.
++ */
++struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver),
++                            [ 1 ... MAX_TTYS - 1 ] = 
++                            LINE_INIT(CONFIG_CON_CHAN, &driver) };
++
++static int con_config(char *str)
++{
++      return(line_config(vts, sizeof(vts)/sizeof(vts[0]), str));
++}
++
++static int con_get_config(char *dev, char *str, int size, char **error_out)
++{
++      return(line_get_config(dev, vts, sizeof(vts)/sizeof(vts[0]), str, 
++                             size, error_out));
++}
++
++static int con_remove(char *str)
++{
++      return(line_remove(vts, sizeof(vts)/sizeof(vts[0]), str));
++}
++
++static int open_console(struct tty_struct *tty)
++{
++      return(line_open(vts, tty, &opts));
++}
++
++static int con_open(struct tty_struct *tty, struct file *filp)
++{
++      return(open_console(tty));
++}
++
++static void con_close(struct tty_struct *tty, struct file *filp)
++{
++      line_close(vts, tty);
++}
++
++static int con_write(struct tty_struct *tty, int from_user, 
++                   const unsigned char *buf, int count)
++{
++       return(line_write(vts, tty, from_user, buf, count));
++}
++
++static void set_termios(struct tty_struct *tty, struct termios * old)
++{
++}
++
++static int chars_in_buffer(struct tty_struct *tty)
++{
++      return(0);
++}
++
++static int con_init_done = 0;
++
++int stdio_init(void)
++{
++      char *new_title;
++
++      printk(KERN_INFO "Initializing stdio console driver\n");
++
++      line_register_devfs(&console_lines, &driver, &console_driver, vts, 
++                          sizeof(vts)/sizeof(vts[0]));
++
++      lines_init(vts, sizeof(vts)/sizeof(vts[0]));
++
++      new_title = add_xterm_umid(opts.xterm_title);
++      if(new_title != NULL) opts.xterm_title = new_title;
++
++      open_console(NULL);
++      con_init_done = 1;
++      return(0);
++}
++
++__initcall(stdio_init);
++
++static void console_write(struct console *console, const char *string, 
++                        unsigned len)
++{
++      if(con_init_done) down(&vts[console->index].sem);
++      console_write_chan(&vts[console->index].chan_list, string, len);
++      if(con_init_done) up(&vts[console->index].sem);
++}
++
++static struct tty_driver console_driver = {
++      .refcount               = &console_refcount,
++      .open                   = con_open,
++      .close                  = con_close,
++      .write                  = con_write,
++      .chars_in_buffer        = chars_in_buffer,
++      .set_termios            = set_termios
++};
++
++static kdev_t console_device(struct console *c)
++{
++      return mk_kdev(TTY_MAJOR, c->index);
++}
++
++static int console_setup(struct console *co, char *options)
++{
++      return(0);
++}
++
++static struct console stdiocons = INIT_CONSOLE("tty", console_write, 
++                                             console_device, console_setup,
++                                             CON_PRINTBUFFER);
++
++void stdio_console_init(void)
++{
++      INIT_LIST_HEAD(&vts[0].chan_list);
++      list_add(&init_console_chan.list, &vts[0].chan_list);
++      register_console(&stdiocons);
++}
++
++static int console_chan_setup(char *str)
++{
++      line_setup(vts, sizeof(vts)/sizeof(vts[0]), str, 1);
++      return(1);
++}
++
++__setup("con", console_chan_setup);
++__channel_help(console_chan_setup, "con");
++
++static void console_exit(void)
++{
++      if(!con_init_done) return;
++      close_lines(vts, sizeof(vts)/sizeof(vts[0]));
++}
++
++__uml_exitcall(console_exit);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/stdio_console.h um/arch/um/drivers/stdio_console.h
+--- orig/arch/um/drivers/stdio_console.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/stdio_console.h Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,21 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __STDIO_CONSOLE_H
++#define __STDIO_CONSOLE_H
++
++extern void save_console_flags(void);
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/tty.c um/arch/um/drivers/tty.c
+--- orig/arch/um/drivers/tty.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/tty.c   Sun Dec 15 21:08:41 2002
+@@ -0,0 +1,86 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <termios.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <unistd.h>
++#include "chan_user.h"
++#include "user_util.h"
++#include "user.h"
++#include "os.h"
++
++struct tty_chan {
++      char *dev;
++      int raw;
++      struct termios tt;
++};
++
++void *tty_chan_init(char *str, int device, struct chan_opts *opts)
++{
++      struct tty_chan *data;
++
++      if(*str != ':'){
++              printk("tty_init : channel type 'tty' must specify "
++                     "a device\n");
++              return(NULL);
++      }
++      str++;
++
++      if((data = um_kmalloc(sizeof(*data))) == NULL) 
++              return(NULL);
++      *data = ((struct tty_chan) { .dev       = str,
++                                   .raw       = opts->raw });
++                                   
++      return(data);
++}
++
++int tty_open(int input, int output, int primary, void *d, char **dev_out)
++{
++      struct tty_chan *data = d;
++      int fd;
++
++      fd = os_open_file(data->dev, of_set_rw(OPENFLAGS(), input, output), 0);
++      if(fd < 0) return(fd);
++      if(data->raw){
++              tcgetattr(fd, &data->tt);
++              raw(fd, 0);
++      }
++
++      *dev_out = data->dev;
++      return(fd);
++}
++
++int tty_console_write(int fd, const char *buf, int n, void *d)
++{
++      struct tty_chan *data = d;
++
++      return(generic_console_write(fd, buf, n, &data->tt));
++}
++
++struct chan_ops tty_ops = {
++      .type           = "tty",
++      .init           = tty_chan_init,
++      .open           = tty_open,
++      .close          = generic_close,
++      .read           = generic_read,
++      .write          = generic_write,
++      .console_write  = tty_console_write,
++      .window_size    = generic_window_size,
++      .free           = generic_free,
++      .winch          = 0,
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/ubd_kern.c um/arch/um/drivers/ubd_kern.c
+--- orig/arch/um/drivers/ubd_kern.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/ubd_kern.c      Tue Mar 11 15:46:36 2003
+@@ -0,0 +1,1067 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++/* 2001-09-28...2002-04-17
++ * Partition stuff by James_McMechan@hotmail.com
++ * old style ubd by setting UBD_SHIFT to 0
++ */
++
++#define MAJOR_NR UBD_MAJOR
++#define UBD_SHIFT 4
++
++#include "linux/config.h"
++#include "linux/blk.h"
++#include "linux/blkdev.h"
++#include "linux/hdreg.h"
++#include "linux/init.h"
++#include "linux/devfs_fs_kernel.h"
++#include "linux/cdrom.h"
++#include "linux/proc_fs.h"
++#include "linux/ctype.h"
++#include "linux/capability.h"
++#include "linux/mm.h"
++#include "linux/vmalloc.h"
++#include "linux/blkpg.h"
++#include "linux/genhd.h"
++#include "linux/spinlock.h"
++#include "asm/segment.h"
++#include "asm/uaccess.h"
++#include "asm/irq.h"
++#include "asm/types.h"
++#include "user_util.h"
++#include "mem_user.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "mconsole_kern.h"
++#include "init.h"
++#include "irq_user.h"
++#include "ubd_user.h"
++#include "2_5compat.h"
++#include "os.h"
++
++static int ubd_open(struct inode * inode, struct file * filp);
++static int ubd_release(struct inode * inode, struct file * file);
++static int ubd_ioctl(struct inode * inode, struct file * file,
++                   unsigned int cmd, unsigned long arg);
++static int ubd_revalidate(kdev_t rdev);
++static int ubd_revalidate1(kdev_t rdev);
++
++#define MAX_DEV (8)
++#define MAX_MINOR (MAX_DEV << UBD_SHIFT)
++
++/* Not modified by this driver */
++static int blk_sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = BLOCK_SIZE };
++static int hardsect_sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = 512 };
++
++/* Protected by ubd_lock */
++static int sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = 0 };
++
++static struct block_device_operations ubd_blops = {
++        .open         = ubd_open,
++        .release      = ubd_release,
++        .ioctl                = ubd_ioctl,
++        .revalidate   = ubd_revalidate,
++};
++
++/* Protected by ubd_lock, except in prepare_request and ubd_ioctl because 
++ * the block layer should ensure that the device is idle before closing it.
++ */
++static struct hd_struct       ubd_part[MAX_MINOR] =
++      { [ 0 ... MAX_MINOR - 1 ] = { 0, 0, 0 } };
++
++/* Protected by io_request_lock */
++static request_queue_t *ubd_queue;
++
++/* Protected by ubd_lock */
++static int fake_major = MAJOR_NR;
++
++static spinlock_t ubd_lock = SPIN_LOCK_UNLOCKED;
++
++#define INIT_GENDISK(maj, name, parts, shift, bsizes, max, blops) \
++{ \
++      .major          = maj, \
++      .major_name     = name, \
++      .minor_shift    = shift, \
++      .max_p          = 1 << shift, \
++      .part           = parts, \
++      .sizes          = bsizes, \
++      .nr_real        = max, \
++      .real_devices   = NULL, \
++      .next           = NULL, \
++      .fops           = blops, \
++      .de_arr         = NULL, \
++      .flags          = 0 \
++}
++
++static struct gendisk ubd_gendisk = INIT_GENDISK(MAJOR_NR, "ubd", ubd_part,
++                                               UBD_SHIFT, sizes, MAX_DEV, 
++                                               &ubd_blops);
++static struct gendisk fake_gendisk = INIT_GENDISK(0, "ubd", ubd_part, 
++                                                UBD_SHIFT, sizes, MAX_DEV, 
++                                                &ubd_blops);
++
++#ifdef CONFIG_BLK_DEV_UBD_SYNC
++#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \
++                                       .cl = 1 })
++#else
++#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 0, .c = 0, \
++                                       .cl = 1 })
++#endif
++
++/* Not protected - changed only in ubd_setup_common and then only to
++ * to enable O_SYNC.
++ */
++static struct openflags global_openflags = OPEN_FLAGS;
++
++struct cow {
++      char *file;
++      int fd;
++      unsigned long *bitmap;
++      unsigned long bitmap_len;
++      int bitmap_offset;
++        int data_offset;
++};
++
++struct ubd {
++      char *file;
++      int count;
++      int fd;
++      __u64 size;
++      struct openflags boot_openflags;
++      struct openflags openflags;
++      devfs_handle_t devfs;
++      struct cow cow;
++};
++
++#define DEFAULT_COW { \
++      .file                   = NULL, \
++        .fd                   = -1, \
++        .bitmap                       = NULL, \
++      .bitmap_offset          = 0, \
++        .data_offset          = 0, \
++}
++
++#define DEFAULT_UBD { \
++      .file                   = NULL, \
++      .count                  = 0, \
++      .fd                     = -1, \
++      .size                   = -1, \
++      .boot_openflags         = OPEN_FLAGS, \
++      .openflags              = OPEN_FLAGS, \
++      .devfs                  = NULL, \
++        .cow                  = DEFAULT_COW, \
++}
++
++struct ubd ubd_dev[MAX_DEV] = { [ 0 ... MAX_DEV - 1 ] = DEFAULT_UBD };
++
++static int ubd0_init(void)
++{
++      struct ubd *dev = &ubd_dev[0];
++
++      if(dev->file == NULL)
++              dev->file = "root_fs";
++      return(0);
++}
++
++__initcall(ubd0_init);
++
++/* Only changed by fake_ide_setup which is a setup */
++static int fake_ide = 0;
++static struct proc_dir_entry *proc_ide_root = NULL;
++static struct proc_dir_entry *proc_ide = NULL;
++
++static void make_proc_ide(void)
++{
++      proc_ide_root = proc_mkdir("ide", 0);
++      proc_ide = proc_mkdir("ide0", proc_ide_root);
++}
++
++static int proc_ide_read_media(char *page, char **start, off_t off, int count,
++                             int *eof, void *data)
++{
++      int len;
++
++      strcpy(page, "disk\n");
++      len = strlen("disk\n");
++      len -= off;
++      if (len < count){
++              *eof = 1;
++              if (len <= 0) return 0;
++      }
++      else len = count;
++      *start = page + off;
++      return len;
++}
++
++static void make_ide_entries(char *dev_name)
++{
++      struct proc_dir_entry *dir, *ent;
++      char name[64];
++
++      if(!fake_ide) return;
++
++      /* Without locking this could race if a UML was booted with no 
++       * disks and then two mconsole requests which add disks came in 
++       * at the same time.
++       */
++      spin_lock(&ubd_lock);
++      if(proc_ide_root == NULL) make_proc_ide();
++      spin_unlock(&ubd_lock);
++
++      dir = proc_mkdir(dev_name, proc_ide);
++      if(!dir) return;
++
++      ent = create_proc_entry("media", S_IFREG|S_IRUGO, dir);
++      if(!ent) return;
++      ent->nlink = 1;
++      ent->data = NULL;
++      ent->read_proc = proc_ide_read_media;
++      ent->write_proc = NULL;
++      sprintf(name,"ide0/%s", dev_name);
++      proc_symlink(dev_name, proc_ide_root, name);
++}
++
++static int fake_ide_setup(char *str)
++{
++      fake_ide = 1;
++      return(1);
++}
++
++__setup("fake_ide", fake_ide_setup);
++
++__uml_help(fake_ide_setup,
++"fake_ide\n"
++"    Create ide0 entries that map onto ubd devices.\n\n"
++);
++
++static int parse_unit(char **ptr)
++{
++      char *str = *ptr, *end;
++      int n = -1;
++
++      if(isdigit(*str)) {
++              n = simple_strtoul(str, &end, 0);
++              if(end == str)
++                      return(-1);
++              *ptr = end;
++      }
++      else if (('a' <= *str) && (*str <= 'h')) {
++              n = *str - 'a';
++              str++;
++              *ptr = str;
++      }
++      return(n);
++}
++
++static int ubd_setup_common(char *str, int *index_out)
++{
++      struct openflags flags = global_openflags;
++      struct ubd *dev;
++      char *backing_file;
++      int n, err;
++
++      if(index_out) *index_out = -1;
++      n = *str;
++      if(n == '='){
++              char *end;
++              int major;
++
++              str++;
++              if(!strcmp(str, "sync")){
++                      global_openflags.s = 1;
++                      return(0);
++              }
++              major = simple_strtoul(str, &end, 0);
++              if((*end != '\0') || (end == str)){
++                      printk(KERN_ERR 
++                             "ubd_setup : didn't parse major number\n");
++                      return(1);
++              }
++
++              err = 1;
++              spin_lock(&ubd_lock);
++              if(fake_major != MAJOR_NR){
++                      printk(KERN_ERR "Can't assign a fake major twice\n");
++                      goto out1;
++              }
++
++              fake_gendisk.major = major;
++              fake_major = major;
++      
++              printk(KERN_INFO "Setting extra ubd major number to %d\n",
++                     major);
++              err = 0;
++      out1:
++              spin_unlock(&ubd_lock);
++              return(err);
++      }
++
++      n = parse_unit(&str);
++      if(n < 0){
++              printk(KERN_ERR "ubd_setup : couldn't parse unit number "
++                     "'%s'\n", str);
++              return(1);
++      }
++
++      if(n >= MAX_DEV){
++              printk(KERN_ERR "ubd_setup : index %d out of range "
++                     "(%d devices)\n", n, MAX_DEV);   
++              return(1);
++      }
++
++      err = 1;
++      spin_lock(&ubd_lock);
++
++      dev = &ubd_dev[n];
++      if(dev->file != NULL){
++              printk(KERN_ERR "ubd_setup : device already configured\n");
++              goto out2;
++      }
++
++      if(index_out) *index_out = n;
++
++      if (*str == 'r'){
++              flags.w = 0;
++              str++;
++      }
++      if (*str == 's'){
++              flags.s = 1;
++              str++;
++      }
++      if(*str++ != '='){
++              printk(KERN_ERR "ubd_setup : Expected '='\n");
++              goto out2;
++      }
++
++      err = 0;
++      backing_file = strchr(str, ',');
++      if(backing_file){
++              *backing_file = '\0';
++              backing_file++;
++      }
++      dev->file = str;
++      dev->cow.file = backing_file;
++      dev->boot_openflags = flags;
++ out2:
++      spin_unlock(&ubd_lock);
++      return(err);
++}
++
++static int ubd_setup(char *str)
++{
++      ubd_setup_common(str, NULL);
++      return(1);
++}
++
++__setup("ubd", ubd_setup);
++__uml_help(ubd_setup,
++"ubd<n>=<filename>\n"
++"    This is used to associate a device with a file in the underlying\n"
++"    filesystem. Usually, there is a filesystem in the file, but \n"
++"    that's not required. Swap devices containing swap files can be\n"
++"    specified like this. Also, a file which doesn't contain a\n"
++"    filesystem can have its contents read in the virtual \n"
++"    machine by running dd on the device. n must be in the range\n"
++"    0 to 7. Appending an 'r' to the number will cause that device\n"
++"    to be mounted read-only. For example ubd1r=./ext_fs. Appending\n"
++"    an 's' (has to be _after_ 'r', if there is one) will cause data\n"
++"    to be written to disk on the host immediately.\n\n"
++);
++
++static int fakehd(char *str)
++{
++      printk(KERN_INFO 
++             "fakehd : Changing ubd_gendisk.major_name to \"hd\".\n");
++      ubd_gendisk.major_name = "hd";
++      return(1);
++}
++
++__setup("fakehd", fakehd);
++__uml_help(fakehd,
++"fakehd\n"
++"    Change the ubd device name to \"hd\".\n\n"
++);
++
++static void do_ubd_request(request_queue_t * q);
++
++/* Only changed by ubd_init, which is an initcall. */
++int thread_fd = -1;
++
++/* Changed by ubd_handler, which is serialized because interrupts only
++ * happen on CPU 0.
++ */
++int intr_count = 0;
++
++static void ubd_finish(int error)
++{
++      int nsect;
++
++      if(error){
++              end_request(0);
++              return;
++      }
++      nsect = CURRENT->current_nr_sectors;
++      CURRENT->sector += nsect;
++      CURRENT->buffer += nsect << 9;
++      CURRENT->errors = 0;
++      CURRENT->nr_sectors -= nsect;
++      CURRENT->current_nr_sectors = 0;
++      end_request(1);
++}
++
++static void ubd_handler(void)
++{
++      struct io_thread_req req;
++      int n;
++
++      DEVICE_INTR = NULL;
++      intr_count++;
++      n = read_ubd_fs(thread_fd, &req, sizeof(req));
++      if(n != sizeof(req)){
++              printk(KERN_ERR "Pid %d - spurious interrupt in ubd_handler, "
++                     "errno = %d\n", os_getpid(), -n);
++              spin_lock(&io_request_lock);
++              end_request(0);
++              spin_unlock(&io_request_lock);
++              return;
++      }
++        
++        if((req.offset != ((__u64) (CURRENT->sector)) << 9) ||
++         (req.length != (CURRENT->current_nr_sectors) << 9))
++              panic("I/O op mismatch");
++      
++      spin_lock(&io_request_lock);
++      ubd_finish(req.error);
++      reactivate_fd(thread_fd, UBD_IRQ);      
++      do_ubd_request(ubd_queue);
++      spin_unlock(&io_request_lock);
++}
++
++static void ubd_intr(int irq, void *dev, struct pt_regs *unused)
++{
++      ubd_handler();
++}
++
++/* Only changed by ubd_init, which is an initcall. */
++static int io_pid = -1;
++
++void kill_io_thread(void)
++{
++      if(io_pid != -1)
++              os_kill_process(io_pid, 1);
++}
++
++__uml_exitcall(kill_io_thread);
++
++/* Initialized in an initcall, and unchanged thereafter */
++devfs_handle_t ubd_dir_handle;
++
++static int ubd_add(int n)
++{
++      struct ubd *dev = &ubd_dev[n];
++      char name[sizeof("nnnnnn\0")], dev_name[sizeof("ubd0x")];
++      int err = -EISDIR;
++
++      if(dev->file == NULL)
++              goto out;
++
++      err = ubd_revalidate1(MKDEV(MAJOR_NR, n << UBD_SHIFT));
++      if(err)
++              goto out;
++
++      sprintf(name, "%d", n);
++      dev->devfs = devfs_register(ubd_dir_handle, name, DEVFS_FL_REMOVABLE,
++                                  MAJOR_NR, n << UBD_SHIFT, S_IFBLK | 
++                                  S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP,
++                                  &ubd_blops, NULL);
++
++      if(!strcmp(ubd_gendisk.major_name, "ubd"))
++              sprintf(dev_name, "%s%d", ubd_gendisk.major_name, n);
++      else sprintf(dev_name, "%s%c", ubd_gendisk.major_name, 
++                   n + 'a');
++
++      make_ide_entries(dev_name);
++      return(0);
++
++ out:
++      return(err);
++}
++
++static int ubd_config(char *str)
++{
++      int n, err;
++
++      str = uml_strdup(str);
++      if(str == NULL){
++              printk(KERN_ERR "ubd_config failed to strdup string\n");
++              return(1);
++      }
++      err = ubd_setup_common(str, &n);
++      if(err){
++              kfree(str);
++              return(-1);
++      }
++      if(n == -1) return(0);
++
++      spin_lock(&ubd_lock);
++      err = ubd_add(n);
++      if(err)
++              ubd_dev[n].file = NULL;
++      spin_unlock(&ubd_lock);
++
++      return(err);
++}
++
++static int ubd_get_config(char *name, char *str, int size, char **error_out)
++{
++      struct ubd *dev;
++      char *end;
++      int n, len = 0;
++
++      n = simple_strtoul(name, &end, 0);
++      if((*end != '\0') || (end == name)){
++              *error_out = "ubd_get_config : didn't parse device number";
++              return(-1);
++      }
++
++      if((n >= MAX_DEV) || (n < 0)){
++              *error_out = "ubd_get_config : device number out of range";
++              return(-1);
++      }
++
++      dev = &ubd_dev[n];
++      spin_lock(&ubd_lock);
++
++      if(dev->file == NULL){
++              CONFIG_CHUNK(str, size, len, "", 1);
++              goto out;
++      }
++
++      CONFIG_CHUNK(str, size, len, dev->file, 0);
++
++      if(dev->cow.file != NULL){
++              CONFIG_CHUNK(str, size, len, ",", 0);
++              CONFIG_CHUNK(str, size, len, dev->cow.file, 1);
++      }
++      else CONFIG_CHUNK(str, size, len, "", 1);
++
++ out:
++      spin_unlock(&ubd_lock);
++      return(len);
++}
++
++static int ubd_remove(char *str)
++{
++      struct ubd *dev;
++      int n, err = -ENODEV;
++
++      if(isdigit(*str)){
++              char *end;
++              n = simple_strtoul(str, &end, 0);
++              if ((*end != '\0') || (end == str)) 
++                      return(err);
++      }
++      else if (('a' <= *str) && (*str <= 'h'))
++              n = *str - 'a';
++      else
++              return(err);    /* it should be a number 0-7/a-h */
++
++      if((n < 0) || (n >= MAX_DEV))
++              return(err);
++
++      dev = &ubd_dev[n];
++
++      spin_lock(&ubd_lock);
++      err = 0;
++      if(dev->file == NULL)
++              goto out;
++      err = -1;
++      if(dev->count > 0)
++              goto out;
++      if(dev->devfs != NULL) 
++              devfs_unregister(dev->devfs);
++
++      *dev = ((struct ubd) DEFAULT_UBD);
++      err = 0;
++ out:
++      spin_unlock(&ubd_lock);
++      return(err);
++}
++
++static struct mc_device ubd_mc = {
++      .name           = "ubd",
++      .config         = ubd_config,
++      .get_config     = ubd_get_config,
++      .remove         = ubd_remove,
++};
++
++static int ubd_mc_init(void)
++{
++      mconsole_register_dev(&ubd_mc);
++      return(0);
++}
++
++__initcall(ubd_mc_init);
++
++static request_queue_t *ubd_get_queue(kdev_t device)
++{
++      return(ubd_queue);
++}
++
++int ubd_init(void)
++{
++      unsigned long stack;
++        int i, err;
++
++      ubd_dir_handle = devfs_mk_dir (NULL, "ubd", NULL);
++      if (devfs_register_blkdev(MAJOR_NR, "ubd", &ubd_blops)) {
++              printk(KERN_ERR "ubd: unable to get major %d\n", MAJOR_NR);
++              return -1;
++      }
++      read_ahead[MAJOR_NR] = 8;               /* 8 sector (4kB) read-ahead */
++      blksize_size[MAJOR_NR] = blk_sizes;
++      blk_size[MAJOR_NR] = sizes;
++      INIT_HARDSECT(hardsect_size, MAJOR_NR, hardsect_sizes);
++
++      ubd_queue = BLK_DEFAULT_QUEUE(MAJOR_NR);
++      blk_init_queue(ubd_queue, DEVICE_REQUEST);
++      INIT_ELV(ubd_queue, &ubd_queue->elevator);
++
++        add_gendisk(&ubd_gendisk);
++      if (fake_major != MAJOR_NR){
++              /* major number 0 is used to auto select */
++              err = devfs_register_blkdev(fake_major, "fake", &ubd_blops);
++              if(fake_major == 0){
++              /* auto device number case */
++                      fake_major = err;
++                      if(err == 0)
++                              return(-ENODEV);
++              } 
++              else if (err){
++                      /* not auto so normal error */
++                      printk(KERN_ERR "ubd: error %d getting major %d\n", 
++                             err, fake_major);
++                      return(-ENODEV);
++              }
++
++              blk_dev[fake_major].queue = ubd_get_queue;
++              read_ahead[fake_major] = 8;     /* 8 sector (4kB) read-ahead */
++              blksize_size[fake_major] = blk_sizes;
++              blk_size[fake_major] = sizes;
++              INIT_HARDSECT(hardsect_size, fake_major, hardsect_sizes);
++                add_gendisk(&fake_gendisk);
++      }
++
++      for(i=0;i<MAX_DEV;i++) 
++              ubd_add(i);
++
++      if(global_openflags.s){
++              printk(KERN_INFO "ubd : Synchronous mode\n");
++              return(0);
++      }
++      stack = alloc_stack(0, 0);
++      io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), 
++                               &thread_fd);
++      if(io_pid < 0){
++              printk(KERN_ERR 
++                     "ubd : Failed to start I/O thread (errno = %d) - "
++                     "falling back to synchronous I/O\n", -io_pid);
++              return(0);
++      }
++      err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, 
++                           SA_INTERRUPT, "ubd", ubd_dev);
++      if(err != 0) printk(KERN_ERR 
++                          "um_request_irq failed - errno = %d\n", -err);
++      return(err);
++}
++
++__initcall(ubd_init);
++
++static void ubd_close(struct ubd *dev)
++{
++      os_close_file(dev->fd);
++      if(dev->cow.file != NULL) {
++              os_close_file(dev->cow.fd);
++              vfree(dev->cow.bitmap);
++              dev->cow.bitmap = NULL;
++      }
++}
++
++static int ubd_open_dev(struct ubd *dev)
++{
++      struct openflags flags;
++      int err, create_cow, *create_ptr;
++
++      dev->openflags = dev->boot_openflags;
++      create_cow = 0;
++      create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL;
++      dev->fd = open_ubd_file(dev->file, &dev->openflags, &dev->cow.file,
++                              &dev->cow.bitmap_offset, &dev->cow.bitmap_len, 
++                              &dev->cow.data_offset, create_ptr);
++
++      if((dev->fd == -ENOENT) && create_cow){
++              dev->fd = create_cow_file(dev->file, dev->cow.file, 
++                                        dev->openflags, 1 << 9,
++                                        &dev->cow.bitmap_offset, 
++                                        &dev->cow.bitmap_len,
++                                        &dev->cow.data_offset);
++              if(dev->fd >= 0){
++                      printk(KERN_INFO "Creating \"%s\" as COW file for "
++                             "\"%s\"\n", dev->file, dev->cow.file);
++              }
++      }
++
++      if(dev->fd < 0) return(dev->fd);
++
++      if(dev->cow.file != NULL){
++              err = -ENOMEM;
++              dev->cow.bitmap = (void *) vmalloc(dev->cow.bitmap_len);
++              if(dev->cow.bitmap == NULL) goto error;
++              flush_tlb_kernel_vm();
++
++              err = read_cow_bitmap(dev->fd, dev->cow.bitmap, 
++                                    dev->cow.bitmap_offset, 
++                                    dev->cow.bitmap_len);
++              if(err) goto error;
++
++              flags = dev->openflags;
++              flags.w = 0;
++              err = open_ubd_file(dev->cow.file, &flags, NULL, NULL, NULL, 
++                                  NULL, NULL);
++              if(err < 0) goto error;
++              dev->cow.fd = err;
++      }
++      return(0);
++ error:
++      os_close_file(dev->fd);
++      return(err);
++}
++
++static int ubd_file_size(struct ubd *dev, __u64 *size_out)
++{
++      char *file;
++
++      file = dev->cow.file ? dev->cow.file : dev->file;
++      return(os_file_size(file, size_out));
++}
++
++static int ubd_open(struct inode *inode, struct file *filp)
++{
++      struct ubd *dev;
++      int n, offset, err = 0;
++
++      n = DEVICE_NR(inode->i_rdev);
++      dev = &ubd_dev[n];
++      if(n >= MAX_DEV)
++              return -ENODEV;
++
++      spin_lock(&ubd_lock);
++      offset = n << UBD_SHIFT;
++
++      if(dev->count == 0){
++              err = ubd_open_dev(dev);
++              if(err){
++                      printk(KERN_ERR "ubd%d: Can't open \"%s\": "
++                             "errno = %d\n", n, dev->file, -err);
++                      goto out;
++              }
++              err = ubd_file_size(dev, &dev->size);
++              if(err)
++                      goto out;
++              sizes[offset] = dev->size / BLOCK_SIZE;
++              ubd_part[offset].nr_sects = dev->size / hardsect_sizes[offset];
++      }
++      dev->count++;
++      if((filp->f_mode & FMODE_WRITE) && !dev->openflags.w){
++              if(--dev->count == 0) ubd_close(dev);
++              err = -EROFS;
++      }
++ out:
++      spin_unlock(&ubd_lock);
++      return(err);
++}
++
++static int ubd_release(struct inode * inode, struct file * file)
++{
++        int n, offset;
++
++      n =  DEVICE_NR(inode->i_rdev);
++      offset = n << UBD_SHIFT;
++      if(n >= MAX_DEV)
++              return -ENODEV;
++
++      spin_lock(&ubd_lock);
++      if(--ubd_dev[n].count == 0)
++              ubd_close(&ubd_dev[n]);
++      spin_unlock(&ubd_lock);
++
++      return(0);
++}
++
++void cowify_req(struct io_thread_req *req, struct ubd *dev)
++{
++        int i, update_bitmap, sector = req->offset >> 9;
++
++      if(req->length > (sizeof(req->sector_mask) * 8) << 9)
++              panic("Operation too long");
++      if(req->op == UBD_READ) {
++              for(i = 0; i < req->length >> 9; i++){
++                      if(ubd_test_bit(sector + i, (unsigned char *) 
++                                      dev->cow.bitmap)){
++                              ubd_set_bit(i, (unsigned char *) 
++                                          &req->sector_mask);
++                      }
++                }
++        } 
++        else {
++              update_bitmap = 0;
++              for(i = 0; i < req->length >> 9; i++){
++                      ubd_set_bit(i, (unsigned char *) 
++                                  &req->sector_mask);
++                      if(!ubd_test_bit(sector + i, (unsigned char *) 
++                                       dev->cow.bitmap))
++                              update_bitmap = 1;
++                      ubd_set_bit(sector + i, (unsigned char *) 
++                                  dev->cow.bitmap);
++              }
++              if(update_bitmap){
++                      req->cow_offset = sector / (sizeof(unsigned long) * 8);
++                      req->bitmap_words[0] = 
++                              dev->cow.bitmap[req->cow_offset];
++                      req->bitmap_words[1] = 
++                              dev->cow.bitmap[req->cow_offset + 1];
++                      req->cow_offset *= sizeof(unsigned long);
++                      req->cow_offset += dev->cow.bitmap_offset;
++              }
++      }
++}
++
++static int prepare_request(struct request *req, struct io_thread_req *io_req)
++{
++      struct ubd *dev;
++      __u64 block;
++      int nsect, minor, n;
++
++      if(req->rq_status == RQ_INACTIVE) return(1);
++
++      minor = MINOR(req->rq_dev);
++      n = minor >> UBD_SHIFT;
++      dev = &ubd_dev[n];
++      if(IS_WRITE(req) && !dev->openflags.w){
++              printk("Write attempted on readonly ubd device %d\n", n);
++              end_request(0);
++              return(1);
++      }
++
++        req->sector += ubd_part[minor].start_sect;
++        block = req->sector;
++        nsect = req->current_nr_sectors;
++
++      io_req->op = (req->cmd == READ) ? UBD_READ : UBD_WRITE;
++      io_req->fds[0] = (dev->cow.file != NULL) ? dev->cow.fd : dev->fd;
++      io_req->fds[1] = dev->fd;
++      io_req->offsets[0] = 0;
++      io_req->offsets[1] = dev->cow.data_offset;
++      io_req->offset = ((__u64) block) << 9;
++      io_req->length = nsect << 9;
++      io_req->buffer = req->buffer;
++      io_req->sectorsize = 1 << 9;
++      io_req->sector_mask = 0;
++      io_req->cow_offset = -1;
++      io_req->error = 0;
++
++        if(dev->cow.file != NULL) cowify_req(io_req, dev);
++      return(0);
++}
++
++static void do_ubd_request(request_queue_t *q)
++{
++      struct io_thread_req io_req;
++      struct request *req;
++      int err, n;
++
++      if(thread_fd == -1){
++              while(!list_empty(&q->queue_head)){
++                      req = blkdev_entry_next_request(&q->queue_head);
++                      err = prepare_request(req, &io_req);
++                      if(!err){
++                              do_io(&io_req);
++                              ubd_finish(io_req.error);
++                      }
++              }
++      }
++      else {
++              if(DEVICE_INTR || list_empty(&q->queue_head)) return;
++              req = blkdev_entry_next_request(&q->queue_head);
++              err = prepare_request(req, &io_req);
++              if(!err){
++                      SET_INTR(ubd_handler);
++                      n = write_ubd_fs(thread_fd, (char *) &io_req, 
++                                       sizeof(io_req));
++                      if(n != sizeof(io_req))
++                              printk("write to io thread failed, "
++                                     "errno = %d\n", -n);
++              }
++      }
++}
++
++static int ubd_ioctl(struct inode * inode, struct file * file,
++                   unsigned int cmd, unsigned long arg)
++{
++      struct hd_geometry *loc = (struct hd_geometry *) arg;
++      struct ubd *dev;
++      int n, minor, err;
++      struct hd_driveid ubd_id = {
++              .cyls           = 0,
++              .heads          = 128,
++              .sectors        = 32,
++      };
++      
++        if(!inode) return(-EINVAL);
++      minor = MINOR(inode->i_rdev);
++      n = minor >> UBD_SHIFT;
++      if(n >= MAX_DEV)
++              return(-EINVAL);
++      dev = &ubd_dev[n];
++      switch (cmd) {
++              struct hd_geometry g;
++              struct cdrom_volctrl volume;
++      case HDIO_GETGEO:
++              if(!loc) return(-EINVAL);
++              g.heads = 128;
++              g.sectors = 32;
++              g.cylinders = dev->size / (128 * 32 * hardsect_sizes[minor]);
++              g.start = 2;
++              return(copy_to_user(loc, &g, sizeof(g)) ? -EFAULT : 0);
++      case BLKGETSIZE:   /* Return device size */
++              if(!arg) return(-EINVAL);
++              err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long));
++              if(err)
++                      return(err);
++              put_user(ubd_part[minor].nr_sects, (long *) arg);
++              return(0);
++      case BLKRRPART: /* Re-read partition tables */
++              return(ubd_revalidate(inode->i_rdev));
++
++      case HDIO_SET_UNMASKINTR:
++              if(!capable(CAP_SYS_ADMIN)) return(-EACCES);
++              if((arg > 1) || (minor & 0x3F)) return(-EINVAL);
++              return(0);
++
++      case HDIO_GET_UNMASKINTR:
++              if(!arg)  return(-EINVAL);
++              err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long));
++              if(err)
++                      return(err);
++              return(0);
++
++      case HDIO_GET_MULTCOUNT:
++              if(!arg)  return(-EINVAL);
++              err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long));
++              if(err)
++                      return(err);
++              return(0);
++
++      case HDIO_SET_MULTCOUNT:
++              if(!capable(CAP_SYS_ADMIN)) return(-EACCES);
++              if(MINOR(inode->i_rdev) & 0x3F) return(-EINVAL);
++              return(0);
++
++      case HDIO_GET_IDENTITY:
++              ubd_id.cyls = dev->size / (128 * 32 * hardsect_sizes[minor]);
++              if(copy_to_user((char *) arg, (char *) &ubd_id, 
++                               sizeof(ubd_id)))
++                      return(-EFAULT);
++              return(0);
++              
++      case CDROMVOLREAD:
++              if(copy_from_user(&volume, (char *) arg, sizeof(volume)))
++                      return(-EFAULT);
++              volume.channel0 = 255;
++              volume.channel1 = 255;
++              volume.channel2 = 255;
++              volume.channel3 = 255;
++              if(copy_to_user((char *) arg, &volume, sizeof(volume)))
++                      return(-EFAULT);
++              return(0);
++
++      default:
++              return blk_ioctl(inode->i_rdev, cmd, arg);
++      }
++}
++
++static int ubd_revalidate1(kdev_t rdev)
++{
++      int i, n, offset, err = 0, pcount = 1 << UBD_SHIFT;
++      struct ubd *dev;
++      struct hd_struct *part;
++
++      n = DEVICE_NR(rdev);
++      offset = n << UBD_SHIFT;
++      dev = &ubd_dev[n];
++
++      part = &ubd_part[offset];
++
++      /* clear all old partition counts */
++      for(i = 1; i < pcount; i++) {
++              part[i].start_sect = 0;
++              part[i].nr_sects = 0;
++      }
++
++      /* If it already has been opened we can check the partitions 
++       * directly 
++       */
++      if(dev->count){
++              part->start_sect = 0;
++              register_disk(&ubd_gendisk, MKDEV(MAJOR_NR, offset), pcount, 
++                            &ubd_blops, part->nr_sects);
++      } 
++      else if(dev->file){
++              err = ubd_open_dev(dev);
++              if(err){
++                      printk(KERN_ERR "unable to open %s for validation\n",
++                             dev->file);
++                      goto out;
++              }
++
++              /* have to recompute sizes since we opened it */
++              err = ubd_file_size(dev, &dev->size);
++              if(err) {
++                      ubd_close(dev);
++                      goto out;
++              }
++              part->start_sect = 0;
++              part->nr_sects = dev->size / hardsect_sizes[offset];
++              register_disk(&ubd_gendisk, MKDEV(MAJOR_NR, offset), pcount, 
++                            &ubd_blops, part->nr_sects);
++
++              /* we are done so close it */
++              ubd_close(dev);
++      } 
++      else err = -ENODEV;
++ out:
++      return(err);
++}
++
++static int ubd_revalidate(kdev_t rdev)
++{
++      int err;
++
++      spin_lock(&ubd_lock);
++      err = ubd_revalidate1(rdev);
++      spin_unlock(&ubd_lock);
++      return(err);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/ubd_user.c um/arch/um/drivers/ubd_user.c
+--- orig/arch/um/drivers/ubd_user.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/ubd_user.c      Thu Mar  6 18:08:55 2003
+@@ -0,0 +1,626 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Copyright (C) 2001 Ridgerun,Inc (glonnon@ridgerun.com)
++ * Licensed under the GPL
++ */
++
++#include <stddef.h>
++#include <unistd.h>
++#include <errno.h>
++#include <sched.h>
++#include <signal.h>
++#include <string.h>
++#include <netinet/in.h>
++#include <sys/stat.h>
++#include <sys/time.h>
++#include <sys/fcntl.h>
++#include <sys/socket.h>
++#include <string.h>
++#include <sys/mman.h>
++#include <sys/param.h>
++#include "asm/types.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "ubd_user.h"
++#include "os.h"
++
++#include <endian.h>
++#include <byteswap.h>
++#if __BYTE_ORDER == __BIG_ENDIAN
++# define ntohll(x) (x)
++# define htonll(x) (x)
++#elif __BYTE_ORDER == __LITTLE_ENDIAN
++# define ntohll(x)  bswap_64(x)
++# define htonll(x)  bswap_64(x)
++#else
++#error "__BYTE_ORDER not defined"
++#endif
++
++#define PATH_LEN_V1 256
++
++struct cow_header_v1 {
++      int magic;
++      int version;
++      char backing_file[PATH_LEN_V1];
++      time_t mtime;
++      __u64 size;
++      int sectorsize;
++};
++
++#define PATH_LEN_V2 MAXPATHLEN
++
++struct cow_header_v2 {
++      unsigned long magic;
++      unsigned long version;
++      char backing_file[PATH_LEN_V2];
++      time_t mtime;
++      __u64 size;
++      int sectorsize;
++};
++
++union cow_header {
++      struct cow_header_v1 v1;
++      struct cow_header_v2 v2;
++};
++
++#define COW_MAGIC 0x4f4f4f4d  /* MOOO */
++#define COW_VERSION 2
++
++static void sizes(__u64 size, int sectorsize, int bitmap_offset, 
++                unsigned long *bitmap_len_out, int *data_offset_out)
++{
++      *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize);
++
++      *data_offset_out = bitmap_offset + *bitmap_len_out;
++      *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize;
++      *data_offset_out *= sectorsize;
++}
++
++static int read_cow_header(int fd, int *magic_out, char **backing_file_out, 
++                         time_t *mtime_out, __u64 *size_out, 
++                         int *sectorsize_out, int *bitmap_offset_out)
++{
++      union cow_header *header;
++      char *file;
++      int err, n;
++      unsigned long version, magic;
++
++      header = um_kmalloc(sizeof(*header));
++      if(header == NULL){
++              printk("read_cow_header - Failed to allocate header\n");
++              return(-ENOMEM);
++      }
++      err = -EINVAL;
++      n = read(fd, header, sizeof(*header));
++      if(n < offsetof(typeof(header->v1), backing_file)){
++              printk("read_cow_header - short header\n");
++              goto out;
++      }
++
++      magic = header->v1.magic;
++      if(magic == COW_MAGIC) {
++              version = header->v1.version;
++      }
++      else if(magic == ntohl(COW_MAGIC)){
++              version = ntohl(header->v1.version);
++      }
++      else goto out;
++
++      *magic_out = COW_MAGIC;
++
++      if(version == 1){
++              if(n < sizeof(header->v1)){
++                      printk("read_cow_header - failed to read V1 header\n");
++                      goto out;
++              }
++              *mtime_out = header->v1.mtime;
++              *size_out = header->v1.size;
++              *sectorsize_out = header->v1.sectorsize;
++              *bitmap_offset_out = sizeof(header->v1);
++              file = header->v1.backing_file;
++      }
++      else if(version == 2){
++              if(n < sizeof(header->v2)){
++                      printk("read_cow_header - failed to read V2 header\n");
++                      goto out;
++              }
++              *mtime_out = ntohl(header->v2.mtime);
++              *size_out = ntohll(header->v2.size);
++              *sectorsize_out = ntohl(header->v2.sectorsize);
++              *bitmap_offset_out = sizeof(header->v2);
++              file = header->v2.backing_file;
++      }
++      else {
++              printk("read_cow_header - invalid COW version\n");
++              goto out;
++      }
++      err = -ENOMEM;
++      *backing_file_out = uml_strdup(file);
++      if(*backing_file_out == NULL){
++              printk("read_cow_header - failed to allocate backing file\n");
++              goto out;
++      }
++      err = 0;
++ out:
++      kfree(header);
++      return(err);
++}
++
++static int same_backing_files(char *from_cmdline, char *from_cow, char *cow)
++{
++      struct stat64 buf1, buf2;
++
++      if(from_cmdline == NULL) return(1);
++      if(!strcmp(from_cmdline, from_cow)) return(1);
++
++      if(stat64(from_cmdline, &buf1) < 0){
++              printk("Couldn't stat '%s', errno = %d\n", from_cmdline, 
++                     errno);
++              return(1);
++      }
++      if(stat64(from_cow, &buf2) < 0){
++              printk("Couldn't stat '%s', errno = %d\n", from_cow, errno);
++              return(1);
++      }
++      if((buf1.st_dev == buf2.st_dev) && (buf1.st_ino == buf2.st_ino))
++              return(1);
++
++      printk("Backing file mismatch - \"%s\" requested,\n"
++             "\"%s\" specified in COW header of \"%s\"\n",
++             from_cmdline, from_cow, cow);
++      return(0);
++}
++
++static int backing_file_mismatch(char *file, __u64 size, time_t mtime)
++{
++      struct stat64 buf;
++      long long actual;
++      int err;
++
++      if(stat64(file, &buf) < 0){
++              printk("Failed to stat backing file \"%s\", errno = %d\n",
++                     file, errno);
++              return(-errno);
++      }
++
++      err = os_file_size(file, &actual);
++      if(err){
++              printk("Failed to get size of backing file \"%s\", "
++                     "errno = %d\n", file, -err);
++              return(err);
++      }
++
++      if(actual != size){
++              printk("Size mismatch (%ld vs %ld) of COW header vs backing "
++                     "file\n", size, actual);
++              return(-EINVAL);
++      }
++      if(buf.st_mtime != mtime){
++              printk("mtime mismatch (%ld vs %ld) of COW header vs backing "
++                     "file\n", mtime, buf.st_mtime);
++              return(-EINVAL);
++      }
++      return(0);
++}
++
++int read_cow_bitmap(int fd, void *buf, int offset, int len)
++{
++      int err;
++
++      err = os_seek_file(fd, offset);
++      if(err != 0) return(-errno);
++      err = read(fd, buf, len);
++      if(err < 0) return(-errno);
++      return(0);
++}
++
++static int absolutize(char *to, int size, char *from)
++{
++      char save_cwd[256], *slash;
++      int remaining;
++
++      if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) {
++              printk("absolutize : unable to get cwd - errno = %d\n", errno);
++              return(-1);
++      }
++      slash = strrchr(from, '/');
++      if(slash != NULL){
++              *slash = '\0';
++              if(chdir(from)){
++                      *slash = '/';
++                      printk("absolutize : Can't cd to '%s' - errno = %d\n",
++                             from, errno);
++                      return(-1);
++              }
++              *slash = '/';
++              if(getcwd(to, size) == NULL){
++                      printk("absolutize : unable to get cwd of '%s' - "
++                             "errno = %d\n", from, errno);
++                      return(-1);
++              }
++              remaining = size - strlen(to);
++              if(strlen(slash) + 1 > remaining){
++                      printk("absolutize : unable to fit '%s' into %d "
++                             "chars\n", from, size);
++                      return(-1);
++              }
++              strcat(to, slash);
++      }
++      else {
++              if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){
++                      printk("absolutize : unable to fit '%s' into %d "
++                             "chars\n", from, size);
++                      return(-1);
++              }
++              strcpy(to, save_cwd);
++              strcat(to, "/");
++              strcat(to, from);
++      }
++      chdir(save_cwd);
++      return(0);
++}
++
++static int write_cow_header(char *cow_file, int fd, char *backing_file, 
++                          int sectorsize, long long *size)
++{
++        struct cow_header_v2 *header;
++      struct stat64 buf;
++      int err;
++
++      err = os_seek_file(fd, 0);
++      if(err != 0){
++              printk("write_cow_header - lseek failed, errno = %d\n", errno);
++              return(-errno);
++      }
++
++      err = -ENOMEM;
++      header = um_kmalloc(sizeof(*header));
++      if(header == NULL){
++              printk("Failed to allocate COW V2 header\n");
++              goto out;
++      }
++      header->magic = htonl(COW_MAGIC);
++      header->version = htonl(COW_VERSION);
++
++      err = -EINVAL;
++      if(strlen(backing_file) > sizeof(header->backing_file) - 1){
++              printk("Backing file name \"%s\" is too long - names are "
++                     "limited to %d characters\n", backing_file, 
++                     sizeof(header->backing_file) - 1);
++              goto out_free;
++      }
++
++      if(absolutize(header->backing_file, sizeof(header->backing_file), 
++                    backing_file))
++              goto out_free;
++
++      err = stat64(header->backing_file, &buf);
++      if(err < 0){
++              printk("Stat of backing file '%s' failed, errno = %d\n",
++                     header->backing_file, errno);
++              err = -errno;
++              goto out_free;
++      }
++
++      err = os_file_size(header->backing_file, size);
++      if(err){
++              printk("Couldn't get size of backing file '%s', errno = %d\n",
++                     header->backing_file, -*size);
++              goto out_free;
++      }
++
++      header->mtime = htonl(buf.st_mtime);
++      header->size = htonll(*size);
++      header->sectorsize = htonl(sectorsize);
++
++      err = write(fd, header, sizeof(*header));
++      if(err != sizeof(*header)){
++              printk("Write of header to new COW file '%s' failed, "
++                     "errno = %d\n", cow_file, errno);
++              goto out_free;
++      }
++      err = 0;
++ out_free:
++      kfree(header);
++ out:
++      return(err);
++}
++
++int open_ubd_file(char *file, struct openflags *openflags, 
++                char **backing_file_out, int *bitmap_offset_out, 
++                unsigned long *bitmap_len_out, int *data_offset_out, 
++                int *create_cow_out)
++{
++      time_t mtime;
++      __u64 size;
++      char *backing_file;
++        int fd, err, sectorsize, magic, same, mode = 0644;
++
++        if((fd = os_open_file(file, *openflags, mode)) < 0){
++              if((fd == -ENOENT) && (create_cow_out != NULL))
++                      *create_cow_out = 1;
++                if(!openflags->w ||
++                   ((errno != EROFS) && (errno != EACCES))) return(-errno);
++              openflags->w = 0;
++                if((fd = os_open_file(file, *openflags, mode)) < 0) 
++                      return(fd);
++        }
++
++      err = os_lock_file(fd, openflags->w);
++      if(err){
++              printk("Failed to lock '%s', errno = %d\n", file, -err);
++              goto error;
++      }
++      
++      if(backing_file_out == NULL) return(fd);
++
++      err = read_cow_header(fd, &magic, &backing_file, &mtime, &size, 
++                            &sectorsize, bitmap_offset_out);
++      if(err && (*backing_file_out != NULL)){
++              printk("Failed to read COW header from COW file \"%s\", "
++                     "errno = %d\n", file, err);
++              goto error;
++      }
++      if(err) return(fd);
++
++      if(backing_file_out == NULL) return(fd);
++      
++      same = same_backing_files(*backing_file_out, backing_file, file);
++
++      if(!same && !backing_file_mismatch(*backing_file_out, size, mtime)){
++              printk("Switching backing file to '%s'\n", *backing_file_out);
++              err = write_cow_header(file, fd, *backing_file_out, 
++                                     sectorsize, &size);
++              if(err){
++                      printk("Switch failed, errno = %d\n", err);
++                      return(err);
++              }
++      }
++      else {
++              *backing_file_out = backing_file;
++              err = backing_file_mismatch(*backing_file_out, size, mtime);
++              if(err) goto error;
++      }
++
++      sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, 
++            data_offset_out);
++
++        return(fd);
++ error:
++      os_close_file(fd);
++      return(err);
++}
++
++int create_cow_file(char *cow_file, char *backing_file, struct openflags flags,
++                  int sectorsize, int *bitmap_offset_out, 
++                  unsigned long *bitmap_len_out, int *data_offset_out)
++{
++      __u64 offset;
++      int err, fd;
++      long long size;
++      char zero = 0;
++
++      flags.c = 1;
++      fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL);
++      if(fd < 0){
++              err = fd;
++              printk("Open of COW file '%s' failed, errno = %d\n", cow_file,
++                     -err);
++              goto out;
++      }
++
++      err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size);
++      if(err) goto out_close;
++
++      sizes(size, sectorsize, sizeof(struct cow_header_v2), 
++            bitmap_len_out, data_offset_out);
++      *bitmap_offset_out = sizeof(struct cow_header_v2);
++
++      offset = *data_offset_out + size - sizeof(zero);
++      err = os_seek_file(fd, offset);
++      if(err != 0){
++              printk("cow bitmap lseek failed : errno = %d\n", errno);
++              goto out_close;
++      }
++
++      /* does not really matter how much we write it is just to set EOF 
++       * this also sets the entire COW bitmap
++       * to zero without having to allocate it 
++       */
++      err = os_write_file(fd, &zero, sizeof(zero));
++      if(err != sizeof(zero)){
++              printk("Write of bitmap to new COW file '%s' failed, "
++                     "errno = %d\n", cow_file, errno);
++              goto out_close;
++      }
++
++      return(fd);
++
++ out_close:
++      close(fd);
++ out:
++      return(err);
++}
++
++int read_ubd_fs(int fd, void *buffer, int len)
++{
++      int n;
++
++      n = read(fd, buffer, len);
++      if(n < 0) return(-errno);
++      else return(n);
++}
++
++int write_ubd_fs(int fd, char *buffer, int len)
++{
++      int n;
++
++      n = write(fd, buffer, len);
++      if(n < 0) return(-errno);
++      else return(n);
++}
++
++void do_io(struct io_thread_req *req)
++{
++      char *buf;
++      unsigned long len;
++      int n, nsectors, start, end, bit;
++      __u64 off;
++
++      nsectors = req->length / req->sectorsize;
++      start = 0;
++      do {
++              bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask);
++              end = start;
++              while((end < nsectors) && 
++                    (ubd_test_bit(end, (unsigned char *) 
++                                  &req->sector_mask) == bit))
++                      end++;
++
++              if(end != nsectors)
++                      printk("end != nsectors\n");
++              off = req->offset + req->offsets[bit] + 
++                      start * req->sectorsize;
++              len = (end - start) * req->sectorsize;
++              buf = &req->buffer[start * req->sectorsize];
++
++              if(os_seek_file(req->fds[bit], off) != 0){
++                      printk("do_io - lseek failed : errno = %d\n", errno);
++                      req->error = 1;
++                      return;
++              }
++              if(req->op == UBD_READ){
++                      n = 0;
++                      do {
++                              buf = &buf[n];
++                              len -= n;
++                              n = read(req->fds[bit], buf, len);
++                              if (n < 0) {
++                                      printk("do_io - read returned %d : "
++                                             "errno = %d fd = %d\n", n,
++                                             errno, req->fds[bit]);
++                                      req->error = 1;
++                                      return;
++                              }
++                      } while((n < len) && (n != 0));
++                      if (n < len) memset(&buf[n], 0, len - n);
++              }
++              else {
++                      n = write(req->fds[bit], buf, len);
++                      if(n != len){
++                              printk("do_io - write returned %d : "
++                                     "errno = %d fd = %d\n", n, 
++                                     errno, req->fds[bit]);
++                              req->error = 1;
++                              return;
++                      }
++              }
++
++              start = end;
++      } while(start < nsectors);
++
++      if(req->cow_offset != -1){
++              if(os_seek_file(req->fds[1], req->cow_offset) != 0){
++                      printk("do_io - bitmap lseek failed : errno = %d\n",
++                             errno);
++                      req->error = 1;
++                      return;
++              }
++              n = write(req->fds[1], &req->bitmap_words, 
++                        sizeof(req->bitmap_words));
++              if(n != sizeof(req->bitmap_words)){
++                      printk("do_io - bitmap update returned %d : "
++                             "errno = %d fd = %d\n", n, errno, req->fds[1]);
++                      req->error = 1;
++                      return;
++              }
++      }
++      req->error = 0;
++      return;
++}
++
++/* Changed in start_io_thread, which is serialized by being called only
++ * from ubd_init, which is an initcall.
++ */
++int kernel_fd = -1;
++
++/* Only changed by the io thread */
++int io_count = 0;
++
++int io_thread(void *arg)
++{
++      struct io_thread_req req;
++      int n;
++
++      signal(SIGWINCH, SIG_IGN);
++      while(1){
++              n = read(kernel_fd, &req, sizeof(req));
++              if(n < 0) printk("io_thread - read returned %d, errno = %d\n",
++                               n, errno);
++              else if(n < sizeof(req)){
++                      printk("io_thread - short read : length = %d\n", n);
++                      continue;
++              }
++              io_count++;
++              do_io(&req);
++              n = write(kernel_fd, &req, sizeof(req));
++              if(n != sizeof(req))
++                      printk("io_thread - write failed, errno = %d\n",
++                             errno);
++      }
++}
++
++int start_io_thread(unsigned long sp, int *fd_out)
++{
++      int pid, fds[2], err;
++
++      err = os_pipe(fds, 1, 1);
++      if(err){
++              printk("start_io_thread - os_pipe failed, errno = %d\n", -err);
++              return(-1);
++      }
++      kernel_fd = fds[0];
++      *fd_out = fds[1];
++
++      pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD,
++                  NULL);
++      if(pid < 0){
++              printk("start_io_thread - clone failed : errno = %d\n", errno);
++              return(-errno);
++      }
++      return(pid);
++}
++
++#ifdef notdef
++int start_io_thread(unsigned long sp, int *fd_out)
++{
++      int pid;
++
++      if((kernel_fd = get_pty()) < 0) return(-1);
++      raw(kernel_fd, 0);
++      if((*fd_out = open(ptsname(kernel_fd), O_RDWR)) < 0){
++              printk("Couldn't open tty for IO\n");
++              return(-1);
++      }
++
++      pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD,
++                  NULL);
++      if(pid < 0){
++              printk("start_io_thread - clone failed : errno = %d\n", errno);
++              return(-errno);
++      }
++      return(pid);
++}
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/xterm.c um/arch/um/drivers/xterm.c
+--- orig/arch/um/drivers/xterm.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/xterm.c Mon Dec 30 20:49:22 2002
+@@ -0,0 +1,200 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <string.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <termios.h>
++#include <signal.h>
++#include <sched.h>
++#include <sys/socket.h>
++#include "kern_util.h"
++#include "chan_user.h"
++#include "helper.h"
++#include "user_util.h"
++#include "user.h"
++#include "os.h"
++#include "xterm.h"
++
++struct xterm_chan {
++      int pid;
++      int helper_pid;
++      char *title;
++      int device;
++      int raw;
++      struct termios tt;
++      unsigned long stack;
++      int direct_rcv;
++};
++
++void *xterm_init(char *str, int device, struct chan_opts *opts)
++{
++      struct xterm_chan *data;
++
++      if((data = malloc(sizeof(*data))) == NULL) return(NULL);
++      *data = ((struct xterm_chan) { .pid             = -1, 
++                                     .helper_pid      = -1,
++                                     .device          = device, 
++                                     .title           = opts->xterm_title,
++                                     .raw             = opts->raw,
++                                     .stack           = opts->tramp_stack,
++                                     .direct_rcv      = !opts->in_kernel } );
++      return(data);
++}
++
++/* Only changed by xterm_setup, which is a setup */
++static char *terminal_emulator = "xterm";
++static char *title_switch = "-T";
++static char *exec_switch = "-e";
++
++static int __init xterm_setup(char *line, int *add)
++{
++      *add = 0;
++      terminal_emulator = line;
++
++      line = strchr(line, ',');
++      if(line == NULL) return(0);
++      *line++ = '\0';
++      if(*line) title_switch = line;
++
++      line = strchr(line, ',');
++      if(line == NULL) return(0);
++      *line++ = '\0';
++      if(*line) exec_switch = line;
++
++      return(0);
++}
++
++__uml_setup("xterm=", xterm_setup,
++"xterm=<terminal emulator>,<title switch>,<exec switch>\n"
++"    Specifies an alternate terminal emulator to use for the debugger,\n"
++"    consoles, and serial lines when they are attached to the xterm channel.\n"
++"    The values are the terminal emulator binary, the switch it uses to set\n"
++"    its title, and the switch it uses to execute a subprocess,\n"
++"    respectively.  The title switch must have the form '<switch> title',\n"
++"    not '<switch>=title'.  Similarly, the exec switch must have the form\n"
++"    '<switch> command arg1 arg2 ...'.\n"
++"    The default values are 'xterm=xterm,-T,-e'.  Values for gnome-terminal\n"
++"    are 'xterm=gnome-terminal,-t,-x'.\n\n"
++);
++
++int xterm_open(int input, int output, int primary, void *d, char **dev_out)
++{
++      struct xterm_chan *data = d;
++      unsigned long stack;
++      int pid, fd, new, err;
++      char title[256], file[] = "/tmp/xterm-pipeXXXXXX";
++      char *argv[] = { terminal_emulator, title_switch, title, exec_switch, 
++                       "/usr/lib/uml/port-helper", "-uml-socket",
++                       file, NULL };
++
++      if(access(argv[4], X_OK))
++              argv[4] = "port-helper";
++
++      fd = mkstemp(file);
++      if(fd < 0){
++              printk("xterm_open : mkstemp failed, errno = %d\n", errno);
++              return(-errno);
++      }
++
++      if(unlink(file)){
++              printk("xterm_open : unlink failed, errno = %d\n", errno);
++              return(-errno);
++      }
++      close(fd);
++
++      fd = create_unix_socket(file, sizeof(file));
++      if(fd < 0){
++              printk("xterm_open : create_unix_socket failed, errno = %d\n", 
++                     -fd);
++              return(-fd);
++      }
++
++      sprintf(title, data->title, data->device);
++      stack = data->stack;
++      pid = run_helper(NULL, NULL, argv, &stack);
++      if(pid < 0){
++              printk("xterm_open : run_helper failed, errno = %d\n", -pid);
++              return(pid);
++      }
++
++      if(data->stack == 0) free_stack(stack, 0);
++
++      if(data->direct_rcv)
++              new = os_rcv_fd(fd, &data->helper_pid);
++      else {
++              if((err = os_set_fd_block(fd, 0)) != 0){
++                      printk("xterm_open : failed to set descriptor "
++                             "non-blocking, errno = %d\n", err);
++                      return(err);
++              }
++              new = xterm_fd(fd, &data->helper_pid);
++      }
++      if(new < 0){
++              printk("xterm_open : os_rcv_fd failed, errno = %d\n", -new);
++              goto out;
++      }
++
++      tcgetattr(new, &data->tt);
++      if(data->raw) raw(new, 0);
++
++      data->pid = pid;
++      *dev_out = NULL;
++ out:
++      unlink(file);
++      return(new);
++}
++
++void xterm_close(int fd, void *d)
++{
++      struct xterm_chan *data = d;
++      
++      if(data->pid != -1) 
++              os_kill_process(data->pid, 1);
++      data->pid = -1;
++      if(data->helper_pid != -1) 
++              os_kill_process(data->helper_pid, 0);
++      data->helper_pid = -1;
++      close(fd);
++}
++
++void xterm_free(void *d)
++{
++      free(d);
++}
++
++int xterm_console_write(int fd, const char *buf, int n, void *d)
++{
++      struct xterm_chan *data = d;
++
++      return(generic_console_write(fd, buf, n, &data->tt));
++}
++
++struct chan_ops xterm_ops = {
++      .type           = "xterm",
++      .init           = xterm_init,
++      .open           = xterm_open,
++      .close          = xterm_close,
++      .read           = generic_read,
++      .write          = generic_write,
++      .console_write  = xterm_console_write,
++      .window_size    = generic_window_size,
++      .free           = xterm_free,
++      .winch          = 1,
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/xterm.h um/arch/um/drivers/xterm.h
+--- orig/arch/um/drivers/xterm.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/xterm.h Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,22 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __XTERM_H__
++#define __XTERM_H__
++
++extern int xterm_fd(int socket, int *pid_out);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/drivers/xterm_kern.c um/arch/um/drivers/xterm_kern.c
+--- orig/arch/um/drivers/xterm_kern.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/drivers/xterm_kern.c    Tue Dec 17 17:31:20 2002
+@@ -0,0 +1,79 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/errno.h"
++#include "linux/slab.h"
++#include "asm/semaphore.h"
++#include "asm/irq.h"
++#include "irq_user.h"
++#include "kern_util.h"
++#include "os.h"
++#include "xterm.h"
++
++struct xterm_wait {
++      struct semaphore sem;
++      int fd;
++      int pid;
++      int new_fd;
++};
++
++static void xterm_interrupt(int irq, void *data, struct pt_regs *regs)
++{
++      struct xterm_wait *xterm = data;
++      int fd;
++
++      fd = os_rcv_fd(xterm->fd, &xterm->pid);
++      if(fd == -EAGAIN)
++              return;
++
++      xterm->new_fd = fd;
++      up(&xterm->sem);
++}
++
++int xterm_fd(int socket, int *pid_out)
++{
++      struct xterm_wait *data;
++      int err, ret;
++
++      data = kmalloc(sizeof(*data), GFP_KERNEL);
++      if(data == NULL){
++              printk(KERN_ERR "xterm_fd : failed to allocate xterm_wait\n");
++              return(-ENOMEM);
++      }
++      *data = ((struct xterm_wait) 
++              { .sem          = __SEMAPHORE_INITIALIZER(data->sem, 0),
++                .fd           = socket,
++                .pid          = -1,
++                .new_fd       = -1 });
++
++      err = um_request_irq(XTERM_IRQ, socket, IRQ_READ, xterm_interrupt, 
++                           SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, 
++                           "xterm", data);
++      if(err){
++              printk(KERN_ERR "xterm_fd : failed to get IRQ for xterm, "
++                     "err = %d\n",  err);
++              return(err);
++      }
++      down(&data->sem);
++
++      free_irq(XTERM_IRQ, data);
++
++      ret = data->new_fd;
++      *pid_out = data->pid;
++      kfree(data);
++
++      return(ret);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/dyn_link.ld.in um/arch/um/dyn_link.ld.in
+--- orig/arch/um/dyn_link.ld.in        Wed Dec 31 19:00:00 1969
++++ um/arch/um/dyn_link.ld.in  Fri Jan 17 23:37:27 2003
+@@ -0,0 +1,172 @@
++OUTPUT_FORMAT("ELF_FORMAT")
++OUTPUT_ARCH(ELF_ARCH)
++ENTRY(_start)
++SEARCH_DIR("/usr/local/i686-pc-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib");
++/* Do we need any of these for elf?
++   __DYNAMIC = 0;    */
++SECTIONS
++{
++  . = START() + SIZEOF_HEADERS;
++  .interp         : { *(.interp) }
++  . = ALIGN(4096);
++  __binary_start = .;
++  . = ALIGN(4096);            /* Init code and data */
++  _stext = .;
++  __init_begin = .;
++  .text.init : { *(.text.init) }
++
++  . = ALIGN(4096);
++
++  /* Read-only sections, merged into text segment: */
++  .hash           : { *(.hash) }
++  .dynsym         : { *(.dynsym) }
++  .dynstr         : { *(.dynstr) }
++  .gnu.version    : { *(.gnu.version) }
++  .gnu.version_d  : { *(.gnu.version_d) }
++  .gnu.version_r  : { *(.gnu.version_r) }
++  .rel.init       : { *(.rel.init) }
++  .rela.init      : { *(.rela.init) }
++  .rel.text       : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) }
++  .rela.text      : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) }
++  .rel.fini       : { *(.rel.fini) }
++  .rela.fini      : { *(.rela.fini) }
++  .rel.rodata     : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) }
++  .rela.rodata    : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) }
++  .rel.data       : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) }
++  .rela.data      : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) }
++  .rel.tdata    : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) }
++  .rela.tdata   : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) }
++  .rel.tbss     : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) }
++  .rela.tbss    : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) }
++  .rel.ctors      : { *(.rel.ctors) }
++  .rela.ctors     : { *(.rela.ctors) }
++  .rel.dtors      : { *(.rel.dtors) }
++  .rela.dtors     : { *(.rela.dtors) }
++  .rel.got        : { *(.rel.got) }
++  .rela.got       : { *(.rela.got) }
++  .rel.bss        : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) }
++  .rela.bss       : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) }
++  .rel.plt        : { *(.rel.plt) }
++  .rela.plt       : { *(.rela.plt) }
++  .init           : {
++    KEEP (*(.init))
++  } =0x90909090
++  .plt            : { *(.plt) }
++  .text           : {
++    *(.text .stub .text.* .gnu.linkonce.t.*)
++    /* .gnu.warning sections are handled specially by elf32.em.  */
++    *(.gnu.warning)
++  } =0x90909090
++  .fini           : {
++    KEEP (*(.fini))
++  } =0x90909090
++
++  PROVIDE (__etext = .);
++  PROVIDE (_etext = .);
++  PROVIDE (etext = .);
++  .rodata         : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
++  .rodata1        : { *(.rodata1) }
++  .eh_frame_hdr : { *(.eh_frame_hdr) }
++
++
++  . = ALIGN(4096);
++  PROVIDE (_sdata = .);
++
++include(`arch/um/common.ld.in')
++
++  /* Ensure the __preinit_array_start label is properly aligned.  We
++     could instead move the label definition inside the section, but
++     the linker would then create the section even if it turns out to
++     be empty, which isn't pretty.  */
++  . = ALIGN(32 / 8);
++  .preinit_array     : { *(.preinit_array) }
++  .init_array     : { *(.init_array) }
++  .fini_array     : { *(.fini_array) }
++  .data           : {
++    . = ALIGN(KERNEL_STACK_SIZE);             /* init_task */
++    *(.data.init_task)
++    *(.data .data.* .gnu.linkonce.d.*)
++    SORT(CONSTRUCTORS)
++  }
++  .data1          : { *(.data1) }
++  .tdata        : { *(.tdata .tdata.* .gnu.linkonce.td.*) }
++  .tbss                 : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) }
++  .eh_frame       : { KEEP (*(.eh_frame)) }
++  .gcc_except_table   : { *(.gcc_except_table) }
++  .dynamic        : { *(.dynamic) }
++  .ctors          : {
++    /* gcc uses crtbegin.o to find the start of
++       the constructors, so we make sure it is
++       first.  Because this is a wildcard, it
++       doesn't matter if the user does not
++       actually link against crtbegin.o; the
++       linker won't look for a file to match a
++       wildcard.  The wildcard also means that it
++       doesn't matter which directory crtbegin.o
++       is in.  */
++    KEEP (*crtbegin.o(.ctors))
++    /* We don't want to include the .ctor section from
++       from the crtend.o file until after the sorted ctors.
++       The .ctor section from the crtend file contains the
++       end of ctors marker and it must be last */
++    KEEP (*(EXCLUDE_FILE (*crtend.o ) .ctors))
++    KEEP (*(SORT(.ctors.*)))
++    KEEP (*(.ctors))
++  }
++  .dtors          : {
++    KEEP (*crtbegin.o(.dtors))
++    KEEP (*(EXCLUDE_FILE (*crtend.o ) .dtors))
++    KEEP (*(SORT(.dtors.*)))
++    KEEP (*(.dtors))
++  }
++  .jcr            : { KEEP (*(.jcr)) }
++  .got            : { *(.got.plt) *(.got) }
++  _edata = .;
++  PROVIDE (edata = .);
++  __bss_start = .;
++  .bss            : {
++   *(.dynbss)
++   *(.bss .bss.* .gnu.linkonce.b.*)
++   *(COMMON)
++   /* Align here to ensure that the .bss section occupies space up to
++      _end.  Align after .bss to ensure correct alignment even if the
++      .bss section disappears because there are no input sections.  */
++   . = ALIGN(32 / 8);
++  . = ALIGN(32 / 8);
++  }
++  _end = .;
++  PROVIDE (end = .);
++   /* Stabs debugging sections.  */
++  .stab          0 : { *(.stab) }
++  .stabstr       0 : { *(.stabstr) }
++  .stab.excl     0 : { *(.stab.excl) }
++  .stab.exclstr  0 : { *(.stab.exclstr) }
++  .stab.index    0 : { *(.stab.index) }
++  .stab.indexstr 0 : { *(.stab.indexstr) }
++  .comment       0 : { *(.comment) }
++  /* DWARF debug sections.
++     Symbols in the DWARF debugging sections are relative to the beginning
++     of the section so we begin them at 0.  */
++  /* DWARF 1 */
++  .debug          0 : { *(.debug) }
++  .line           0 : { *(.line) }
++  /* GNU DWARF 1 extensions */
++  .debug_srcinfo  0 : { *(.debug_srcinfo) }
++  .debug_sfnames  0 : { *(.debug_sfnames) }
++  /* DWARF 1.1 and DWARF 2 */
++  .debug_aranges  0 : { *(.debug_aranges) }
++  .debug_pubnames 0 : { *(.debug_pubnames) }
++  /* DWARF 2 */
++  .debug_info     0 : { *(.debug_info .gnu.linkonce.wi.*) }
++  .debug_abbrev   0 : { *(.debug_abbrev) }
++  .debug_line     0 : { *(.debug_line) }
++  .debug_frame    0 : { *(.debug_frame) }
++  .debug_str      0 : { *(.debug_str) }
++  .debug_loc      0 : { *(.debug_loc) }
++  .debug_macinfo  0 : { *(.debug_macinfo) }
++  /* SGI/MIPS DWARF 2 extensions */
++  .debug_weaknames 0 : { *(.debug_weaknames) }
++  .debug_funcnames 0 : { *(.debug_funcnames) }
++  .debug_typenames 0 : { *(.debug_typenames) }
++  .debug_varnames  0 : { *(.debug_varnames) }
++}
+diff -Naur -X ../exclude-files orig/arch/um/fs/Makefile um/arch/um/fs/Makefile
+--- orig/arch/um/fs/Makefile   Wed Dec 31 19:00:00 1969
++++ um/arch/um/fs/Makefile     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,23 @@
++# 
++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET := built-in.o
++
++subdir-y =
++subdir-m =
++
++subdir-$(CONFIG_HOSTFS) += hostfs
++subdir-$(CONFIG_HPPFS) += hppfs
++
++obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
++obj-m += $(join $(subdir-m),$(subdir-m:%=/%.o))
++
++include $(TOPDIR)/Rules.make
++
++dep:
++
++clean:
++
++archmrproper:
+diff -Naur -X ../exclude-files orig/arch/um/fs/hostfs/Makefile um/arch/um/fs/hostfs/Makefile
+--- orig/arch/um/fs/hostfs/Makefile    Wed Dec 31 19:00:00 1969
++++ um/arch/um/fs/hostfs/Makefile      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,24 @@
++# 
++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++# struct stat64 changed the inode field name between 2.2 and 2.4 from st_ino
++# to __st_ino.  It stayed in the same place, so as long as the correct name
++# is used, hostfs compiled on 2.2 should work on 2.4 and vice versa.
++
++STAT64_INO_FIELD := $(shell grep -q __st_ino /usr/include/bits/stat.h && \
++                              echo __)st_ino
++
++USER_CFLAGS := $(USER_CFLAGS) -DSTAT64_INO_FIELD=$(STAT64_INO_FIELD)
++
++O_TARGET := hostfs.o
++obj-y = hostfs_kern.o hostfs_user.o
++obj-m = $(O_TARGET)
++
++USER_OBJS = $(filter %_user.o,$(obj-y))
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
+diff -Naur -X ../exclude-files orig/arch/um/fs/hostfs/hostfs.h um/arch/um/fs/hostfs/hostfs.h
+--- orig/arch/um/fs/hostfs/hostfs.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/fs/hostfs/hostfs.h      Mon Feb 24 23:00:47 2003
+@@ -0,0 +1,69 @@
++#ifndef __UM_FS_HOSTFS
++#define __UM_FS_HOSTFS
++
++#include "os.h"
++
++/* These are exactly the same definitions as in fs.h, but the names are 
++ * changed so that this file can be included in both kernel and user files.
++ */
++
++#define HOSTFS_ATTR_MODE      1
++#define HOSTFS_ATTR_UID       2
++#define HOSTFS_ATTR_GID       4
++#define HOSTFS_ATTR_SIZE      8
++#define HOSTFS_ATTR_ATIME     16
++#define HOSTFS_ATTR_MTIME     32
++#define HOSTFS_ATTR_CTIME     64
++#define HOSTFS_ATTR_ATIME_SET 128
++#define HOSTFS_ATTR_MTIME_SET 256
++#define HOSTFS_ATTR_FORCE     512     /* Not a change, but a change it */
++#define HOSTFS_ATTR_ATTR_FLAG 1024
++
++struct hostfs_iattr {
++      unsigned int    ia_valid;
++      mode_t          ia_mode;
++      uid_t           ia_uid;
++      gid_t           ia_gid;
++      loff_t          ia_size;
++      time_t          ia_atime;
++      time_t          ia_mtime;
++      time_t          ia_ctime;
++      unsigned int    ia_attr_flags;
++};
++
++extern int stat_file(const char *path, int *dev_out, 
++                   unsigned long long *inode_out, int *mode_out, 
++                   int *nlink_out, int *uid_out, int *gid_out, 
++                   unsigned long long *size_out, unsigned long *atime_out, 
++                   unsigned long *mtime_out, unsigned long *ctime_out, 
++                   int *blksize_out, unsigned long long *blocks_out);
++extern int access_file(char *path, int r, int w, int x);
++extern int open_file(char *path, int r, int w);
++extern int file_type(const char *path, int *rdev);
++extern void *open_dir(char *path, int *err_out);
++extern char *read_dir(void *stream, unsigned long long *pos, 
++                    unsigned long long *ino_out, int *len_out);
++extern void close_file(void *stream);
++extern void close_dir(void *stream);
++extern int read_file(int fd, unsigned long long *offset, char *buf, int len);
++extern int write_file(int fd, unsigned long long *offset, const char *buf,
++                    int len);
++extern int lseek_file(int fd, long long offset, int whence);
++extern int file_create(char *name, int ur, int uw, int ux, int gr, 
++                     int gw, int gx, int or, int ow, int ox);
++extern int set_attr(const char *file, struct hostfs_iattr *attrs);
++extern int make_symlink(const char *from, const char *to);
++extern int unlink_file(const char *file);
++extern int do_mkdir(const char *file, int mode);
++extern int do_rmdir(const char *file);
++extern int do_mknod(const char *file, int mode, int dev);
++extern int link_file(const char *from, const char *to);
++extern int do_readlink(char *file, char *buf, int size);
++extern int rename_file(char *from, char *to);
++extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 
++                   long long *bfree_out, long long *bavail_out, 
++                   long long *files_out, long long *ffree_out, 
++                   void *fsid_out, int fsid_size, long *namelen_out, 
++                   long *spare_out);
++
++#endif
+diff -Naur -X ../exclude-files orig/arch/um/fs/hostfs/hostfs_kern.c um/arch/um/fs/hostfs/hostfs_kern.c
+--- orig/arch/um/fs/hostfs/hostfs_kern.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/fs/hostfs/hostfs_kern.c Sun Apr 13 21:29:33 2003
+@@ -0,0 +1,870 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <linux/stddef.h>
++#include <linux/fs.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/pagemap.h>
++#include <linux/blkdev.h>
++#include <asm/uaccess.h>
++#include "hostfs.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "user_util.h"
++#include "2_5compat.h"
++
++#define file_hostfs_i(file) (&(file)->f_dentry->d_inode->u.hostfs_i)
++
++int hostfs_d_delete(struct dentry *dentry)
++{
++      return(1);
++}
++
++struct dentry_operations hostfs_dentry_ops = {
++      .d_delete               = hostfs_d_delete,
++};
++
++/* Not changed */
++static char *root_ino = "/";
++
++#define HOSTFS_SUPER_MAGIC 0x00c0ffee
++
++static struct inode_operations hostfs_iops;
++static struct inode_operations hostfs_dir_iops;
++static struct address_space_operations hostfs_link_aops;
++
++static char *dentry_name(struct dentry *dentry, int extra)
++{
++      struct dentry *parent;
++      char *root, *name;
++      int len;
++
++      len = 0;
++      parent = dentry;
++      while(parent->d_parent != parent){
++              len += parent->d_name.len + 1;
++              parent = parent->d_parent;
++      }
++      
++      root = parent->d_inode->u.hostfs_i.host_filename;
++      len += strlen(root);
++      name = kmalloc(len + extra + 1, GFP_KERNEL);
++      if(name == NULL) return(NULL);
++
++      name[len] = '\0';
++      parent = dentry;
++      while(parent->d_parent != parent){
++              len -= parent->d_name.len + 1;
++              name[len] = '/';
++              strncpy(&name[len + 1], parent->d_name.name, 
++                      parent->d_name.len);
++              parent = parent->d_parent;
++      }
++      strncpy(name, root, strlen(root));
++      return(name);
++}
++
++static char *inode_name(struct inode *ino, int extra)
++{
++      struct dentry *dentry;
++
++      dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias);
++      return(dentry_name(dentry, extra));
++}
++
++static int read_name(struct inode *ino, char *name)
++{
++      /* The non-int inode fields are copied into ints by stat_file and
++       * then copied into the inode because passing the actual pointers
++       * in and having them treated as int * breaks on big-endian machines
++       */
++      int err;
++      int i_dev, i_mode, i_nlink, i_blksize;
++      unsigned long long i_size;
++      unsigned long long i_ino;
++      unsigned long long i_blocks;
++      err = stat_file(name, &i_dev, &i_ino, &i_mode, &i_nlink, 
++                      &ino->i_uid, &ino->i_gid, &i_size, &ino->i_atime, 
++                      &ino->i_mtime, &ino->i_ctime, &i_blksize, &i_blocks);
++      if(err) return(err);
++      ino->i_ino = i_ino;
++      ino->i_dev = i_dev;
++      ino->i_mode = i_mode;
++      ino->i_nlink = i_nlink;
++      ino->i_size = i_size;
++      ino->i_blksize = i_blksize;
++      ino->i_blocks = i_blocks;
++      if(kdev_same(ino->i_sb->s_dev, ROOT_DEV) && (ino->i_uid == getuid()))
++              ino->i_uid = 0;
++      return(0);
++}
++
++static char *follow_link(char *link)
++{
++      int len, n;
++      char *name, *resolved, *end;
++
++      len = 64;
++      while(1){
++              n = -ENOMEM;
++              name = kmalloc(len, GFP_KERNEL);
++              if(name == NULL)
++                      goto out;
++
++              n = do_readlink(link, name, len);
++              if(n < len)
++                      break;
++              len *= 2;
++              kfree(name);
++      }
++      if(n < 0)
++              goto out_free;
++
++      if(*name == '/')
++              return(name);
++
++      end = strrchr(link, '/');
++      if(end == NULL)
++              return(name);
++
++      *(end + 1) = '\0';
++      len = strlen(link) + strlen(name) + 1;
++
++      resolved = kmalloc(len, GFP_KERNEL);
++      if(resolved == NULL){
++              n = -ENOMEM;
++              goto out_free;
++      }
++
++      sprintf(resolved, "%s%s", link, name);
++      kfree(name);
++      kfree(link);
++      return(resolved);
++
++ out_free:
++      kfree(name);
++ out:
++      return(ERR_PTR(n));
++}
++
++static int read_inode(struct inode *ino)
++{
++      char *name;
++      int err;
++
++      err = -ENOMEM;
++      name = inode_name(ino, 0);
++      if(name == NULL) 
++              goto out;
++
++      if(file_type(name, NULL) == OS_TYPE_SYMLINK){
++              name = follow_link(name);
++              if(IS_ERR(name)){
++                      err = PTR_ERR(name);
++                      goto out;
++              }
++      }
++      
++      err = read_name(ino, name);
++      kfree(name);
++ out:
++      return(err);
++}
++
++void hostfs_delete_inode(struct inode *ino)
++{
++      if(ino->u.hostfs_i.host_filename) 
++              kfree(ino->u.hostfs_i.host_filename);
++      ino->u.hostfs_i.host_filename = NULL;
++
++      if(ino->u.hostfs_i.fd != -1) 
++              close_file(&ino->u.hostfs_i.fd);
++
++      ino->u.hostfs_i.mode = 0;
++      clear_inode(ino);
++}
++
++int hostfs_statfs(struct super_block *sb, struct statfs *sf)
++{
++      /* do_statfs uses struct statfs64 internally, but the linux kernel
++       * struct statfs still has 32-bit versions for most of these fields,
++       * so we convert them here
++       */
++      int err;
++      long long f_blocks;
++      long long f_bfree;
++      long long f_bavail;
++      long long f_files;
++      long long f_ffree;
++
++      err = do_statfs(sb->s_root->d_inode->u.hostfs_i.host_filename,
++                      &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
++                      &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), 
++                      &sf->f_namelen, sf->f_spare);
++      if(err) return(err);
++      sf->f_blocks = f_blocks;
++      sf->f_bfree = f_bfree;
++      sf->f_bavail = f_bavail;
++      sf->f_files = f_files;
++      sf->f_ffree = f_ffree;
++      sf->f_type = HOSTFS_SUPER_MAGIC;
++      return(0);
++}
++
++static struct super_operations hostfs_sbops = { 
++      .put_inode      = force_delete,
++      .delete_inode   = hostfs_delete_inode,
++      .statfs         = hostfs_statfs,
++};
++
++int hostfs_readdir(struct file *file, void *ent, filldir_t filldir)
++{
++      void *dir;
++      char *name;
++      unsigned long long next, ino;
++      int error, len;
++
++      name = dentry_name(file->f_dentry, 0);
++      if(name == NULL) return(-ENOMEM);
++      dir = open_dir(name, &error);
++      kfree(name);
++      if(dir == NULL) return(-error);
++      next = file->f_pos;
++      while((name = read_dir(dir, &next, &ino, &len)) != NULL){
++              error = (*filldir)(ent, name, len, file->f_pos, 
++                                 ino, DT_UNKNOWN);
++              if(error) break;
++              file->f_pos = next;
++      }
++      close_dir(dir);
++      return(0);
++}
++
++int hostfs_file_open(struct inode *ino, struct file *file)
++{
++      char *name;
++      int mode = 0, r = 0, w = 0, fd;
++
++      mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
++      if((mode & ino->u.hostfs_i.mode) == mode)
++              return(0);
++
++      /* The file may already have been opened, but with the wrong access,
++       * so this resets things and reopens the file with the new access.
++       */
++      if(ino->u.hostfs_i.fd != -1){
++              close_file(&ino->u.hostfs_i.fd);
++              ino->u.hostfs_i.fd = -1;
++      }
++
++      ino->u.hostfs_i.mode |= mode;
++      if(ino->u.hostfs_i.mode & FMODE_READ) 
++              r = 1;
++      if(ino->u.hostfs_i.mode & FMODE_WRITE) 
++              w = 1;
++      if(w) 
++              r = 1;
++
++      name = dentry_name(file->f_dentry, 0);
++      if(name == NULL) 
++              return(-ENOMEM);
++
++      fd = open_file(name, r, w);
++      kfree(name);
++      if(fd < 0) return(fd);
++      file_hostfs_i(file)->fd = fd;
++
++      return(0);
++}
++
++int hostfs_dir_open(struct inode *ino, struct file *file)
++{
++      return(0);      
++}
++
++int hostfs_dir_release(struct inode *ino, struct file *file)
++{
++      return(0);
++}
++
++int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
++{
++      return(0);
++}
++
++static struct file_operations hostfs_file_fops = {
++      .owner          = NULL,
++      .read           = generic_file_read,
++      .write          = generic_file_write,
++      .mmap           = generic_file_mmap,
++      .open           = hostfs_file_open,
++      .release        = NULL,
++      .fsync          = hostfs_fsync,
++};
++
++static struct file_operations hostfs_dir_fops = {
++      .owner          = NULL,
++      .readdir        = hostfs_readdir,
++      .open           = hostfs_dir_open,
++      .release        = hostfs_dir_release,
++      .fsync          = hostfs_fsync,
++};
++
++int hostfs_writepage(struct page *page)
++{
++      struct address_space *mapping = page->mapping;
++      struct inode *inode = mapping->host;
++      char *buffer;
++      unsigned long long base;
++      int count = PAGE_CACHE_SIZE;
++      int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
++      int err;
++
++      if (page->index >= end_index)
++              count = inode->i_size & (PAGE_CACHE_SIZE-1);
++
++      buffer = kmap(page);
++      base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT;
++
++      err = write_file(inode->u.hostfs_i.fd, &base, buffer, count);
++      if(err != count){
++              ClearPageUptodate(page);
++              goto out;
++      }
++
++      if (base > inode->i_size)
++              inode->i_size = base;
++
++      if (PageError(page))
++              ClearPageError(page);   
++      err = 0;
++
++ out: 
++      kunmap(page);
++
++      UnlockPage(page);
++      return err; 
++}
++
++int hostfs_readpage(struct file *file, struct page *page)
++{
++      char *buffer;
++      long long start;
++      int err = 0;
++
++      start = (long long) page->index << PAGE_CACHE_SHIFT;
++      buffer = kmap(page);
++      err = read_file(file_hostfs_i(file)->fd, &start, buffer,
++                      PAGE_CACHE_SIZE);
++      if(err < 0) goto out;
++
++      flush_dcache_page(page);
++      SetPageUptodate(page);
++      if (PageError(page)) ClearPageError(page);
++      err = 0;
++ out:
++      kunmap(page);
++      UnlockPage(page);
++      return(err);
++}
++
++int hostfs_prepare_write(struct file *file, struct page *page, 
++                       unsigned int from, unsigned int to)
++{
++      char *buffer;
++      long long start, tmp;
++      int err;
++
++      start = (long long) page->index << PAGE_CACHE_SHIFT;
++      buffer = kmap(page);
++      if(from != 0){
++              tmp = start;
++              err = read_file(file_hostfs_i(file)->fd, &tmp, buffer,
++                              from);
++              if(err < 0) goto out;
++      }
++      if(to != PAGE_CACHE_SIZE){
++              start += to;
++              err = read_file(file_hostfs_i(file)->fd, &start, buffer + to,
++                              PAGE_CACHE_SIZE - to);
++              if(err < 0) goto out;           
++      }
++      err = 0;
++ out:
++      kunmap(page);
++      return(err);
++}
++
++int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
++               unsigned to)
++{
++      struct address_space *mapping = page->mapping;
++      struct inode *inode = mapping->host;
++      char *buffer;
++      long long start;
++      int err = 0;
++
++      start = (long long) (page->index << PAGE_CACHE_SHIFT) + from;
++      buffer = kmap(page);
++      err = write_file(file_hostfs_i(file)->fd, &start, buffer + from, 
++                       to - from);
++      if(err > 0) err = 0;
++      if(!err && (start > inode->i_size))
++              inode->i_size = start;
++
++      kunmap(page);
++      return(err);
++}
++
++static struct address_space_operations hostfs_aops = {
++      .writepage      = hostfs_writepage,
++      .readpage       = hostfs_readpage,
++      .prepare_write  = hostfs_prepare_write,
++      .commit_write   = hostfs_commit_write
++};
++
++static struct inode *get_inode(struct super_block *sb, struct dentry *dentry,
++                             int *error)
++{
++      struct inode *inode;
++      char *name;
++      int type, err = -ENOMEM, rdev;
++
++      inode = new_inode(sb);
++      if(inode == NULL) 
++              goto out;
++
++      inode->u.hostfs_i.host_filename = NULL;
++      inode->u.hostfs_i.fd = -1;
++      inode->u.hostfs_i.mode = 0;
++      insert_inode_hash(inode);
++      if(dentry){
++              name = dentry_name(dentry, 0);
++              if(name == NULL){
++                      err = -ENOMEM;
++                      goto out_put;
++              }
++              type = file_type(name, &rdev);
++              kfree(name);
++      }
++      else type = OS_TYPE_DIR;
++      inode->i_sb = sb;
++
++      err = 0;
++      if(type == OS_TYPE_SYMLINK)
++              inode->i_op = &page_symlink_inode_operations;
++      else if(type == OS_TYPE_DIR)
++              inode->i_op = &hostfs_dir_iops;
++      else inode->i_op = &hostfs_iops;
++
++      if(type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops;
++      else inode->i_fop = &hostfs_file_fops;
++
++      if(type == OS_TYPE_SYMLINK) 
++              inode->i_mapping->a_ops = &hostfs_link_aops;
++      else inode->i_mapping->a_ops = &hostfs_aops;
++
++      switch (type) {
++      case OS_TYPE_CHARDEV:
++              init_special_inode(inode, S_IFCHR, rdev);
++              break;
++      case OS_TYPE_BLOCKDEV:
++              init_special_inode(inode, S_IFBLK, rdev);
++              break;
++      case OS_TYPE_FIFO:
++              init_special_inode(inode, S_IFIFO, 0);
++              break;
++      case OS_TYPE_SOCK:
++              init_special_inode(inode, S_IFSOCK, 0);
++              break;
++      }
++      
++      if(error) *error = err;
++      return(inode);
++ out_put:
++      iput(inode);
++ out:
++      if(error) *error = err;
++      return(NULL);
++}
++
++int hostfs_create(struct inode *dir, struct dentry *dentry, int mode)
++{
++      struct inode *inode;
++      char *name;
++      int error;
++
++      inode = get_inode(dir->i_sb, dentry, &error);
++      if(error) return(error);
++      name = dentry_name(dentry, 0);
++      if(name == NULL){
++              iput(inode);
++              return(-ENOMEM);
++      }
++      error = file_create(name, 
++                          mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, 
++                          mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, 
++                          mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH);
++      if(!error) error = read_name(inode, name);
++      kfree(name);
++      if(error){
++              iput(inode);
++              return(error);
++      }
++      d_instantiate(dentry, inode);
++      return(0);
++}
++ 
++struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry)
++{
++      struct inode *inode;
++      char *name;
++      int error;
++
++      inode = get_inode(ino->i_sb, dentry, &error);
++      if(error != 0) return(ERR_PTR(error));
++      name = dentry_name(dentry, 0);
++      if(name == NULL) return(ERR_PTR(-ENOMEM));
++      error = read_name(inode, name);
++      kfree(name);
++      if(error){
++              iput(inode);
++              if(error == -ENOENT) inode = NULL;
++              else return(ERR_PTR(error));
++      }
++      d_add(dentry, inode);
++      dentry->d_op = &hostfs_dentry_ops;
++      return(NULL);
++}
++
++static char *inode_dentry_name(struct inode *ino, struct dentry *dentry)
++{
++        char *file;
++      int len;
++
++      file = inode_name(ino, dentry->d_name.len + 1);
++      if(file == NULL) return(NULL);
++        strcat(file, "/");
++      len = strlen(file);
++        strncat(file, dentry->d_name.name, dentry->d_name.len);
++      file[len + dentry->d_name.len] = '\0';
++        return(file);
++}
++
++int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
++{
++        char *from_name, *to_name;
++        int err;
++
++        if((from_name = inode_dentry_name(ino, from)) == NULL) 
++                return(-ENOMEM);
++        to_name = dentry_name(to, 0);
++      if(to_name == NULL){
++              kfree(from_name);
++              return(-ENOMEM);
++      }
++        err = link_file(to_name, from_name);
++        kfree(from_name);
++        kfree(to_name);
++        return(err);
++}
++
++int hostfs_unlink(struct inode *ino, struct dentry *dentry)
++{
++      char *file;
++      int err;
++
++      if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
++      err = unlink_file(file);
++      kfree(file);
++      return(err);
++}
++
++int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
++{
++      char *file;
++      int err;
++
++      if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
++      err = make_symlink(file, to);
++      kfree(file);
++      return(err);
++}
++
++int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode)
++{
++      char *file;
++      int err;
++
++      if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
++      err = do_mkdir(file, mode);
++      kfree(file);
++      return(err);
++}
++
++int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
++{
++      char *file;
++      int err;
++
++      if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM);
++      err = do_rmdir(file);
++      kfree(file);
++      return(err);
++}
++
++int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, int dev)
++{
++      struct inode *inode;
++      char *name;
++      int error;
++ 
++      inode = get_inode(dir->i_sb, dentry, &error);
++      if(error) return(error);
++      name = dentry_name(dentry, 0);
++      if(name == NULL){
++              iput(inode);
++              return(-ENOMEM);
++      }
++      init_special_inode(inode, mode, dev);
++      error = do_mknod(name, mode, dev);
++      if(!error) error = read_name(inode, name);
++      kfree(name);
++      if(error){
++              iput(inode);
++              return(error);
++      }
++      d_instantiate(dentry, inode);
++      return(0);
++}
++
++int hostfs_rename(struct inode *from_ino, struct dentry *from,
++                struct inode *to_ino, struct dentry *to)
++{
++      char *from_name, *to_name;
++      int err;
++
++      if((from_name = inode_dentry_name(from_ino, from)) == NULL)
++              return(-ENOMEM);
++      if((to_name = inode_dentry_name(to_ino, to)) == NULL){
++              kfree(from_name);
++              return(-ENOMEM);
++      }
++      err = rename_file(from_name, to_name);
++      kfree(from_name);
++      kfree(to_name);
++      return(err);
++}
++
++void hostfs_truncate(struct inode *ino)
++{
++      not_implemented();
++}
++
++int hostfs_permission(struct inode *ino, int desired)
++{
++      char *name;
++      int r = 0, w = 0, x = 0, err;
++
++      if(desired & MAY_READ) r = 1;
++      if(desired & MAY_WRITE) w = 1;
++      if(desired & MAY_EXEC) x = 1;
++      name = inode_name(ino, 0);
++      if(name == NULL) return(-ENOMEM);
++      err = access_file(name, r, w, x);
++      kfree(name);
++      if(!err) err = vfs_permission(ino, desired);
++      return(err);
++}
++
++int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
++{
++      struct hostfs_iattr attrs;
++      char *name;
++      int err;
++
++      attrs.ia_valid = 0;
++      if(attr->ia_valid & ATTR_MODE){
++              attrs.ia_valid |= HOSTFS_ATTR_MODE;
++              attrs.ia_mode = attr->ia_mode;
++      }
++      if(attr->ia_valid & ATTR_UID){
++              if(kdev_same(dentry->d_inode->i_sb->s_dev, ROOT_DEV) && 
++                 (attr->ia_uid == 0))
++                      attr->ia_uid = getuid();
++              attrs.ia_valid |= HOSTFS_ATTR_UID;
++              attrs.ia_uid = attr->ia_uid;
++      }
++      if(attr->ia_valid & ATTR_GID){
++              if(kdev_same(dentry->d_inode->i_sb->s_dev, ROOT_DEV) && 
++                 (attr->ia_gid == 0))
++                      attr->ia_gid = getuid();
++              attrs.ia_valid |= HOSTFS_ATTR_GID;
++              attrs.ia_gid = attr->ia_gid;
++      }
++      if(attr->ia_valid & ATTR_SIZE){
++              attrs.ia_valid |= HOSTFS_ATTR_SIZE;
++              attrs.ia_size = attr->ia_size;
++      }
++      if(attr->ia_valid & ATTR_ATIME){
++              attrs.ia_valid |= HOSTFS_ATTR_ATIME;
++              attrs.ia_atime = attr->ia_atime;
++      }
++      if(attr->ia_valid & ATTR_MTIME){
++              attrs.ia_valid |= HOSTFS_ATTR_MTIME;
++              attrs.ia_mtime = attr->ia_mtime;
++      }
++      if(attr->ia_valid & ATTR_CTIME){
++              attrs.ia_valid |= HOSTFS_ATTR_CTIME;
++              attrs.ia_ctime = attr->ia_ctime;
++      }
++      if(attr->ia_valid & ATTR_ATIME_SET){
++              attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET;
++      }
++      if(attr->ia_valid & ATTR_MTIME_SET){
++              attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
++      }
++      name = dentry_name(dentry, 0);
++      if(name == NULL) return(-ENOMEM);
++      err = set_attr(name, &attrs);
++      kfree(name);
++      if(err)
++              return(err);
++
++      return(inode_setattr(dentry->d_inode, attr));
++}
++
++int hostfs_getattr(struct dentry *dentry, struct iattr *attr)
++{
++      not_implemented();
++      return(-EINVAL);
++}
++
++static struct inode_operations hostfs_iops = {
++      .create         = hostfs_create,
++      .link           = hostfs_link,
++      .unlink         = hostfs_unlink,
++      .symlink        = hostfs_symlink,
++      .mkdir          = hostfs_mkdir,
++      .rmdir          = hostfs_rmdir,
++      .mknod          = hostfs_mknod,
++      .rename         = hostfs_rename,
++      .truncate       = hostfs_truncate,
++      .permission     = hostfs_permission,
++      .setattr        = hostfs_setattr,
++      .getattr        = hostfs_getattr,
++};
++
++static struct inode_operations hostfs_dir_iops = {
++      .create         = hostfs_create,
++      .lookup         = hostfs_lookup,
++      .link           = hostfs_link,
++      .unlink         = hostfs_unlink,
++      .symlink        = hostfs_symlink,
++      .mkdir          = hostfs_mkdir,
++      .rmdir          = hostfs_rmdir,
++      .mknod          = hostfs_mknod,
++      .rename         = hostfs_rename,
++      .truncate       = hostfs_truncate,
++      .permission     = hostfs_permission,
++      .setattr        = hostfs_setattr,
++      .getattr        = hostfs_getattr,
++};
++
++int hostfs_link_readpage(struct file *file, struct page *page)
++{
++      char *buffer, *name;
++      long long start;
++      int err;
++
++      start = page->index << PAGE_CACHE_SHIFT;
++      buffer = kmap(page);
++      name = inode_name(page->mapping->host, 0);
++      if(name == NULL) return(-ENOMEM);
++      err = do_readlink(name, buffer, PAGE_CACHE_SIZE);
++      kfree(name);
++      if(err == PAGE_CACHE_SIZE)
++              err = -E2BIG;
++      else if(err > 0){
++              flush_dcache_page(page);
++              SetPageUptodate(page);
++              if (PageError(page)) ClearPageError(page);
++              err = 0;
++      }
++      kunmap(page);
++      UnlockPage(page);
++      return(err);
++}
++
++static struct address_space_operations hostfs_link_aops = {
++      .readpage       = hostfs_link_readpage,
++};
++
++static struct super_block *hostfs_read_super_common(struct super_block *sb, 
++                                                  char *data)
++{
++      struct inode *root_inode;
++      char *name;
++
++      sb->s_blocksize = 1024;
++      sb->s_blocksize_bits = 10;
++      sb->s_magic = HOSTFS_SUPER_MAGIC;
++      sb->s_op = &hostfs_sbops;
++      if((data == NULL) || (*((char *) data) == '\0')) data = root_ino;
++      name = kmalloc(strlen(data) + 1, GFP_KERNEL);
++      if(name == NULL) return(NULL);
++      strcpy(name, data);
++      root_inode = get_inode(sb, NULL, NULL);
++      if(root_inode == NULL)
++              goto out_free;
++
++      root_inode->u.hostfs_i.host_filename = name;
++      sb->s_root = d_alloc_root(root_inode);
++      if(read_inode(root_inode))
++              goto out_put;
++      return(sb);
++
++ out_free:
++      kfree(name);
++ out_put:
++      iput(root_inode);
++      return(NULL);
++}
++
++struct super_block *hostfs_read_super(struct super_block *sb, void *data, 
++                                    int silent)
++{
++      return(hostfs_read_super_common(sb, data));
++}
++
++DECLARE_FSTYPE(hostfs_type, "hostfs", hostfs_read_super, 0);
++
++static int __init init_hostfs(void)
++{
++      return(register_filesystem(&hostfs_type));
++}
++
++static void __exit exit_hostfs(void)
++{
++      unregister_filesystem(&hostfs_type);
++}
++
++module_init(init_hostfs)
++module_exit(exit_hostfs)
++MODULE_LICENSE("GPL");
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/fs/hostfs/hostfs_user.c um/arch/um/fs/hostfs/hostfs_user.c
+--- orig/arch/um/fs/hostfs/hostfs_user.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/fs/hostfs/hostfs_user.c Fri Jan 31 21:48:30 2003
+@@ -0,0 +1,341 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <stdio.h>
++#include <fcntl.h>
++#include <dirent.h>
++#include <errno.h>
++#include <utime.h>
++#include <string.h>
++#include <sys/stat.h>
++#include <sys/time.h>
++#include <sys/vfs.h>
++#include "hostfs.h"
++#include "kern_util.h"
++#include "user.h"
++
++int stat_file(const char *path, int *dev_out, unsigned long long *inode_out,
++            int *mode_out, int *nlink_out, int *uid_out, int *gid_out, 
++            unsigned long long *size_out, unsigned long *atime_out,
++            unsigned long *mtime_out, unsigned long *ctime_out,
++            int *blksize_out, unsigned long long *blocks_out)
++{
++      struct stat64 buf;
++
++      if(lstat64(path, &buf) < 0) 
++              return(-errno);
++      if(dev_out != NULL) *dev_out = buf.st_dev;
++
++      /* See the Makefile for why STAT64_INO_FIELD is passed in
++       * by the build
++       */
++      if(inode_out != NULL) *inode_out = buf.STAT64_INO_FIELD;
++      if(mode_out != NULL) *mode_out = buf.st_mode;
++      if(nlink_out != NULL) *nlink_out = buf.st_nlink;
++      if(uid_out != NULL) *uid_out = buf.st_uid;
++      if(gid_out != NULL) *gid_out = buf.st_gid;
++      if(size_out != NULL) *size_out = buf.st_size;
++      if(atime_out != NULL) *atime_out = buf.st_atime;
++      if(mtime_out != NULL) *mtime_out = buf.st_mtime;
++      if(ctime_out != NULL) *ctime_out = buf.st_ctime;
++      if(blksize_out != NULL) *blksize_out = buf.st_blksize;
++      if(blocks_out != NULL) *blocks_out = buf.st_blocks;
++      return(0);
++}
++
++int file_type(const char *path, int *rdev)
++{
++      struct stat64 buf;
++
++      if(lstat64(path, &buf) < 0) 
++              return(-errno);
++      if(rdev != NULL) 
++              *rdev = buf.st_rdev;
++
++      if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR);
++      else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK);
++      else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV);
++      else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV);
++      else if(S_ISFIFO(buf.st_mode))return(OS_TYPE_FIFO);
++      else if(S_ISSOCK(buf.st_mode))return(OS_TYPE_SOCK);
++      else return(OS_TYPE_FILE);
++}
++
++int access_file(char *path, int r, int w, int x)
++{
++      int mode = 0;
++
++      if(r) mode = R_OK;
++      if(w) mode |= W_OK;
++      if(x) mode |= X_OK;
++      if(access(path, mode) != 0) return(-errno);
++      else return(0);
++}
++
++int open_file(char *path, int r, int w)
++{
++      int mode = 0, fd;
++
++      if(r && !w) mode = O_RDONLY;
++      else if(!r && w) mode = O_WRONLY;
++      else if(r && w) mode = O_RDWR;
++      else panic("Impossible mode in open_file");
++      fd = open64(path, mode);
++      if(fd < 0) return(-errno);
++      else return(fd);
++}
++
++void *open_dir(char *path, int *err_out)
++{
++      DIR *dir;
++
++      dir = opendir(path);
++      *err_out = errno;
++      if(dir == NULL) return(NULL);
++      return(dir);
++}
++
++char *read_dir(void *stream, unsigned long long *pos, 
++             unsigned long long *ino_out, int *len_out)
++{
++      DIR *dir = stream;
++      struct dirent *ent;
++
++      seekdir(dir, *pos);
++      ent = readdir(dir);
++      if(ent == NULL) return(NULL);
++      *len_out = strlen(ent->d_name);
++      *ino_out = ent->d_ino;
++      *pos = telldir(dir);
++      return(ent->d_name);
++}
++
++int read_file(int fd, unsigned long long *offset, char *buf, int len)
++{
++      int n;
++
++      n = pread64(fd, buf, len, *offset);
++      if(n < 0) return(-errno);
++      *offset += n;
++      return(n);
++}
++
++int write_file(int fd, unsigned long long *offset, const char *buf, int len)
++{
++      int n;
++
++      n = pwrite64(fd, buf, len, *offset);
++      if(n < 0) return(-errno);
++      *offset += n;
++      return(n);
++}
++
++int lseek_file(int fd, long long offset, int whence)
++{
++      int ret;
++
++      ret = lseek64(fd, offset, whence);
++      if(ret < 0) return(-errno);
++      return(0);
++}
++
++void close_file(void *stream)
++{
++      close(*((int *) stream));
++}
++
++void close_dir(void *stream)
++{
++      closedir(stream);
++}
++
++int file_create(char *name, int ur, int uw, int ux, int gr, 
++              int gw, int gx, int or, int ow, int ox)
++{
++      int mode, fd;
++
++      mode = 0;
++      mode |= ur ? S_IRUSR : 0;
++      mode |= uw ? S_IWUSR : 0;
++      mode |= ux ? S_IXUSR : 0;
++      mode |= gr ? S_IRGRP : 0;
++      mode |= gw ? S_IWGRP : 0;
++      mode |= gx ? S_IXGRP : 0;
++      mode |= or ? S_IROTH : 0;
++      mode |= ow ? S_IWOTH : 0;
++      mode |= ox ? S_IXOTH : 0;
++      fd = open64(name, O_CREAT, mode);
++      if(fd < 0) return(-errno);
++      close(fd);
++      return(0);
++}
++
++int set_attr(const char *file, struct hostfs_iattr *attrs)
++{
++      struct utimbuf buf;
++      int err, ma;
++
++      if(attrs->ia_valid & HOSTFS_ATTR_MODE){
++              if(chmod(file, attrs->ia_mode) != 0) return(-errno);
++      }
++      if(attrs->ia_valid & HOSTFS_ATTR_UID){
++              if(chown(file, attrs->ia_uid, -1)) return(-errno);
++      }
++      if(attrs->ia_valid & HOSTFS_ATTR_GID){
++              if(chown(file, -1, attrs->ia_gid)) return(-errno);
++      }
++      if(attrs->ia_valid & HOSTFS_ATTR_SIZE){
++              if(truncate(file, attrs->ia_size)) return(-errno);
++      }
++      ma = HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET;
++      if((attrs->ia_valid & ma) == ma){
++              buf.actime = attrs->ia_atime;
++              buf.modtime = attrs->ia_mtime;
++              if(utime(file, &buf) != 0) return(-errno);
++      }
++      else {
++              if(attrs->ia_valid & HOSTFS_ATTR_ATIME_SET){
++                      err = stat_file(file, NULL, NULL, NULL, NULL, NULL, 
++                                      NULL, NULL, NULL, &buf.modtime, NULL,
++                                      NULL, NULL);
++                      if(err != 0) return(err);
++                      buf.actime = attrs->ia_atime;
++                      if(utime(file, &buf) != 0) return(-errno);
++              }
++              if(attrs->ia_valid & HOSTFS_ATTR_MTIME_SET){
++                      err = stat_file(file, NULL, NULL, NULL, NULL, NULL, 
++                                      NULL, NULL, &buf.actime, NULL, NULL, 
++                                      NULL, NULL);
++                      if(err != 0) return(err);
++                      buf.modtime = attrs->ia_mtime;
++                      if(utime(file, &buf) != 0) return(-errno);
++              }
++      }
++      if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ;
++      if(attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)){
++              err = stat_file(file, NULL, NULL, NULL, NULL, NULL, 
++                              NULL, NULL, &attrs->ia_atime, &attrs->ia_mtime,
++                              NULL, NULL, NULL);
++              if(err != 0) return(err);
++      }
++      return(0);
++}
++
++int make_symlink(const char *from, const char *to)
++{
++      int err;
++
++      err = symlink(to, from);
++      if(err) return(-errno);
++      return(0);
++}
++
++int unlink_file(const char *file)
++{
++      int err;
++
++      err = unlink(file);
++      if(err) return(-errno);
++      return(0);
++}
++
++int do_mkdir(const char *file, int mode)
++{
++      int err;
++
++      err = mkdir(file, mode);
++      if(err) return(-errno);
++      return(0);
++}
++
++int do_rmdir(const char *file)
++{
++      int err;
++
++      err = rmdir(file);
++      if(err) return(-errno);
++      return(0);
++}
++
++int do_mknod(const char *file, int mode, int dev)
++{
++      int err;
++
++      err = mknod(file, mode, dev);
++      if(err) return(-errno);
++      return(0);
++}
++
++int link_file(const char *to, const char *from)
++{
++      int err;
++
++      err = link(to, from);
++      if(err) return(-errno);
++      return(0);
++}
++
++int do_readlink(char *file, char *buf, int size)
++{
++      int n;
++
++      n = readlink(file, buf, size);
++      if(n < 0) 
++              return(-errno);
++      if(n < size) 
++              buf[n] = '\0';
++      return(n);
++}
++
++int rename_file(char *from, char *to)
++{
++      int err;
++
++      err = rename(from, to);
++      if(err < 0) return(-errno);
++      return(0);      
++}
++
++int do_statfs(char *root, long *bsize_out, long long *blocks_out, 
++            long long *bfree_out, long long *bavail_out, 
++            long long *files_out, long long *ffree_out,
++            void *fsid_out, int fsid_size, long *namelen_out, 
++            long *spare_out)
++{
++      struct statfs64 buf;
++      int err;
++
++      err = statfs64(root, &buf);
++      if(err < 0) return(-errno);
++      *bsize_out = buf.f_bsize;
++      *blocks_out = buf.f_blocks;
++      *bfree_out = buf.f_bfree;
++      *bavail_out = buf.f_bavail;
++      *files_out = buf.f_files;
++      *ffree_out = buf.f_ffree;
++      memcpy(fsid_out, &buf.f_fsid, 
++             sizeof(buf.f_fsid) > fsid_size ? fsid_size : 
++             sizeof(buf.f_fsid));
++      *namelen_out = buf.f_namelen;
++      spare_out[0] = buf.f_spare[0];
++      spare_out[1] = buf.f_spare[1];
++      spare_out[2] = buf.f_spare[2];
++      spare_out[3] = buf.f_spare[3];
++      spare_out[4] = buf.f_spare[4];
++      spare_out[5] = buf.f_spare[5];
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/fs/hppfs/Makefile um/arch/um/fs/hppfs/Makefile
+--- orig/arch/um/fs/hppfs/Makefile     Wed Dec 31 19:00:00 1969
++++ um/arch/um/fs/hppfs/Makefile       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,10 @@
++O_TARGET := hppfs.o
++obj-y = hppfs_kern.o #hppfs_user.o
++obj-m = $(O_TARGET)
++
++CFLAGS_hppfs_kern.o := $(CFLAGS)
++#CFLAGS_hppfs_user.o := $(USER_CFLAGS)
++
++override CFLAGS =  
++
++include $(TOPDIR)/Rules.make
+diff -Naur -X ../exclude-files orig/arch/um/fs/hppfs/hppfs_kern.c um/arch/um/fs/hppfs/hppfs_kern.c
+--- orig/arch/um/fs/hppfs/hppfs_kern.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/fs/hppfs/hppfs_kern.c   Thu Feb 27 13:14:26 2003
+@@ -0,0 +1,725 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <linux/fs.h>
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/kernel.h>
++#include <linux/ctype.h>
++#include <asm/uaccess.h>
++#include "os.h"
++
++struct hppfs_data {
++      struct list_head list;
++      char contents[PAGE_SIZE - sizeof(struct list_head)];
++};
++
++struct hppfs_private {
++      struct file proc_file;
++      int host_fd;
++      loff_t len;
++      struct hppfs_data *contents;
++};
++
++#define HPPFS_SUPER_MAGIC 0xb00000ee
++
++static struct super_operations hppfs_sbops;
++
++static struct inode *get_inode(struct super_block *sb, struct dentry *dentry,
++                             int *error);
++
++static int is_pid(struct dentry *dentry)
++{
++      struct super_block *sb;
++      int i;
++
++      sb = dentry->d_sb;
++      if((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root))
++              return(0);
++
++      for(i = 0; i < dentry->d_name.len; i++){
++              if(!isdigit(dentry->d_name.name[i]))
++                      return(0);
++      }
++      return(1);
++}
++
++static char *dentry_name(struct dentry *dentry, int extra)
++{
++      struct dentry *parent;
++      char *root, *name;
++      const char *seg_name;
++      int len, seg_len;
++
++      len = 0;
++      parent = dentry;
++      while(parent->d_parent != parent){
++              if(is_pid(parent))
++                      len += strlen("pid") + 1;
++              else len += parent->d_name.len + 1;
++              parent = parent->d_parent;
++      }
++      
++      root = "proc";
++      len += strlen(root);
++      name = kmalloc(len + extra + 1, GFP_KERNEL);
++      if(name == NULL) return(NULL);
++
++      name[len] = '\0';
++      parent = dentry;
++      while(parent->d_parent != parent){
++              if(is_pid(parent)){
++                      seg_name = "pid";
++                      seg_len = strlen("pid");
++              }
++              else {
++                      seg_name = parent->d_name.name;
++                      seg_len = parent->d_name.len;
++              }
++
++              len -= seg_len + 1;
++              name[len] = '/';
++              strncpy(&name[len + 1], seg_name, seg_len);
++              parent = parent->d_parent;
++      }
++      strncpy(name, root, strlen(root));
++      return(name);
++}
++
++struct dentry_operations hppfs_dentry_ops = {
++};
++
++static int file_removed(struct dentry *dentry, const char *file)
++{
++      char *host_file;
++      int extra, fd;
++
++      extra = 0;
++      if(file != NULL) extra += strlen(file) + 1;
++
++      host_file = dentry_name(dentry, extra + strlen("/remove"));
++      if(host_file == NULL){
++              printk("file_removed : allocation failed\n");
++              return(-ENOMEM);
++      }
++
++      if(file != NULL){
++              strcat(host_file, "/");
++              strcat(host_file, file);
++      }
++      strcat(host_file, "/remove");
++
++      fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
++      kfree(host_file);
++      if(fd > 0){
++              os_close_file(fd);
++              return(1);
++      }
++      return(0);
++}
++
++static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry)
++{
++      struct dentry *proc_dentry;
++      struct inode *inode;
++      int err, deleted;
++
++      deleted = file_removed(dentry, NULL);
++      if(deleted < 0)
++              return(ERR_PTR(deleted));
++      else if(deleted)
++              return(ERR_PTR(-ENOENT));
++
++      proc_dentry = lookup_hash(&dentry->d_name, ino->u.hppfs_i.proc_dentry);
++      if(IS_ERR(proc_dentry))
++              return(proc_dentry);
++
++      inode = get_inode(ino->i_sb, proc_dentry, &err);
++      if(err != 0) 
++              return(ERR_PTR(err));
++
++      d_add(dentry, inode);
++      dentry->d_op = &hppfs_dentry_ops;
++      return(NULL);
++}
++
++static struct inode_operations hppfs_file_iops = {
++};
++
++static struct inode_operations hppfs_dir_iops = {
++      .lookup         = hppfs_lookup,
++};
++
++static ssize_t read_proc(struct file *file, char *buf, ssize_t count, 
++                       loff_t *ppos, int is_user)
++{
++      ssize_t (*read)(struct file *, char *, size_t, loff_t *);
++      ssize_t n;
++
++      read = file->f_dentry->d_inode->i_fop->read;
++
++      if(!is_user)
++              set_fs(KERNEL_DS);
++              
++      n = (*read)(file, buf, count, &file->f_pos);
++
++      if(!is_user)
++              set_fs(USER_DS);
++
++      if(ppos) *ppos = file->f_pos;
++      return(n);
++}
++
++static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count)
++{
++      ssize_t n;
++      int cur, err;
++      char *new_buf;
++
++      n = -ENOMEM;
++      new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
++      if(new_buf == NULL){
++              printk("hppfs_read_file : kmalloc failed\n");
++              goto out;
++      }
++      n = 0;
++      while(count > 0){
++              cur = min_t(ssize_t, count, PAGE_SIZE);
++              err = os_read_file(fd, new_buf, cur);
++              if(err < 0){
++                      printk("hppfs_read : read failed, errno = %d\n",
++                             count);
++                      n = err;
++                      goto out_free;
++              }
++              else if(err == 0)
++                      break;
++
++              if(copy_to_user(buf, new_buf, err)){
++                      n = -EFAULT;
++                      goto out_free;
++              }
++              n += err;
++              count -= err;
++      }
++ out_free:
++      kfree(new_buf);
++ out:
++      return(n);
++}
++
++static ssize_t hppfs_read(struct file *file, char *buf, size_t count, 
++                        loff_t *ppos)
++{
++      struct hppfs_private *hppfs = file->private_data;
++      struct hppfs_data *data;
++      loff_t off;
++      int err;
++
++      if(hppfs->contents != NULL){
++              if(*ppos >= hppfs->len) return(0);
++
++              data = hppfs->contents;
++              off = *ppos;
++              while(off >= sizeof(data->contents)){
++                      data = list_entry(data->list.next, struct hppfs_data,
++                                        list);
++                      off -= sizeof(data->contents);
++              }
++
++              if(off + count > hppfs->len)
++                      count = hppfs->len - off;
++              copy_to_user(buf, &data->contents[off], count);
++              *ppos += count;
++      }
++      else if(hppfs->host_fd != -1){
++              err = os_seek_file(hppfs->host_fd, *ppos);
++              if(err){
++                      printk("hppfs_read : seek failed, errno = %d\n", err);
++                      return(err);
++              }
++              count = hppfs_read_file(hppfs->host_fd, buf, count);
++              if(count > 0)
++                      *ppos += count;
++      }
++      else count = read_proc(&hppfs->proc_file, buf, count, ppos, 1);
++
++      return(count);
++}
++
++static ssize_t hppfs_write(struct file *file, const char *buf, size_t len, 
++                         loff_t *ppos)
++{
++      struct hppfs_private *data = file->private_data;
++      struct file *proc_file = &data->proc_file;
++      ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
++      int err;
++
++      write = proc_file->f_dentry->d_inode->i_fop->write;
++
++      proc_file->f_pos = file->f_pos;
++      err = (*write)(proc_file, buf, len, &proc_file->f_pos);
++      file->f_pos = proc_file->f_pos;
++
++      return(err);
++}
++
++static int open_host_sock(char *host_file, int *filter_out)
++{
++      char *end;
++      int fd;
++
++      end = &host_file[strlen(host_file)];
++      strcpy(end, "/rw");
++      *filter_out = 1;
++      fd = os_connect_socket(host_file);
++      if(fd > 0)
++              return(fd);
++
++      strcpy(end, "/r");
++      *filter_out = 0;
++      fd = os_connect_socket(host_file);
++      return(fd);
++}
++
++static void free_contents(struct hppfs_data *head)
++{
++      struct hppfs_data *data;
++      struct list_head *ele, *next;
++
++      if(head == NULL) return;
++
++      list_for_each_safe(ele, next, &head->list){
++              data = list_entry(ele, struct hppfs_data, list);
++              kfree(data);
++      }
++      kfree(head);
++}
++
++static struct hppfs_data *hppfs_get_data(int fd, int filter, 
++                                       struct file *proc_file, 
++                                       struct file *hppfs_file, 
++                                       loff_t *size_out)
++{
++      struct hppfs_data *data, *new, *head;
++      int n, err;
++
++      err = -ENOMEM;
++      data = kmalloc(sizeof(*data), GFP_KERNEL);
++      if(data == NULL){
++              printk("hppfs_get_data : head allocation failed\n");
++              goto failed;
++      }
++
++      INIT_LIST_HEAD(&data->list);
++
++      head = data;
++      *size_out = 0;
++
++      if(filter){
++              while((n = read_proc(proc_file, data->contents,
++                                   sizeof(data->contents), NULL, 0)) > 0)
++                      os_write_file(fd, data->contents, n);
++              err = os_shutdown_socket(fd, 0, 1);
++              if(err){
++                      printk("hppfs_get_data : failed to shut down "
++                             "socket\n");
++                      goto failed_free;
++              }
++      }
++      while(1){
++              n = os_read_file(fd, data->contents, sizeof(data->contents));
++              if(n < 0){
++                      err = n;
++                      printk("hppfs_get_data : read failed, errno = %d\n",
++                             err);
++                      goto failed_free;
++              }
++              else if(n == 0)
++                      break;
++
++              *size_out += n;
++
++              if(n < sizeof(data->contents))
++                      break;
++
++              new = kmalloc(sizeof(*data), GFP_KERNEL);
++              if(new == 0){
++                      printk("hppfs_get_data : data allocation failed\n");
++                      err = -ENOMEM;
++                      goto failed_free;
++              }
++      
++              INIT_LIST_HEAD(&new->list);
++              list_add(&new->list, &data->list);
++              data = new;
++      }
++      return(head);
++
++ failed_free:
++      free_contents(head);
++ failed:              
++      return(ERR_PTR(err));
++}
++
++static struct hppfs_private *hppfs_data(void)
++{
++      struct hppfs_private *data;
++
++      data = kmalloc(sizeof(*data), GFP_KERNEL);
++      if(data == NULL)
++              return(data);
++
++      *data = ((struct hppfs_private ) { .host_fd             = -1,
++                                         .len                 = -1,
++                                         .contents            = NULL } );
++      return(data);
++}
++
++static int hppfs_open(struct inode *inode, struct file *file)
++{
++      struct hppfs_private *data;
++      struct dentry *proc_dentry;
++      char *host_file;
++      int err, fd, type, filter;
++
++      err = -ENOMEM;
++      data = hppfs_data();
++      if(data == NULL)
++              goto out;
++
++      host_file = dentry_name(file->f_dentry, strlen("/rw"));
++      if(host_file == NULL)
++              goto out_free2;
++
++      proc_dentry = inode->u.hppfs_i.proc_dentry;
++      err = init_private_file(&data->proc_file, proc_dentry, file->f_mode);
++      if(err)
++              goto out_free1;
++
++      type = os_file_type(host_file);
++      if(type == OS_TYPE_FILE){
++              fd = os_open_file(host_file, of_read(OPENFLAGS()), 0);
++              if(fd >= 0) 
++                      data->host_fd = fd;
++              else printk("hppfs_open : failed to open '%s', errno = %d\n",
++                          host_file, -fd);
++
++              data->contents = NULL;
++      }
++      else if(type == OS_TYPE_DIR){
++              fd = open_host_sock(host_file, &filter);
++              if(fd > 0){
++                      data->contents = hppfs_get_data(fd, filter, 
++                                                      &data->proc_file, 
++                                                      file, &data->len);
++                      if(!IS_ERR(data->contents))
++                              data->host_fd = fd;
++              }
++              else printk("hppfs_open : failed to open a socket in "
++                          "'%s', errno = %d\n", host_file, -fd);
++      }
++      kfree(host_file);
++
++      file->private_data = data;
++      return(0);
++
++ out_free1:
++      kfree(host_file);
++ out_free2:
++      free_contents(data->contents);
++      kfree(data);
++ out:
++      return(err);
++}
++
++static int hppfs_dir_open(struct inode *inode, struct file *file)
++{
++      struct hppfs_private *data;
++      struct dentry *proc_dentry;
++      int err;
++
++      err = -ENOMEM;
++      data = hppfs_data();
++      if(data == NULL)
++              goto out;
++
++      proc_dentry = inode->u.hppfs_i.proc_dentry;
++      err = init_private_file(&data->proc_file, proc_dentry, file->f_mode);
++      if(err)
++              goto out_free;
++
++      file->private_data = data;
++      return(0);
++
++ out_free:
++      kfree(data);
++ out:
++      return(err);
++}
++
++static loff_t hppfs_llseek(struct file *file, loff_t off, int where)
++{
++      struct hppfs_private *data = file->private_data;
++      struct file *proc_file = &data->proc_file;
++      loff_t (*llseek)(struct file *, loff_t, int);
++      loff_t ret;
++
++      llseek = proc_file->f_dentry->d_inode->i_fop->llseek;
++      if(llseek != NULL){
++              ret = (*llseek)(proc_file, off, where);
++              if(ret < 0)
++                      return(ret);
++      }
++
++      return(default_llseek(file, off, where));
++}
++
++struct hppfs_dirent {
++      void *vfs_dirent;
++      filldir_t filldir;
++      struct dentry *dentry;
++};
++
++static int hppfs_filldir(void *d, const char *name, int size, 
++                       loff_t offset, ino_t inode, unsigned int type)
++{
++      struct hppfs_dirent *dirent = d;
++
++      if(file_removed(dirent->dentry, name))
++              return(0);
++
++      return((*dirent->filldir)(dirent->vfs_dirent, name, size, offset, 
++                                inode, type));
++}
++
++static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
++{
++      struct hppfs_private *data = file->private_data;
++      struct file *proc_file = &data->proc_file;
++      int (*readdir)(struct file *, void *, filldir_t);
++      struct hppfs_dirent dirent = ((struct hppfs_dirent)
++                                    { .vfs_dirent     = ent,
++                                      .filldir        = filldir,
++                                      .dentry         = file->f_dentry } );
++      int err;
++
++      readdir = proc_file->f_dentry->d_inode->i_fop->readdir;
++
++      proc_file->f_pos = file->f_pos;
++      err = (*readdir)(proc_file, &dirent, hppfs_filldir);
++      file->f_pos = proc_file->f_pos;
++
++      return(err);
++}
++
++static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
++{
++      return(0);
++}
++
++static struct file_operations hppfs_file_fops = {
++      .owner          = NULL,
++      .llseek         = hppfs_llseek,
++      .read           = hppfs_read,
++      .write          = hppfs_write,
++      .open           = hppfs_open,
++};
++
++static struct file_operations hppfs_dir_fops = {
++      .owner          = NULL,
++      .readdir        = hppfs_readdir,
++      .open           = hppfs_dir_open,
++      .fsync          = hppfs_fsync,
++};
++
++static int hppfs_statfs(struct super_block *sb, struct statfs *sf)
++{
++      sf->f_blocks = 0;
++      sf->f_bfree = 0;
++      sf->f_bavail = 0;
++      sf->f_files = 0;
++      sf->f_ffree = 0;
++      sf->f_type = HPPFS_SUPER_MAGIC;
++      return(0);
++}
++
++static struct super_operations hppfs_sbops = { 
++      .put_inode      = force_delete,
++      .delete_inode   = NULL,
++      .statfs         = hppfs_statfs,
++};
++
++static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen)
++{
++      struct file proc_file;
++      struct dentry *proc_dentry;
++      int (*readlink)(struct dentry *, char *, int);
++      int err, n;
++
++      proc_dentry = dentry->d_inode->u.hppfs_i.proc_dentry;
++      err = init_private_file(&proc_file, proc_dentry, FMODE_READ);
++      if(err) 
++              return(err);
++
++      readlink = proc_dentry->d_inode->i_op->readlink;
++      n = (*readlink)(proc_dentry, buffer, buflen);
++
++      if(proc_file.f_op->release)
++              (*proc_file.f_op->release)(proc_dentry->d_inode, &proc_file);
++      
++      return(n);
++}
++
++static int hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
++{
++      struct file proc_file;
++      struct dentry *proc_dentry;
++      int (*follow_link)(struct dentry *, struct nameidata *);
++      int err, n;
++
++      proc_dentry = dentry->d_inode->u.hppfs_i.proc_dentry;
++      err = init_private_file(&proc_file, proc_dentry, FMODE_READ);
++      if(err) 
++              return(err);
++
++      follow_link = proc_dentry->d_inode->i_op->follow_link;
++      n = (*follow_link)(proc_dentry, nd);
++
++      if(proc_file.f_op->release)
++              (*proc_file.f_op->release)(proc_dentry->d_inode, &proc_file);
++      
++      return(n);
++}
++
++static struct inode_operations hppfs_link_iops = {
++      .readlink       = hppfs_readlink,
++      .follow_link    = hppfs_follow_link,
++};
++
++static void read_inode(struct inode *ino)
++{
++      struct inode *proc_ino;
++
++      proc_ino = ino->u.hppfs_i.proc_dentry->d_inode;
++      ino->i_uid = proc_ino->i_uid;
++      ino->i_gid = proc_ino->i_gid;
++      ino->i_atime = proc_ino->i_atime;
++      ino->i_mtime = proc_ino->i_mtime;
++      ino->i_ctime = proc_ino->i_ctime;
++      ino->i_ino = proc_ino->i_ino;
++      ino->i_dev = proc_ino->i_dev;
++      ino->i_mode = proc_ino->i_mode;
++      ino->i_nlink = proc_ino->i_nlink;
++      ino->i_size = proc_ino->i_size;
++      ino->i_blksize = proc_ino->i_blksize;
++      ino->i_blocks = proc_ino->i_blocks;
++}
++
++static struct inode *get_inode(struct super_block *sb, struct dentry *dentry,
++                             int *error)
++{
++      struct inode *inode;
++      int err = -ENOMEM;
++
++      inode = new_inode(sb);
++      if(inode == NULL) 
++              goto out;
++
++      insert_inode_hash(inode);
++      if(S_ISDIR(dentry->d_inode->i_mode)){
++              inode->i_op = &hppfs_dir_iops;
++              inode->i_fop = &hppfs_dir_fops;
++      }
++      else if(S_ISLNK(dentry->d_inode->i_mode)){
++              inode->i_op = &hppfs_link_iops;
++              inode->i_fop = &hppfs_file_fops;
++      }
++      else {
++              inode->i_op = &hppfs_file_iops;
++              inode->i_fop = &hppfs_file_fops;
++      }
++
++      inode->i_sb = sb;
++      inode->u.hppfs_i.proc_dentry = dentry;
++
++      read_inode(inode);
++      err = 0;
++
++      if(error) *error = err;
++      return(inode);
++ out:
++      if(error) *error = err;
++      return(NULL);
++}
++
++static struct super_block *hppfs_read_super(struct super_block *sb, void *d, 
++                                          int silent)
++{
++      struct inode *root_inode;
++      struct file_system_type *procfs;
++      struct super_block *proc_sb;
++
++      procfs = get_fs_type("proc");
++      if(procfs == NULL) 
++              goto out;
++
++      if(list_empty(&procfs->fs_supers))
++              goto out;
++
++      proc_sb = list_entry(procfs->fs_supers.next, struct super_block,
++                           s_instances);
++      
++      sb->s_blocksize = 1024;
++      sb->s_blocksize_bits = 10;
++      sb->s_magic = HPPFS_SUPER_MAGIC;
++      sb->s_op = &hppfs_sbops;
++
++      dget(proc_sb->s_root);
++      root_inode = get_inode(sb, proc_sb->s_root, NULL);
++      if(root_inode == NULL)
++              goto out_dput;
++
++      sb->s_root = d_alloc_root(root_inode);
++      if(sb->s_root == NULL)
++              goto out_put;
++
++      return(sb);
++
++ out_put:
++      iput(root_inode);
++ out_dput:
++      dput(proc_sb->s_root);
++ out:
++      return(NULL);
++}
++
++DECLARE_FSTYPE(hppfs_type, "hppfs", hppfs_read_super, 0);
++
++static int __init init_hppfs(void)
++{
++      return(register_filesystem(&hppfs_type));
++}
++
++static void __exit exit_hppfs(void)
++{
++      unregister_filesystem(&hppfs_type);
++}
++
++module_init(init_hppfs)
++module_exit(exit_hppfs)
++MODULE_LICENSE("GPL");
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/2_5compat.h um/arch/um/include/2_5compat.h
+--- orig/arch/um/include/2_5compat.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/2_5compat.h     Thu Feb 27 20:15:19 2003
+@@ -0,0 +1,46 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __2_5_COMPAT_H__
++#define __2_5_COMPAT_H__
++
++#include "linux/version.h"
++
++#define INIT_CONSOLE(dev_name, write_proc, device_proc, setup_proc, f) { \
++      name :          dev_name, \
++      write :         write_proc, \
++      read :          NULL, \
++      device :        device_proc, \
++      unblank :       NULL, \
++      setup :         setup_proc, \
++      flags :         f, \
++      index :         -1, \
++      cflag :         0, \
++      next :          NULL \
++}
++
++#define INIT_ELV(queue, elv) elevator_init(elv, ELV_NOOP)
++
++#define ELV_NOOP ELEVATOR_NOOP
++
++#define INIT_HARDSECT(arr, maj, sizes) arr[maj] = sizes
++
++#define IS_WRITE(req) ((req)->cmd == WRITE)
++
++#define SET_PRI(task) \
++      do { (task)->nice = 20; (task)->counter = -100; } while(0);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/Makefile um/arch/um/include/Makefile
+--- orig/arch/um/include/Makefile      Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/Makefile        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,7 @@
++all : sc.h
++
++sc.h : ../util/mk_sc
++      ../util/mk_sc > $@
++
++../util/mk_sc :
++      $(MAKE) -C ../util mk_sc
+diff -Naur -X ../exclude-files orig/arch/um/include/chan_kern.h um/arch/um/include/chan_kern.h
+--- orig/arch/um/include/chan_kern.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/chan_kern.h     Fri Nov 15 13:32:35 2002
+@@ -0,0 +1,56 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __CHAN_KERN_H__
++#define __CHAN_KERN_H__
++
++#include "linux/tty.h"
++#include "linux/list.h"
++#include "chan_user.h"
++
++struct chan {
++      struct list_head list;
++      char *dev;
++      unsigned int primary:1;
++      unsigned int input:1;
++      unsigned int output:1;
++      unsigned int opened:1;
++      int fd;
++      enum chan_init_pri pri;
++      struct chan_ops *ops;
++      void *data;
++};
++
++extern void chan_interrupt(struct list_head *chans, struct tq_struct *task,
++                         struct tty_struct *tty, int irq, void *dev);
++extern int parse_chan_pair(char *str, struct list_head *chans, int pri, 
++                         int device, struct chan_opts *opts);
++extern int open_chan(struct list_head *chans);
++extern int write_chan(struct list_head *chans, const char *buf, int len,
++                           int write_irq);
++extern int console_write_chan(struct list_head *chans, const char *buf, 
++                            int len);
++extern void close_chan(struct list_head *chans);
++extern void chan_enable_winch(struct list_head *chans, void *line);
++extern void enable_chan(struct list_head *chans, void *data);
++extern int chan_window_size(struct list_head *chans, 
++                           unsigned short *rows_out, 
++                           unsigned short *cols_out);
++extern int chan_out_fd(struct list_head *chans);
++extern int chan_config_string(struct list_head *chans, char *str, int size,
++                            char **error_out);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/chan_user.h um/arch/um/include/chan_user.h
+--- orig/arch/um/include/chan_user.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/chan_user.h     Wed Nov  6 16:44:00 2002
+@@ -0,0 +1,66 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __CHAN_USER_H__
++#define __CHAN_USER_H__
++
++#include "init.h"
++
++struct chan_opts {
++      void (*announce)(char *dev_name, int dev);
++      char *xterm_title;
++      int raw;
++      unsigned long tramp_stack;
++      int in_kernel;
++};
++
++enum chan_init_pri { INIT_STATIC, INIT_ALL, INIT_ONE };
++
++struct chan_ops {
++      char *type;
++      void *(*init)(char *, int, struct chan_opts *);
++      int (*open)(int, int, int, void *, char **);
++      void (*close)(int, void *);
++      int (*read)(int, char *, void *);
++      int (*write)(int, const char *, int, void *);
++      int (*console_write)(int, const char *, int, void *);
++      int (*window_size)(int, void *, unsigned short *, unsigned short *);
++      void (*free)(void *);
++      int winch;
++};
++
++extern struct chan_ops fd_ops, null_ops, port_ops, pts_ops, pty_ops, tty_ops,
++      xterm_ops;
++
++extern void generic_close(int fd, void *unused);
++extern int generic_read(int fd, char *c_out, void *unused);
++extern int generic_write(int fd, const char *buf, int n, void *unused);
++extern int generic_console_write(int fd, const char *buf, int n, void *state);
++extern int generic_window_size(int fd, void *unused, unsigned short *rows_out,
++                             unsigned short *cols_out);
++extern void generic_free(void *data);
++
++extern void register_winch(int fd, void *device_data);
++extern void register_winch_irq(int fd, int tty_fd, int pid, void *line);
++
++#define __channel_help(fn, prefix) \
++__uml_help(fn, prefix "[0-9]*=<channel description>\n" \
++"    Attach a console or serial line to a host channel.  See\n" \
++"    http://user-mode-linux.sourceforge.net/input.html for a complete\n" \
++"    description of this switch.\n\n" \
++);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/choose-mode.h um/arch/um/include/choose-mode.h
+--- orig/arch/um/include/choose-mode.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/choose-mode.h   Fri Jan 17 13:23:32 2003
+@@ -0,0 +1,35 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __CHOOSE_MODE_H__
++#define __CHOOSE_MODE_H__
++
++#include "uml-config.h"
++
++#if defined(UML_CONFIG_MODE_TT) && defined(UML_CONFIG_MODE_SKAS)
++#define CHOOSE_MODE(tt, skas) (mode_tt ? (tt) : (skas))
++
++#elif defined(UML_CONFIG_MODE_SKAS)
++#define CHOOSE_MODE(tt, skas) (skas)
++
++#elif defined(UML_CONFIG_MODE_TT)
++#define CHOOSE_MODE(tt, skas) (tt)
++#endif
++
++#define CHOOSE_MODE_PROC(tt, skas, args...) \
++      CHOOSE_MODE(tt(args), skas(args))
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/frame.h um/arch/um/include/frame.h
+--- orig/arch/um/include/frame.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/frame.h Mon Dec  2 21:43:03 2002
+@@ -0,0 +1,53 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __FRAME_H_
++#define __FRAME_H_
++
++#include "sysdep/frame.h"
++
++struct frame_common {
++      void *data;
++      int len;
++      int sig_index;
++      int sr_index;
++      int sr_relative;
++      int sp_index;
++      struct arch_frame_data arch;
++};
++
++struct sc_frame {
++      struct frame_common common;
++      int sc_index;
++};
++
++extern struct sc_frame signal_frame_sc;
++
++extern struct sc_frame signal_frame_sc_sr;
++
++struct si_frame {
++      struct frame_common common;
++      int sip_index;
++      int si_index;
++      int ucp_index;
++      int uc_index;
++};
++
++extern struct si_frame signal_frame_si;
++
++extern void capture_signal_stack(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/frame_kern.h um/arch/um/include/frame_kern.h
+--- orig/arch/um/include/frame_kern.h  Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/frame_kern.h    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,34 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __FRAME_KERN_H_
++#define __FRAME_KERN_H_
++
++#include "frame.h"
++#include "sysdep/frame_kern.h"
++
++extern int setup_signal_stack_sc(unsigned long stack_top, int sig, 
++                               unsigned long handler,
++                               void (*restorer)(void), 
++                               struct pt_regs *regs, 
++                               sigset_t *mask);
++extern int setup_signal_stack_si(unsigned long stack_top, int sig, 
++                               unsigned long handler, 
++                               void (*restorer)(void), 
++                               struct pt_regs *regs, siginfo_t *info, 
++                               sigset_t *mask);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/frame_user.h um/arch/um/include/frame_user.h
+--- orig/arch/um/include/frame_user.h  Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/frame_user.h    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,23 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __FRAME_USER_H_
++#define __FRAME_USER_H_
++
++#include "sysdep/frame_user.h"
++#include "frame.h"
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/helper.h um/arch/um/include/helper.h
+--- orig/arch/um/include/helper.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/helper.h        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,27 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __HELPER_H__
++#define __HELPER_H__
++
++extern int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv,
++                    unsigned long *stack_out);
++extern int run_helper_thread(int (*proc)(void *), void *arg, 
++                           unsigned int flags, unsigned long *stack_out,
++                           int stack_order);
++extern int helper_wait(int pid);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/hostaudio.h um/arch/um/include/hostaudio.h
+--- orig/arch/um/include/hostaudio.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/hostaudio.h     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,48 @@
++/* 
++ * Copyright (C) 2002 Steve Schmidtke 
++ * Licensed under the GPL
++ */
++
++#ifndef HOSTAUDIO_H
++#define HOSTAUDIO_H
++
++#define HOSTAUDIO_DEV_DSP "/dev/sound/dsp"
++#define HOSTAUDIO_DEV_MIXER "/dev/sound/mixer"
++
++struct hostaudio_state {
++  int fd;
++};
++
++struct hostmixer_state {
++  int fd;
++};
++
++/* UML user-side protoypes */
++extern ssize_t hostaudio_read_user(struct hostaudio_state *state, char *buffer,
++                                 size_t count, loff_t *ppos);
++extern ssize_t hostaudio_write_user(struct hostaudio_state *state, 
++                                  const char *buffer, size_t count, 
++                                  loff_t *ppos);
++extern int hostaudio_ioctl_user(struct hostaudio_state *state, 
++                              unsigned int cmd, unsigned long arg);
++extern int hostaudio_open_user(struct hostaudio_state *state, int r, int w, 
++                             char *dsp);
++extern int hostaudio_release_user(struct hostaudio_state *state);
++extern int hostmixer_ioctl_mixdev_user(struct hostmixer_state *state, 
++                              unsigned int cmd, unsigned long arg);
++extern int hostmixer_open_mixdev_user(struct hostmixer_state *state, int r, 
++                                    int w, char *mixer);
++extern int hostmixer_release_mixdev_user(struct hostmixer_state *state);
++
++#endif /* HOSTAUDIO_H */
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/init.h um/arch/um/include/init.h
+--- orig/arch/um/include/init.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/init.h  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,114 @@
++#ifndef _LINUX_UML_INIT_H
++#define _LINUX_UML_INIT_H
++
++/* These macros are used to mark some functions or
++ * initialized data (doesn't apply to uninitialized data)
++ * as `initialization' functions. The kernel can take this
++ * as hint that the function is used only during the initialization
++ * phase and free up used memory resources after
++ *
++ * Usage:
++ * For functions:
++ *
++ * You should add __init immediately before the function name, like:
++ *
++ * static void __init initme(int x, int y)
++ * {
++ *    extern int z; z = x * y;
++ * }
++ *
++ * If the function has a prototype somewhere, you can also add
++ * __init between closing brace of the prototype and semicolon:
++ *
++ * extern int initialize_foobar_device(int, int, int) __init;
++ *
++ * For initialized data:
++ * You should insert __initdata between the variable name and equal
++ * sign followed by value, e.g.:
++ *
++ * static int init_variable __initdata = 0;
++ * static char linux_logo[] __initdata = { 0x32, 0x36, ... };
++ *
++ * Don't forget to initialize data not at file scope, i.e. within a function,
++ * as gcc otherwise puts the data into the bss section and not into the init
++ * section.
++ *
++ * Also note, that this data cannot be "const".
++ */
++
++#ifndef _LINUX_INIT_H
++typedef int (*initcall_t)(void);
++typedef void (*exitcall_t)(void);
++
++#define __init          __attribute__ ((__section__ (".text.init")))
++#define __exit          __attribute__ ((unused, __section__(".text.exit")))
++#define __initdata      __attribute__ ((__section__ (".data.init")))
++
++#endif
++
++#ifndef MODULE
++struct uml_param {
++        const char *str;
++        int (*setup_func)(char *, int *);
++};
++
++extern initcall_t __uml_initcall_start, __uml_initcall_end;
++extern initcall_t __uml_postsetup_start, __uml_postsetup_end;
++extern const char *__uml_help_start, *__uml_help_end;
++#endif
++
++#define __uml_initcall(fn)                                            \
++      static initcall_t __uml_initcall_##fn __uml_init_call = fn
++
++#define __uml_exitcall(fn)                                            \
++      static exitcall_t __uml_exitcall_##fn __uml_exit_call = fn
++
++extern struct uml_param __uml_setup_start, __uml_setup_end;
++
++#define __uml_postsetup(fn)                                           \
++      static initcall_t __uml_postsetup_##fn __uml_postsetup_call = fn
++
++#define __non_empty_string(dummyname,string)                          \
++      struct __uml_non_empty_string_struct_##dummyname                \
++      {                                                               \
++              char _string[sizeof(string)-2];                         \
++      }
++
++#ifndef MODULE
++#define __uml_setup(str, fn, help...)                                 \
++      __non_empty_string(fn ##_setup, str);                           \
++      __uml_help(fn, help);                                           \
++      static char __uml_setup_str_##fn[] __initdata = str;            \
++      static struct uml_param __uml_setup_##fn __uml_init_setup = { __uml_setup_str_##fn, fn }
++#else
++#define __uml_setup(str, fn, help...)                                 \
++
++#endif
++
++#define __uml_help(fn, help...)                                               \
++      __non_empty_string(fn ##__help, help);                          \
++      static char __uml_help_str_##fn[] __initdata = help;            \
++      static const char *__uml_help_##fn __uml_setup_help = __uml_help_str_##fn
++
++/*
++ * Mark functions and data as being only used at initialization
++ * or exit time.
++ */
++#define __uml_init_setup      __attribute__ ((unused,__section__ (".uml.setup.init")))
++#define __uml_setup_help      __attribute__ ((unused,__section__ (".uml.help.init")))
++#define __uml_init_call               __attribute__ ((unused,__section__ (".uml.initcall.init")))
++#define __uml_postsetup_call  __attribute__ ((unused,__section__ (".uml.postsetup.init")))
++#define __uml_exit_call               __attribute__ ((unused,__section__ (".uml.exitcall.exit")))
++
++#endif /* _LINUX_UML_INIT_H */
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/initrd.h um/arch/um/include/initrd.h
+--- orig/arch/um/include/initrd.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/initrd.h        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,22 @@
++/*
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __INITRD_USER_H__
++#define __INITRD_USER_H__
++
++extern int load_initrd(char *filename, void *buf, int size);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/irq_user.h um/arch/um/include/irq_user.h
+--- orig/arch/um/include/irq_user.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/irq_user.h      Sun Dec  8 20:38:42 2002
+@@ -0,0 +1,35 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __IRQ_USER_H__
++#define __IRQ_USER_H__
++
++enum { IRQ_READ, IRQ_WRITE };
++
++extern void sigio_handler(int sig, union uml_pt_regs *regs);
++extern int activate_fd(int irq, int fd, int type, void *dev_id);
++extern void free_irq_by_irq_and_dev(int irq, void *dev_id);
++extern void free_irq_by_fd(int fd);
++extern void reactivate_fd(int fd, int irqnum);
++extern void deactivate_fd(int fd, int irqnum);
++extern void forward_interrupts(int pid);
++extern void init_irq_signals(int on_sigstack);
++extern void forward_ipi(int fd, int pid);
++extern void free_irq_later(int irq, void *dev_id);
++extern int activate_ipi(int fd, int pid);
++extern unsigned long irq_lock(void);
++extern void irq_unlock(unsigned long flags);
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/kern.h um/arch/um/include/kern.h
+--- orig/arch/um/include/kern.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/kern.h  Sat Nov  2 21:38:02 2002
+@@ -0,0 +1,48 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __KERN_H__
++#define __KERN_H__
++
++/* These are all user-mode things which are convenient to call directly
++ * from kernel code and for which writing a wrapper is too much of a pain.
++ * The regular include files can't be included because this file is included
++ * only into kernel code, and user-space includes conflict with kernel
++ * includes.
++ */
++
++extern int errno;
++
++extern int clone(int (*proc)(void *), void *sp, int flags, void *data);
++extern int sleep(int);
++extern int printf(char *fmt, ...);
++extern char *strerror(int errnum);
++extern char *ptsname(int __fd);
++extern int munmap(void *, int);
++extern void *sbrk(int increment);
++extern void *malloc(int size);
++extern void perror(char *err);
++extern int kill(int pid, int sig);
++extern int getuid(void);
++extern int pause(void);
++extern int write(int, const void *, int);
++extern int exit(int);
++extern int close(int);
++extern int read(unsigned int, char *, int);
++extern int pipe(int *);
++extern int sched_yield(void);
++extern int ptrace(int op, int pid, long addr, long data);
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/kern_util.h um/arch/um/include/kern_util.h
+--- orig/arch/um/include/kern_util.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/kern_util.h     Wed Apr 16 16:00:11 2003
+@@ -0,0 +1,121 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __KERN_UTIL_H__
++#define __KERN_UTIL_H__
++
++#include "sysdep/ptrace.h"
++
++extern int ncpus;
++extern char *linux_prog;
++extern char *gdb_init;
++extern int kmalloc_ok;
++extern int timer_irq_inited;
++extern int jail;
++extern int nsyscalls;
++
++#define UML_ROUND_DOWN(addr) ((void *)(((unsigned long) addr) & PAGE_MASK))
++#define UML_ROUND_UP(addr) \
++      UML_ROUND_DOWN(((unsigned long) addr) + PAGE_SIZE - 1)
++
++extern int kernel_fork(unsigned long flags, int (*fn)(void *), void * arg);
++extern unsigned long stack_sp(unsigned long page);
++extern int kernel_thread_proc(void *data);
++extern void syscall_segv(int sig);
++extern int current_pid(void);
++extern unsigned long alloc_stack(int order, int atomic);
++extern int do_signal(int error);
++extern int is_stack_fault(unsigned long sp);
++extern unsigned long segv(unsigned long address, unsigned long ip, 
++                        int is_write, int is_user, void *sc);
++extern unsigned long handle_page_fault(unsigned long address, unsigned long ip,
++                                     int is_write, int is_user, 
++                                     int *code_out);
++extern void syscall_ready(void);
++extern int segv_syscall(void);
++extern void kern_finish_exec(void *task, int new_pid, unsigned long stack);
++extern int page_size(void);
++extern int page_mask(void);
++extern int need_finish_fork(void);
++extern void free_stack(unsigned long stack, int order);
++extern void add_input_request(int op, void (*proc)(int), void *arg);
++extern int sys_execve(char *file, char **argv, char **env);
++extern char *current_cmd(void);
++extern void timer_handler(int sig, union uml_pt_regs *regs);
++extern int set_signals(int enable);
++extern void force_sigbus(void);
++extern int pid_to_processor_id(int pid);
++extern void block_signals(void);
++extern void unblock_signals(void);
++extern void deliver_signals(void *t);
++extern int next_syscall_index(int max);
++extern int next_trap_index(int max);
++extern void cpu_idle(void);
++extern void finish_fork(void);
++extern void paging_init(void);
++extern void init_flush_vm(void);
++extern void *syscall_sp(void *t);
++extern void syscall_trace(void);
++extern int hz(void);
++extern void idle_timer(void);
++extern unsigned int do_IRQ(int irq, union uml_pt_regs *regs);
++extern int external_pid(void *t);
++extern void boot_timer_handler(int sig);
++extern void interrupt_end(void);
++extern void initial_thread_cb(void (*proc)(void *), void *arg);
++extern int debugger_signal(int status, int pid);
++extern void debugger_parent_signal(int status, int pid);
++extern void child_signal(int pid, int status);
++extern int init_ptrace_proxy(int idle_pid, int startup, int stop);
++extern int init_parent_proxy(int pid);
++extern void check_stack_overflow(void *ptr);
++extern void relay_signal(int sig, union uml_pt_regs *regs);
++extern void not_implemented(void);
++extern int user_context(unsigned long sp);
++extern void timer_irq(union uml_pt_regs *regs);
++extern void unprotect_stack(unsigned long stack);
++extern void do_uml_exitcalls(void);
++extern int attach_debugger(int idle_pid, int pid, int stop);
++extern void bad_segv(unsigned long address, unsigned long ip, int is_write);
++extern int config_gdb(char *str);
++extern int remove_gdb(void);
++extern char *uml_strdup(char *string);
++extern void unprotect_kernel_mem(void);
++extern void protect_kernel_mem(void);
++extern void set_kmem_end(unsigned long);
++extern void uml_cleanup(void);
++extern void set_current(void *t);
++extern void lock_signalled_task(void *t);
++extern void IPI_handler(int cpu);
++extern int jail_setup(char *line, int *add);
++extern void *get_init_task(void);
++extern int clear_user_proc(void *buf, int size);
++extern int copy_to_user_proc(void *to, void *from, int size);
++extern int copy_from_user_proc(void *to, void *from, int size);
++extern int strlen_user_proc(char *str);
++extern void bus_handler(int sig, union uml_pt_regs *regs);
++extern void winch(int sig, union uml_pt_regs *regs);
++extern long execute_syscall(void *r);
++extern int smp_sigio_handler(void);
++extern void *get_current(void);
++extern struct task_struct *get_task(int pid, int require);
++extern void machine_halt(void);
++extern int is_syscall(unsigned long addr);
++extern void arch_switch(void);
++extern void free_irq(unsigned int, void *);
++extern int um_in_interrupt(void);
++extern int cpu(void);
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/line.h um/arch/um/include/line.h
+--- orig/arch/um/include/line.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/line.h  Fri Nov 15 13:44:44 2002
+@@ -0,0 +1,106 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __LINE_H__
++#define __LINE_H__
++
++#include "linux/list.h"
++#include "linux/tqueue.h"
++#include "linux/tty.h"
++#include "asm/semaphore.h"
++#include "chan_user.h"
++#include "mconsole_kern.h"
++
++struct line_driver {
++      char *name;
++      char *devfs_name;
++      short major;
++      short minor_start;
++      short type;
++      short subtype;
++      int read_irq;
++      char *read_irq_name;
++      int write_irq;
++      char *write_irq_name;
++      char *symlink_from;
++      char *symlink_to;
++      struct mc_device mc;
++};
++
++struct line {
++      char *init_str;
++      int init_pri;
++      struct list_head chan_list;
++      int valid;
++      int count;
++      struct tty_struct *tty;
++      struct semaphore sem;
++      char *buffer;
++      char *head;
++      char *tail;
++      int sigio;
++      struct tq_struct task;
++      struct line_driver *driver;
++      int have_irq;
++};
++
++#define LINE_INIT(str, d) \
++      { init_str :    str, \
++        init_pri :    INIT_STATIC, \
++        chan_list :   { }, \
++        valid :       1, \
++        count :       0, \
++        tty :         NULL, \
++        sem :         { }, \
++        buffer :      NULL, \
++        head :        NULL, \
++        tail :        NULL, \
++        sigio :       0, \
++        driver :      d, \
++          have_irq :  0 }
++
++struct lines {
++      int num;
++};
++
++#define LINES_INIT(n) {  num :                n }
++
++extern void line_interrupt(int irq, void *data, struct pt_regs *unused);
++extern void line_write_interrupt(int irq, void *data, struct pt_regs *unused);
++extern void line_close(struct line *lines, struct tty_struct *tty);
++extern int line_open(struct line *lines, struct tty_struct *tty, 
++                   struct chan_opts *opts);
++extern int line_setup(struct line *lines, int num, char *init, 
++                    int all_allowed);
++extern int line_write(struct line *line, struct tty_struct *tty, int from_user,
++                    const char *buf, int len);
++extern int line_write_room(struct tty_struct *tty);
++extern char *add_xterm_umid(char *base);
++extern int line_setup_irq(int fd, int input, int output, void *data);
++extern void line_close_chan(struct line *line);
++extern void line_disable(struct line *line, int current_irq);
++extern void line_register_devfs(struct lines *set, 
++                              struct line_driver *line_driver, 
++                              struct tty_driver *driver, struct line *lines,
++                              int nlines);
++extern void lines_init(struct line *lines, int nlines);
++extern void close_lines(struct line *lines, int nlines);
++extern int line_config(struct line *lines, int num, char *str);
++extern int line_remove(struct line *lines, int num, char *str);
++extern int line_get_config(char *dev, struct line *lines, int num, char *str, 
++                         int size, char **error_out);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/mconsole.h um/arch/um/include/mconsole.h
+--- orig/arch/um/include/mconsole.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/mconsole.h      Fri Jan 17 13:48:25 2003
+@@ -0,0 +1,99 @@
++/*
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org)
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MCONSOLE_H__
++#define __MCONSOLE_H__
++
++#ifndef __KERNEL__
++#include <stdint.h>
++#define u32 uint32_t
++#endif
++
++#define MCONSOLE_MAGIC (0xcafebabe)
++#define MCONSOLE_MAX_DATA (512)
++#define MCONSOLE_VERSION 2
++
++struct mconsole_request {
++      u32 magic;
++      u32 version;
++      u32 len;
++      char data[MCONSOLE_MAX_DATA];
++};
++
++struct mconsole_reply {
++      u32 err;
++      u32 more;
++      u32 len;
++      char data[MCONSOLE_MAX_DATA];
++};
++
++struct mconsole_notify {
++      u32 magic;
++      u32 version;    
++      enum { MCONSOLE_SOCKET, MCONSOLE_PANIC, MCONSOLE_HANG,
++             MCONSOLE_USER_NOTIFY } type;
++      u32 len;
++      char data[MCONSOLE_MAX_DATA];
++};
++
++struct mc_request;
++
++struct mconsole_command
++{
++      char *command;
++      void (*handler)(struct mc_request *req);
++      int as_interrupt;
++};
++
++struct mc_request
++{
++      int len;
++      int as_interrupt;
++
++      int originating_fd;
++      int originlen;
++      unsigned char origin[128];                      /* sockaddr_un */
++
++      struct mconsole_request request;
++      struct mconsole_command *cmd;
++};
++
++extern char mconsole_socket_name[];
++
++extern int mconsole_unlink_socket(void);
++extern int mconsole_reply(struct mc_request *req, char *reply, int err,
++                        int more);
++
++extern void mconsole_version(struct mc_request *req);
++extern void mconsole_help(struct mc_request *req);
++extern void mconsole_halt(struct mc_request *req);
++extern void mconsole_reboot(struct mc_request *req);
++extern void mconsole_config(struct mc_request *req);
++extern void mconsole_remove(struct mc_request *req);
++extern void mconsole_sysrq(struct mc_request *req);
++extern void mconsole_cad(struct mc_request *req);
++extern void mconsole_stop(struct mc_request *req);
++extern void mconsole_go(struct mc_request *req);
++
++extern int mconsole_get_request(int fd, struct mc_request *req);
++extern int mconsole_notify(char *sock_name, int type, const void *data, 
++                         int len);
++extern char *mconsole_notify_socket(void);
++extern void lock_notify(void);
++extern void unlock_notify(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/mconsole_kern.h um/arch/um/include/mconsole_kern.h
+--- orig/arch/um/include/mconsole_kern.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/mconsole_kern.h Fri Nov 15 15:21:58 2002
+@@ -0,0 +1,62 @@
++/*
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MCONSOLE_KERN_H__
++#define __MCONSOLE_KERN_H__
++
++#include "linux/config.h"
++#include "linux/list.h"
++#include "mconsole.h"
++
++struct mconsole_entry {
++      struct list_head list;
++      struct mc_request request;
++};
++
++struct mc_device {
++      struct list_head list;
++      char *name;
++      int (*config)(char *);
++      int (*get_config)(char *, char *, int, char **);
++      int (*remove)(char *);
++};
++
++#define CONFIG_CHUNK(str, size, current, chunk, end) \
++do { \
++      current += strlen(chunk); \
++      if(current >= size) \
++              str = NULL; \
++      if(str != NULL){ \
++              strcpy(str, chunk); \
++              str += strlen(chunk); \
++      } \
++      if(end) \
++              current++; \
++} while(0)
++
++#ifdef CONFIG_MCONSOLE
++
++extern void mconsole_register_dev(struct mc_device *new);
++
++#else
++
++static inline void mconsole_register_dev(struct mc_device *new)
++{
++}
++
++#endif
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/mem.h um/arch/um/include/mem.h
+--- orig/arch/um/include/mem.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/mem.h   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,29 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MEM_H__
++#define __MEM_H__
++
++struct vm_reserved {
++      struct list_head list;
++      unsigned long start;
++      unsigned long end;
++};
++
++extern void set_usable_vm(unsigned long start, unsigned long end);
++extern void set_kmem_end(unsigned long new);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/mem_user.h um/arch/um/include/mem_user.h
+--- orig/arch/um/include/mem_user.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/mem_user.h      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,87 @@
++/*
++ * arch/um/include/mem_user.h
++ *
++ * BRIEF MODULE DESCRIPTION
++ * user side memory interface for support IO memory inside user mode linux
++ *
++ * Copyright (C) 2001 RidgeRun, Inc.
++ * Author: RidgeRun, Inc.
++ *         Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com
++ *
++ *  This program is free software; you can redistribute  it and/or modify it
++ *  under  the terms of  the GNU General  Public License as published by the
++ *  Free Software Foundation;  either version 2 of the  License, or (at your
++ *  option) any later version.
++ *
++ *  THIS  SOFTWARE  IS PROVIDED   ``AS  IS'' AND   ANY  EXPRESS OR IMPLIED
++ *  WARRANTIES,   INCLUDING, BUT NOT  LIMITED  TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
++ *  NO  EVENT  SHALL   THE AUTHOR  BE    LIABLE FOR ANY   DIRECT, INDIRECT,
++ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
++ *  NOT LIMITED   TO, PROCUREMENT OF  SUBSTITUTE GOODS  OR SERVICES; LOSS OF
++ *  USE, DATA,  OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
++ *  ANY THEORY OF LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT
++ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
++ *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ *  You should have received a copy of the  GNU General Public License along
++ *  with this program; if not, write  to the Free Software Foundation, Inc.,
++ *  675 Mass Ave, Cambridge, MA 02139, USA.
++ */
++
++#ifndef _MEM_USER_H
++#define _MEM_USER_H
++
++struct mem_region {
++      char *driver;
++      unsigned long start_pfn;
++      unsigned long start;
++      unsigned long len;
++      void *mem_map;
++      int fd;
++};
++
++extern struct mem_region *regions[];
++extern struct mem_region physmem_region;
++
++#define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1))
++
++extern unsigned long host_task_size;
++extern unsigned long task_size;
++
++extern int init_mem_user(void);
++extern int create_mem_file(unsigned long len);
++extern void setup_range(int fd, char *driver, unsigned long start,
++                      unsigned long pfn, unsigned long total, int need_vm, 
++                      struct mem_region *region, void *reserved);
++extern void setup_memory(void *entry);
++extern unsigned long find_iomem(char *driver, unsigned long *len_out);
++extern int init_maps(struct mem_region *region);
++extern int nregions(void);
++extern int reserve_vm(unsigned long start, unsigned long end, void *e);
++extern unsigned long get_vm(unsigned long len);
++extern void setup_physmem(unsigned long start, unsigned long usable,
++                        unsigned long len);
++extern int setup_region(struct mem_region *region, void *entry);
++extern void add_iomem(char *name, int fd, unsigned long size);
++extern struct mem_region *phys_region(unsigned long phys);
++extern unsigned long phys_offset(unsigned long phys);
++extern void unmap_physmem(void);
++extern int map_memory(unsigned long virt, unsigned long phys, 
++                    unsigned long len, int r, int w, int x);
++extern int protect_memory(unsigned long addr, unsigned long len, 
++                        int r, int w, int x, int must_succeed);
++extern unsigned long get_kmem_end(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/mode.h um/arch/um/include/mode.h
+--- orig/arch/um/include/mode.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/mode.h  Fri Jan 17 13:23:32 2003
+@@ -0,0 +1,30 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MODE_H__
++#define __MODE_H__
++
++#include "uml-config.h"
++
++#ifdef UML_CONFIG_MODE_TT
++#include "../kernel/tt/include/mode.h"
++#endif
++
++#ifdef UML_CONFIG_MODE_SKAS
++#include "../kernel/skas/include/mode.h"
++#endif
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/mode_kern.h um/arch/um/include/mode_kern.h
+--- orig/arch/um/include/mode_kern.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/mode_kern.h     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,30 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MODE_KERN_H__
++#define __MODE_KERN_H__
++
++#include "linux/config.h"
++
++#ifdef CONFIG_MODE_TT
++#include "../kernel/tt/include/mode_kern.h"
++#endif
++
++#ifdef CONFIG_MODE_SKAS
++#include "../kernel/skas/include/mode_kern.h"
++#endif
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/net_kern.h um/arch/um/include/net_kern.h
+--- orig/arch/um/include/net_kern.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/net_kern.h      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,81 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_NET_KERN_H
++#define __UM_NET_KERN_H
++
++#include "linux/netdevice.h"
++#include "linux/skbuff.h"
++#include "linux/socket.h"
++#include "linux/list.h"
++
++struct uml_net {
++      struct list_head list;
++      struct net_device *dev;
++      int index;
++      unsigned char mac[ETH_ALEN];
++      int have_mac;
++};
++
++struct uml_net_private {
++      struct list_head list;
++      spinlock_t lock;
++      struct net_device *dev;
++      struct timer_list tl;
++      struct net_device_stats stats;
++      int fd;
++      unsigned char mac[ETH_ALEN];
++      int have_mac;
++      unsigned short (*protocol)(struct sk_buff *);
++      int (*open)(void *);
++      void (*close)(int, void *);
++      void (*remove)(void *);
++      int (*read)(int, struct sk_buff **skb, struct uml_net_private *);
++      int (*write)(int, struct sk_buff **skb, struct uml_net_private *);
++      
++      void (*add_address)(unsigned char *, unsigned char *, void *);
++      void (*delete_address)(unsigned char *, unsigned char *, void *);
++      int (*set_mtu)(int mtu, void *);
++      int user[1];
++};
++
++struct net_kern_info {
++      void (*init)(struct net_device *, void *);
++      unsigned short (*protocol)(struct sk_buff *);
++      int (*read)(int, struct sk_buff **skb, struct uml_net_private *);
++      int (*write)(int, struct sk_buff **skb, struct uml_net_private *);
++};
++
++struct transport {
++      struct list_head list;
++      char *name;
++      int (*setup)(char *, char **, void *);
++      struct net_user_info *user;
++      struct net_kern_info *kern;
++      int private_size;
++      int setup_size;
++};
++
++extern struct net_device *ether_init(int);
++extern unsigned short ether_protocol(struct sk_buff *);
++extern int setup_etheraddr(char *str, unsigned char *addr);
++extern struct sk_buff *ether_adjust_skb(struct sk_buff *skb, int extra);
++extern int tap_setup_common(char *str, char *type, char **dev_name, 
++                          char **mac_out, char **gate_addr);
++extern void register_transport(struct transport *new);
++extern unsigned short eth_protocol(struct sk_buff *skb);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/net_user.h um/arch/um/include/net_user.h
+--- orig/arch/um/include/net_user.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/net_user.h      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,66 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_NET_USER_H__
++#define __UM_NET_USER_H__
++
++#define ETH_ADDR_LEN (6)
++#define ETH_HEADER_ETHERTAP (16)
++#define ETH_HEADER_OTHER (14)
++#define ETH_MAX_PACKET (1500)
++
++#define UML_NET_VERSION (4)
++
++struct net_user_info {
++      void (*init)(void *, void *);
++      int (*open)(void *);
++      void (*close)(int, void *);
++      void (*remove)(void *);
++      int (*set_mtu)(int mtu, void *);
++      void (*add_address)(unsigned char *, unsigned char *, void *);
++      void (*delete_address)(unsigned char *, unsigned char *, void *);
++      int max_packet;
++};
++
++extern void ether_user_init(void *data, void *dev);
++extern void dev_ip_addr(void *d, char *buf, char *bin_buf);
++extern void set_ether_mac(void *d, unsigned char *addr);
++extern void iter_addresses(void *d, void (*cb)(unsigned char *, 
++                                             unsigned char *, void *), 
++                         void *arg);
++
++extern void *get_output_buffer(int *len_out);
++extern void free_output_buffer(void *buffer);
++
++extern int tap_open_common(void *dev, char *gate_addr);
++extern void tap_check_ips(char *gate_addr, char *eth_addr);
++
++extern void read_output(int fd, char *output_out, int len);
++
++extern int net_read(int fd, void *buf, int len);
++extern int net_recvfrom(int fd, void *buf, int len);
++extern int net_write(int fd, void *buf, int len);
++extern int net_send(int fd, void *buf, int len);
++extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len);
++
++extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg);
++extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg);
++
++extern char *split_if_spec(char *str, ...);
++
++extern int dev_netmask(void *d, void *m);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/os.h um/arch/um/include/os.h
+--- orig/arch/um/include/os.h  Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/os.h    Tue Feb  4 19:11:32 2003
+@@ -0,0 +1,137 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __OS_H__
++#define __OS_H__
++
++#include "asm/types.h"
++#include "../os/include/file.h"
++
++#define OS_TYPE_FILE 1 
++#define OS_TYPE_DIR 2 
++#define OS_TYPE_SYMLINK 3 
++#define OS_TYPE_CHARDEV 4
++#define OS_TYPE_BLOCKDEV 5
++#define OS_TYPE_FIFO 6
++#define OS_TYPE_SOCK 7
++
++struct openflags {
++      unsigned int r : 1;
++      unsigned int w : 1;
++      unsigned int s : 1;     /* O_SYNC */
++      unsigned int c : 1;     /* O_CREAT */
++      unsigned int t : 1;     /* O_TRUNC */
++      unsigned int a : 1;     /* O_APPEND */
++      unsigned int e : 1;     /* O_EXCL */
++      unsigned int cl : 1;    /* FD_CLOEXEC */
++};
++
++#define OPENFLAGS() ((struct openflags) { .r = 0, .w = 0, .s = 0, .c = 0, \
++                                        .t = 0, .a = 0, .e = 0, .cl = 0 })
++
++static inline struct openflags of_read(struct openflags flags)
++{
++      flags.r = 1; 
++      return(flags);
++}
++
++static inline struct openflags of_write(struct openflags flags)
++{
++      flags.w = 1; 
++      return(flags); 
++}
++
++static inline struct openflags of_rdwr(struct openflags flags)
++{
++      return(of_read(of_write(flags)));
++}
++
++static inline struct openflags of_set_rw(struct openflags flags, int r, int w)
++{
++      flags.r = r;
++      flags.w = w;
++      return(flags);
++}
++
++static inline struct openflags of_sync(struct openflags flags)
++{ 
++      flags.s = 1; 
++      return(flags); 
++}
++
++static inline struct openflags of_create(struct openflags flags)
++{ 
++      flags.c = 1; 
++      return(flags); 
++}
++ 
++static inline struct openflags of_trunc(struct openflags flags)
++{ 
++      flags.t = 1; 
++      return(flags); 
++}
++ 
++static inline struct openflags of_append(struct openflags flags)
++{ 
++      flags.a = 1; 
++      return(flags); 
++}
++ 
++static inline struct openflags of_excl(struct openflags flags)
++{ 
++      flags.e = 1; 
++      return(flags); 
++}
++
++static inline struct openflags of_cloexec(struct openflags flags)
++{ 
++      flags.cl = 1; 
++      return(flags); 
++}
++ 
++extern int os_seek_file(int fd, __u64 offset);
++extern int os_open_file(char *file, struct openflags flags, int mode);
++extern int os_read_file(int fd, void *buf, int len);
++extern int os_write_file(int fd, void *buf, int count);
++extern int os_file_size(char *file, long long *size_out);
++extern int os_pipe(int *fd, int stream, int close_on_exec);
++extern int os_set_fd_async(int fd, int owner);
++extern int os_set_fd_block(int fd, int blocking);
++extern int os_accept_connection(int fd);
++extern int os_shutdown_socket(int fd, int r, int w);
++extern void os_close_file(int fd);
++extern int os_rcv_fd(int fd, int *helper_pid_out);
++extern int create_unix_socket(char *file, int len);
++extern int os_connect_socket(char *name);
++extern int os_file_type(char *file);
++extern int os_file_mode(char *file, struct openflags *mode_out);
++extern int os_lock_file(int fd, int excl);
++
++extern unsigned long os_process_pc(int pid);
++extern int os_process_parent(int pid);
++extern void os_stop_process(int pid);
++extern void os_kill_process(int pid, int reap_child);
++extern void os_usr1_process(int pid);
++extern int os_getpid(void);
++
++extern int os_map_memory(void *virt, int fd, unsigned long off, 
++                       unsigned long len, int r, int w, int x);
++extern int os_protect_memory(void *addr, unsigned long len, 
++                           int r, int w, int x);
++extern int os_unmap_memory(void *addr, int len);
++extern void os_flush_stdout(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/process.h um/arch/um/include/process.h
+--- orig/arch/um/include/process.h     Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/process.h       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,25 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PROCESS_H__
++#define __PROCESS_H__
++
++#include <asm/sigcontext.h>
++
++extern void sig_handler(int sig, struct sigcontext sc);
++extern void alarm_handler(int sig, struct sigcontext sc);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/ptrace_user.h um/arch/um/include/ptrace_user.h
+--- orig/arch/um/include/ptrace_user.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/ptrace_user.h   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,18 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PTRACE_USER_H__
++#define __PTRACE_USER_H__
++
++#include "sysdep/ptrace_user.h"
++
++extern int ptrace_getregs(long pid, unsigned long *regs_out);
++extern int ptrace_setregs(long pid, unsigned long *regs_in);
++extern int ptrace_getfpregs(long pid, unsigned long *regs_out);
++extern void arch_enter_kernel(void *task, int pid);
++extern void arch_leave_kernel(void *task, int pid);
++extern void ptrace_pokeuser(unsigned long addr, unsigned long data);
++
++#endif
+diff -Naur -X ../exclude-files orig/arch/um/include/sigcontext.h um/arch/um/include/sigcontext.h
+--- orig/arch/um/include/sigcontext.h  Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sigcontext.h    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,25 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UML_SIGCONTEXT_H__
++#define __UML_SIGCONTEXT_H__
++
++#include "sysdep/sigcontext.h"
++
++extern int sc_size(void *data);
++extern void sc_to_sc(void *to_ptr, void *from_ptr);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sigio.h um/arch/um/include/sigio.h
+--- orig/arch/um/include/sigio.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sigio.h Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,28 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SIGIO_H__
++#define __SIGIO_H__
++
++extern int write_sigio_irq(int fd);
++extern int register_sigio_fd(int fd);
++extern int read_sigio_fd(int fd);
++extern int add_sigio_fd(int fd, int read);
++extern int ignore_sigio_fd(int fd);
++extern void sigio_lock(void);
++extern void sigio_unlock(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/signal_kern.h um/arch/um/include/signal_kern.h
+--- orig/arch/um/include/signal_kern.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/signal_kern.h   Thu Dec  5 18:08:47 2002
+@@ -0,0 +1,22 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SIGNAL_KERN_H__
++#define __SIGNAL_KERN_H__
++
++extern int have_signals(void *t);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/signal_user.h um/arch/um/include/signal_user.h
+--- orig/arch/um/include/signal_user.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/signal_user.h   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,26 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SIGNAL_USER_H__
++#define __SIGNAL_USER_H__
++
++extern int signal_stack_size;
++
++extern int change_sig(int signal, int on);
++extern void set_sigstack(void *stack, int size);
++extern void set_handler(int sig, void (*handler)(int), int flags, ...);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/skas_ptrace.h um/arch/um/include/skas_ptrace.h
+--- orig/arch/um/include/skas_ptrace.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/skas_ptrace.h   Mon Dec 16 11:54:52 2002
+@@ -0,0 +1,36 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_PTRACE_H
++#define __SKAS_PTRACE_H
++
++struct ptrace_faultinfo {
++      int is_write;
++      unsigned long addr;
++};
++
++struct ptrace_ldt {
++      int func;
++      void *ptr;
++      unsigned long bytecount;
++};
++
++#define PTRACE_FAULTINFO 52
++#define PTRACE_SIGPENDING 53
++#define PTRACE_LDT 54
++#define PTRACE_SWITCH_MM 55
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/syscall_user.h um/arch/um/include/syscall_user.h
+--- orig/arch/um/include/syscall_user.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/syscall_user.h  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,23 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SYSCALL_USER_H
++#define __SYSCALL_USER_H
++
++extern int record_syscall_start(int syscall);
++extern void record_syscall_end(int index, int result);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/checksum.h um/arch/um/include/sysdep-i386/checksum.h
+--- orig/arch/um/include/sysdep-i386/checksum.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-i386/checksum.h  Tue Oct 29 21:23:02 2002
+@@ -0,0 +1,217 @@
++/* 
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_SYSDEP_CHECKSUM_H
++#define __UM_SYSDEP_CHECKSUM_H
++
++#include "linux/string.h"
++
++/*
++ * computes the checksum of a memory block at buff, length len,
++ * and adds in "sum" (32-bit)
++ *
++ * returns a 32-bit number suitable for feeding into itself
++ * or csum_tcpudp_magic
++ *
++ * this function must be called with even lengths, except
++ * for the last fragment, which may be odd
++ *
++ * it's best to have buff aligned on a 32-bit boundary
++ */
++unsigned int csum_partial(const unsigned char * buff, int len, 
++                        unsigned int sum);
++
++/*
++ * the same as csum_partial, but copies from src while it
++ * checksums, and handles user-space pointer exceptions correctly, when needed.
++ *
++ * here even more important to align src and dst on a 32-bit (or even
++ * better 64-bit) boundary
++ */
++
++unsigned int csum_partial_copy_to(const char *src, char *dst, int len, 
++                                int sum, int *err_ptr);
++unsigned int csum_partial_copy_from(const char *src, char *dst, int len, 
++                                  int sum, int *err_ptr);
++
++/*
++ *    Note: when you get a NULL pointer exception here this means someone
++ *    passed in an incorrect kernel address to one of these functions.
++ *
++ *    If you use these functions directly please don't forget the
++ *    verify_area().
++ */
++
++static __inline__
++unsigned int csum_partial_copy_nocheck(const char *src, char *dst,
++                                     int len, int sum)
++{
++      memcpy(dst, src, len);
++      return(csum_partial(dst, len, sum));
++}
++
++static __inline__
++unsigned int csum_partial_copy_from_user(const char *src, char *dst,
++                                       int len, int sum, int *err_ptr)
++{
++      return csum_partial_copy_from(src, dst, len, sum, err_ptr);
++}
++
++/*
++ * These are the old (and unsafe) way of doing checksums, a warning message 
++ * will be printed if they are used and an exeption occurs.
++ *
++ * these functions should go away after some time.
++ */
++
++#define csum_partial_copy_fromuser csum_partial_copy_from_user
++unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum);
++
++/*
++ *    This is a version of ip_compute_csum() optimized for IP headers,
++ *    which always checksum on 4 octet boundaries.
++ *
++ *    By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
++ *    Arnt Gulbrandsen.
++ */
++static inline unsigned short ip_fast_csum(unsigned char * iph,
++                                        unsigned int ihl)
++{
++      unsigned int sum;
++
++      __asm__ __volatile__(
++          "movl (%1), %0      ;\n"
++          "subl $4, %2        ;\n"
++          "jbe 2f             ;\n"
++          "addl 4(%1), %0     ;\n"
++          "adcl 8(%1), %0     ;\n"
++          "adcl 12(%1), %0    ;\n"
++"1:       adcl 16(%1), %0     ;\n"
++          "lea 4(%1), %1      ;\n"
++          "decl %2            ;\n"
++          "jne 1b             ;\n"
++          "adcl $0, %0        ;\n"
++          "movl %0, %2        ;\n"
++          "shrl $16, %0       ;\n"
++          "addw %w2, %w0      ;\n"
++          "adcl $0, %0        ;\n"
++          "notl %0            ;\n"
++"2:                           ;\n"
++      /* Since the input registers which are loaded with iph and ipl
++         are modified, we must also specify them as outputs, or gcc
++         will assume they contain their original values. */
++      : "=r" (sum), "=r" (iph), "=r" (ihl)
++      : "1" (iph), "2" (ihl));
++      return(sum);
++}
++
++/*
++ *    Fold a partial checksum
++ */
++
++static inline unsigned int csum_fold(unsigned int sum)
++{
++      __asm__(
++              "addl %1, %0            ;\n"
++              "adcl $0xffff, %0       ;\n"
++              : "=r" (sum)
++              : "r" (sum << 16), "0" (sum & 0xffff0000)
++      );
++      return (~sum) >> 16;
++}
++
++static inline unsigned long csum_tcpudp_nofold(unsigned long saddr,
++                                                 unsigned long daddr,
++                                                 unsigned short len,
++                                                 unsigned short proto,
++                                                 unsigned int sum)
++{
++    __asm__(
++      "addl %1, %0    ;\n"
++      "adcl %2, %0    ;\n"
++      "adcl %3, %0    ;\n"
++      "adcl $0, %0    ;\n"
++      : "=r" (sum)
++      : "g" (daddr), "g"(saddr), "g"((ntohs(len)<<16)+proto*256), "0"(sum));
++    return sum;
++}
++
++/*
++ * computes the checksum of the TCP/UDP pseudo-header
++ * returns a 16-bit checksum, already complemented
++ */
++static inline unsigned short int csum_tcpudp_magic(unsigned long saddr,
++                                                 unsigned long daddr,
++                                                 unsigned short len,
++                                                 unsigned short proto,
++                                                 unsigned int sum)
++{
++      return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
++}
++
++/*
++ * this routine is used for miscellaneous IP-like checksums, mainly
++ * in icmp.c
++ */
++
++static inline unsigned short ip_compute_csum(unsigned char * buff, int len)
++{
++    return csum_fold (csum_partial(buff, len, 0));
++}
++
++#define _HAVE_ARCH_IPV6_CSUM
++static __inline__ unsigned short int csum_ipv6_magic(struct in6_addr *saddr,
++                                                   struct in6_addr *daddr,
++                                                   __u32 len,
++                                                   unsigned short proto,
++                                                   unsigned int sum)
++{
++      __asm__(
++              "addl 0(%1), %0         ;\n"
++              "adcl 4(%1), %0         ;\n"
++              "adcl 8(%1), %0         ;\n"
++              "adcl 12(%1), %0        ;\n"
++              "adcl 0(%2), %0         ;\n"
++              "adcl 4(%2), %0         ;\n"
++              "adcl 8(%2), %0         ;\n"
++              "adcl 12(%2), %0        ;\n"
++              "adcl %3, %0            ;\n"
++              "adcl %4, %0            ;\n"
++              "adcl $0, %0            ;\n"
++              : "=&r" (sum)
++              : "r" (saddr), "r" (daddr),
++                "r"(htonl(len)), "r"(htonl(proto)), "0"(sum));
++
++      return csum_fold(sum);
++}
++
++/*
++ *    Copy and checksum to user
++ */
++#define HAVE_CSUM_COPY_USER
++static __inline__ unsigned int csum_and_copy_to_user(const char *src, 
++                                                   char *dst, int len,
++                                                   int sum, int *err_ptr)
++{
++      if (access_ok(VERIFY_WRITE, dst, len))
++              return(csum_partial_copy_to(src, dst, len, sum, err_ptr));
++
++      if (len)
++              *err_ptr = -EFAULT;
++
++      return -1; /* invalid checksum */
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/frame.h um/arch/um/include/sysdep-i386/frame.h
+--- orig/arch/um/include/sysdep-i386/frame.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-i386/frame.h     Fri Dec  6 14:07:54 2002
+@@ -0,0 +1,29 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __FRAME_I386_H
++#define __FRAME_I386_H
++
++struct arch_frame_data_raw {
++      unsigned long fp_start;
++      unsigned long sr;
++};
++
++struct arch_frame_data {
++      int fpstate_size;
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/frame_kern.h um/arch/um/include/sysdep-i386/frame_kern.h
+--- orig/arch/um/include/sysdep-i386/frame_kern.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-i386/frame_kern.h        Mon Dec  2 21:45:04 2002
+@@ -0,0 +1,69 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __FRAME_KERN_I386_H
++#define __FRAME_KERN_I386_H
++
++/* This is called from sys_sigreturn.  It takes the sp at the point of the
++ * sigreturn system call and returns the address of the sigcontext struct
++ * on the stack.
++ */
++
++static inline void *sp_to_sc(unsigned long sp)
++{
++      return((void *) sp);
++}
++
++static inline void *sp_to_uc(unsigned long sp)
++{
++      unsigned long uc;
++
++      uc = sp + signal_frame_si.uc_index - 
++              signal_frame_si.common.sp_index - 4;
++      return((void *) uc);
++}
++
++static inline void *sp_to_rt_sc(unsigned long sp)
++{
++      unsigned long sc;
++
++      sc = sp - signal_frame_si.common.sp_index + 
++              signal_frame_si.common.len - 4;
++      return((void *) sc);
++}
++
++static inline void *sp_to_mask(unsigned long sp)
++{
++      unsigned long mask;
++
++      mask = sp - signal_frame_sc.common.sp_index + 
++              signal_frame_sc.common.len - 8;
++      return((void *) mask);
++}
++
++extern int sc_size(void *data);
++
++static inline void *sp_to_rt_mask(unsigned long sp)
++{
++      unsigned long mask;
++
++      mask = sp - signal_frame_si.common.sp_index + 
++              signal_frame_si.common.len + 
++              sc_size(&signal_frame_si.common.arch) - 4;
++      return((void *) mask);
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/frame_user.h um/arch/um/include/sysdep-i386/frame_user.h
+--- orig/arch/um/include/sysdep-i386/frame_user.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-i386/frame_user.h        Fri Dec  6 14:13:59 2002
+@@ -0,0 +1,91 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __FRAME_USER_I386_H
++#define __FRAME_USER_I386_H
++
++#include <asm/page.h>
++#include "sysdep/frame.h"
++
++/* This stuff is to calculate the size of the fp state struct at runtime
++ * because it has changed between 2.2 and 2.4 and it would be good for a
++ * UML compiled on one to work on the other.
++ * So, setup_arch_frame_raw fills in the arch struct with the raw data, which
++ * just contains the address of the end of the sigcontext.  This is invoked
++ * from the signal handler.
++ * setup_arch_frame uses that data to figure out what 
++ * arch_frame_data.fpstate_size should be.  It really has no idea, since it's
++ * not allowed to do sizeof(struct fpstate) but it's safe to consider that it's
++ * everything from the end of the sigcontext up to the top of the stack.  So,
++ * it masks off the page number to get the offset within the page and subtracts
++ * that from the page size, and that's how big the fpstate struct will be
++ * considered to be.
++ */
++
++static inline void setup_arch_frame_raw(struct arch_frame_data_raw *data,
++                                      void *end, unsigned long srp)
++{
++      unsigned long sr = *((unsigned long *) srp);
++
++      data->fp_start = (unsigned long) end;
++      if((sr & PAGE_MASK) == ((unsigned long) end & PAGE_MASK))
++              data->sr = sr;
++      else data->sr = 0;
++}
++
++static inline void setup_arch_frame(struct arch_frame_data_raw *in, 
++                                  struct arch_frame_data *out)
++{
++      unsigned long fpstate_start = in->fp_start;
++
++      if(in->sr == 0){
++              fpstate_start &= ~PAGE_MASK;
++              out->fpstate_size = PAGE_SIZE - fpstate_start;
++      }
++      else {
++              out->fpstate_size = in->sr - fpstate_start;
++      }
++}
++
++/* This figures out where on the stack the SA_RESTORER function address
++ * is stored.  For i386, it's the signal handler return address, so it's
++ * located next to the frame pointer.
++ * This is inlined, so __builtin_frame_address(0) is correct.  Otherwise,
++ * it would have to be __builtin_frame_address(1).
++ */
++
++static inline unsigned long frame_restorer(void)
++{
++      unsigned long *fp;
++
++      fp = __builtin_frame_address(0);
++      return((unsigned long) (fp + 1));
++}
++
++/* Similarly, this returns the value of sp when the handler was first
++ * entered.  This is used to calculate the proper sp when delivering
++ * signals.
++ */
++
++static inline unsigned long frame_sp(void)
++{
++      unsigned long *fp;
++
++      fp = __builtin_frame_address(0);
++      return((unsigned long) (fp + 1));
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/ptrace.h um/arch/um/include/sysdep-i386/ptrace.h
+--- orig/arch/um/include/sysdep-i386/ptrace.h  Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-i386/ptrace.h    Fri Jan 17 13:23:31 2003
+@@ -0,0 +1,193 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SYSDEP_I386_PTRACE_H
++#define __SYSDEP_I386_PTRACE_H
++
++#include "uml-config.h"
++
++#ifdef UML_CONFIG_MODE_TT
++#include "ptrace-tt.h"
++#endif
++
++#ifdef UML_CONFIG_MODE_SKAS
++#include "ptrace-skas.h"
++#endif
++
++#include "choose-mode.h"
++
++union uml_pt_regs {
++#ifdef UML_CONFIG_MODE_TT
++      struct tt_regs {
++              long syscall;
++              void *sc;
++      } tt;
++#endif
++#ifdef UML_CONFIG_MODE_SKAS
++      struct skas_regs {
++              unsigned long regs[HOST_FRAME_SIZE];
++              unsigned long fp[HOST_FP_SIZE];
++              unsigned long xfp[HOST_XFP_SIZE];
++              unsigned long fault_addr;
++              unsigned long fault_type;
++              unsigned long trap_type;
++              long syscall;
++              int is_user;
++      } skas;
++#endif
++};
++
++#define EMPTY_UML_PT_REGS { }
++
++extern int mode_tt;
++
++#define UPT_SC(r) ((r)->tt.sc)
++#define UPT_IP(r) \
++      CHOOSE_MODE(SC_IP(UPT_SC(r)), REGS_IP((r)->skas.regs))
++#define UPT_SP(r) \
++      CHOOSE_MODE(SC_SP(UPT_SC(r)), REGS_SP((r)->skas.regs))
++#define UPT_EFLAGS(r) \
++      CHOOSE_MODE(SC_EFLAGS(UPT_SC(r)), REGS_EFLAGS((r)->skas.regs))
++#define UPT_EAX(r) \
++      CHOOSE_MODE(SC_EAX(UPT_SC(r)), REGS_EAX((r)->skas.regs))
++#define UPT_EBX(r) \
++      CHOOSE_MODE(SC_EBX(UPT_SC(r)), REGS_EBX((r)->skas.regs))
++#define UPT_ECX(r) \
++      CHOOSE_MODE(SC_ECX(UPT_SC(r)), REGS_ECX((r)->skas.regs))
++#define UPT_EDX(r) \
++      CHOOSE_MODE(SC_EDX(UPT_SC(r)), REGS_EDX((r)->skas.regs))
++#define UPT_ESI(r) \
++      CHOOSE_MODE(SC_ESI(UPT_SC(r)), REGS_ESI((r)->skas.regs))
++#define UPT_EDI(r) \
++      CHOOSE_MODE(SC_EDI(UPT_SC(r)), REGS_EDI((r)->skas.regs))
++#define UPT_EBP(r) \
++      CHOOSE_MODE(SC_EBP(UPT_SC(r)), REGS_EBP((r)->skas.regs))
++#define UPT_ORIG_EAX(r) \
++      CHOOSE_MODE((r)->tt.syscall, (r)->skas.syscall)
++#define UPT_CS(r) \
++      CHOOSE_MODE(SC_CS(UPT_SC(r)), REGS_CS((r)->skas.regs))
++#define UPT_SS(r) \
++      CHOOSE_MODE(SC_SS(UPT_SC(r)), REGS_SS((r)->skas.regs))
++#define UPT_DS(r) \
++      CHOOSE_MODE(SC_DS(UPT_SC(r)), REGS_DS((r)->skas.regs))
++#define UPT_ES(r) \
++      CHOOSE_MODE(SC_ES(UPT_SC(r)), REGS_ES((r)->skas.regs))
++#define UPT_FS(r) \
++      CHOOSE_MODE(SC_FS(UPT_SC(r)), REGS_FS((r)->skas.regs))
++#define UPT_GS(r) \
++      CHOOSE_MODE(SC_GS(UPT_SC(r)), REGS_GS((r)->skas.regs))
++
++#define UPT_SYSCALL_ARG1(r) UPT_EBX(r)
++#define UPT_SYSCALL_ARG2(r) UPT_ECX(r)
++#define UPT_SYSCALL_ARG3(r) UPT_EDX(r)
++#define UPT_SYSCALL_ARG4(r) UPT_ESI(r)
++#define UPT_SYSCALL_ARG5(r) UPT_EDI(r)
++#define UPT_SYSCALL_ARG6(r) UPT_EBP(r)
++
++extern int user_context(unsigned long sp);
++
++#define UPT_IS_USER(r) \
++      CHOOSE_MODE(user_context(UPT_SP(r)), (r)->skas.is_user)
++
++struct syscall_args {
++      unsigned long args[6];
++};
++
++#define SYSCALL_ARGS(r) ((struct syscall_args) \
++                        { .args = { UPT_SYSCALL_ARG1(r), \
++                                    UPT_SYSCALL_ARG2(r), \
++                                  UPT_SYSCALL_ARG3(r), \
++                                    UPT_SYSCALL_ARG4(r), \
++                                  UPT_SYSCALL_ARG5(r), \
++                                    UPT_SYSCALL_ARG6(r) } } )
++
++#define UPT_REG(regs, reg) \
++      ({      unsigned long val; \
++              switch(reg){ \
++              case EIP: val = UPT_IP(regs); break; \
++              case UESP: val = UPT_SP(regs); break; \
++              case EAX: val = UPT_EAX(regs); break; \
++              case EBX: val = UPT_EBX(regs); break; \
++              case ECX: val = UPT_ECX(regs); break; \
++              case EDX: val = UPT_EDX(regs); break; \
++              case ESI: val = UPT_ESI(regs); break; \
++              case EDI: val = UPT_EDI(regs); break; \
++              case EBP: val = UPT_EBP(regs); break; \
++              case ORIG_EAX: val = UPT_ORIG_EAX(regs); break; \
++              case CS: val = UPT_CS(regs); break; \
++              case SS: val = UPT_SS(regs); break; \
++              case DS: val = UPT_DS(regs); break; \
++              case ES: val = UPT_ES(regs); break; \
++              case FS: val = UPT_FS(regs); break; \
++              case GS: val = UPT_GS(regs); break; \
++              case EFL: val = UPT_EFLAGS(regs); break; \
++              default :  \
++                      panic("Bad register in UPT_REG : %d\n", reg);  \
++                      val = -1; \
++              } \
++              val; \
++      })
++      
++
++#define UPT_SET(regs, reg, val) \
++      do { \
++              switch(reg){ \
++              case EIP: UPT_IP(regs) = val; break; \
++              case UESP: UPT_SP(regs) = val; break; \
++              case EAX: UPT_EAX(regs) = val; break; \
++              case EBX: UPT_EBX(regs) = val; break; \
++              case ECX: UPT_ECX(regs) = val; break; \
++              case EDX: UPT_EDX(regs) = val; break; \
++              case ESI: UPT_ESI(regs) = val; break; \
++              case EDI: UPT_EDI(regs) = val; break; \
++              case EBP: UPT_EBP(regs) = val; break; \
++              case ORIG_EAX: UPT_ORIG_EAX(regs) = val; break; \
++              case CS: UPT_CS(regs) = val; break; \
++              case SS: UPT_SS(regs) = val; break; \
++              case DS: UPT_DS(regs) = val; break; \
++              case ES: UPT_ES(regs) = val; break; \
++              case FS: UPT_FS(regs) = val; break; \
++              case GS: UPT_GS(regs) = val; break; \
++              case EFL: UPT_EFLAGS(regs) = val; break; \
++              default :  \
++                      panic("Bad register in UPT_SET : %d\n", reg);  \
++                      break; \
++              } \
++      } while (0)
++
++#define UPT_SET_SYSCALL_RETURN(r, res) \
++      CHOOSE_MODE(SC_SET_SYSCALL_RETURN(UPT_SC(r), (res)), \
++                    REGS_SET_SYSCALL_RETURN((r)->skas.regs, (res)))
++
++#define UPT_RESTART_SYSCALL(r) \
++      CHOOSE_MODE(SC_RESTART_SYSCALL(UPT_SC(r)), \
++                  REGS_RESTART_SYSCALL((r)->skas.regs))
++
++#define UPT_ORIG_SYSCALL(r) UPT_EAX(r)
++#define UPT_SYSCALL_NR(r) UPT_ORIG_EAX(r)
++#define UPT_SYSCALL_RET(r) UPT_EAX(r)
++
++#define UPT_SEGV_IS_FIXABLE(r) \
++      CHOOSE_MODE(SC_SEGV_IS_FIXABLE(UPT_SC(r)), \
++                    REGS_SEGV_IS_FIXABLE(&r->skas))
++
++#define UPT_FAULT_ADDR(r) \
++      CHOOSE_MODE(SC_FAULT_ADDR(UPT_SC(r)), REGS_FAULT_ADDR(&r->skas))
++
++#define UPT_FAULT_WRITE(r) \
++      CHOOSE_MODE(SC_FAULT_WRITE(UPT_SC(r)), REGS_FAULT_WRITE(&r->skas))
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/ptrace_user.h um/arch/um/include/sysdep-i386/ptrace_user.h
+--- orig/arch/um/include/sysdep-i386/ptrace_user.h     Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-i386/ptrace_user.h       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,62 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SYSDEP_I386_PTRACE_USER_H__
++#define __SYSDEP_I386_PTRACE_USER_H__
++
++#include <asm/ptrace.h>
++
++#define PT_OFFSET(r) ((r) * sizeof(long))
++
++#define PT_SYSCALL_NR(regs) ((regs)[ORIG_EAX])
++#define PT_SYSCALL_NR_OFFSET PT_OFFSET(ORIG_EAX)
++
++#define PT_SYSCALL_ARG1_OFFSET PT_OFFSET(EBX)
++#define PT_SYSCALL_ARG2_OFFSET PT_OFFSET(ECX)
++#define PT_SYSCALL_ARG3_OFFSET PT_OFFSET(EDX)
++#define PT_SYSCALL_ARG4_OFFSET PT_OFFSET(ESI)
++#define PT_SYSCALL_ARG5_OFFSET PT_OFFSET(EDI)
++
++#define PT_SYSCALL_RET_OFFSET PT_OFFSET(EAX)
++
++#define PT_IP_OFFSET PT_OFFSET(EIP)
++#define PT_IP(regs) ((regs)[EIP])
++#define PT_SP(regs) ((regs)[UESP])
++
++#ifndef FRAME_SIZE
++#define FRAME_SIZE (17)
++#endif
++#define FRAME_SIZE_OFFSET (FRAME_SIZE * sizeof(unsigned long))
++
++#define FP_FRAME_SIZE (27)
++#define FPX_FRAME_SIZE (128)
++
++#ifdef PTRACE_GETREGS
++#define UM_HAVE_GETREGS
++#endif
++
++#ifdef PTRACE_SETREGS
++#define UM_HAVE_SETREGS
++#endif
++
++#ifdef PTRACE_GETFPREGS
++#define UM_HAVE_GETFPREGS
++#endif
++
++#ifdef PTRACE_SETFPREGS
++#define UM_HAVE_SETFPREGS
++#endif
++
++#ifdef PTRACE_GETFPXREGS
++#define UM_HAVE_GETFPXREGS
++#endif
++
++#ifdef PTRACE_SETFPXREGS
++#define UM_HAVE_SETFPXREGS
++#endif
++
++extern void update_debugregs(int seq);
++
++#endif
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/sigcontext.h um/arch/um/include/sysdep-i386/sigcontext.h
+--- orig/arch/um/include/sysdep-i386/sigcontext.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-i386/sigcontext.h        Sun Dec  8 18:21:33 2002
+@@ -0,0 +1,49 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SYS_SIGCONTEXT_I386_H
++#define __SYS_SIGCONTEXT_I386_H
++
++#include "sc.h"
++
++#define IP_RESTART_SYSCALL(ip) ((ip) -= 2)
++
++#define SC_RESTART_SYSCALL(sc) IP_RESTART_SYSCALL(SC_IP(sc))
++#define SC_SET_SYSCALL_RETURN(sc, result) SC_EAX(sc) = (result)
++
++#define SC_FAULT_ADDR(sc) SC_CR2(sc)
++#define SC_FAULT_TYPE(sc) SC_ERR(sc)
++
++#define FAULT_WRITE(err) (err & 2)
++#define TO_SC_ERR(is_write) ((is_write) ? 2 : 0)
++
++#define SC_FAULT_WRITE(sc) (FAULT_WRITE(SC_ERR(sc)))
++
++#define SC_TRAP_TYPE(sc) SC_TRAPNO(sc)
++
++/* ptrace expects that, at the start of a system call, %eax contains
++ * -ENOSYS, so this makes it so.
++ */
++#define SC_START_SYSCALL(sc) do SC_EAX(sc) = -ENOSYS; while(0)
++
++/* These are General Protection and Page Fault */
++#define SEGV_IS_FIXABLE(trap) ((trap == 13) || (trap == 14))
++
++#define SC_SEGV_IS_FIXABLE(sc) (SEGV_IS_FIXABLE(SC_TRAPNO(sc)))
++
++extern unsigned long *sc_sigmask(void *sc_ptr);
++extern int sc_get_fpregs(unsigned long buf, void *sc_ptr);
++
++#endif
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/syscalls.h um/arch/um/include/sysdep-i386/syscalls.h
+--- orig/arch/um/include/sysdep-i386/syscalls.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-i386/syscalls.h  Sun Dec  8 18:04:15 2002
+@@ -0,0 +1,61 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "asm/unistd.h"
++#include "sysdep/ptrace.h"
++
++typedef long syscall_handler_t(struct pt_regs);
++
++#define EXECUTE_SYSCALL(syscall, regs) \
++      ((long (*)(struct syscall_args)) (*sys_call_table[syscall]))(SYSCALL_ARGS(&regs->regs))
++
++extern syscall_handler_t sys_modify_ldt;
++extern syscall_handler_t old_mmap_i386;
++extern syscall_handler_t old_select;
++extern syscall_handler_t sys_ni_syscall;
++
++#define ARCH_SYSCALLS \
++      [ __NR_mmap ] = old_mmap_i386, \
++      [ __NR_select ] = old_select, \
++      [ __NR_vm86old ] = sys_ni_syscall, \
++        [ __NR_modify_ldt ] = sys_modify_ldt, \
++      [ __NR_lchown32 ] = sys_lchown, \
++      [ __NR_getuid32 ] = sys_getuid, \
++      [ __NR_getgid32 ] = sys_getgid, \
++      [ __NR_geteuid32 ] = sys_geteuid, \
++      [ __NR_getegid32 ] = sys_getegid, \
++      [ __NR_setreuid32 ] = sys_setreuid, \
++      [ __NR_setregid32 ] = sys_setregid, \
++      [ __NR_getgroups32 ] = sys_getgroups, \
++      [ __NR_setgroups32 ] = sys_setgroups, \
++      [ __NR_fchown32 ] = sys_fchown, \
++      [ __NR_setresuid32 ] = sys_setresuid, \
++      [ __NR_getresuid32 ] = sys_getresuid, \
++      [ __NR_setresgid32 ] = sys_setresgid, \
++      [ __NR_getresgid32 ] = sys_getresgid, \
++      [ __NR_chown32 ] = sys_chown, \
++      [ __NR_setuid32 ] = sys_setuid, \
++      [ __NR_setgid32 ] = sys_setgid, \
++      [ __NR_setfsuid32 ] = sys_setfsuid, \
++      [ __NR_setfsgid32 ] = sys_setfsgid, \
++      [ __NR_pivot_root ] = sys_pivot_root, \
++      [ __NR_mincore ] = sys_mincore, \
++      [ __NR_madvise ] = sys_madvise, \
++        [ 222 ] = sys_ni_syscall, 
++        
++/* 222 doesn't yet have a name in include/asm-i386/unistd.h */
++
++#define LAST_ARCH_SYSCALL 222
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ia64/ptrace.h um/arch/um/include/sysdep-ia64/ptrace.h
+--- orig/arch/um/include/sysdep-ia64/ptrace.h  Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-ia64/ptrace.h    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,26 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SYSDEP_IA64_PTRACE_H
++#define __SYSDEP_IA64_PTRACE_H
++
++struct sys_pt_regs {
++  int foo;
++};
++
++#define EMPTY_REGS { 0 }
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ia64/sigcontext.h um/arch/um/include/sysdep-ia64/sigcontext.h
+--- orig/arch/um/include/sysdep-ia64/sigcontext.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-ia64/sigcontext.h        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,20 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SYSDEP_IA64_SIGCONTEXT_H
++#define __SYSDEP_IA64_SIGCONTEXT_H
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ia64/syscalls.h um/arch/um/include/sysdep-ia64/syscalls.h
+--- orig/arch/um/include/sysdep-ia64/syscalls.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-ia64/syscalls.h  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,20 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SYSDEP_IA64_SYSCALLS_H
++#define __SYSDEP_IA64_SYSCALLS_H
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ppc/ptrace.h um/arch/um/include/sysdep-ppc/ptrace.h
+--- orig/arch/um/include/sysdep-ppc/ptrace.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-ppc/ptrace.h     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,104 @@
++/* 
++ * Licensed under the GPL
++ */
++
++#ifndef __SYS_PTRACE_PPC_H
++#define __SYS_PTRACE_PPC_H
++
++#include "linux/config.h"
++#include "linux/types.h"
++
++/* the following taken from <asm-ppc/ptrace.h> */
++
++#ifdef CONFIG_PPC64
++#define PPC_REG unsigned long /*long*/
++#else
++#define PPC_REG unsigned long
++#endif
++struct sys_pt_regs_s {
++      PPC_REG gpr[32];
++      PPC_REG nip;
++      PPC_REG msr;
++      PPC_REG orig_gpr3;      /* Used for restarting system calls */
++      PPC_REG ctr;
++      PPC_REG link;
++      PPC_REG xer;
++      PPC_REG ccr;
++      PPC_REG mq;             /* 601 only (not used at present) */
++                              /* Used on APUS to hold IPL value. */
++      PPC_REG trap;           /* Reason for being here */
++      PPC_REG dar;            /* Fault registers */
++      PPC_REG dsisr;
++      PPC_REG result;         /* Result of a system call */
++};
++
++#define NUM_REGS (sizeof(struct sys_pt_regs_s) / sizeof(PPC_REG))
++
++struct sys_pt_regs {
++    PPC_REG regs[sizeof(struct sys_pt_regs_s) / sizeof(PPC_REG)];
++};
++
++#define UM_MAX_REG (PT_FPR0)
++#define UM_MAX_REG_OFFSET (UM_MAX_REG * sizeof(PPC_REG))
++
++#define EMPTY_REGS { { [ 0 ... NUM_REGS - 1] = 0 } }
++
++#define UM_REG(r, n) ((r)->regs[n])
++
++#define UM_SYSCALL_RET(r) UM_REG(r, PT_R3)
++#define UM_SP(r) UM_REG(r, PT_R1)
++#define UM_IP(r) UM_REG(r, PT_NIP)
++#define UM_ELF_ZERO(r) UM_REG(r, PT_FPSCR)
++#define UM_SYSCALL_NR(r) UM_REG(r, PT_R0)
++#define UM_SYSCALL_ARG1(r) UM_REG(r, PT_ORIG_R3)
++#define UM_SYSCALL_ARG2(r) UM_REG(r, PT_R4)
++#define UM_SYSCALL_ARG3(r) UM_REG(r, PT_R5)
++#define UM_SYSCALL_ARG4(r) UM_REG(r, PT_R6)
++#define UM_SYSCALL_ARG5(r) UM_REG(r, PT_R7)
++#define UM_SYSCALL_ARG6(r) UM_REG(r, PT_R8)
++
++#define UM_SYSCALL_NR_OFFSET (PT_R0 * sizeof(PPC_REG))
++#define UM_SYSCALL_RET_OFFSET (PT_R3 * sizeof(PPC_REG))
++#define UM_SYSCALL_ARG1_OFFSET (PT_R3 * sizeof(PPC_REG))
++#define UM_SYSCALL_ARG2_OFFSET (PT_R4 * sizeof(PPC_REG))
++#define UM_SYSCALL_ARG3_OFFSET (PT_R5 * sizeof(PPC_REG))
++#define UM_SYSCALL_ARG4_OFFSET (PT_R6 * sizeof(PPC_REG))
++#define UM_SYSCALL_ARG5_OFFSET (PT_R7 * sizeof(PPC_REG))
++#define UM_SYSCALL_ARG6_OFFSET (PT_R8 * sizeof(PPC_REG))
++#define UM_SP_OFFSET (PT_R1 * sizeof(PPC_REG))
++#define UM_IP_OFFSET (PT_NIP * sizeof(PPC_REG))
++#define UM_ELF_ZERO_OFFSET (PT_R3 * sizeof(PPC_REG))
++
++#define UM_SET_SYSCALL_RETURN(_regs, result)          \
++do {                                                    \
++        if (result < 0) {                             \
++              (_regs)->regs[PT_CCR] |= 0x10000000;    \
++              UM_SYSCALL_RET((_regs)) = -result;      \
++        } else {                                      \
++              UM_SYSCALL_RET((_regs)) = result;       \
++        }                                               \
++} while(0)
++
++extern void shove_aux_table(unsigned long sp);
++#define UM_FIX_EXEC_STACK(sp) shove_aux_table(sp);
++
++/* These aren't actually defined.  The undefs are just to make sure
++ * everyone's clear on the concept.
++ */
++#undef UML_HAVE_GETREGS
++#undef UML_HAVE_GETFPREGS
++#undef UML_HAVE_SETREGS
++#undef UML_HAVE_SETFPREGS
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ppc/sigcontext.h um/arch/um/include/sysdep-ppc/sigcontext.h
+--- orig/arch/um/include/sysdep-ppc/sigcontext.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-ppc/sigcontext.h Sat Nov 23 22:02:19 2002
+@@ -0,0 +1,62 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SYS_SIGCONTEXT_PPC_H
++#define __SYS_SIGCONTEXT_PPC_H
++
++#define DSISR_WRITE 0x02000000
++
++#define SC_FAULT_ADDR(sc) ({ \
++              struct sigcontext *_sc = (sc); \
++              long retval = -1; \
++              switch (_sc->regs->trap) { \
++              case 0x300: \
++                      /* data exception */ \
++                      retval = _sc->regs->dar; \
++                      break; \
++              case 0x400: \
++                      /* instruction exception */ \
++                      retval = _sc->regs->nip; \
++                      break; \
++              default: \
++                      panic("SC_FAULT_ADDR: unhandled trap type\n"); \
++              } \
++              retval; \
++      })
++
++#define SC_FAULT_WRITE(sc) ({ \
++              struct sigcontext *_sc = (sc); \
++              long retval = -1; \
++              switch (_sc->regs->trap) { \
++              case 0x300: \
++                      /* data exception */ \
++                      retval = !!(_sc->regs->dsisr & DSISR_WRITE); \
++                      break; \
++              case 0x400: \
++                      /* instruction exception: not a write */ \
++                      retval = 0; \
++                      break; \
++              default: \
++                      panic("SC_FAULT_ADDR: unhandled trap type\n"); \
++              } \
++              retval; \
++      })
++
++#define SC_IP(sc) ((sc)->regs->nip)
++#define SC_SP(sc) ((sc)->regs->gpr[1])
++#define SEGV_IS_FIXABLE(sc) (1)
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ppc/syscalls.h um/arch/um/include/sysdep-ppc/syscalls.h
+--- orig/arch/um/include/sysdep-ppc/syscalls.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysdep-ppc/syscalls.h   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,50 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++typedef long syscall_handler_t(unsigned long arg1, unsigned long arg2,
++                             unsigned long arg3, unsigned long arg4,
++                             unsigned long arg5, unsigned long arg6);
++
++#define EXECUTE_SYSCALL(syscall, regs) \
++        (*sys_call_table[syscall])(UM_SYSCALL_ARG1(&regs), \
++                                 UM_SYSCALL_ARG2(&regs), \
++                                 UM_SYSCALL_ARG3(&regs), \
++                                 UM_SYSCALL_ARG4(&regs), \
++                                 UM_SYSCALL_ARG5(&regs), \
++                                 UM_SYSCALL_ARG6(&regs))
++
++extern syscall_handler_t sys_mincore;
++extern syscall_handler_t sys_madvise;
++
++/* old_mmap needs the correct prototype since syscall_kern.c includes
++ * this file.
++ */
++int old_mmap(unsigned long addr, unsigned long len,
++           unsigned long prot, unsigned long flags,
++           unsigned long fd, unsigned long offset);
++
++#define ARCH_SYSCALLS \
++      [ __NR_modify_ldt ] = sys_ni_syscall, \
++      [ __NR_pciconfig_read ] = sys_ni_syscall, \
++      [ __NR_pciconfig_write ] = sys_ni_syscall, \
++      [ __NR_pciconfig_iobase ] = sys_ni_syscall, \
++      [ __NR_pivot_root ] = sys_ni_syscall, \
++      [ __NR_multiplexer ] = sys_ni_syscall, \
++      [ __NR_mmap ] = old_mmap, \
++      [ __NR_madvise ] = sys_madvise, \
++      [ __NR_mincore ] = sys_mincore, 
++
++#define LAST_ARCH_SYSCALL __NR_mincore
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/sysrq.h um/arch/um/include/sysrq.h
+--- orig/arch/um/include/sysrq.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/sysrq.h Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_SYSRQ_H
++#define __UM_SYSRQ_H
++
++extern void show_trace(unsigned long *stack);
++
++#endif
+diff -Naur -X ../exclude-files orig/arch/um/include/tempfile.h um/arch/um/include/tempfile.h
+--- orig/arch/um/include/tempfile.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/tempfile.h      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,21 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __TEMPFILE_H__
++#define __TEMPFILE_H__
++
++extern int make_tempfile(const char *template, char **tempname, int do_unlink);
++
++#endif
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/time_user.h um/arch/um/include/time_user.h
+--- orig/arch/um/include/time_user.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/time_user.h     Wed Jan  8 12:55:47 2003
+@@ -0,0 +1,17 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __TIME_USER_H__
++#define __TIME_USER_H__
++
++extern void timer(void);
++extern void switch_timers(int to_real);
++extern void set_interval(int timer_type);
++extern void idle_sleep(int secs);
++extern void enable_timer(void);
++extern unsigned long time_lock(void);
++extern void time_unlock(unsigned long);
++
++#endif
+diff -Naur -X ../exclude-files orig/arch/um/include/tlb.h um/arch/um/include/tlb.h
+--- orig/arch/um/include/tlb.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/tlb.h   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,23 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __TLB_H__
++#define __TLB_H__
++
++extern void mprotect_kernel_vm(int w);
++extern void force_flush_all(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/ubd_user.h um/arch/um/include/ubd_user.h
+--- orig/arch/um/include/ubd_user.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/ubd_user.h      Thu Mar  6 18:09:14 2003
+@@ -0,0 +1,77 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Copyright (C) 2001 RidgeRun, Inc (glonnon@ridgerun.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_UBD_USER_H
++#define __UM_UBD_USER_H
++
++#include "os.h"
++
++enum ubd_req { UBD_READ, UBD_WRITE };
++
++struct io_thread_req {
++      enum ubd_req op;
++      int fds[2];
++      unsigned long offsets[2];
++      unsigned long long offset;
++      unsigned long length;
++      char *buffer;
++      int sectorsize;
++      unsigned long sector_mask;
++      unsigned long cow_offset;
++      unsigned long bitmap_words[2];
++      int error;
++};
++
++extern int open_ubd_file(char *file, struct openflags *openflags, 
++                       char **backing_file_out, int *bitmap_offset_out, 
++                       unsigned long *bitmap_len_out, int *data_offset_out,
++                       int *create_cow_out);
++extern int create_cow_file(char *cow_file, char *backing_file, 
++                         struct openflags flags, int sectorsize, 
++                         int *bitmap_offset_out, 
++                         unsigned long *bitmap_len_out,
++                         int *data_offset_out);
++extern int read_cow_bitmap(int fd, void *buf, int offset, int len);
++extern int read_ubd_fs(int fd, void *buffer, int len);
++extern int write_ubd_fs(int fd, char *buffer, int len);
++extern int start_io_thread(unsigned long sp, int *fds_out);
++extern void do_io(struct io_thread_req *req);
++
++static inline int ubd_test_bit(__u64 bit, unsigned char *data)
++{
++      __u64 n;
++      int bits, off;
++
++      bits = sizeof(data[0]) * 8;
++      n = bit / bits;
++      off = bit % bits;
++      return((data[n] & (1 << off)) != 0);
++}
++
++static inline void ubd_set_bit(__u64 bit, unsigned char *data)
++{
++      __u64 n;
++      int bits, off;
++
++      bits = sizeof(data[0]) * 8;
++      n = bit / bits;
++      off = bit % bits;
++      data[n] |= (1 << off);
++}
++
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/um_mmu.h um/arch/um/include/um_mmu.h
+--- orig/arch/um/include/um_mmu.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/um_mmu.h        Sat Nov  9 12:51:43 2002
+@@ -0,0 +1,40 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __ARCH_UM_MMU_H
++#define __ARCH_UM_MMU_H
++
++#include "linux/config.h"
++#include "choose-mode.h"
++
++#ifdef CONFIG_MODE_TT
++#include "../kernel/tt/include/mmu.h"
++#endif
++
++#ifdef CONFIG_MODE_SKAS
++#include "../kernel/skas/include/mmu.h"
++#endif
++
++typedef union {
++#ifdef CONFIG_MODE_TT
++      struct mmu_context_tt tt;
++#endif
++#ifdef CONFIG_MODE_SKAS
++      struct mmu_context_skas skas;
++#endif
++} mm_context_t;
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/um_uaccess.h um/arch/um/include/um_uaccess.h
+--- orig/arch/um/include/um_uaccess.h  Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/um_uaccess.h    Sat Nov 23 22:03:02 2002
+@@ -0,0 +1,73 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __ARCH_UM_UACCESS_H
++#define __ARCH_UM_UACCESS_H
++
++#include "linux/config.h"
++#include "choose-mode.h"
++
++#ifdef CONFIG_MODE_TT
++#include "../kernel/tt/include/uaccess.h"
++#endif
++
++#ifdef CONFIG_MODE_SKAS
++#include "../kernel/skas/include/uaccess.h"
++#endif
++
++#define access_ok(type, addr, size) \
++      CHOOSE_MODE_PROC(access_ok_tt, access_ok_skas, type, addr, size)
++
++static inline int verify_area(int type, const void * addr, unsigned long size)
++{
++      return(CHOOSE_MODE_PROC(verify_area_tt, verify_area_skas, type, addr,
++                              size));
++}
++
++static inline int copy_from_user(void *to, const void *from, int n)
++{
++      return(CHOOSE_MODE_PROC(copy_from_user_tt, copy_from_user_skas, to,
++                              from, n));
++}
++
++static inline int copy_to_user(void *to, const void *from, int n)
++{
++      return(CHOOSE_MODE_PROC(copy_to_user_tt, copy_to_user_skas, to, 
++                              from, n));
++}
++
++static inline int strncpy_from_user(char *dst, const char *src, int count)
++{
++      return(CHOOSE_MODE_PROC(strncpy_from_user_tt, strncpy_from_user_skas,
++                              dst, src, count));
++}
++
++static inline int __clear_user(void *mem, int len)
++{
++      return(CHOOSE_MODE_PROC(__clear_user_tt, __clear_user_skas, mem, len));
++}
++
++static inline int clear_user(void *mem, int len)
++{
++      return(CHOOSE_MODE_PROC(clear_user_tt, clear_user_skas, mem, len));
++}
++
++static inline int strnlen_user(const void *str, int len)
++{
++      return(CHOOSE_MODE_PROC(strnlen_user_tt, strnlen_user_skas, str, len));
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/umid.h um/arch/um/include/umid.h
+--- orig/arch/um/include/umid.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/umid.h  Mon Dec 16 20:52:19 2002
+@@ -0,0 +1,22 @@
++/*
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UMID_H__
++#define __UMID_H__
++
++extern int umid_file_name(char *name, char *buf, int len);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/uml_uaccess.h um/arch/um/include/uml_uaccess.h
+--- orig/arch/um/include/uml_uaccess.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/uml_uaccess.h   Thu Dec 19 13:15:22 2002
+@@ -0,0 +1,28 @@
++/*
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UML_UACCESS_H__
++#define __UML_UACCESS_H__
++
++extern int __do_copy_to_user(void *to, const void *from, int n,
++                                void **fault_addr, void **fault_catcher);
++extern unsigned long __do_user_copy(void *to, const void *from, int n,
++                                  void **fault_addr, void **fault_catcher,
++                                  void (*op)(void *to, const void *from,
++                                             int n), int *faulted_out);
++void __do_copy(void *to, const void *from, int n);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/umn.h um/arch/um/include/umn.h
+--- orig/arch/um/include/umn.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/umn.h   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,27 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UMN_H
++#define __UMN_H
++
++extern int open_umn_tty(int *slave_out, int *slipno_out);
++extern void close_umn_tty(int master, int slave);
++extern int umn_send_packet(int fd, void *data, int len);
++extern int set_umn_addr(int fd, char *addr, char *ptp_addr);
++extern void slip_unesc(unsigned char s);
++extern void umn_read(int fd);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/user.h um/arch/um/include/user.h
+--- orig/arch/um/include/user.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/user.h  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,29 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __USER_H__
++#define __USER_H__
++
++extern void panic(const char *fmt, ...);
++extern int printk(const char *fmt, ...);
++extern void schedule(void);
++extern void *um_kmalloc(int size);
++extern void *um_kmalloc_atomic(int size);
++extern void kfree(void *ptr);
++extern int in_aton(char *str);
++extern int open_gdb_chan(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/include/user_util.h um/arch/um/include/user_util.h
+--- orig/arch/um/include/user_util.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/include/user_util.h     Wed Apr 23 20:42:00 2003
+@@ -0,0 +1,103 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __USER_UTIL_H__
++#define __USER_UTIL_H__
++
++#include "sysdep/ptrace.h"
++
++extern int mode_tt;
++
++extern int grantpt(int __fd);
++extern int unlockpt(int __fd);
++extern char *ptsname(int __fd);
++
++enum { OP_NONE, OP_EXEC, OP_FORK, OP_TRACE_ON, OP_REBOOT, OP_HALT, OP_CB };
++
++struct cpu_task {
++      int pid;
++      void *task;
++};
++
++extern struct cpu_task cpu_tasks[];
++
++struct signal_info {
++      void (*handler)(int, union uml_pt_regs *);
++      int is_irq;
++};
++
++extern struct signal_info sig_info[];
++
++extern unsigned long low_physmem;
++extern unsigned long high_physmem;
++extern unsigned long uml_physmem;
++extern unsigned long uml_reserved;
++extern unsigned long end_vm;
++extern unsigned long start_vm;
++extern unsigned long highmem;
++
++extern char host_info[];
++
++extern char saved_command_line[];
++extern char command_line[];
++
++extern char *tempdir;
++
++extern unsigned long _stext, _etext, _sdata, _edata, __bss_start, _end;
++extern unsigned long _unprotected_end;
++extern unsigned long brk_start;
++
++extern int pty_output_sigio;
++extern int pty_close_sigio;
++
++extern void stop(void);
++extern void stack_protections(unsigned long address);
++extern void task_protections(unsigned long address);
++extern int wait_for_stop(int pid, int sig, int cont_type, void *relay);
++extern void *add_signal_handler(int sig, void (*handler)(int));
++extern int start_fork_tramp(void *arg, unsigned long temp_stack, 
++                          int clone_flags, int (*tramp)(void *));
++extern int linux_main(int argc, char **argv);
++extern void set_cmdline(char *cmd);
++extern void input_cb(void (*proc)(void *), void *arg, int arg_len);
++extern int get_pty(void);
++extern void *um_kmalloc(int size);
++extern int raw(int fd, int complain);
++extern int switcheroo(int fd, int prot, void *from, void *to, int size);
++extern void setup_machinename(char *machine_out);
++extern void setup_hostinfo(void);
++extern void add_arg(char *cmd_line, char *arg);
++extern void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int));
++extern void init_new_thread_signals(int altstack);
++extern void do_exec(int old_pid, int new_pid);
++extern void tracer_panic(char *msg, ...);
++extern char *get_umid(int only_if_set);
++extern void do_longjmp(void *p, int val);
++extern void suspend_new_thread(int fd);
++extern int detach(int pid, int sig);
++extern int attach(int pid);
++extern void kill_child_dead(int pid);
++extern int cont(int pid);
++extern void check_ptrace(void);
++extern void check_sigio(void);
++extern int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr);
++extern void write_sigio_workaround(void);
++extern void arch_check_bugs(void);
++extern int arch_handle_signal(int sig, union uml_pt_regs *regs);
++extern int arch_fixup(unsigned long address, void *sc_ptr);
++extern int can_do_skas(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/Makefile um/arch/um/kernel/Makefile
+--- orig/arch/um/kernel/Makefile       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/Makefile Thu Apr 10 11:14:55 2003
+@@ -0,0 +1,73 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET = built-in.o
++
++obj-y = config.o checksum.o exec_kern.o exitcode.o frame_kern.o frame.o \
++      helper.o init_task.o irq.o irq_user.o ksyms.o mem.o mem_user.o \
++      process.o process_kern.o ptrace.o reboot.o resource.o sigio_user.o \
++      sigio_kern.o signal_kern.o signal_user.o smp.o syscall_kern.o \
++      syscall_user.o sysrq.o sys_call_table.o tempfile.o time.o \
++      time_kern.o tlb.o trap_kern.o trap_user.o uaccess_user.o um_arch.o \
++      umid.o user_syms.o user_util.o
++
++obj-$(CONFIG_BLK_DEV_INITRD) += initrd_kern.o initrd_user.o
++obj-$(CONFIG_GPROF) += gprof_syms.o
++obj-$(CONFIG_GCOV) += gmon_syms.o
++obj-$(CONFIG_TTY_LOG) += tty_log.o
++
++subdir-$(CONFIG_MODE_TT) += tt
++subdir-$(CONFIG_MODE_SKAS) += skas
++
++user-objs-$(CONFIG_TTY_LOG) += tty_log.o
++
++obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
++
++# user_syms.o not included here because Rules.make has its own ideas about
++# building anything in export-objs
++
++USER_OBJS = $(filter %_user.o,$(obj-y)) $(user-objs-y) config.o helper.o \
++      process.o tempfile.o time.o umid.o user_util.o 
++
++DMODULES-$(CONFIG_MODULES) = -D__CONFIG_MODULES__
++DMODVERSIONS-$(CONFIG_MODVERSIONS) = -D__CONFIG_MODVERSIONS__
++
++export-objs-$(CONFIG_GPROF) += gprof_syms.o
++export-objs-$(CONFIG_GCOV) += gmon_syms.o
++
++export-objs = ksyms.o process_kern.o signal_kern.o user_syms.o $(export-objs-y)
++
++CFLAGS_user_syms.o = -D__AUTOCONF_INCLUDED__ $(DMODULES-y) $(DMODVERSIONS-y) \
++      -I/usr/include -I../include
++
++CFLAGS_frame.o := $(patsubst -fomit-frame-pointer,,$(USER_CFLAGS))
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++# This has to be separate because it needs be compiled with frame pointers
++# regardless of how the rest of the kernel is built.
++
++frame.o: frame.c
++      $(CC) $(CFLAGS_$@) -c -o $@ $<
++
++QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; $$config =~ s/\n/\\n"\n"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }'
++
++config.c : config.c.in $(TOPDIR)/.config
++      $(PERL) -e $(QUOTE) < config.c.in > $@
++
++clean:
++      $(RM) config.c
++      for dir in $(subdir-y) ; do $(MAKE) -C $$dir clean; done
++
++modules:
++
++fastdep:
++
++dep:
++
++archmrproper: clean
+diff -Naur -X ../exclude-files orig/arch/um/kernel/checksum.c um/arch/um/kernel/checksum.c
+--- orig/arch/um/kernel/checksum.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/checksum.c       Thu Oct 31 22:39:58 2002
+@@ -0,0 +1,42 @@
++#include "asm/uaccess.h"
++#include "linux/errno.h"
++
++extern unsigned int arch_csum_partial(const char *buff, int len, int sum);
++
++extern unsigned int csum_partial(char *buff, int len, int sum)
++{
++      return(arch_csum_partial(buff, len, sum));
++}
++
++unsigned int csum_partial_copy_to(const char *src, char *dst, int len, 
++                                int sum, int *err_ptr)
++{
++      if(copy_to_user(dst, src, len)){
++              *err_ptr = -EFAULT;
++              return(-1);
++      }
++
++      return(arch_csum_partial(src, len, sum));
++}
++
++unsigned int csum_partial_copy_from(const char *src, char *dst, int len, 
++                                  int sum, int *err_ptr)
++{
++      if(copy_from_user(dst, src, len)){
++              *err_ptr = -EFAULT;
++              return(-1);
++      }
++
++      return(arch_csum_partial(dst, len, sum));
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/config.c.in um/arch/um/kernel/config.c.in
+--- orig/arch/um/kernel/config.c.in    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/config.c.in      Thu Apr 10 11:17:55 2003
+@@ -0,0 +1,32 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include "init.h"
++
++static __initdata char *config = "CONFIG";
++
++static int __init print_config(char *line, int *add)
++{
++      printf("%s", config);
++      exit(0);
++}
++
++__uml_setup("--showconfig", print_config,
++"--showconfig\n"
++"    Prints the config file that this UML binary was generated from.\n\n"
++);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/exec_kern.c um/arch/um/kernel/exec_kern.c
+--- orig/arch/um/kernel/exec_kern.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/exec_kern.c      Wed Apr 16 16:35:05 2003
+@@ -0,0 +1,86 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/slab.h"
++#include "linux/smp_lock.h"
++#include "asm/ptrace.h"
++#include "asm/pgtable.h"
++#include "asm/pgalloc.h"
++#include "asm/uaccess.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "mem_user.h"
++#include "kern.h"
++#include "irq_user.h"
++#include "tlb.h"
++#include "2_5compat.h"
++#include "os.h"
++#include "time_user.h"
++#include "choose-mode.h"
++#include "mode_kern.h"
++
++void flush_thread(void)
++{
++      CHOOSE_MODE(flush_thread_tt(), flush_thread_skas());
++}
++
++void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
++{
++      CHOOSE_MODE_PROC(start_thread_tt, start_thread_skas, regs, eip, esp);
++}
++
++extern void log_exec(char **argv, void *tty);
++
++static int execve1(char *file, char **argv, char **env)
++{
++        int error;
++
++#ifdef CONFIG_TTY_LOG
++      log_exec(argv, current->tty);
++#endif
++        error = do_execve(file, argv, env, &current->thread.regs);
++        if (error == 0){
++                current->ptrace &= ~PT_DTRACE;
++                set_cmdline(current_cmd());
++        }
++        return(error);
++}
++
++int um_execve(char *file, char **argv, char **env)
++{
++      int err;
++
++      err = execve1(file, argv, env);
++      if(!err) 
++              do_longjmp(current->thread.exec_buf, 1);
++      return(err);
++}
++
++int sys_execve(char *file, char **argv, char **env)
++{
++      int error;
++      char *filename;
++
++      lock_kernel();
++      filename = getname((char *) file);
++      error = PTR_ERR(filename);
++      if (IS_ERR(filename)) goto out;
++      error = execve1(filename, argv, env);
++      putname(filename);
++ out:
++      unlock_kernel();
++      return(error);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/exitcode.c um/arch/um/kernel/exitcode.c
+--- orig/arch/um/kernel/exitcode.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/exitcode.c       Thu Nov  7 18:22:04 2002
+@@ -0,0 +1,73 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/init.h"
++#include "linux/ctype.h"
++#include "linux/proc_fs.h"
++#include "asm/uaccess.h"
++
++/* If read and write race, the read will still atomically read a valid
++ * value.
++ */
++int uml_exitcode = 0;
++
++static int read_proc_exitcode(char *page, char **start, off_t off,
++                            int count, int *eof, void *data)
++{
++      int len;
++
++      len = sprintf(page, "%d\n", uml_exitcode);
++      len -= off;
++      if(len <= off+count) *eof = 1;
++      *start = page + off;
++      if(len > count) len = count;
++      if(len < 0) len = 0;
++      return(len);
++}
++
++static int write_proc_exitcode(struct file *file, const char *buffer,
++                             unsigned long count, void *data)
++{
++      char *end, buf[sizeof("nnnnn\0")];
++      int tmp;
++
++      if(copy_from_user(buf, buffer, count))
++              return(-EFAULT);
++      tmp = simple_strtol(buf, &end, 0);
++      if((*end != '\0') && !isspace(*end))
++              return(-EINVAL);
++      uml_exitcode = tmp;
++      return(count);
++}
++
++static int make_proc_exitcode(void)
++{
++      struct proc_dir_entry *ent;
++
++      ent = create_proc_entry("exitcode", 0600, &proc_root);
++      if(ent == NULL){
++              printk("make_proc_exitcode : Failed to register "
++                     "/proc/exitcode\n");
++              return(0);
++      }
++
++      ent->read_proc = read_proc_exitcode;
++      ent->write_proc = write_proc_exitcode;
++      
++      return(0);
++}
++
++__initcall(make_proc_exitcode);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/frame.c um/arch/um/kernel/frame.c
+--- orig/arch/um/kernel/frame.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/frame.c  Wed Dec 11 11:12:41 2002
+@@ -0,0 +1,342 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <string.h>
++#include <signal.h>
++#include <wait.h>
++#include <sched.h>
++#include <errno.h>
++#include <sys/ptrace.h>
++#include <sys/syscall.h>
++#include <sys/mman.h>
++#include <asm/page.h>
++#include <asm/ptrace.h>
++#include <asm/sigcontext.h>
++#include "sysdep/ptrace.h"
++#include "sysdep/sigcontext.h"
++#include "frame_user.h"
++#include "kern_util.h"
++#include "ptrace_user.h"
++#include "os.h"
++
++static int capture_stack(int (*child)(void *arg), void *arg, void *sp,
++                       unsigned long top, void **data_out)
++{
++      unsigned long regs[FRAME_SIZE];
++      int pid, status, n, len;
++
++      /* Start the child as a thread */
++      pid = clone(child, sp, CLONE_VM | SIGCHLD, arg);
++      if(pid < 0){
++              printf("capture_stack : clone failed - errno = %d\n", errno);
++              exit(1);
++      }
++
++      /* Wait for it to stop itself and continue it with a SIGUSR1 to force 
++       * it into the signal handler.
++       */
++      n = waitpid(pid, &status, WUNTRACED);
++      if(n < 0){
++              printf("capture_stack : waitpid failed - errno = %d\n", errno);
++              exit(1);
++      }
++      if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)){
++              fprintf(stderr, "capture_stack : Expected SIGSTOP, "
++                      "got status = 0x%x\n", status);
++              exit(1);
++      }
++      if(ptrace(PTRACE_CONT, pid, 0, SIGUSR1) < 0){
++              printf("capture_stack : PTRACE_CONT failed - errno = %d\n", 
++                     errno);
++              exit(1);
++      }
++
++      /* Wait for it to stop itself again and grab its registers again.  
++       * At this point, the handler has stuffed the addresses of
++       * sig, sc, and SA_RESTORER in raw.
++       */
++      n = waitpid(pid, &status, WUNTRACED);
++      if(n < 0){
++              printf("capture_stack : waitpid failed - errno = %d\n", errno);
++              exit(1);
++      }
++      if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)){
++              fprintf(stderr, "capture_stack : Expected SIGSTOP, "
++                      "got status = 0x%x\n", status);
++              exit(1);
++      }
++      if(ptrace(PTRACE_GETREGS, pid, 0, regs) < 0){
++              printf("capture_stack : PTRACE_GETREGS failed - errno = %d\n", 
++                     errno);
++              exit(1);
++      }
++
++      /* It has outlived its usefulness, so continue it so it can exit */
++      if(ptrace(PTRACE_CONT, pid, 0, 0) < 0){
++              printf("capture_stack : PTRACE_CONT failed - errno = %d\n", 
++                     errno);
++              exit(1);
++      }
++      if(waitpid(pid, &status, 0) < 0){
++              printf("capture_stack : waitpid failed - errno = %d\n", errno);
++              exit(1);
++      }
++      if(!WIFSIGNALED(status) || (WTERMSIG(status) != 9)){
++              printf("capture_stack : Expected exit signal 9, "
++                     "got status = 0x%x\n", status);
++              exit(1);
++      }
++
++      /* The frame that we want is the top of the signal stack */
++
++      len = top - PT_SP(regs);
++      *data_out = malloc(len);
++      if(*data_out == NULL){
++              printf("capture_stack : malloc failed - errno = %d\n", errno);
++              exit(1);
++      }
++      memcpy(*data_out, (void *) PT_SP(regs), len);
++
++      return(len);
++}
++
++struct common_raw {
++      void *stack;
++      int size;
++      unsigned long sig;
++      unsigned long sr;
++      unsigned long sp;       
++      struct arch_frame_data_raw arch;
++};
++
++#define SA_RESTORER (0x04000000)
++
++typedef unsigned long old_sigset_t;
++
++struct old_sigaction {
++      __sighandler_t handler;
++      old_sigset_t sa_mask;
++      unsigned long sa_flags;
++      void (*sa_restorer)(void);
++};
++
++static void child_common(struct common_raw *common, sighandler_t handler,
++                       int restorer, int flags)
++{
++      stack_t ss = ((stack_t) { .ss_sp        = common->stack,
++                                .ss_flags     = 0,
++                                .ss_size      = common->size });
++      int err;
++
++      if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){
++              printf("PTRACE_TRACEME failed, errno = %d\n", errno);
++      }
++      if(sigaltstack(&ss, NULL) < 0){
++              printf("sigaltstack failed - errno = %d\n", errno);
++              kill(getpid(), SIGKILL);
++      }
++
++      if(restorer){
++              struct sigaction sa;
++
++              sa.sa_handler = handler;
++              sigemptyset(&sa.sa_mask);
++              sa.sa_flags = SA_ONSTACK | flags;
++              err = sigaction(SIGUSR1, &sa, NULL);
++      }
++      else {
++              struct old_sigaction sa;
++
++              sa.handler = handler;
++              sa.sa_mask = 0;
++              sa.sa_flags = (SA_ONSTACK | flags) & ~SA_RESTORER;
++              err = syscall(__NR_sigaction, SIGUSR1, &sa, NULL);
++      }
++      
++      if(err < 0){
++              printf("sigaction failed - errno = %d\n", errno);
++              kill(getpid(), SIGKILL);
++      }
++
++      os_stop_process(os_getpid());
++}
++
++/* Changed only during early boot */
++struct sc_frame signal_frame_sc;
++
++struct sc_frame signal_frame_sc_sr;
++
++struct sc_frame_raw {
++      struct common_raw common;
++      unsigned long sc;
++      int restorer;
++};
++
++/* Changed only during early boot */
++static struct sc_frame_raw *raw_sc = NULL;
++
++static void sc_handler(int sig, struct sigcontext sc)
++{
++      raw_sc->common.sig = (unsigned long) &sig;
++      raw_sc->common.sr = frame_restorer();
++      raw_sc->common.sp = frame_sp();
++      raw_sc->sc = (unsigned long) &sc;
++      setup_arch_frame_raw(&raw_sc->common.arch, &sc + 1, raw_sc->common.sr);
++
++      os_stop_process(os_getpid());
++      kill(getpid(), SIGKILL);
++}
++
++static int sc_child(void *arg)
++{
++      raw_sc = arg;
++      child_common(&raw_sc->common, (sighandler_t) sc_handler, 
++                   raw_sc->restorer, 0);
++      return(-1);
++}
++
++/* Changed only during early boot */
++struct si_frame signal_frame_si;
++
++struct si_frame_raw {
++      struct common_raw common;
++      unsigned long sip;
++      unsigned long si;
++      unsigned long ucp;
++      unsigned long uc;
++};
++
++/* Changed only during early boot */
++static struct si_frame_raw *raw_si = NULL;
++
++static void si_handler(int sig, siginfo_t *si, struct ucontext *ucontext)
++{
++      raw_si->common.sig = (unsigned long) &sig;
++      raw_si->common.sr = frame_restorer();
++      raw_si->common.sp = frame_sp();
++      raw_si->sip = (unsigned long) &si;
++      raw_si->si = (unsigned long) si;
++      raw_si->ucp = (unsigned long) &ucontext;
++      raw_si->uc = (unsigned long) ucontext;
++      setup_arch_frame_raw(&raw_si->common.arch, 
++                           ucontext->uc_mcontext.fpregs, raw_si->common.sr);
++      
++      os_stop_process(os_getpid());
++      kill(getpid(), SIGKILL);
++}
++
++static int si_child(void *arg)
++{
++      raw_si = arg;
++      child_common(&raw_si->common, (sighandler_t) si_handler, 1, 
++                   SA_SIGINFO);
++      return(-1);
++}
++
++static int relative_sr(unsigned long sr, int sr_index, void *stack, 
++                     void *framep)
++{
++      unsigned long *srp = (unsigned long *) sr;
++      unsigned long frame = (unsigned long) framep;
++
++      if((*srp & PAGE_MASK) == (unsigned long) stack){
++              *srp -= sr;
++              *((unsigned long *) (frame + sr_index)) = *srp;
++              return(1);
++      }
++      else return(0);
++}
++
++static unsigned long capture_stack_common(int (*proc)(void *), void *arg, 
++                                        struct common_raw *common_in, 
++                                        void *top, void *sigstack, 
++                                        int stack_len, 
++                                        struct frame_common *common_out)
++{
++      unsigned long sig_top = (unsigned long) sigstack + stack_len, base;
++
++      common_in->stack = (void *) sigstack;
++      common_in->size = stack_len;
++      common_out->len = capture_stack(proc, arg, top, sig_top, 
++                                      &common_out->data);
++      base = sig_top - common_out->len;
++      common_out->sig_index = common_in->sig - base;
++      common_out->sp_index = common_in->sp - base;
++      common_out->sr_index = common_in->sr - base;
++      common_out->sr_relative = relative_sr(common_in->sr, 
++                                            common_out->sr_index, sigstack, 
++                                            common_out->data);
++      return(base);
++}
++
++void capture_signal_stack(void)
++{
++      struct sc_frame_raw raw_sc;
++      struct si_frame_raw raw_si;
++      void *stack, *sigstack;
++      unsigned long top, sig_top, base;
++
++      stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
++                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
++      sigstack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
++                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
++      if((stack == MAP_FAILED) || (sigstack == MAP_FAILED)){
++              printf("capture_signal_stack : mmap failed - errno = %d\n", 
++                     errno);
++              exit(1);
++      }
++
++      top = (unsigned long) stack + PAGE_SIZE - sizeof(void *);
++      sig_top = (unsigned long) sigstack + PAGE_SIZE;
++
++      /* Get the sigcontext, no sigrestorer layout */
++      raw_sc.restorer = 0;
++      base = capture_stack_common(sc_child, &raw_sc, &raw_sc.common, 
++                                  (void *) top, sigstack, PAGE_SIZE, 
++                                  &signal_frame_sc.common);
++
++      signal_frame_sc.sc_index = raw_sc.sc - base;
++      setup_arch_frame(&raw_sc.common.arch, &signal_frame_sc.common.arch);
++
++      /* Ditto for the sigcontext, sigrestorer layout */
++      raw_sc.restorer = 1;
++      base = capture_stack_common(sc_child, &raw_sc, &raw_sc.common, 
++                                  (void *) top, sigstack, PAGE_SIZE, 
++                                  &signal_frame_sc_sr.common);
++      signal_frame_sc_sr.sc_index = raw_sc.sc - base;
++      setup_arch_frame(&raw_sc.common.arch, &signal_frame_sc_sr.common.arch);
++
++      /* And the siginfo layout */
++
++      base = capture_stack_common(si_child, &raw_si, &raw_si.common, 
++                                  (void *) top, sigstack, PAGE_SIZE, 
++                                  &signal_frame_si.common);
++      signal_frame_si.sip_index = raw_si.sip - base;
++      signal_frame_si.si_index = raw_si.si - base;
++      signal_frame_si.ucp_index = raw_si.ucp - base;
++      signal_frame_si.uc_index = raw_si.uc - base;
++      setup_arch_frame(&raw_si.common.arch, &signal_frame_si.common.arch);
++
++      if((munmap(stack, PAGE_SIZE) < 0) || 
++         (munmap(sigstack, PAGE_SIZE) < 0)){
++              printf("capture_signal_stack : munmap failed - errno = %d\n", 
++                     errno);
++              exit(1);
++      }
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/frame_kern.c um/arch/um/kernel/frame_kern.c
+--- orig/arch/um/kernel/frame_kern.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/frame_kern.c     Sun Dec  8 19:44:13 2002
+@@ -0,0 +1,171 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "asm/ptrace.h"
++#include "asm/uaccess.h"
++#include "asm/signal.h"
++#include "asm/uaccess.h"
++#include "asm/ucontext.h"
++#include "frame_kern.h"
++#include "sigcontext.h"
++#include "sysdep/ptrace.h"
++#include "choose-mode.h"
++#include "mode.h"
++
++int copy_siginfo_to_user(siginfo_t *to, siginfo_t *from)
++{
++      if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t)))
++              return -EFAULT;
++      if (from->si_code < 0)
++              return __copy_to_user(to, from, sizeof(siginfo_t));
++      else {
++              int err;
++
++              /* If you change siginfo_t structure, please be sure
++                 this code is fixed accordingly.
++                 It should never copy any pad contained in the structure
++                 to avoid security leaks, but must copy the generic
++                 3 ints plus the relevant union member.  */
++              err = __put_user(from->si_signo, &to->si_signo);
++              err |= __put_user(from->si_errno, &to->si_errno);
++              err |= __put_user((short)from->si_code, &to->si_code);
++              /* First 32bits of unions are always present.  */
++              err |= __put_user(from->si_pid, &to->si_pid);
++              switch (from->si_code >> 16) {
++              case __SI_FAULT >> 16:
++                      break;
++              case __SI_CHLD >> 16:
++                      err |= __put_user(from->si_utime, &to->si_utime);
++                      err |= __put_user(from->si_stime, &to->si_stime);
++                      err |= __put_user(from->si_status, &to->si_status);
++              default:
++                      err |= __put_user(from->si_uid, &to->si_uid);
++                      break;
++              }
++              return err;
++      }
++}
++
++static int copy_restorer(void (*restorer)(void), unsigned long start, 
++                       unsigned long sr_index, int sr_relative)
++{
++      unsigned long sr;
++
++      if(sr_relative){
++              sr = (unsigned long) restorer;
++              sr += start + sr_index;
++              restorer = (void (*)(void)) sr;
++      }
++
++      return(copy_to_user((void *) (start + sr_index), &restorer, 
++                          sizeof(restorer)));
++}
++
++static int copy_sc_to_user(void *to, void *fp, struct pt_regs *from, 
++                         struct arch_frame_data *arch)
++{
++      return(CHOOSE_MODE(copy_sc_to_user_tt(to, fp, UPT_SC(&from->regs), 
++                                            arch),
++                         copy_sc_to_user_skas(to, fp, &from->regs,
++                                              current->thread.cr2,
++                                              current->thread.err)));
++}
++
++static int copy_ucontext_to_user(struct ucontext *uc, void *fp, sigset_t *set,
++                               unsigned long sp)
++{
++      int err = 0;
++
++      err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp);
++      err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags);
++      err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size);
++      err |= copy_sc_to_user(&uc->uc_mcontext, fp, &current->thread.regs,
++                             &signal_frame_si.common.arch);
++      err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set));
++      return(err);
++}
++
++int setup_signal_stack_si(unsigned long stack_top, int sig, 
++                        unsigned long handler, void (*restorer)(void), 
++                        struct pt_regs *regs, siginfo_t *info, 
++                        sigset_t *mask)
++{
++      unsigned long start;
++      void *sip, *ucp, *fp;
++
++      start = stack_top - signal_frame_si.common.len;
++      sip = (void *) (start + signal_frame_si.si_index);
++      ucp = (void *) (start + signal_frame_si.uc_index);
++      fp = (void *) (((unsigned long) ucp) + sizeof(struct ucontext));
++
++      if(restorer == NULL)
++              panic("setup_signal_stack_si - no restorer");
++
++      if(copy_to_user((void *) start, signal_frame_si.common.data,
++                      signal_frame_si.common.len) ||
++         copy_to_user((void *) (start + signal_frame_si.common.sig_index), 
++                      &sig, sizeof(sig)) ||
++         copy_siginfo_to_user(sip, info) ||
++         copy_to_user((void *) (start + signal_frame_si.sip_index), &sip,
++                      sizeof(sip)) ||
++         copy_ucontext_to_user(ucp, fp, mask, PT_REGS_SP(regs)) ||
++         copy_to_user((void *) (start + signal_frame_si.ucp_index), &ucp,
++                      sizeof(ucp)) ||
++         copy_restorer(restorer, start, signal_frame_si.common.sr_index,
++                       signal_frame_si.common.sr_relative))
++              return(1);
++      
++      PT_REGS_IP(regs) = handler;
++      PT_REGS_SP(regs) = start + signal_frame_si.common.sp_index;
++      return(0);
++}
++
++int setup_signal_stack_sc(unsigned long stack_top, int sig, 
++                        unsigned long handler, void (*restorer)(void), 
++                        struct pt_regs *regs, sigset_t *mask)
++{
++      struct frame_common *frame = &signal_frame_sc_sr.common;
++      void *user_sc;
++      int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
++      unsigned long sigs, sr;
++      unsigned long start = stack_top - frame->len - sig_size;
++
++      user_sc = (void *) (start + signal_frame_sc_sr.sc_index);
++      if(restorer == NULL){
++              frame = &signal_frame_sc.common;
++              user_sc = (void *) (start + signal_frame_sc.sc_index);
++              sr = (unsigned long) frame->data;
++              sr += frame->sr_index;
++              sr = *((unsigned long *) sr);
++              restorer = ((void (*)(void)) sr);
++      }
++
++      sigs = start + frame->len;
++      if(copy_to_user((void *) start, frame->data, frame->len) ||
++         copy_to_user((void *) (start + frame->sig_index), &sig, 
++                      sizeof(sig)) ||
++         copy_sc_to_user(user_sc, NULL, regs, 
++                         &signal_frame_sc.common.arch) ||
++         copy_to_user(sc_sigmask(user_sc), mask, sizeof(mask->sig[0])) ||
++         copy_to_user((void *) sigs, &mask->sig[1], sig_size) ||
++         copy_restorer(restorer, start, frame->sr_index, frame->sr_relative))
++              return(1);
++
++      PT_REGS_IP(regs) = handler;
++      PT_REGS_SP(regs) = start + frame->sp_index;
++
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/gmon_syms.c um/arch/um/kernel/gmon_syms.c
+--- orig/arch/um/kernel/gmon_syms.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/gmon_syms.c      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,20 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/module.h"
++
++extern void __bb_init_func(void *);
++EXPORT_SYMBOL(__bb_init_func);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/gprof_syms.c um/arch/um/kernel/gprof_syms.c
+--- orig/arch/um/kernel/gprof_syms.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/gprof_syms.c     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,20 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/module.h"
++
++extern void mcount(void);
++EXPORT_SYMBOL(mcount);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/helper.c um/arch/um/kernel/helper.c
+--- orig/arch/um/kernel/helper.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/helper.c Thu Oct 31 10:34:23 2002
+@@ -0,0 +1,153 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <sched.h>
++#include <sys/signal.h>
++#include <sys/wait.h>
++#include "user.h"
++#include "kern_util.h"
++#include "os.h"
++
++struct helper_data {
++      void (*pre_exec)(void*);
++      void *pre_data;
++      char **argv;
++      int fd;
++};
++
++/* Debugging aid, changed only from gdb */
++int helper_pause = 0;
++
++static void helper_hup(int sig)
++{
++}
++
++static int helper_child(void *arg)
++{
++      struct helper_data *data = arg;
++      char **argv = data->argv;
++
++      if(helper_pause){
++              signal(SIGHUP, helper_hup);
++              pause();
++      }
++      if(data->pre_exec != NULL)
++              (*data->pre_exec)(data->pre_data);
++      execvp(argv[0], argv);
++      printk("execvp of '%s' failed - errno = %d\n", argv[0], errno);
++      write(data->fd, &errno, sizeof(errno));
++      os_kill_process(os_getpid(), 0);
++      return(0);
++}
++
++/* XXX The alloc_stack here breaks if this is called in the tracing thread */
++
++int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv,
++             unsigned long *stack_out)
++{
++      struct helper_data data;
++      unsigned long stack, sp;
++      int pid, fds[2], err, n;
++
++      if((stack_out != NULL) && (*stack_out != 0))
++              stack = *stack_out;
++      else stack = alloc_stack(0, um_in_interrupt());
++      if(stack == 0) return(-ENOMEM);
++
++      err = os_pipe(fds, 1, 0);
++      if(err){
++              printk("run_helper : pipe failed, errno = %d\n", -err);
++              return(err);
++      }
++      if(fcntl(fds[1], F_SETFD, 1) != 0){
++              printk("run_helper : setting FD_CLOEXEC failed, errno = %d\n",
++                     errno);
++              return(-errno);
++      }
++
++      sp = stack + page_size() - sizeof(void *);
++      data.pre_exec = pre_exec;
++      data.pre_data = pre_data;
++      data.argv = argv;
++      data.fd = fds[1];
++      pid = clone(helper_child, (void *) sp, CLONE_VM | SIGCHLD, &data);
++      if(pid < 0){
++              printk("run_helper : clone failed, errno = %d\n", errno);
++              return(-errno);
++      }
++      close(fds[1]);
++      n = read(fds[0], &err, sizeof(err));
++      if(n < 0){
++              printk("run_helper : read on pipe failed, errno = %d\n", 
++                     errno);
++              return(-errno);
++      }
++      else if(n != 0){
++              waitpid(pid, NULL, 0);
++              pid = -err;
++      }
++
++      if(stack_out == NULL) free_stack(stack, 0);
++        else *stack_out = stack;
++      return(pid);
++}
++
++int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, 
++                    unsigned long *stack_out, int stack_order)
++{
++      unsigned long stack, sp;
++      int pid, status;
++
++      stack = alloc_stack(stack_order, um_in_interrupt());
++      if(stack == 0) return(-ENOMEM);
++
++      sp = stack + (page_size() << stack_order) - sizeof(void *);
++      pid = clone(proc, (void *) sp, flags | SIGCHLD, arg);
++      if(pid < 0){
++              printk("run_helper_thread : clone failed, errno = %d\n", 
++                     errno);
++              return(-errno);
++      }
++      if(stack_out == NULL){
++              pid = waitpid(pid, &status, 0);
++              if(pid < 0)
++                      printk("run_helper_thread - wait failed, errno = %d\n",
++                             pid);
++              if(!WIFEXITED(status) || (WEXITSTATUS(status) != 0))
++                      printk("run_helper_thread - thread returned status "
++                             "0x%x\n", status);
++              free_stack(stack, stack_order);
++      }
++        else *stack_out = stack;
++      return(pid);
++}
++
++int helper_wait(int pid, int block)
++{
++      int ret;
++
++      ret = waitpid(pid, NULL, WNOHANG);
++      if(ret < 0){
++              printk("helper_wait : waitpid failed, errno = %d\n", errno);
++              return(-errno);
++      }
++      return(ret);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/init_task.c um/arch/um/kernel/init_task.c
+--- orig/arch/um/kernel/init_task.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/init_task.c      Sat Dec 28 19:58:44 2002
+@@ -0,0 +1,61 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/mm.h"
++#include "linux/sched.h"
++#include "linux/version.h"
++#include "asm/uaccess.h"
++#include "asm/pgtable.h"
++#include "user_util.h"
++#include "mem_user.h"
++
++static struct fs_struct init_fs = INIT_FS;
++static struct files_struct init_files = INIT_FILES;
++static struct signal_struct init_signals = INIT_SIGNALS;
++struct mm_struct init_mm = INIT_MM(init_mm);
++
++/*
++ * Initial task structure.
++ *
++ * We need to make sure that this is 16384-byte aligned due to the
++ * way process stacks are handled. This is done by having a special
++ * "init_task" linker map entry..
++ */
++
++union task_union init_task_union 
++__attribute__((__section__(".data.init_task"))) = 
++{ INIT_TASK(init_task_union.task) };
++
++struct task_struct *alloc_task_struct(void)
++{
++      return((struct task_struct *) 
++             __get_free_pages(GFP_KERNEL, CONFIG_KERNEL_STACK_ORDER));
++}
++
++void unprotect_stack(unsigned long stack)
++{
++      protect_memory(stack, (1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE, 
++                     1, 1, 0, 1);
++}
++
++void free_task_struct(struct task_struct *task)
++{
++      /* free_pages decrements the page counter and only actually frees
++       * the pages if they are now not accessed by anything.
++       */
++      free_pages((unsigned long) task, CONFIG_KERNEL_STACK_ORDER);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/initrd_kern.c um/arch/um/kernel/initrd_kern.c
+--- orig/arch/um/kernel/initrd_kern.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/initrd_kern.c    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,59 @@
++/*
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/init.h"
++#include "linux/bootmem.h"
++#include "linux/blk.h"
++#include "asm/types.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "initrd.h"
++#include "init.h"
++#include "os.h"
++
++/* Changed by uml_initrd_setup, which is a setup */
++static char *initrd __initdata = NULL;
++
++static int __init read_initrd(void)
++{
++      void *area;
++      long long size;
++      int err;
++
++      if(initrd == NULL) return 0;
++      err = os_file_size(initrd, &size);
++      if(err) return 0;
++      area = alloc_bootmem(size);
++      if(area == NULL) return 0;
++      if(load_initrd(initrd, area, size) == -1) return 0;
++      initrd_start = (unsigned long) area;
++      initrd_end = initrd_start + size;
++      return 0;
++}
++
++__uml_postsetup(read_initrd);
++
++static int __init uml_initrd_setup(char *line, int *add)
++{
++      initrd = line;
++      return 0;
++}
++
++__uml_setup("initrd=", uml_initrd_setup, 
++"initrd=<initrd image>\n"
++"    This is used to boot UML from an initrd image.  The argument is the\n"
++"    name of the file containing the image.\n\n"
++);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/initrd_user.c um/arch/um/kernel/initrd_user.c
+--- orig/arch/um/kernel/initrd_user.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/initrd_user.c    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,43 @@
++/*
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <sys/types.h>
++#include <sys/stat.h>
++#include <fcntl.h>
++#include <errno.h>
++
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "initrd.h"
++#include "os.h"
++
++int load_initrd(char *filename, void *buf, int size)
++{
++      int fd, n;
++
++      if((fd = os_open_file(filename, of_read(OPENFLAGS()), 0)) < 0){
++              printk("Opening '%s' failed - errno = %d\n", filename, errno);
++              return(-1);
++      }
++      if((n = read(fd, buf, size)) != size){
++              printk("Read of %d bytes from '%s' returned %d, errno = %d\n",
++                     size, filename, n, errno);
++              return(-1);
++      }
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/irq.c um/arch/um/kernel/irq.c
+--- orig/arch/um/kernel/irq.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/irq.c    Wed Mar 26 14:45:29 2003
+@@ -0,0 +1,842 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
++ *    Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
++ */
++
++#include "linux/config.h"
++#include "linux/kernel.h"
++#include "linux/smp.h"
++#include "linux/irq.h"
++#include "linux/kernel_stat.h"
++#include "linux/interrupt.h"
++#include "linux/random.h"
++#include "linux/slab.h"
++#include "linux/file.h"
++#include "linux/proc_fs.h"
++#include "linux/init.h"
++#include "linux/seq_file.h"
++#include "asm/irq.h"
++#include "asm/hw_irq.h"
++#include "asm/hardirq.h"
++#include "asm/atomic.h"
++#include "asm/signal.h"
++#include "asm/system.h"
++#include "asm/errno.h"
++#include "asm/uaccess.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "irq_user.h"
++
++static void register_irq_proc (unsigned int irq);
++
++irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned =
++        { [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};
++
++/*
++ * Generic no controller code
++ */
++
++static void enable_none(unsigned int irq) { }
++static unsigned int startup_none(unsigned int irq) { return 0; }
++static void disable_none(unsigned int irq) { }
++static void ack_none(unsigned int irq)
++{
++/*
++ * 'what should we do if we get a hw irq event on an illegal vector'.
++ * each architecture has to answer this themselves, it doesnt deserve
++ * a generic callback i think.
++ */
++#if CONFIG_X86
++      printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
++#ifdef CONFIG_X86_LOCAL_APIC
++      /*
++       * Currently unexpected vectors happen only on SMP and APIC.
++       * We _must_ ack these because every local APIC has only N
++       * irq slots per priority level, and a 'hanging, unacked' IRQ
++       * holds up an irq slot - in excessive cases (when multiple
++       * unexpected vectors occur) that might lock up the APIC
++       * completely.
++       */
++      ack_APIC_irq();
++#endif
++#endif
++}
++
++/* startup is the same as "enable", shutdown is same as "disable" */
++#define shutdown_none disable_none
++#define end_none      enable_none
++
++struct hw_interrupt_type no_irq_type = {
++      "none",
++      startup_none,
++      shutdown_none,
++      enable_none,
++      disable_none,
++      ack_none,
++      end_none
++};
++
++/* Not changed */
++volatile unsigned long irq_err_count;
++
++/*
++ * Generic, controller-independent functions:
++ */
++
++int get_irq_list(char *buf)
++{
++      int i, j;
++      unsigned long flags;
++      struct irqaction * action;
++      char *p = buf;
++
++      p += sprintf(p, "           ");
++      for (j=0; j<smp_num_cpus; j++)
++              p += sprintf(p, "CPU%d       ",j);
++      *p++ = '\n';
++
++      for (i = 0 ; i < NR_IRQS ; i++) {
++              spin_lock_irqsave(&irq_desc[i].lock, flags);
++              action = irq_desc[i].action;
++              if (!action) 
++                      goto end;
++              p += sprintf(p, "%3d: ",i);
++#ifndef CONFIG_SMP
++              p += sprintf(p, "%10u ", kstat_irqs(i));
++#else
++              for (j = 0; j < smp_num_cpus; j++)
++                      p += sprintf(p, "%10u ",
++                              kstat.irqs[cpu_logical_map(j)][i]);
++#endif
++              p += sprintf(p, " %14s", irq_desc[i].handler->typename);
++              p += sprintf(p, "  %s", action->name);
++
++              for (action=action->next; action; action = action->next)
++                      p += sprintf(p, ", %s", action->name);
++              *p++ = '\n';
++      end:
++              spin_unlock_irqrestore(&irq_desc[i].lock, flags);
++      }
++      p += sprintf(p, "\n");
++#ifdef notdef
++#if CONFIG_SMP
++      p += sprintf(p, "LOC: ");
++      for (j = 0; j < smp_num_cpus; j++)
++              p += sprintf(p, "%10u ",
++                      apic_timer_irqs[cpu_logical_map(j)]);
++      p += sprintf(p, "\n");
++#endif
++#endif
++      p += sprintf(p, "ERR: %10lu\n", irq_err_count);
++      return p - buf;
++}
++
++
++/*
++ * This should really return information about whether
++ * we should do bottom half handling etc. Right now we
++ * end up _always_ checking the bottom half, which is a
++ * waste of time and is not what some drivers would
++ * prefer.
++ */
++int handle_IRQ_event(unsigned int irq, struct pt_regs * regs, 
++                   struct irqaction * action)
++{
++      int status;
++      int cpu = smp_processor_id();
++
++      irq_enter(cpu, irq);
++
++      status = 1;     /* Force the "do bottom halves" bit */
++
++      if (!(action->flags & SA_INTERRUPT))
++              __sti();
++
++      do {
++              status |= action->flags;
++              action->handler(irq, action->dev_id, regs);
++              action = action->next;
++      } while (action);
++      if (status & SA_SAMPLE_RANDOM)
++              add_interrupt_randomness(irq);
++      __cli();
++
++      irq_exit(cpu, irq);
++
++      return status;
++}
++
++/*
++ * Generic enable/disable code: this just calls
++ * down into the PIC-specific version for the actual
++ * hardware disable after having gotten the irq
++ * controller lock. 
++ */
++ 
++/**
++ *    disable_irq_nosync - disable an irq without waiting
++ *    @irq: Interrupt to disable
++ *
++ *    Disable the selected interrupt line. Disables of an interrupt
++ *    stack. Unlike disable_irq(), this function does not ensure existing
++ *    instances of the IRQ handler have completed before returning.
++ *
++ *    This function may be called from IRQ context.
++ */
++ 
++void inline disable_irq_nosync(unsigned int irq)
++{
++      irq_desc_t *desc = irq_desc + irq;
++      unsigned long flags;
++
++      spin_lock_irqsave(&desc->lock, flags);
++      if (!desc->depth++) {
++              desc->status |= IRQ_DISABLED;
++              desc->handler->disable(irq);
++      }
++      spin_unlock_irqrestore(&desc->lock, flags);
++}
++
++/**
++ *    disable_irq - disable an irq and wait for completion
++ *    @irq: Interrupt to disable
++ *
++ *    Disable the selected interrupt line. Disables of an interrupt
++ *    stack. That is for two disables you need two enables. This
++ *    function waits for any pending IRQ handlers for this interrupt
++ *    to complete before returning. If you use this function while
++ *    holding a resource the IRQ handler may need you will deadlock.
++ *
++ *    This function may be called - with care - from IRQ context.
++ */
++ 
++void disable_irq(unsigned int irq)
++{
++      disable_irq_nosync(irq);
++
++      if (!local_irq_count(smp_processor_id())) {
++              do {
++                      barrier();
++              } while (irq_desc[irq].status & IRQ_INPROGRESS);
++      }
++}
++
++/**
++ *    enable_irq - enable interrupt handling on an irq
++ *    @irq: Interrupt to enable
++ *
++ *    Re-enables the processing of interrupts on this IRQ line
++ *    providing no disable_irq calls are now in effect.
++ *
++ *    This function may be called from IRQ context.
++ */
++ 
++void enable_irq(unsigned int irq)
++{
++      irq_desc_t *desc = irq_desc + irq;
++      unsigned long flags;
++
++      spin_lock_irqsave(&desc->lock, flags);
++      switch (desc->depth) {
++      case 1: {
++              unsigned int status = desc->status & ~IRQ_DISABLED;
++              desc->status = status;
++              if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
++                      desc->status = status | IRQ_REPLAY;
++                      hw_resend_irq(desc->handler,irq);
++              }
++              desc->handler->enable(irq);
++              /* fall-through */
++      }
++      default:
++              desc->depth--;
++              break;
++      case 0:
++              printk(KERN_ERR "enable_irq() unbalanced from %p\n",
++                     __builtin_return_address(0));
++      }
++      spin_unlock_irqrestore(&desc->lock, flags);
++}
++
++/*
++ * do_IRQ handles all normal device IRQ's (the special
++ * SMP cross-CPU interrupts have their own specific
++ * handlers).
++ */
++unsigned int do_IRQ(int irq, union uml_pt_regs *regs)
++{     
++      /* 
++       * 0 return value means that this irq is already being
++       * handled by some other CPU. (or is disabled)
++       */
++      int cpu = smp_processor_id();
++      irq_desc_t *desc = irq_desc + irq;
++      struct irqaction * action;
++      unsigned int status;
++
++      kstat.irqs[cpu][irq]++;
++      spin_lock(&desc->lock);
++      desc->handler->ack(irq);
++      /*
++         REPLAY is when Linux resends an IRQ that was dropped earlier
++         WAITING is used by probe to mark irqs that are being tested
++         */
++      status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
++      status |= IRQ_PENDING; /* we _want_ to handle it */
++
++      /*
++       * If the IRQ is disabled for whatever reason, we cannot
++       * use the action we have.
++       */
++      action = NULL;
++      if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) {
++              action = desc->action;
++              status &= ~IRQ_PENDING; /* we commit to handling */
++              status |= IRQ_INPROGRESS; /* we are handling it */
++      }
++      desc->status = status;
++
++      /*
++       * If there is no IRQ handler or it was disabled, exit early.
++         Since we set PENDING, if another processor is handling
++         a different instance of this same irq, the other processor
++         will take care of it.
++       */
++      if (!action)
++              goto out;
++
++      /*
++       * Edge triggered interrupts need to remember
++       * pending events.
++       * This applies to any hw interrupts that allow a second
++       * instance of the same irq to arrive while we are in do_IRQ
++       * or in the handler. But the code here only handles the _second_
++       * instance of the irq, not the third or fourth. So it is mostly
++       * useful for irq hardware that does not mask cleanly in an
++       * SMP environment.
++       */
++      for (;;) {
++              spin_unlock(&desc->lock);
++              handle_IRQ_event(irq, (struct pt_regs *) regs, action);
++              spin_lock(&desc->lock);
++              
++              if (!(desc->status & IRQ_PENDING))
++                      break;
++              desc->status &= ~IRQ_PENDING;
++      }
++      desc->status &= ~IRQ_INPROGRESS;
++out:
++      /*
++       * The ->end() handler has to deal with interrupts which got
++       * disabled while the handler was running.
++       */
++      desc->handler->end(irq);
++      spin_unlock(&desc->lock);
++
++      if (softirq_pending(cpu))
++              do_softirq();
++      return 1;
++}
++
++/**
++ *    request_irq - allocate an interrupt line
++ *    @irq: Interrupt line to allocate
++ *    @handler: Function to be called when the IRQ occurs
++ *    @irqflags: Interrupt type flags
++ *    @devname: An ascii name for the claiming device
++ *    @dev_id: A cookie passed back to the handler function
++ *
++ *    This call allocates interrupt resources and enables the
++ *    interrupt line and IRQ handling. From the point this
++ *    call is made your handler function may be invoked. Since
++ *    your handler function must clear any interrupt the board 
++ *    raises, you must take care both to initialise your hardware
++ *    and to set up the interrupt handler in the right order.
++ *
++ *    Dev_id must be globally unique. Normally the address of the
++ *    device data structure is used as the cookie. Since the handler
++ *    receives this value it makes sense to use it.
++ *
++ *    If your interrupt is shared you must pass a non NULL dev_id
++ *    as this is required when freeing the interrupt.
++ *
++ *    Flags:
++ *
++ *    SA_SHIRQ                Interrupt is shared
++ *
++ *    SA_INTERRUPT            Disable local interrupts while processing
++ *
++ *    SA_SAMPLE_RANDOM        The interrupt can be used for entropy
++ *
++ */
++ 
++int request_irq(unsigned int irq,
++              void (*handler)(int, void *, struct pt_regs *),
++              unsigned long irqflags, 
++              const char * devname,
++              void *dev_id)
++{
++      int retval;
++      struct irqaction * action;
++
++#if 1
++      /*
++       * Sanity-check: shared interrupts should REALLY pass in
++       * a real dev-ID, otherwise we'll have trouble later trying
++       * to figure out which interrupt is which (messes up the
++       * interrupt freeing logic etc).
++       */
++      if (irqflags & SA_SHIRQ) {
++              if (!dev_id)
++                      printk(KERN_ERR "Bad boy: %s (at 0x%x) called us "
++                             "without a dev_id!\n", devname, (&irq)[-1]);
++      }
++#endif
++
++      if (irq >= NR_IRQS)
++              return -EINVAL;
++      if (!handler)
++              return -EINVAL;
++
++      action = (struct irqaction *)
++                      kmalloc(sizeof(struct irqaction), GFP_KERNEL);
++      if (!action)
++              return -ENOMEM;
++
++      action->handler = handler;
++      action->flags = irqflags;
++      action->mask = 0;
++      action->name = devname;
++      action->next = NULL;
++      action->dev_id = dev_id;
++
++      retval = setup_irq(irq, action);
++      if (retval)
++              kfree(action);
++      return retval;
++}
++
++int um_request_irq(unsigned int irq, int fd, int type,
++                 void (*handler)(int, void *, struct pt_regs *),
++                 unsigned long irqflags, const char * devname,
++                 void *dev_id)
++{
++      int err;
++
++      err = request_irq(irq, handler, irqflags, devname, dev_id);
++      if(err) 
++              return(err);
++
++      if(fd != -1)
++              err = activate_fd(irq, fd, type, dev_id);
++      return(err);
++}
++
++/* this was setup_x86_irq but it seems pretty generic */
++int setup_irq(unsigned int irq, struct irqaction * new)
++{
++      int shared = 0;
++      unsigned long flags;
++      struct irqaction *old, **p;
++      irq_desc_t *desc = irq_desc + irq;
++
++      /*
++       * Some drivers like serial.c use request_irq() heavily,
++       * so we have to be careful not to interfere with a
++       * running system.
++       */
++      if (new->flags & SA_SAMPLE_RANDOM) {
++              /*
++               * This function might sleep, we want to call it first,
++               * outside of the atomic block.
++               * Yes, this might clear the entropy pool if the wrong
++               * driver is attempted to be loaded, without actually
++               * installing a new handler, but is this really a problem,
++               * only the sysadmin is able to do this.
++               */
++              rand_initialize_irq(irq);
++      }
++
++      /*
++       * The following block of code has to be executed atomically
++       */
++      spin_lock_irqsave(&desc->lock,flags);
++      p = &desc->action;
++      if ((old = *p) != NULL) {
++              /* Can't share interrupts unless both agree to */
++              if (!(old->flags & new->flags & SA_SHIRQ)) {
++                      spin_unlock_irqrestore(&desc->lock,flags);
++                      return -EBUSY;
++              }
++
++              /* add new interrupt at end of irq queue */
++              do {
++                      p = &old->next;
++                      old = *p;
++              } while (old);
++              shared = 1;
++      }
++
++      *p = new;
++
++      if (!shared) {
++              desc->depth = 0;
++              desc->status &= ~IRQ_DISABLED;
++              desc->handler->startup(irq);
++      }
++      spin_unlock_irqrestore(&desc->lock,flags);
++
++      register_irq_proc(irq);
++      return 0;
++}
++
++/**
++ *    free_irq - free an interrupt
++ *    @irq: Interrupt line to free
++ *    @dev_id: Device identity to free
++ *
++ *    Remove an interrupt handler. The handler is removed and if the
++ *    interrupt line is no longer in use by any driver it is disabled.
++ *    On a shared IRQ the caller must ensure the interrupt is disabled
++ *    on the card it drives before calling this function. The function
++ *    does not return until any executing interrupts for this IRQ
++ *    have completed.
++ *
++ *    This function may be called from interrupt context. 
++ *
++ *    Bugs: Attempting to free an irq in a handler for the same irq hangs
++ *          the machine.
++ */
++ 
++void free_irq(unsigned int irq, void *dev_id)
++{
++      irq_desc_t *desc;
++      struct irqaction **p;
++      unsigned long flags;
++
++      if (irq >= NR_IRQS)
++              return;
++
++      desc = irq_desc + irq;
++      spin_lock_irqsave(&desc->lock,flags);
++      p = &desc->action;
++      for (;;) {
++              struct irqaction * action = *p;
++              if (action) {
++                      struct irqaction **pp = p;
++                      p = &action->next;
++                      if (action->dev_id != dev_id)
++                              continue;
++
++                      /* Found it - now remove it from the list of entries */
++                      *pp = action->next;
++                      if (!desc->action) {
++                              desc->status |= IRQ_DISABLED;
++                              desc->handler->shutdown(irq);
++                      }
++                      free_irq_by_irq_and_dev(irq, dev_id);
++                      spin_unlock_irqrestore(&desc->lock,flags);
++
++#ifdef CONFIG_SMP
++                      /* Wait to make sure it's not being used on another CPU */
++                      while (desc->status & IRQ_INPROGRESS)
++                              barrier();
++#endif
++                      kfree(action);
++                      return;
++              }
++              printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
++              spin_unlock_irqrestore(&desc->lock,flags);
++              return;
++      }
++}
++
++/* These are initialized by sysctl_init, which is called from init/main.c */
++static struct proc_dir_entry * root_irq_dir;
++static struct proc_dir_entry * irq_dir [NR_IRQS];
++static struct proc_dir_entry * smp_affinity_entry [NR_IRQS];
++
++/* These are read and written as longs, so a read won't see a partial write
++ * even during a race.
++ */
++static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL };
++
++#define HEX_DIGITS 8
++
++static int irq_affinity_read_proc (char *page, char **start, off_t off,
++                      int count, int *eof, void *data)
++{
++      if (count < HEX_DIGITS+1)
++              return -EINVAL;
++      return sprintf (page, "%08lx\n", irq_affinity[(long)data]);
++}
++
++static unsigned int parse_hex_value (const char *buffer,
++              unsigned long count, unsigned long *ret)
++{
++      unsigned char hexnum [HEX_DIGITS];
++      unsigned long value;
++      int i;
++
++      if (!count)
++              return -EINVAL;
++      if (count > HEX_DIGITS)
++              count = HEX_DIGITS;
++      if (copy_from_user(hexnum, buffer, count))
++              return -EFAULT;
++
++      /*
++       * Parse the first 8 characters as a hex string, any non-hex char
++       * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same.
++       */
++      value = 0;
++
++      for (i = 0; i < count; i++) {
++              unsigned int c = hexnum[i];
++
++              switch (c) {
++                      case '0' ... '9': c -= '0'; break;
++                      case 'a' ... 'f': c -= 'a'-10; break;
++                      case 'A' ... 'F': c -= 'A'-10; break;
++              default:
++                      goto out;
++              }
++              value = (value << 4) | c;
++      }
++out:
++      *ret = value;
++      return 0;
++}
++
++static int irq_affinity_write_proc (struct file *file, const char *buffer,
++                                      unsigned long count, void *data)
++{
++      int irq = (long) data, full_count = count, err;
++      unsigned long new_value;
++
++      if (!irq_desc[irq].handler->set_affinity)
++              return -EIO;
++
++      err = parse_hex_value(buffer, count, &new_value);
++
++#if CONFIG_SMP
++      /*
++       * Do not allow disabling IRQs completely - it's a too easy
++       * way to make the system unusable accidentally :-) At least
++       * one online CPU still has to be targeted.
++       */
++      if (!(new_value & cpu_online_map))
++              return -EINVAL;
++#endif
++
++      irq_affinity[irq] = new_value;
++      irq_desc[irq].handler->set_affinity(irq, new_value);
++
++      return full_count;
++}
++
++static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
++                      int count, int *eof, void *data)
++{
++      unsigned long *mask = (unsigned long *) data;
++      if (count < HEX_DIGITS+1)
++              return -EINVAL;
++      return sprintf (page, "%08lx\n", *mask);
++}
++
++static int prof_cpu_mask_write_proc (struct file *file, const char *buffer,
++                                      unsigned long count, void *data)
++{
++      unsigned long *mask = (unsigned long *) data, full_count = count, err;
++      unsigned long new_value;
++
++      err = parse_hex_value(buffer, count, &new_value);
++      if (err)
++              return err;
++
++      *mask = new_value;
++      return full_count;
++}
++
++#define MAX_NAMELEN 10
++
++static void register_irq_proc (unsigned int irq)
++{
++      struct proc_dir_entry *entry;
++      char name [MAX_NAMELEN];
++
++      if (!root_irq_dir || (irq_desc[irq].handler == &no_irq_type) ||
++          irq_dir[irq])
++              return;
++
++      memset(name, 0, MAX_NAMELEN);
++      sprintf(name, "%d", irq);
++
++      /* create /proc/irq/1234 */
++      irq_dir[irq] = proc_mkdir(name, root_irq_dir);
++
++      /* create /proc/irq/1234/smp_affinity */
++      entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
++
++      entry->nlink = 1;
++      entry->data = (void *)(long)irq;
++      entry->read_proc = irq_affinity_read_proc;
++      entry->write_proc = irq_affinity_write_proc;
++
++      smp_affinity_entry[irq] = entry;
++}
++
++/* Read and written as a long */
++unsigned long prof_cpu_mask = -1;
++
++void __init init_irq_proc (void)
++{
++      struct proc_dir_entry *entry;
++      int i;
++
++      /* create /proc/irq */
++      root_irq_dir = proc_mkdir("irq", 0);
++
++      /* create /proc/irq/prof_cpu_mask */
++      entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
++
++      entry->nlink = 1;
++      entry->data = (void *)&prof_cpu_mask;
++      entry->read_proc = prof_cpu_mask_read_proc;
++      entry->write_proc = prof_cpu_mask_write_proc;
++
++      /*
++       * Create entries for all existing IRQs.
++       */
++      for (i = 0; i < NR_IRQS; i++)
++              register_irq_proc(i);
++}
++
++static spinlock_t irq_spinlock = SPIN_LOCK_UNLOCKED;
++
++unsigned long irq_lock(void)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&irq_spinlock, flags);
++      return(flags);
++}
++
++void irq_unlock(unsigned long flags)
++{
++      spin_unlock_irqrestore(&irq_spinlock, flags);
++}
++
++unsigned long probe_irq_on(void)
++{
++      return(0);
++}
++
++int probe_irq_off(unsigned long val)
++{
++      return(0);
++}
++
++static unsigned int startup_SIGIO_irq(unsigned int irq)
++{
++      return(0);
++}
++
++static void shutdown_SIGIO_irq(unsigned int irq)
++{
++}
++
++static void enable_SIGIO_irq(unsigned int irq)
++{
++}
++
++static void disable_SIGIO_irq(unsigned int irq)
++{
++}
++
++static void mask_and_ack_SIGIO(unsigned int irq)
++{
++}
++
++static void end_SIGIO_irq(unsigned int irq)
++{
++}
++
++static unsigned int startup_SIGVTALRM_irq(unsigned int irq)
++{
++      return(0);
++}
++
++static void shutdown_SIGVTALRM_irq(unsigned int irq)
++{
++}
++
++static void enable_SIGVTALRM_irq(unsigned int irq)
++{
++}
++
++static void disable_SIGVTALRM_irq(unsigned int irq)
++{
++}
++
++static void mask_and_ack_SIGVTALRM(unsigned int irq)
++{
++}
++
++static void end_SIGVTALRM_irq(unsigned int irq)
++{
++}
++
++static struct hw_interrupt_type SIGIO_irq_type = {
++      "SIGIO",
++      startup_SIGIO_irq,
++      shutdown_SIGIO_irq,
++      enable_SIGIO_irq,
++      disable_SIGIO_irq,
++      mask_and_ack_SIGIO,
++      end_SIGIO_irq,
++      NULL
++};
++
++static struct hw_interrupt_type SIGVTALRM_irq_type = {
++      "SIGVTALRM",
++      startup_SIGVTALRM_irq,
++      shutdown_SIGVTALRM_irq,
++      enable_SIGVTALRM_irq,
++      disable_SIGVTALRM_irq,
++      mask_and_ack_SIGVTALRM,
++      end_SIGVTALRM_irq,
++      NULL
++};
++
++void __init init_IRQ(void)
++{
++      int i;
++
++      irq_desc[TIMER_IRQ].status = IRQ_DISABLED;
++      irq_desc[TIMER_IRQ].action = 0;
++      irq_desc[TIMER_IRQ].depth = 1;
++      irq_desc[TIMER_IRQ].handler = &SIGVTALRM_irq_type;
++      enable_irq(TIMER_IRQ);
++      for(i=1;i<NR_IRQS;i++){
++              irq_desc[i].status = IRQ_DISABLED;
++              irq_desc[i].action = 0;
++              irq_desc[i].depth = 1;
++              irq_desc[i].handler = &SIGIO_irq_type;
++              enable_irq(i);
++      }
++      init_irq_signals(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/irq_user.c um/arch/um/kernel/irq_user.c
+--- orig/arch/um/kernel/irq_user.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/irq_user.c       Sun Dec 22 15:49:46 2002
+@@ -0,0 +1,427 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <unistd.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <signal.h>
++#include <string.h>
++#include <sys/poll.h>
++#include <sys/types.h>
++#include <sys/time.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "process.h"
++#include "signal_user.h"
++#include "sigio.h"
++#include "irq_user.h"
++#include "os.h"
++
++struct irq_fd {
++      struct irq_fd *next;
++      void *id;
++      int fd;
++      int type;
++      int irq;
++      int pid;
++      int events;
++      int current_events;
++      int freed;
++};
++
++static struct irq_fd *active_fds = NULL;
++static struct irq_fd **last_irq_ptr = &active_fds;
++
++static struct pollfd *pollfds = NULL;
++static int pollfds_num = 0;
++static int pollfds_size = 0;
++
++extern int io_count, intr_count;
++
++void sigio_handler(int sig, union uml_pt_regs *regs)
++{
++      struct irq_fd *irq_fd, *next;
++      int i, n;
++
++      if(smp_sigio_handler()) return;
++      while(1){
++              if((n = poll(pollfds, pollfds_num, 0)) < 0){
++                      if(errno == EINTR) continue;
++                      printk("sigio_handler : poll returned %d, "
++                             "errno = %d\n", n, errno);
++                      break;
++              }
++              if(n == 0) break;
++
++              irq_fd = active_fds;
++              for(i = 0; i < pollfds_num; i++){
++                      if(pollfds[i].revents != 0){
++                              irq_fd->current_events = pollfds[i].revents;
++                              pollfds[i].fd = -1;
++                      }
++                      irq_fd = irq_fd->next;
++              }
++
++              for(irq_fd = active_fds; irq_fd != NULL; irq_fd = next){
++                      next = irq_fd->next;
++                      if(irq_fd->current_events != 0){
++                              irq_fd->current_events = 0;
++                              do_IRQ(irq_fd->irq, regs);
++
++                              /* This is here because the next irq may be
++                               * freed in the handler.  If a console goes
++                               * away, both the read and write irqs will be
++                               * freed.  After do_IRQ, ->next will point to
++                               * a good IRQ.
++                               * Irqs can't be freed inside their handlers,
++                               * so the next best thing is to have them
++                               * marked as needing freeing, so that they
++                               * can be freed here.
++                               */
++                              next = irq_fd->next;
++                              if(irq_fd->freed)
++                                      free_irq(irq_fd->irq, irq_fd->id);
++                      }
++              }
++      }
++}
++
++int activate_ipi(int fd, int pid)
++{
++      return(os_set_fd_async(fd, pid));
++}
++
++static void maybe_sigio_broken(int fd, int type)
++{
++      if(isatty(fd)){
++              if((type == IRQ_WRITE) && !pty_output_sigio){
++                      write_sigio_workaround();
++                      add_sigio_fd(fd, 0);
++              }
++              else if((type == IRQ_READ) && !pty_close_sigio){
++                      write_sigio_workaround();
++                      add_sigio_fd(fd, 1);                    
++              }
++      }
++}
++
++int activate_fd(int irq, int fd, int type, void *dev_id)
++{
++      struct pollfd *tmp_pfd;
++      struct irq_fd *new_fd, *irq_fd;
++      unsigned long flags;
++      int pid, events, err, n, size;
++
++      pid = os_getpid();
++      err = os_set_fd_async(fd, pid);
++      if(err < 0)
++              goto out;
++
++      new_fd = um_kmalloc(sizeof(*new_fd));
++      err = -ENOMEM;
++      if(new_fd == NULL)
++              goto out;
++
++      if(type == IRQ_READ) events = POLLIN | POLLPRI;
++      else events = POLLOUT;
++      *new_fd = ((struct irq_fd) { .next              = NULL,
++                                   .id                = dev_id,
++                                   .fd                = fd,
++                                   .type              = type,
++                                   .irq               = irq,
++                                   .pid               = pid,
++                                   .events            = events,
++                                   .current_events    = 0,
++                                   .freed             = 0  } );
++
++      /* Critical section - locked by a spinlock because this stuff can
++       * be changed from interrupt handlers.  The stuff above is done 
++       * outside the lock because it allocates memory.
++       */
++
++      /* Actually, it only looks like it can be called from interrupt
++       * context.  The culprit is reactivate_fd, which calls 
++       * maybe_sigio_broken, which calls write_sigio_workaround,
++       * which calls activate_fd.  However, write_sigio_workaround should
++       * only be called once, at boot time.  That would make it clear that
++       * this is called only from process context, and can be locked with
++       * a semaphore.
++       */
++      flags = irq_lock();
++      for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){
++              if((irq_fd->fd == fd) && (irq_fd->type == type)){
++                      printk("Registering fd %d twice\n", fd);
++                      printk("Irqs : %d, %d\n", irq_fd->irq, irq);
++                      printk("Ids : 0x%x, 0x%x\n", irq_fd->id, dev_id);
++                      goto out_unlock;
++              }
++      }
++
++      n = pollfds_num;
++      if(n == pollfds_size){
++              while(1){
++                      /* Here we have to drop the lock in order to call 
++                       * kmalloc, which might sleep.  If something else
++                       * came in and changed the pollfds array, we free
++                       * the buffer and try again.
++                       */
++                      irq_unlock(flags);
++                      size = (pollfds_num + 1) * sizeof(pollfds[0]);
++                      tmp_pfd = um_kmalloc(size);
++                      flags = irq_lock();
++                      if(tmp_pfd == NULL)
++                              goto out_unlock;
++                      if(n == pollfds_size)
++                              break;
++                      kfree(tmp_pfd);
++              }
++              if(pollfds != NULL){
++                      memcpy(tmp_pfd, pollfds,
++                             sizeof(pollfds[0]) * pollfds_size);
++                      kfree(pollfds);
++              }
++              pollfds = tmp_pfd;
++              pollfds_size++;
++      }
++
++      if(type == IRQ_WRITE) 
++              fd = -1;
++
++      pollfds[pollfds_num] = ((struct pollfd) { .fd   = fd,
++                                                .events       = events,
++                                                .revents      = 0 });
++      pollfds_num++;
++
++      *last_irq_ptr = new_fd;
++      last_irq_ptr = &new_fd->next;
++
++      irq_unlock(flags);
++
++      /* This calls activate_fd, so it has to be outside the critical
++       * section.
++       */
++      maybe_sigio_broken(fd, type);
++
++      return(0);
++
++ out_unlock:
++      irq_unlock(flags);
++      kfree(new_fd);
++ out:
++      return(err);
++}
++
++static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
++{
++      struct irq_fd **prev;
++      unsigned long flags;
++      int i = 0;
++
++      flags = irq_lock();
++      prev = &active_fds;
++      while(*prev != NULL){
++              if((*test)(*prev, arg)){
++                      struct irq_fd *old_fd = *prev;
++                      if((pollfds[i].fd != -1) && 
++                         (pollfds[i].fd != (*prev)->fd)){
++                              printk("free_irq_by_cb - mismatch between "
++                                     "active_fds and pollfds, fd %d vs %d\n",
++                                     (*prev)->fd, pollfds[i].fd);
++                              goto out;
++                      }
++                      memcpy(&pollfds[i], &pollfds[i + 1],
++                             (pollfds_num - i - 1) * sizeof(pollfds[0]));
++                      pollfds_num--;
++                      if(last_irq_ptr == &old_fd->next) 
++                              last_irq_ptr = prev;
++                      *prev = (*prev)->next;
++                      if(old_fd->type == IRQ_WRITE) 
++                              ignore_sigio_fd(old_fd->fd);
++                      kfree(old_fd);
++                      continue;
++              }
++              prev = &(*prev)->next;
++              i++;
++      }
++ out:
++      irq_unlock(flags);
++}
++
++struct irq_and_dev {
++      int irq;
++      void *dev;
++};
++
++static int same_irq_and_dev(struct irq_fd *irq, void *d)
++{
++      struct irq_and_dev *data = d;
++
++      return((irq->irq == data->irq) && (irq->id == data->dev));
++}
++
++void free_irq_by_irq_and_dev(int irq, void *dev)
++{
++      struct irq_and_dev data = ((struct irq_and_dev) { .irq  = irq,
++                                                        .dev  = dev });
++
++      free_irq_by_cb(same_irq_and_dev, &data);
++}
++
++static int same_fd(struct irq_fd *irq, void *fd)
++{
++      return(irq->fd == *((int *) fd));
++}
++
++void free_irq_by_fd(int fd)
++{
++      free_irq_by_cb(same_fd, &fd);
++}
++
++static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
++{
++      struct irq_fd *irq;
++      int i = 0;
++
++      for(irq=active_fds; irq != NULL; irq = irq->next){
++              if((irq->fd == fd) && (irq->irq == irqnum)) break;
++              i++;
++      }
++      if(irq == NULL){
++              printk("find_irq_by_fd doesn't have descriptor %d\n", fd);
++              goto out;
++      }
++      if((pollfds[i].fd != -1) && (pollfds[i].fd != fd)){
++              printk("find_irq_by_fd - mismatch between active_fds and "
++                     "pollfds, fd %d vs %d, need %d\n", irq->fd, 
++                     pollfds[i].fd, fd);
++              irq = NULL;
++              goto out;
++      }
++      *index_out = i;
++ out:
++      return(irq);
++}
++
++void free_irq_later(int irq, void *dev_id)
++{
++      struct irq_fd *irq_fd;
++      unsigned long flags;
++
++      flags = irq_lock();
++      for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){
++              if((irq_fd->irq == irq) && (irq_fd->id == dev_id))
++                      break;
++      }
++      if(irq_fd == NULL){
++              printk("free_irq_later found no irq, irq = %d, "
++                     "dev_id = 0x%p\n", irq, dev_id);
++              goto out;
++      }
++      irq_fd->freed = 1;
++ out:
++      irq_unlock(flags);
++}
++
++void reactivate_fd(int fd, int irqnum)
++{
++      struct irq_fd *irq;
++      unsigned long flags;
++      int i;
++
++      flags = irq_lock();
++      irq = find_irq_by_fd(fd, irqnum, &i);
++      if(irq == NULL){
++              irq_unlock(flags);
++              return;
++      }
++
++      pollfds[i].fd = irq->fd;
++
++      irq_unlock(flags);
++
++      /* This calls activate_fd, so it has to be outside the critical
++       * section.
++       */
++      maybe_sigio_broken(fd, irq->type);
++}
++
++void deactivate_fd(int fd, int irqnum)
++{
++      struct irq_fd *irq;
++      unsigned long flags;
++      int i;
++
++      flags = irq_lock();
++      irq = find_irq_by_fd(fd, irqnum, &i);
++      if(irq == NULL)
++              goto out;
++      pollfds[i].fd = -1;
++ out:
++      irq_unlock(flags);
++}
++
++void forward_ipi(int fd, int pid)
++{
++      if(fcntl(fd, F_SETOWN, pid) < 0){
++              int save_errno = errno;
++              if(fcntl(fd, F_GETOWN, 0) != pid){
++                      printk("forward_ipi: F_SETOWN failed, fd = %d, "
++                             "me = %d, target = %d, errno = %d\n", fd, 
++                             os_getpid(), pid, save_errno);
++              }
++      }
++}
++
++void forward_interrupts(int pid)
++{
++      struct irq_fd *irq;
++      unsigned long flags;
++
++      flags = irq_lock();
++      for(irq=active_fds;irq != NULL;irq = irq->next){
++              if(fcntl(irq->fd, F_SETOWN, pid) < 0){
++                      int save_errno = errno;
++                      if(fcntl(irq->fd, F_GETOWN, 0) != pid){
++                              /* XXX Just remove the irq rather than
++                               * print out an infinite stream of these
++                               */
++                              printk("Failed to forward %d to pid %d, "
++                                     "errno = %d\n", irq->fd, pid, 
++                                     save_errno);
++                      }
++              }
++              irq->pid = pid;
++      }
++      irq_unlock(flags);
++}
++
++void init_irq_signals(int on_sigstack)
++{
++      __sighandler_t h;
++      int flags;
++
++      flags = on_sigstack ? SA_ONSTACK : 0;
++      if(timer_irq_inited) h = (__sighandler_t) alarm_handler;
++      else h = boot_timer_handler;
++
++      set_handler(SIGVTALRM, h, flags | SA_RESTART, 
++                  SIGUSR1, SIGIO, SIGWINCH, SIGALRM, -1);
++      set_handler(SIGIO, (__sighandler_t) sig_handler, flags | SA_RESTART,
++                  SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1);
++      signal(SIGWINCH, SIG_IGN);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/ksyms.c um/arch/um/kernel/ksyms.c
+--- orig/arch/um/kernel/ksyms.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/ksyms.c  Tue Dec 17 13:29:43 2002
+@@ -0,0 +1,94 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/module.h"
++#include "linux/string.h"
++#include "linux/smp_lock.h"
++#include "linux/spinlock.h"
++#include "asm/current.h"
++#include "asm/delay.h"
++#include "asm/processor.h"
++#include "asm/unistd.h"
++#include "asm/pgalloc.h"
++#include "asm/pgtable.h"
++#include "asm/page.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "os.h"
++#include "helper.h"
++
++EXPORT_SYMBOL(stop);
++EXPORT_SYMBOL(strtok);
++EXPORT_SYMBOL(uml_physmem);
++EXPORT_SYMBOL(set_signals);
++EXPORT_SYMBOL(get_signals);
++EXPORT_SYMBOL(kernel_thread);
++EXPORT_SYMBOL(__const_udelay);
++EXPORT_SYMBOL(__udelay);
++EXPORT_SYMBOL(sys_waitpid);
++EXPORT_SYMBOL(task_size);
++EXPORT_SYMBOL(flush_tlb_range);
++EXPORT_SYMBOL(host_task_size);
++EXPORT_SYMBOL(arch_validate);
++
++EXPORT_SYMBOL(region_pa);
++EXPORT_SYMBOL(region_va);
++EXPORT_SYMBOL(phys_mem_map);
++EXPORT_SYMBOL(page_mem_map);
++EXPORT_SYMBOL(high_physmem);
++EXPORT_SYMBOL(empty_zero_page);
++EXPORT_SYMBOL(um_virt_to_phys);
++EXPORT_SYMBOL(mode_tt);
++EXPORT_SYMBOL(handle_page_fault);
++
++EXPORT_SYMBOL(os_getpid);
++EXPORT_SYMBOL(os_open_file);
++EXPORT_SYMBOL(os_read_file);
++EXPORT_SYMBOL(os_write_file);
++EXPORT_SYMBOL(os_seek_file);
++EXPORT_SYMBOL(os_pipe);
++EXPORT_SYMBOL(os_file_type);
++EXPORT_SYMBOL(os_close_file);
++EXPORT_SYMBOL(helper_wait);
++EXPORT_SYMBOL(os_shutdown_socket);
++EXPORT_SYMBOL(os_connect_socket);
++EXPORT_SYMBOL(run_helper);
++EXPORT_SYMBOL(start_thread);
++EXPORT_SYMBOL(dump_thread);
++
++/* This is here because UML expands open to sys_open, not to a system
++ * call instruction.
++ */
++EXPORT_SYMBOL(sys_open);
++EXPORT_SYMBOL(sys_lseek);
++EXPORT_SYMBOL(sys_read);
++EXPORT_SYMBOL(sys_wait4);
++
++#ifdef CONFIG_SMP
++
++/* required for SMP */
++
++extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
++EXPORT_SYMBOL_NOVERS(__write_lock_failed);
++
++extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
++EXPORT_SYMBOL_NOVERS(__read_lock_failed);
++
++EXPORT_SYMBOL(kernel_flag_cacheline);
++EXPORT_SYMBOL(smp_num_cpus);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/mem.c um/arch/um/kernel/mem.c
+--- orig/arch/um/kernel/mem.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/mem.c    Sun Mar 30 14:30:26 2003
+@@ -0,0 +1,852 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/types.h"
++#include "linux/mm.h"
++#include "linux/fs.h"
++#include "linux/init.h"
++#include "linux/bootmem.h"
++#include "linux/swap.h"
++#include "linux/slab.h"
++#include "linux/vmalloc.h"
++#include "linux/highmem.h"
++#include "asm/page.h"
++#include "asm/pgtable.h"
++#include "asm/pgalloc.h"
++#include "asm/bitops.h"
++#include "asm/uaccess.h"
++#include "asm/tlb.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "mem_user.h"
++#include "mem.h"
++#include "kern.h"
++#include "init.h"
++#include "os.h"
++#include "mode_kern.h"
++#include "uml_uaccess.h"
++
++/* Changed during early boot */
++pgd_t swapper_pg_dir[1024];
++unsigned long high_physmem;
++unsigned long vm_start;
++unsigned long vm_end;
++unsigned long highmem;
++unsigned long *empty_zero_page = NULL;
++unsigned long *empty_bad_page = NULL;
++
++/* Not modified */
++const char bad_pmd_string[] = "Bad pmd in pte_alloc: %08lx\n";
++
++/* Changed during early boot */
++static unsigned long totalram_pages = 0;
++
++extern char __init_begin, __init_end;
++extern long physmem_size;
++
++#ifdef CONFIG_SMP
++/* Not changed by UML */
++mmu_gather_t mmu_gathers[NR_CPUS];
++#endif
++
++/* Changed during early boot */
++int kmalloc_ok = 0;
++
++#define NREGIONS (phys_region_index(0xffffffff) - phys_region_index(0x0) + 1)
++struct mem_region *regions[NREGIONS] = { [ 0 ... NREGIONS - 1 ] = NULL };
++#define REGION_SIZE ((0xffffffff & ~REGION_MASK) + 1)
++
++/* Changed during early boot */
++static unsigned long brk_end;
++
++static void map_cb(void *unused)
++{
++      map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
++}
++
++void unmap_physmem(void)
++{
++      os_unmap_memory((void *) brk_end, uml_reserved - brk_end);
++}
++
++extern char __binary_start;
++
++void mem_init(void)
++{
++      unsigned long start;
++
++#ifdef CONFIG_HIGHMEM
++      highmem_start_page = phys_page(__pa(high_physmem));
++#endif
++
++        /* clear the zero-page */
++        memset((void *) empty_zero_page, 0, PAGE_SIZE);
++
++      /* Map in the area just after the brk now that kmalloc is about
++       * to be turned on.
++       */
++      brk_end = (unsigned long) UML_ROUND_UP(sbrk(0));
++      map_cb(NULL);
++      initial_thread_cb(map_cb, NULL);
++      free_bootmem(__pa(brk_end), uml_reserved - brk_end);
++      uml_reserved = brk_end;
++
++      /* Fill in any hole at the start of the binary */
++      start = (unsigned long) &__binary_start;
++      if(uml_physmem != start){
++              map_memory(uml_physmem, __pa(uml_physmem), start - uml_physmem,
++                  1, 1, 0);
++      }
++
++      /* this will put all low memory onto the freelists */
++      totalram_pages = free_all_bootmem();
++      totalram_pages += highmem >> PAGE_SHIFT;
++      max_mapnr = totalram_pages;
++      num_physpages = totalram_pages;
++      printk(KERN_INFO "Memory: %luk available\n", 
++             (unsigned long) nr_free_pages() << (PAGE_SHIFT-10));
++      kmalloc_ok = 1;
++}
++
++/* Changed during early boot */
++static unsigned long kmem_top = 0;
++
++unsigned long get_kmem_end(void)
++{
++      if(kmem_top == 0) 
++              kmem_top = CHOOSE_MODE(kmem_end_tt, kmem_end_skas);
++      return(kmem_top);
++}
++
++void set_kmem_end(unsigned long new)
++{
++      kmem_top = new;
++}
++
++#if CONFIG_HIGHMEM
++/* Changed during early boot */
++pte_t *kmap_pte;
++pgprot_t kmap_prot;
++
++#define kmap_get_fixmap_pte(vaddr)                                    \
++      pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr))
++
++void __init kmap_init(void)
++{
++      unsigned long kmap_vstart;
++
++      /* cache the first kmap pte */
++      kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
++      kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
++
++      kmap_prot = PAGE_KERNEL;
++}
++#endif /* CONFIG_HIGHMEM */
++
++static void __init fixrange_init(unsigned long start, unsigned long end, 
++                               pgd_t *pgd_base)
++{
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++      int i, j;
++      unsigned long vaddr;
++
++      vaddr = start;
++      i = __pgd_offset(vaddr);
++      j = __pmd_offset(vaddr);
++      pgd = pgd_base + i;
++
++      for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) {
++              pmd = (pmd_t *)pgd;
++              for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) {
++                      if (pmd_none(*pmd)) {
++                              pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
++                              set_pmd(pmd, __pmd(_KERNPG_TABLE + 
++                                                 (unsigned long) __pa(pte)));
++                              if (pte != pte_offset(pmd, 0))
++                                      BUG();
++                      }
++                      vaddr += PMD_SIZE;
++              }
++              j = 0;
++      }
++}
++
++int init_maps(struct mem_region *region)
++{
++      struct page *p, *map;
++      int i, n, len;
++
++      if(region == &physmem_region){
++              region->mem_map = mem_map;
++              return(0);
++      }
++      else if(region->mem_map != NULL) return(0);
++
++      n = region->len >> PAGE_SHIFT;
++      len = n * sizeof(struct page);
++      if(kmalloc_ok){
++              map = kmalloc(len, GFP_KERNEL);
++              if(map == NULL) map = vmalloc(len);
++      }
++      else map = alloc_bootmem_low_pages(len);
++
++      if(map == NULL)
++              return(-ENOMEM);
++      for(i = 0; i < n; i++){
++              p = &map[i];
++              set_page_count(p, 0);
++              SetPageReserved(p);
++              INIT_LIST_HEAD(&p->list);
++      }
++      region->mem_map = map;
++      return(0);
++}
++
++DECLARE_MUTEX(regions_sem);
++
++static int setup_one_range(int fd, char *driver, unsigned long start, 
++                         unsigned long pfn, int len, 
++                         struct mem_region *region)
++{
++      int i;
++
++      down(&regions_sem);
++      for(i = 0; i < NREGIONS; i++){
++              if(regions[i] == NULL) break;           
++      }
++      if(i == NREGIONS){
++              printk("setup_one_range : no free regions\n");
++              i = -1;
++              goto out;
++      }
++
++      if(fd == -1)
++              fd = create_mem_file(len);
++
++      if(region == NULL){
++              if(kmalloc_ok)
++                      region = kmalloc(sizeof(*region), GFP_KERNEL);
++              else region = alloc_bootmem_low_pages(sizeof(*region));
++              if(region == NULL)
++                      panic("Failed to allocating mem_region");
++      }
++
++      *region = ((struct mem_region) { .driver        = driver,
++                                       .start_pfn     = pfn,
++                                       .start         = start, 
++                                       .len           = len, 
++                                       .fd            = fd } );
++      regions[i] = region;
++ out:
++      up(&regions_sem);
++      return(i);
++}
++
++#ifdef CONFIG_HIGHMEM
++static void init_highmem(void)
++{
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++      unsigned long vaddr;
++
++      /*
++       * Permanent kmaps:
++       */
++      vaddr = PKMAP_BASE;
++      fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir);
++
++      pgd = swapper_pg_dir + __pgd_offset(vaddr);
++      pmd = pmd_offset(pgd, vaddr);
++      pte = pte_offset(pmd, vaddr);
++      pkmap_page_table = pte;
++
++      kmap_init();
++}
++
++void setup_highmem(unsigned long len)
++{
++      struct mem_region *region;
++      struct page *page, *map;
++      unsigned long phys;
++      int i, cur, index;
++
++      phys = physmem_size;
++      do {
++              cur = min(len, (unsigned long) REGION_SIZE);
++              i = setup_one_range(-1, NULL, -1, phys >> PAGE_SHIFT, cur,
++                                  NULL);
++              if(i == -1){
++                      printk("setup_highmem - setup_one_range failed\n");
++                      return;
++              }
++              region = regions[i];
++              index = phys / PAGE_SIZE;
++              region->mem_map = &mem_map[index];
++
++              map = region->mem_map;
++              for(i = 0; i < (cur >> PAGE_SHIFT); i++){
++                      page = &map[i];
++                      ClearPageReserved(page);
++                      set_bit(PG_highmem, &page->flags);
++                      atomic_set(&page->count, 1);
++                      __free_page(page);
++              }
++              phys += cur;
++              len -= cur;
++      } while(len > 0);
++}
++#endif
++
++void paging_init(void)
++{
++      struct mem_region *region;
++      unsigned long zones_size[MAX_NR_ZONES], start, end, vaddr;
++      int i, index;
++
++      empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
++      empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE);
++      for(i=0;i<sizeof(zones_size)/sizeof(zones_size[0]);i++) 
++              zones_size[i] = 0;
++      zones_size[0] = (high_physmem >> PAGE_SHIFT) - 
++              (uml_physmem >> PAGE_SHIFT);
++      zones_size[2] = highmem >> PAGE_SHIFT;
++      free_area_init(zones_size);
++      start = phys_region_index(__pa(uml_physmem));
++      end = phys_region_index(__pa(high_physmem - 1));
++      for(i = start; i <= end; i++){
++              region = regions[i];
++              index = (region->start - uml_physmem) / PAGE_SIZE;
++              region->mem_map = &mem_map[index];
++              if(i > start) free_bootmem(__pa(region->start), region->len);
++      }
++
++      /*
++       * Fixed mappings, only the page table structure has to be
++       * created - mappings will be set by set_fixmap():
++       */
++      vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
++      fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir);
++
++#if CONFIG_HIGHMEM
++      init_highmem();
++      setup_highmem(highmem);
++#endif
++}
++
++/* Changed by meminfo_compat, which is a setup */
++static int meminfo_22 = 0;
++
++static int meminfo_compat(char *str)
++{
++      meminfo_22 = 1;
++      return(1);
++}
++
++__setup("22_meminfo", meminfo_compat);
++
++void si_meminfo(struct sysinfo *val)
++{
++      val->totalram = totalram_pages;
++      val->sharedram = 0;
++      val->freeram = nr_free_pages();
++      val->bufferram = atomic_read(&buffermem_pages);
++      val->totalhigh = highmem >> PAGE_SHIFT;
++      val->freehigh = nr_free_highpages();
++      val->mem_unit = PAGE_SIZE;
++      if(meminfo_22){
++              val->freeram <<= PAGE_SHIFT;
++              val->bufferram <<= PAGE_SHIFT;
++              val->totalram <<= PAGE_SHIFT;
++              val->sharedram <<= PAGE_SHIFT;
++      }
++}
++
++pte_t __bad_page(void)
++{
++      clear_page(empty_bad_page);
++        return pte_mkdirty(mk_pte((struct page *) empty_bad_page, 
++                                PAGE_SHARED));
++}
++
++/* This can't do anything because nothing in the kernel image can be freed
++ * since it's not in kernel physical memory.
++ */
++
++void free_initmem(void)
++{
++}
++
++#ifdef CONFIG_BLK_DEV_INITRD
++
++void free_initrd_mem(unsigned long start, unsigned long end)
++{
++      if (start < end)
++              printk ("Freeing initrd memory: %ldk freed\n", 
++                      (end - start) >> 10);
++      for (; start < end; start += PAGE_SIZE) {
++              ClearPageReserved(virt_to_page(start));
++              set_page_count(virt_to_page(start), 1);
++              free_page(start);
++              totalram_pages++;
++      }
++}
++      
++#endif
++
++int do_check_pgt_cache(int low, int high)
++{
++        int freed = 0;
++        if(pgtable_cache_size > high) {
++                do {
++                        if (pgd_quicklist) {
++                                free_pgd_slow(get_pgd_fast());
++                                freed++;
++                        }
++                        if (pmd_quicklist) {
++                                pmd_free_slow(pmd_alloc_one_fast(NULL, 0));
++                                freed++;
++                        }
++                        if (pte_quicklist) {
++                                pte_free_slow(pte_alloc_one_fast(NULL, 0));
++                                freed++;
++                        }
++                } while(pgtable_cache_size > low);
++        }
++        return freed;
++}
++
++void show_mem(void)
++{
++        int i, total = 0, reserved = 0;
++        int shared = 0, cached = 0;
++        int highmem = 0;
++
++        printk("Mem-info:\n");
++        show_free_areas();
++        printk("Free swap:       %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
++        i = max_mapnr;
++        while(i-- > 0) {
++                total++;
++                if(PageHighMem(mem_map + i))
++                        highmem++;
++                if(PageReserved(mem_map + i))
++                        reserved++;
++                else if(PageSwapCache(mem_map + i))
++                        cached++;
++                else if(page_count(mem_map + i))
++                        shared += page_count(mem_map + i) - 1;
++        }
++        printk("%d pages of RAM\n", total);
++        printk("%d pages of HIGHMEM\n", highmem);
++        printk("%d reserved pages\n", reserved);
++        printk("%d pages shared\n", shared);
++        printk("%d pages swap cached\n", cached);
++        printk("%ld pages in page table cache\n", pgtable_cache_size);
++        show_buffers();
++}
++
++static int __init uml_mem_setup(char *line, int *add)
++{
++      char *retptr;
++      physmem_size = memparse(line,&retptr);
++      return 0;
++}
++__uml_setup("mem=", uml_mem_setup,
++"mem=<Amount of desired ram>\n"
++"    This controls how much \"physical\" memory the kernel allocates\n"
++"    for the system. The size is specified as a number followed by\n"
++"    one of 'k', 'K', 'm', 'M', which have the obvious meanings.\n"
++"    This is not related to the amount of memory in the physical\n"
++"    machine. It can be more, and the excess, if it's ever used, will\n"
++"    just be swapped out.\n        Example: mem=64M\n\n"
++);
++
++struct page *arch_validate(struct page *page, int mask, int order)
++{
++      unsigned long addr, zero = 0;
++      int i;
++
++ again:
++      if(page == NULL) return(page);
++      if(PageHighMem(page)) return(page);
++
++      addr = (unsigned long) page_address(page);
++      for(i = 0; i < (1 << order); i++){
++              current->thread.fault_addr = (void *) addr;
++              if(__do_copy_to_user((void *) addr, &zero, 
++                                   sizeof(zero),
++                                   &current->thread.fault_addr,
++                                   &current->thread.fault_catcher)){
++                      if(!(mask & __GFP_WAIT)) return(NULL);
++                      else break;
++              }
++              addr += PAGE_SIZE;
++      }
++      if(i == (1 << order)) return(page);
++      page = _alloc_pages(mask, order);
++      goto again;
++}
++
++DECLARE_MUTEX(vm_reserved_sem);
++static struct list_head vm_reserved = LIST_HEAD_INIT(vm_reserved);
++
++/* Static structures, linked in to the list in early boot */
++static struct vm_reserved head = {
++      .list           = LIST_HEAD_INIT(head.list),
++      .start          = 0,
++      .end            = 0xffffffff
++};
++
++static struct vm_reserved tail = {
++      .list           = LIST_HEAD_INIT(tail.list),
++      .start          = 0,
++      .end            = 0xffffffff
++};
++
++void set_usable_vm(unsigned long start, unsigned long end)
++{
++      list_add(&head.list, &vm_reserved);
++      list_add(&tail.list, &head.list);
++      head.end = start;
++      tail.start = end;
++}
++
++int reserve_vm(unsigned long start, unsigned long end, void *e)
++             
++{
++      struct vm_reserved *entry = e, *reserved, *prev;
++      struct list_head *ele;
++      int err;
++
++      down(&vm_reserved_sem);
++      list_for_each(ele, &vm_reserved){
++              reserved = list_entry(ele, struct vm_reserved, list);
++              if(reserved->start >= end) goto found;
++      }
++      panic("Reserved vm out of range");
++ found:
++      prev = list_entry(ele->prev, struct vm_reserved, list);
++      if(prev->end > start)
++              panic("Can't reserve vm");
++      if(entry == NULL)
++              entry = kmalloc(sizeof(*entry), GFP_KERNEL);
++      if(entry == NULL){
++              printk("reserve_vm : Failed to allocate entry\n");
++              err = -ENOMEM;
++              goto out;
++      }
++      *entry = ((struct vm_reserved) 
++              { .list         = LIST_HEAD_INIT(entry->list),
++                .start        = start,
++                .end          = end });
++      list_add(&entry->list, &prev->list);
++      err = 0;
++ out:
++      up(&vm_reserved_sem);
++      return(0);
++}
++
++unsigned long get_vm(unsigned long len)
++{
++      struct vm_reserved *this, *next;
++      struct list_head *ele;
++      unsigned long start;
++      int err;
++      
++      down(&vm_reserved_sem);
++      list_for_each(ele, &vm_reserved){
++              this = list_entry(ele, struct vm_reserved, list);
++              next = list_entry(ele->next, struct vm_reserved, list);
++              if((this->start < next->start) && 
++                 (this->end + len + PAGE_SIZE <= next->start))
++                      goto found;
++      }
++      up(&vm_reserved_sem);
++      return(0);
++ found:
++      up(&vm_reserved_sem);
++      start = (unsigned long) UML_ROUND_UP(this->end) + PAGE_SIZE;
++      err = reserve_vm(start, start + len, NULL);
++      if(err) return(0);
++      return(start);
++}
++
++int nregions(void)
++{
++      return(NREGIONS);
++}
++
++void setup_range(int fd, char *driver, unsigned long start, unsigned long pfn,
++               unsigned long len, int need_vm, struct mem_region *region, 
++               void *reserved)
++{
++      int i, cur;
++
++      do {
++              cur = min(len, (unsigned long) REGION_SIZE);
++              i = setup_one_range(fd, driver, start, pfn, cur, region);
++              region = regions[i];
++              if(need_vm && setup_region(region, reserved)){
++                      kfree(region);
++                      regions[i] = NULL;
++                      return;
++              }
++              start += cur;
++              if(pfn != -1) pfn += cur;
++              len -= cur;
++      } while(len > 0);
++}
++
++struct iomem {
++      char *name;
++      int fd;
++      unsigned long size;
++};
++
++/* iomem regions can only be added on the command line at the moment.  
++ * Locking will be needed when they can be added via mconsole.
++ */
++
++struct iomem iomem_regions[NREGIONS] = { [ 0 ... NREGIONS - 1 ] =
++                                       { .name        = NULL,
++                                         .fd          = -1,
++                                         .size        = 0 } };
++
++int num_iomem_regions = 0;
++
++void add_iomem(char *name, int fd, unsigned long size)
++{
++      if(num_iomem_regions == sizeof(iomem_regions)/sizeof(iomem_regions[0]))
++              return;
++      size = (size + PAGE_SIZE - 1) & PAGE_MASK;
++      iomem_regions[num_iomem_regions++] = 
++              ((struct iomem) { .name         = name,
++                                .fd           = fd,
++                                .size         = size } );
++}
++
++int setup_iomem(void)
++{
++      struct iomem *iomem;
++      int i;
++
++      for(i = 0; i < num_iomem_regions; i++){
++              iomem = &iomem_regions[i];
++              setup_range(iomem->fd, iomem->name, -1, -1, iomem->size, 1, 
++                          NULL, NULL);
++      }
++      return(0);
++}
++
++__initcall(setup_iomem);
++
++#define PFN_UP(x)     (((x) + PAGE_SIZE-1) >> PAGE_SHIFT)
++#define PFN_DOWN(x)   ((x) >> PAGE_SHIFT)
++
++/* Changed during early boot */
++static struct mem_region physmem_region;
++static struct vm_reserved physmem_reserved;
++
++void setup_physmem(unsigned long start, unsigned long reserve_end,
++                 unsigned long len)
++{
++      struct mem_region *region = &physmem_region;
++      struct vm_reserved *reserved = &physmem_reserved;
++      unsigned long cur, pfn = 0;
++      int do_free = 1, bootmap_size;
++
++      do {
++              cur = min(len, (unsigned long) REGION_SIZE);
++              if(region == NULL) 
++                      region = alloc_bootmem_low_pages(sizeof(*region));
++              if(reserved == NULL) 
++                      reserved = alloc_bootmem_low_pages(sizeof(*reserved));
++              if((region == NULL) || (reserved == NULL))
++                      panic("Couldn't allocate physmem region or vm "
++                            "reservation\n");
++              setup_range(-1, NULL, start, pfn, cur, 1, region, reserved);
++
++              if(do_free){
++                      unsigned long reserve = reserve_end - start;
++                      int pfn = PFN_UP(__pa(reserve_end));
++                      int delta = (len - reserve) >> PAGE_SHIFT;
++
++                      bootmap_size = init_bootmem(pfn, pfn + delta);
++                      free_bootmem(__pa(reserve_end) + bootmap_size,
++                                   cur - bootmap_size - reserve);
++                      do_free = 0;
++              }
++              start += cur;
++              pfn += cur >> PAGE_SHIFT;
++              len -= cur;
++              region = NULL;
++              reserved = NULL;
++      } while(len > 0);
++}
++
++struct mem_region *phys_region(unsigned long phys)
++{
++      unsigned int n = phys_region_index(phys);
++
++      if(regions[n] == NULL) 
++              panic("Physical address in uninitialized region");
++      return(regions[n]);
++}
++
++unsigned long phys_offset(unsigned long phys)
++{
++      return(phys_addr(phys));
++}
++
++struct page *phys_mem_map(unsigned long phys)
++{
++      return((struct page *) phys_region(phys)->mem_map);
++}
++
++struct page *pte_mem_map(pte_t pte)
++{
++      return(phys_mem_map(pte_val(pte)));
++}
++
++struct mem_region *page_region(struct page *page, int *index_out)
++{
++      int i;
++      struct mem_region *region;
++      struct page *map;
++
++      for(i = 0; i < NREGIONS; i++){
++              region = regions[i];
++              if(region == NULL) continue;
++              map = region->mem_map;
++              if((page >= map) && (page < &map[region->len >> PAGE_SHIFT])){
++                      if(index_out != NULL) *index_out = i;
++                      return(region);
++              }
++      }
++      panic("No region found for page");
++      return(NULL);
++}
++
++unsigned long page_to_pfn(struct page *page)
++{
++      struct mem_region *region = page_region(page, NULL);
++
++      return(region->start_pfn + (page - (struct page *) region->mem_map));
++}
++
++struct mem_region *pfn_to_region(unsigned long pfn, int *index_out)
++{
++      struct mem_region *region;
++      int i;
++
++      for(i = 0; i < NREGIONS; i++){
++              region = regions[i];
++              if(region == NULL)
++                      continue;
++
++              if((region->start_pfn <= pfn) &&
++                 (region->start_pfn + (region->len >> PAGE_SHIFT) > pfn)){
++                      if(index_out != NULL) 
++                              *index_out = i;
++                      return(region);
++              }
++      }
++      return(NULL);
++}
++
++struct page *pfn_to_page(unsigned long pfn)
++{
++      struct mem_region *region = pfn_to_region(pfn, NULL);
++      struct page *mem_map = (struct page *) region->mem_map;
++
++      return(&mem_map[pfn - region->start_pfn]);
++}
++
++unsigned long phys_to_pfn(unsigned long p)
++{
++      struct mem_region *region = regions[phys_region_index(p)];
++
++      return(region->start_pfn + (phys_addr(p) >> PAGE_SHIFT));
++}
++
++unsigned long pfn_to_phys(unsigned long pfn)
++{
++      int n;
++      struct mem_region *region = pfn_to_region(pfn, &n);
++
++      return(mk_phys((pfn - region->start_pfn) << PAGE_SHIFT, n));
++}
++
++struct page *page_mem_map(struct page *page)
++{
++      return((struct page *) page_region(page, NULL)->mem_map);
++}
++
++extern unsigned long region_pa(void *virt)
++{
++      struct mem_region *region;
++      unsigned long addr = (unsigned long) virt;
++      int i;
++
++      for(i = 0; i < NREGIONS; i++){
++              region = regions[i];
++              if(region == NULL) continue;
++              if((region->start <= addr) && 
++                 (addr <= region->start + region->len))
++                      return(mk_phys(addr - region->start, i));
++      }
++      panic("region_pa : no region for virtual address");
++      return(0);
++}
++
++extern void *region_va(unsigned long phys)
++{
++      return((void *) (phys_region(phys)->start + phys_addr(phys)));
++}
++
++unsigned long page_to_phys(struct page *page)
++{
++      int n;
++      struct mem_region *region = page_region(page, &n);
++      struct page *map = region->mem_map;
++      return(mk_phys((page - map) << PAGE_SHIFT, n));
++}
++
++struct page *phys_to_page(unsigned long phys)
++{
++      struct page *mem_map;
++
++      mem_map = phys_mem_map(phys);
++      return(mem_map + (phys_offset(phys) >> PAGE_SHIFT));
++}
++
++static int setup_mem_maps(void)
++{
++      struct mem_region *region;
++      int i;
++
++      for(i = 0; i < NREGIONS; i++){
++              region = regions[i];
++              if((region != NULL) && (region->fd > 0)) init_maps(region);
++      }
++      return(0);
++}
++
++__initcall(setup_mem_maps);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/mem_user.c um/arch/um/kernel/mem_user.c
+--- orig/arch/um/kernel/mem_user.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/mem_user.c       Thu Mar  6 16:05:21 2003
+@@ -0,0 +1,232 @@
++/*
++ * arch/um/kernel/mem_user.c
++ *
++ * BRIEF MODULE DESCRIPTION
++ * user side memory routines for supporting IO memory inside user mode linux
++ *
++ * Copyright (C) 2001 RidgeRun, Inc.
++ * Author: RidgeRun, Inc.
++ *         Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com
++ *
++ *  This program is free software; you can redistribute  it and/or modify it
++ *  under  the terms of  the GNU General  Public License as published by the
++ *  Free Software Foundation;  either version 2 of the  License, or (at your
++ *  option) any later version.
++ *
++ *  THIS  SOFTWARE  IS PROVIDED   ``AS  IS'' AND   ANY  EXPRESS OR IMPLIED
++ *  WARRANTIES,   INCLUDING, BUT NOT  LIMITED  TO, THE IMPLIED WARRANTIES OF
++ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN
++ *  NO  EVENT  SHALL   THE AUTHOR  BE    LIABLE FOR ANY   DIRECT, INDIRECT,
++ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
++ *  NOT LIMITED   TO, PROCUREMENT OF  SUBSTITUTE GOODS  OR SERVICES; LOSS OF
++ *  USE, DATA,  OR PROFITS; OR  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
++ *  ANY THEORY OF LIABILITY, WHETHER IN  CONTRACT, STRICT LIABILITY, OR TORT
++ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
++ *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ *  You should have received a copy of the  GNU General Public License along
++ *  with this program; if not, write  to the Free Software Foundation, Inc.,
++ *  675 Mass Ave, Cambridge, MA 02139, USA.
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <stddef.h>
++#include <stdarg.h>
++#include <unistd.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <string.h>
++#include <sys/stat.h>
++#include <sys/types.h>
++#include <sys/mman.h>
++#include "kern_util.h"
++#include "user.h"
++#include "user_util.h"
++#include "mem_user.h"
++#include "init.h"
++#include "os.h"
++#include "tempfile.h"
++
++extern struct mem_region physmem_region;
++
++#define TEMPNAME_TEMPLATE "vm_file-XXXXXX"
++
++int create_mem_file(unsigned long len)
++{
++      int fd;
++      char zero;
++
++      fd = make_tempfile(TEMPNAME_TEMPLATE, NULL, 1);
++      if (fchmod(fd, 0777) < 0){
++              perror("fchmod");
++              exit(1);
++      }
++      if(os_seek_file(fd, len) < 0){
++              perror("lseek");
++              exit(1);
++      }
++      zero = 0;
++      if(write(fd, &zero, 1) != 1){
++              perror("write");
++              exit(1);
++      }
++      if(fcntl(fd, F_SETFD, 1) != 0)
++              perror("Setting FD_CLOEXEC failed");
++      return(fd);
++}
++
++int setup_region(struct mem_region *region, void *entry)
++{
++      void *loc, *start;
++      char *driver;
++      int err, offset;
++
++      if(region->start != -1){
++              err = reserve_vm(region->start, 
++                               region->start + region->len, entry);
++              if(err){
++                      printk("setup_region : failed to reserve "
++                             "0x%x - 0x%x for driver '%s'\n",
++                             region->start, 
++                             region->start + region->len,
++                             region->driver);
++                      return(-1);
++              }
++      }
++      else region->start = get_vm(region->len);
++      if(region->start == 0){
++              if(region->driver == NULL) driver = "physmem";
++              else driver = region->driver;
++              printk("setup_region : failed to find vm for "
++                     "driver '%s' (length %d)\n", driver, region->len);
++              return(-1);
++      }
++      if(region->start == uml_physmem){
++              start = (void *) uml_reserved;
++              offset = uml_reserved - uml_physmem;
++      }
++      else {
++              start = (void *) region->start;
++              offset = 0;
++      }
++
++      loc = mmap(start, region->len - offset, PROT_READ | PROT_WRITE, 
++                 MAP_SHARED | MAP_FIXED, region->fd, offset);
++      if(loc != start){
++              perror("Mapping memory");
++              exit(1);
++      }
++      return(0);
++}
++
++static int __init parse_iomem(char *str, int *add)
++{
++      struct stat64 buf;
++      char *file, *driver;
++      int fd;
++
++      driver = str;
++      file = strchr(str,',');
++      if(file == NULL){
++              printf("parse_iomem : failed to parse iomem\n");
++              return(1);
++      }
++      *file = '\0';
++      file++;
++      fd = os_open_file(file, of_rdwr(OPENFLAGS()), 0);
++      if(fd < 0){
++              printf("parse_iomem - Couldn't open io file, errno = %d\n", 
++                     errno);
++              return(1);
++      }
++      if(fstat64(fd, &buf) < 0) {
++              printf("parse_iomem - cannot fstat file, errno = %d\n", errno);
++              return(1);
++      }
++      add_iomem(driver, fd, buf.st_size);
++      return(0);
++}
++
++__uml_setup("iomem=", parse_iomem,
++"iomem=<name>,<file>\n"
++"    Configure <file> as an IO memory region named <name>.\n\n"
++);
++
++#ifdef notdef
++int logging = 0;
++int logging_fd = -1;
++
++int logging_line = 0;
++char logging_buf[256];
++
++void log(char *fmt, ...)
++{
++      va_list ap;
++      struct timeval tv;
++      struct openflags flags;
++
++      if(logging == 0) return;
++      if(logging_fd < 0){
++              flags = of_create(of_trunc(of_rdrw(OPENFLAGS())));
++              logging_fd = os_open_file("log", flags, 0644);
++      }
++      gettimeofday(&tv, NULL);
++      sprintf(logging_buf, "%d\t %u.%u  ", logging_line++, tv.tv_sec, 
++              tv.tv_usec);
++      va_start(ap, fmt);
++      vsprintf(&logging_buf[strlen(logging_buf)], fmt, ap);
++      va_end(ap);
++      write(logging_fd, logging_buf, strlen(logging_buf));
++}
++#endif
++
++int map_memory(unsigned long virt, unsigned long phys, unsigned long len, 
++             int r, int w, int x)
++{
++      struct mem_region *region = phys_region(phys);
++
++      return(os_map_memory((void *) virt, region->fd, phys_offset(phys), len,
++                           r, w, x));
++}
++
++int protect_memory(unsigned long addr, unsigned long len, int r, int w, int x,
++                 int must_succeed)
++{
++      if(os_protect_memory((void *) addr, len, r, w, x) < 0){
++                if(must_succeed)
++                        panic("protect failed, errno = %d", errno);
++                else return(-errno);
++      }
++      return(0);
++}
++
++unsigned long find_iomem(char *driver, unsigned long *len_out)
++{
++      struct mem_region *region;
++      int i, n;
++
++      n = nregions();
++      for(i = 0; i < n; i++){
++              region = regions[i];
++              if(region == NULL) continue;
++              if((region->driver != NULL) &&
++                 !strcmp(region->driver, driver)){
++                      *len_out = region->len;
++                      return(region->start);
++              }
++      }
++      *len_out = 0;
++      return 0;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/mprot.h um/arch/um/kernel/mprot.h
+--- orig/arch/um/kernel/mprot.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/mprot.h  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,6 @@
++#ifndef __MPROT_H__
++#define __MPROT_H__
++
++extern void no_access(unsigned long addr, unsigned int len);
++
++#endif
+diff -Naur -X ../exclude-files orig/arch/um/kernel/process.c um/arch/um/kernel/process.c
+--- orig/arch/um/kernel/process.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/process.c        Wed Apr 23 20:36:15 2003
+@@ -0,0 +1,286 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <signal.h>
++#include <sched.h>
++#include <errno.h>
++#include <stdarg.h>
++#include <fcntl.h>
++#include <stdlib.h>
++#include <setjmp.h>
++#include <sys/time.h>
++#include <sys/ptrace.h>
++#include <sys/ioctl.h>
++#include <sys/wait.h>
++#include <sys/mman.h>
++#include <asm/ptrace.h>
++#include <asm/sigcontext.h>
++#include <asm/unistd.h>
++#include <asm/page.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "process.h"
++#include "signal_kern.h"
++#include "signal_user.h"
++#include "sysdep/ptrace.h"
++#include "sysdep/sigcontext.h"
++#include "irq_user.h"
++#include "ptrace_user.h"
++#include "time_user.h"
++#include "init.h"
++#include "os.h"
++#include "uml-config.h"
++#include "choose-mode.h"
++#include "mode.h"
++#ifdef UML_CONFIG_MODE_SKAS
++#include "skas.h"
++#include "skas_ptrace.h"
++#endif
++
++void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int))
++{
++      int flags = 0, pages;
++
++      if(sig_stack != NULL){
++              pages = (1 << UML_CONFIG_KERNEL_STACK_ORDER) - 2;
++              set_sigstack(sig_stack, pages * page_size());
++              flags = SA_ONSTACK;
++      }
++      if(usr1_handler) set_handler(SIGUSR1, usr1_handler, flags, -1);
++}
++
++void init_new_thread_signals(int altstack)
++{
++      int flags = altstack ? SA_ONSTACK : 0;
++
++      set_handler(SIGSEGV, (__sighandler_t) sig_handler, flags,
++                  SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1);
++      set_handler(SIGTRAP, (__sighandler_t) sig_handler, flags, 
++                  SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1);
++      set_handler(SIGFPE, (__sighandler_t) sig_handler, flags, 
++                  SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1);
++      set_handler(SIGILL, (__sighandler_t) sig_handler, flags, 
++                  SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1);
++      set_handler(SIGBUS, (__sighandler_t) sig_handler, flags, 
++                  SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1);
++      set_handler(SIGWINCH, (__sighandler_t) sig_handler, flags, 
++                  SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1);
++      set_handler(SIGUSR2, (__sighandler_t) sig_handler, 
++                  SA_NOMASK | flags, -1);
++      signal(SIGHUP, SIG_IGN);
++
++      init_irq_signals(altstack);
++}
++
++struct tramp {
++      int (*tramp)(void *);
++      void *tramp_data;
++      unsigned long temp_stack;
++      int flags;
++      int pid;
++};
++
++/* See above for why sigkill is here */
++
++int sigkill = SIGKILL;
++
++int outer_tramp(void *arg)
++{
++      struct tramp *t;
++      int sig = sigkill;
++
++      t = arg;
++      t->pid = clone(t->tramp, (void *) t->temp_stack + page_size()/2,
++                     t->flags, t->tramp_data);
++      if(t->pid > 0) wait_for_stop(t->pid, SIGSTOP, PTRACE_CONT, NULL);
++      kill(os_getpid(), sig);
++      _exit(0);
++}
++
++int start_fork_tramp(void *thread_arg, unsigned long temp_stack, 
++                   int clone_flags, int (*tramp)(void *))
++{
++      struct tramp arg;
++      unsigned long sp;
++      int new_pid, status, err;
++
++      /* The trampoline will run on the temporary stack */
++      sp = stack_sp(temp_stack);
++
++      clone_flags |= CLONE_FILES | SIGCHLD;
++
++      arg.tramp = tramp;
++      arg.tramp_data = thread_arg;
++      arg.temp_stack = temp_stack;
++      arg.flags = clone_flags;
++
++      /* Start the process and wait for it to kill itself */
++      new_pid = clone(outer_tramp, (void *) sp, clone_flags, &arg);
++      if(new_pid < 0) return(-errno);
++      while((err = waitpid(new_pid, &status, 0) < 0) && (errno == EINTR)) ;
++      if(err < 0) panic("Waiting for outer trampoline failed - errno = %d", 
++                        errno);
++      if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL))
++              panic("outer trampoline didn't exit with SIGKILL");
++
++      return(arg.pid);
++}
++
++void suspend_new_thread(int fd)
++{
++      char c;
++
++      os_stop_process(os_getpid());
++
++      if(read(fd, &c, sizeof(c)) != sizeof(c))
++              panic("read failed in suspend_new_thread");
++}
++
++static int ptrace_child(void *arg)
++{
++      int pid = os_getpid();
++
++      if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){
++              perror("ptrace");
++              os_kill_process(pid, 0);
++      }
++      os_stop_process(pid);
++      _exit(os_getpid() == pid);
++}
++
++static int start_ptraced_child(void **stack_out)
++{
++      void *stack;
++      unsigned long sp;
++      int pid, n, status;
++      
++      stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
++                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
++      if(stack == MAP_FAILED)
++              panic("check_ptrace : mmap failed, errno = %d", errno);
++      sp = (unsigned long) stack + PAGE_SIZE - sizeof(void *);
++      pid = clone(ptrace_child, (void *) sp, SIGCHLD, NULL);
++      if(pid < 0)
++              panic("check_ptrace : clone failed, errno = %d", errno);
++      n = waitpid(pid, &status, WUNTRACED);
++      if(n < 0)
++              panic("check_ptrace : wait failed, errno = %d", errno);
++      if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
++              panic("check_ptrace : expected SIGSTOP, got status = %d",
++                    status);
++
++      *stack_out = stack;
++      return(pid);
++}
++
++static void stop_ptraced_child(int pid, void *stack, int exitcode)
++{
++      int status, n;
++
++      if(ptrace(PTRACE_CONT, pid, 0, 0) < 0)
++              panic("check_ptrace : ptrace failed, errno = %d", errno);
++      n = waitpid(pid, &status, 0);
++      if(!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode))
++              panic("check_ptrace : child exited with status 0x%x", status);
++
++      if(munmap(stack, PAGE_SIZE) < 0)
++              panic("check_ptrace : munmap failed, errno = %d", errno);
++}
++
++void __init check_ptrace(void)
++{
++      void *stack;
++      int pid, syscall, n, status;
++
++      printk("Checking that ptrace can change system call numbers...");
++      pid = start_ptraced_child(&stack);
++
++      while(1){
++              if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
++                      panic("check_ptrace : ptrace failed, errno = %d", 
++                            errno);
++              n = waitpid(pid, &status, WUNTRACED);
++              if(n < 0)
++                      panic("check_ptrace : wait failed, errno = %d", errno);
++              if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
++                      panic("check_ptrace : expected SIGTRAP, "
++                            "got status = %d", status);
++              
++              syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET,
++                               0);
++              if(syscall == __NR_getpid){
++                      n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
++                                 __NR_getppid);
++                      if(n < 0)
++                              panic("check_ptrace : failed to modify system "
++                                    "call, errno = %d", errno);
++                      break;
++              }
++      }
++      stop_ptraced_child(pid, stack, 0);
++      printk("OK\n");
++}
++
++int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr)
++{
++      jmp_buf buf;
++      int n;
++
++      *jmp_ptr = &buf;
++      n = setjmp(buf);
++      if(n != 0)
++              return(n);
++      (*fn)(arg);
++      return(0);
++}
++
++int can_do_skas(void)
++{
++#ifdef UML_CONFIG_MODE_SKAS
++      struct ptrace_faultinfo fi;
++      void *stack;
++      int pid, n, ret = 1;
++
++      printf("Checking for the skas3 patch in the host...");
++      pid = start_ptraced_child(&stack);
++
++      n = ptrace(PTRACE_FAULTINFO, pid, 0, &fi);
++      if(n < 0){
++              if(errno == EIO)
++                      printf("not found\n");
++              else printf("No (unexpected errno - %d)\n", errno);
++              ret = 0;
++      }
++      else printf("found\n");
++
++      init_registers(pid);
++      stop_ptraced_child(pid, stack, 1);
++
++      printf("Checking for /proc/mm...");
++      if(access("/proc/mm", W_OK)){
++              printf("not found\n");
++              ret = 0;
++      }
++      else printf("found\n");
++
++      return(ret);
++#else
++      return(0);
++#endif
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/process_kern.c um/arch/um/kernel/process_kern.c
+--- orig/arch/um/kernel/process_kern.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/process_kern.c   Wed Apr 16 16:02:09 2003
+@@ -0,0 +1,391 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/kernel.h"
++#include "linux/sched.h"
++#include "linux/interrupt.h"
++#include "linux/mm.h"
++#include "linux/slab.h"
++#include "linux/utsname.h"
++#include "linux/fs.h"
++#include "linux/utime.h"
++#include "linux/smp_lock.h"
++#include "linux/module.h"
++#include "linux/init.h"
++#include "linux/capability.h"
++#include "asm/unistd.h"
++#include "asm/mman.h"
++#include "asm/segment.h"
++#include "asm/stat.h"
++#include "asm/pgtable.h"
++#include "asm/processor.h"
++#include "asm/pgalloc.h"
++#include "asm/spinlock.h"
++#include "asm/uaccess.h"
++#include "asm/user.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "signal_kern.h"
++#include "signal_user.h"
++#include "init.h"
++#include "irq_user.h"
++#include "mem_user.h"
++#include "time_user.h"
++#include "tlb.h"
++#include "frame_kern.h"
++#include "sigcontext.h"
++#include "2_5compat.h"
++#include "os.h"
++#include "mode.h"
++#include "mode_kern.h"
++#include "choose-mode.h"
++
++/* This is a per-cpu array.  A processor only modifies its entry and it only
++ * cares about its entry, so it's OK if another processor is modifying its
++ * entry.
++ */
++struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } };
++
++struct task_struct *get_task(int pid, int require)
++{
++        struct task_struct *ret;
++
++        read_lock(&tasklist_lock);
++      ret = find_task_by_pid(pid);
++        read_unlock(&tasklist_lock);
++
++        if(require && (ret == NULL)) panic("get_task couldn't find a task\n");
++        return(ret);
++}
++
++int external_pid(void *t)
++{
++      struct task_struct *task = t ? t : current;
++
++      return(CHOOSE_MODE_PROC(external_pid_tt, external_pid_skas, task));
++}
++
++int pid_to_processor_id(int pid)
++{
++      int i;
++
++      for(i = 0; i < smp_num_cpus; i++){
++              if(cpu_tasks[i].pid == pid) return(i);
++      }
++      return(-1);
++}
++
++void free_stack(unsigned long stack, int order)
++{
++      free_pages(stack, order);
++}
++
++unsigned long alloc_stack(int order, int atomic)
++{
++      unsigned long page;
++      int flags = GFP_KERNEL;
++
++      if(atomic) flags |= GFP_ATOMIC;
++      if((page = __get_free_pages(flags, order)) == 0)
++              return(0);
++      stack_protections(page);
++      return(page);
++}
++
++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
++{
++      int pid;
++
++      current->thread.request.u.thread.proc = fn;
++      current->thread.request.u.thread.arg = arg;
++      pid = do_fork(CLONE_VM | flags, 0, NULL, 0);
++      if(pid < 0) panic("do_fork failed in kernel_thread");
++      return(pid);
++}
++
++void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
++             struct task_struct *tsk, unsigned cpu)
++{
++      if (prev != next) 
++              clear_bit(cpu, &prev->cpu_vm_mask);
++      set_bit(cpu, &next->cpu_vm_mask);
++}
++
++void set_current(void *t)
++{
++      struct task_struct *task = t;
++
++      cpu_tasks[task->processor] = ((struct cpu_task) 
++              { external_pid(task), task });
++}
++
++void *_switch_to(void *prev, void *next)
++{
++      return(CHOOSE_MODE(_switch_to_tt(prev, next), 
++                         _switch_to_skas(prev, next)));
++}
++
++void interrupt_end(void)
++{
++      if(current->need_resched) schedule();
++      if(current->sigpending != 0) do_signal(0);
++}
++
++void release_thread(struct task_struct *task)
++{
++      CHOOSE_MODE(release_thread_tt(task), release_thread_skas(task));
++}
++
++void exit_thread(void)
++{
++      CHOOSE_MODE(exit_thread_tt(), exit_thread_skas());
++      unprotect_stack((unsigned long) current);
++}
++
++void *get_current(void)
++{
++      return(current);
++}
++
++int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
++              unsigned long stack_top, struct task_struct * p, 
++              struct pt_regs *regs)
++{
++      p->thread = (struct thread_struct) INIT_THREAD;
++      p->thread.kernel_stack = (unsigned long) p + 2 * PAGE_SIZE;
++
++      return(CHOOSE_MODE_PROC(copy_thread_tt, copy_thread_skas, nr, 
++                              clone_flags, sp, stack_top, p, regs));
++}
++
++void initial_thread_cb(void (*proc)(void *), void *arg)
++{
++      int save_kmalloc_ok = kmalloc_ok;
++
++      kmalloc_ok = 0;
++      CHOOSE_MODE_PROC(initial_thread_cb_tt, initial_thread_cb_skas, proc, 
++                       arg);
++      kmalloc_ok = save_kmalloc_ok;
++}
++
++unsigned long stack_sp(unsigned long page)
++{
++      return(page + PAGE_SIZE - sizeof(void *));
++}
++
++int current_pid(void)
++{
++      return(current->pid);
++}
++
++void cpu_idle(void)
++{
++      CHOOSE_MODE(init_idle_tt(), init_idle_skas());
++
++      atomic_inc(&init_mm.mm_count);
++      current->mm = &init_mm;
++      current->active_mm = &init_mm;
++
++      while(1){
++              /* endless idle loop with no priority at all */
++              SET_PRI(current);
++
++              /*
++               * although we are an idle CPU, we do not want to
++               * get into the scheduler unnecessarily.
++               */
++              if (current->need_resched) {
++                      schedule();
++                      check_pgt_cache();
++              }
++              idle_sleep(10);
++      }
++}
++
++int page_size(void)
++{
++      return(PAGE_SIZE);
++}
++
++int page_mask(void)
++{
++      return(PAGE_MASK);
++}
++
++void *um_virt_to_phys(struct task_struct *task, unsigned long addr, 
++                    pte_t *pte_out)
++{
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++
++      if(task->mm == NULL) 
++              return(ERR_PTR(-EINVAL));
++      pgd = pgd_offset(task->mm, addr);
++      pmd = pmd_offset(pgd, addr);
++      if(!pmd_present(*pmd)) 
++              return(ERR_PTR(-EINVAL));
++      pte = pte_offset(pmd, addr);
++      if(!pte_present(*pte)) 
++              return(ERR_PTR(-EINVAL));
++      if(pte_out != NULL)
++              *pte_out = *pte;
++      return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK));
++}
++
++char *current_cmd(void)
++{
++#if defined(CONFIG_SMP) || defined(CONFIG_HIGHMEM)
++      return("(Unknown)");
++#else
++      void *addr = um_virt_to_phys(current, current->mm->arg_start, NULL);
++      return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr);
++#endif
++}
++
++void force_sigbus(void)
++{
++      printk(KERN_ERR "Killing pid %d because of a lack of memory\n", 
++             current->pid);
++      lock_kernel();
++      sigaddset(&current->pending.signal, SIGBUS);
++      recalc_sigpending(current);
++      current->flags |= PF_SIGNALED;
++      do_exit(SIGBUS | 0x80);
++}
++
++void dump_thread(struct pt_regs *regs, struct user *u)
++{
++}
++
++void enable_hlt(void)
++{
++      panic("enable_hlt");
++}
++
++void disable_hlt(void)
++{
++      panic("disable_hlt");
++}
++
++extern int signal_frame_size;
++
++void *um_kmalloc(int size)
++{
++      return(kmalloc(size, GFP_KERNEL));
++}
++
++void *um_kmalloc_atomic(int size)
++{
++      return(kmalloc(size, GFP_ATOMIC));
++}
++
++unsigned long get_fault_addr(void)
++{
++      return((unsigned long) current->thread.fault_addr);
++}
++
++EXPORT_SYMBOL(get_fault_addr);
++
++void not_implemented(void)
++{
++      printk(KERN_DEBUG "Something isn't implemented in here\n");
++}
++
++EXPORT_SYMBOL(not_implemented);
++
++int user_context(unsigned long sp)
++{
++      unsigned long stack;
++
++      stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER);
++      stack += 2 * PAGE_SIZE;
++      return(stack != current->thread.kernel_stack);
++}
++
++extern void remove_umid_dir(void);
++
++__uml_exitcall(remove_umid_dir);
++
++extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end;
++
++void do_uml_exitcalls(void)
++{
++      exitcall_t *call;
++
++      call = &__uml_exitcall_end;
++      while (--call >= &__uml_exitcall_begin)
++              (*call)();
++}
++
++char *uml_strdup(char *string)
++{
++      char *new;
++
++      new = kmalloc(strlen(string) + 1, GFP_KERNEL);
++      if(new == NULL) return(NULL);
++      strcpy(new, string);
++      return(new);
++}
++
++void *get_init_task(void)
++{
++      return(&init_task_union.task);
++}
++
++int copy_to_user_proc(void *to, void *from, int size)
++{
++      return(copy_to_user(to, from, size));
++}
++
++int copy_from_user_proc(void *to, void *from, int size)
++{
++      return(copy_from_user(to, from, size));
++}
++
++int clear_user_proc(void *buf, int size)
++{
++      return(clear_user(buf, size));
++}
++
++int strlen_user_proc(char *str)
++{
++      return(strlen_user(str));
++}
++
++int smp_sigio_handler(void)
++{
++#ifdef CONFIG_SMP
++      int cpu = current->processor;
++
++      IPI_handler(cpu);
++      if(cpu != 0)
++              return(1);
++#endif
++      return(0);
++}
++
++int um_in_interrupt(void)
++{
++      return(in_interrupt());
++}
++
++int cpu(void)
++{
++        return(current->processor);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/ptrace.c um/arch/um/kernel/ptrace.c
+--- orig/arch/um/kernel/ptrace.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/ptrace.c Sat Dec 28 22:50:21 2002
+@@ -0,0 +1,325 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sched.h"
++#include "linux/mm.h"
++#include "linux/errno.h"
++#include "linux/smp_lock.h"
++#ifdef CONFIG_PROC_MM
++#include "linux/proc_mm.h"
++#endif
++#include "asm/ptrace.h"
++#include "asm/uaccess.h"
++#include "kern_util.h"
++#include "ptrace_user.h"
++
++/*
++ * Called by kernel/ptrace.c when detaching..
++ */
++void ptrace_disable(struct task_struct *child)
++{ 
++}
++
++extern long do_mmap2(struct task_struct *task, unsigned long addr, 
++                   unsigned long len, unsigned long prot, 
++                   unsigned long flags, unsigned long fd,
++                   unsigned long pgoff);
++
++int sys_ptrace(long request, long pid, long addr, long data)
++{
++      struct task_struct *child;
++      int i, ret;
++
++      lock_kernel();
++      ret = -EPERM;
++      if (request == PTRACE_TRACEME) {
++              /* are we already being traced? */
++              if (current->ptrace & PT_PTRACED)
++                      goto out;
++              /* set the ptrace bit in the process flags. */
++              current->ptrace |= PT_PTRACED;
++              ret = 0;
++              goto out;
++      }
++      ret = -ESRCH;
++      read_lock(&tasklist_lock);
++      child = find_task_by_pid(pid);
++      if (child)
++              get_task_struct(child);
++      read_unlock(&tasklist_lock);
++      if (!child)
++              goto out;
++
++      ret = -EPERM;
++      if (pid == 1)           /* you may not mess with init */
++              goto out_tsk;
++
++      if (request == PTRACE_ATTACH) {
++              ret = ptrace_attach(child);
++              goto out_tsk;
++      }
++
++      ret = ptrace_check_attach(child, request == PTRACE_KILL);
++      if (ret < 0)
++              goto out_tsk;
++
++      switch (request) {
++              /* when I and D space are separate, these will need to be fixed. */
++      case PTRACE_PEEKTEXT: /* read word at location addr. */ 
++      case PTRACE_PEEKDATA: {
++              unsigned long tmp;
++              int copied;
++
++              ret = -EIO;
++              copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
++              if (copied != sizeof(tmp))
++                      break;
++              ret = put_user(tmp,(unsigned long *) data);
++              break;
++      }
++
++      /* read the word at location addr in the USER area. */
++      case PTRACE_PEEKUSR: {
++              unsigned long tmp;
++
++              ret = -EIO;
++              if ((addr & 3) || addr < 0) 
++                      break;
++
++              tmp = 0;  /* Default return condition */
++              if(addr < FRAME_SIZE_OFFSET){
++                      tmp = getreg(child, addr);
++              }
++              else if((addr >= offsetof(struct user, u_debugreg[0])) &&
++                      (addr <= offsetof(struct user, u_debugreg[7]))){
++                      addr -= offsetof(struct user, u_debugreg[0]);
++                      addr = addr >> 2;
++                      tmp = child->thread.arch.debugregs[addr];
++              }
++              ret = put_user(tmp, (unsigned long *) data);
++              break;
++      }
++
++      /* when I and D space are separate, this will have to be fixed. */
++      case PTRACE_POKETEXT: /* write the word at location addr. */
++      case PTRACE_POKEDATA:
++              ret = -EIO;
++              if (access_process_vm(child, addr, &data, sizeof(data), 
++                                    1) != sizeof(data))
++                      break;
++              ret = 0;
++              break;
++
++      case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
++              ret = -EIO;
++              if ((addr & 3) || addr < 0)
++                      break;
++
++              if (addr < FRAME_SIZE_OFFSET) {
++                      ret = putreg(child, addr, data);
++                      break;
++              }
++              else if((addr >= offsetof(struct user, u_debugreg[0])) &&
++                      (addr <= offsetof(struct user, u_debugreg[7]))){
++                        addr -= offsetof(struct user, u_debugreg[0]);
++                        addr = addr >> 2;
++                        if((addr == 4) || (addr == 5)) break;
++                        child->thread.arch.debugregs[addr] = data;
++                        ret = 0;
++              }
++
++              break;
++
++      case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
++      case PTRACE_CONT: { /* restart after signal. */
++              ret = -EIO;
++              if ((unsigned long) data > _NSIG)
++                      break;
++              if (request == PTRACE_SYSCALL)
++                      child->ptrace |= PT_TRACESYS;
++              else
++                      child->ptrace &= ~PT_TRACESYS;
++              child->exit_code = data;
++              wake_up_process(child);
++              ret = 0;
++              break;
++      }
++
++/*
++ * make the child exit.  Best I can do is send it a sigkill. 
++ * perhaps it should be put in the status that it wants to 
++ * exit.
++ */
++      case PTRACE_KILL: {
++              ret = 0;
++              if (child->state == TASK_ZOMBIE)        /* already dead */
++                      break;
++              child->exit_code = SIGKILL;
++              wake_up_process(child);
++              break;
++      }
++
++      case PTRACE_SINGLESTEP: {  /* set the trap flag. */
++              ret = -EIO;
++              if ((unsigned long) data > _NSIG)
++                      break;
++              child->ptrace &= ~PT_TRACESYS;
++              child->ptrace |= PT_DTRACE;
++              child->exit_code = data;
++              /* give it a chance to run. */
++              wake_up_process(child);
++              ret = 0;
++              break;
++      }
++
++      case PTRACE_DETACH:
++              /* detach a process that was attached. */
++              ret = ptrace_detach(child, data);
++              break;
++
++#ifdef PTRACE_GETREGS
++      case PTRACE_GETREGS: { /* Get all gp regs from the child. */
++              if (!access_ok(VERIFY_WRITE, (unsigned long *)data, 
++                             FRAME_SIZE_OFFSET)) {
++                      ret = -EIO;
++                      break;
++              }
++              for ( i = 0; i < FRAME_SIZE_OFFSET; i += sizeof(long) ) {
++                      __put_user(getreg(child, i), (unsigned long *) data);
++                      data += sizeof(long);
++              }
++              ret = 0;
++              break;
++      }
++#endif
++#ifdef PTRACE_SETREGS
++      case PTRACE_SETREGS: { /* Set all gp regs in the child. */
++              unsigned long tmp = 0;
++              if (!access_ok(VERIFY_READ, (unsigned *)data, 
++                             FRAME_SIZE_OFFSET)) {
++                      ret = -EIO;
++                      break;
++              }
++              for ( i = 0; i < FRAME_SIZE_OFFSET; i += sizeof(long) ) {
++                      __get_user(tmp, (unsigned long *) data);
++                      putreg(child, i, tmp);
++                      data += sizeof(long);
++              }
++              ret = 0;
++              break;
++      }
++#endif
++#ifdef PTRACE_GETFPREGS
++      case PTRACE_GETFPREGS: /* Get the child FPU state. */
++              ret = get_fpregs(data, child);
++              break;
++#endif
++#ifdef PTRACE_SETFPREGS
++      case PTRACE_SETFPREGS: /* Set the child FPU state. */
++              ret = set_fpregs(data, child);
++              break;
++#endif
++#ifdef PTRACE_GETFPXREGS
++      case PTRACE_GETFPXREGS: /* Get the child FPU state. */
++              ret = get_fpxregs(data, child);
++              break;
++#endif
++#ifdef PTRACE_SETFPXREGS
++      case PTRACE_SETFPXREGS: /* Set the child FPU state. */
++              ret = set_fpxregs(data, child);
++              break;
++#endif
++      case PTRACE_FAULTINFO: {
++              struct ptrace_faultinfo fault;
++
++              fault = ((struct ptrace_faultinfo) 
++                      { .is_write     = child->thread.err,
++                        .addr         = child->thread.cr2 });
++              ret = copy_to_user((unsigned long *) data, &fault, 
++                                 sizeof(fault));
++              if(ret)
++                      break;
++              break;
++      }
++      case PTRACE_SIGPENDING:
++              ret = copy_to_user((unsigned long *) data, 
++                                 &child->pending.signal,
++                                 sizeof(child->pending.signal));
++              break;
++
++      case PTRACE_LDT: {
++              struct ptrace_ldt ldt;
++
++              if(copy_from_user(&ldt, (unsigned long *) data, 
++                                sizeof(ldt))){
++                      ret = -EIO;
++                      break;
++              }
++
++              /* This one is confusing, so just punt and return -EIO for 
++               * now
++               */
++              ret = -EIO;
++              break;
++      }
++#ifdef CONFIG_PROC_MM
++      case PTRACE_SWITCH_MM: {
++              struct mm_struct *old = child->mm;
++              struct mm_struct *new = proc_mm_get_mm(data);
++
++              if(IS_ERR(new)){
++                      ret = PTR_ERR(new);
++                      break;
++              }
++
++              atomic_inc(&new->mm_users);
++              child->mm = new;
++              child->active_mm = new;
++              mmput(old);
++              ret = 0;
++              break;
++      }
++#endif
++      default:
++              ret = -EIO;
++              break;
++      }
++ out_tsk:
++      free_task_struct(child);
++ out:
++      unlock_kernel();
++      return ret;
++}
++
++void syscall_trace(void)
++{
++      if ((current->ptrace & (PT_PTRACED|PT_TRACESYS))
++          != (PT_PTRACED|PT_TRACESYS))
++              return;
++      current->exit_code = SIGTRAP;
++      current->state = TASK_STOPPED;
++      notify_parent(current, SIGCHLD);
++      schedule();
++      /*
++       * this isn't the same as continuing with a signal, but it will do
++       * for normal use.  strace only continues with a signal if the
++       * stopping signal is not SIGTRAP.  -brl
++       */
++      if (current->exit_code) {
++              send_sig(current->exit_code, current, 1);
++              current->exit_code = 0;
++      }
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/reboot.c um/arch/um/kernel/reboot.c
+--- orig/arch/um/kernel/reboot.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/reboot.c Mon Dec 30 20:57:42 2002
+@@ -0,0 +1,71 @@
++/* 
++ * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sched.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "os.h"
++#include "mode.h"
++#include "choose-mode.h"
++
++#ifdef CONFIG_SMP
++static void kill_idlers(int me)
++{
++      struct task_struct *p;
++      int i;
++
++      for(i = 0; i < sizeof(init_tasks)/sizeof(init_tasks[0]); i++){
++              p = init_tasks[i];
++              if((p != NULL) && (p->thread.mode.tt.extern_pid != me) &&
++                 (p->thread.mode.tt.extern_pid != -1))
++                      os_kill_process(p->thread.mode.tt.extern_pid, 0);
++      }
++}
++#endif
++
++static void kill_off_processes(void)
++{
++      CHOOSE_MODE(kill_off_processes_tt(), kill_off_processes_skas());
++#ifdef CONFIG_SMP
++      kill_idlers(os_getpid());
++#endif
++}
++
++void uml_cleanup(void)
++{
++      kill_off_processes();
++      do_uml_exitcalls();
++}
++
++void machine_restart(char * __unused)
++{
++      do_uml_exitcalls();
++      kill_off_processes();
++      CHOOSE_MODE(reboot_tt(), reboot_skas());
++}
++
++void machine_power_off(void)
++{
++      do_uml_exitcalls();
++      kill_off_processes();
++      CHOOSE_MODE(halt_tt(), halt_skas());
++}
++
++void machine_halt(void)
++{
++      machine_power_off();
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/resource.c um/arch/um/kernel/resource.c
+--- orig/arch/um/kernel/resource.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/resource.c       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,23 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/pci.h"
++
++unsigned long resource_fixup(struct pci_dev * dev, struct resource * res,
++                           unsigned long start, unsigned long size)
++{
++      return start;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/sigio_kern.c um/arch/um/kernel/sigio_kern.c
+--- orig/arch/um/kernel/sigio_kern.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/sigio_kern.c     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,56 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/kernel.h"
++#include "linux/list.h"
++#include "linux/slab.h"
++#include "asm/irq.h"
++#include "init.h"
++#include "sigio.h"
++#include "irq_user.h"
++
++/* Protected by sigio_lock() called from write_sigio_workaround */
++static int sigio_irq_fd = -1;
++
++void sigio_interrupt(int irq, void *data, struct pt_regs *unused)
++{
++      read_sigio_fd(sigio_irq_fd);
++      reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ);
++}
++
++int write_sigio_irq(int fd)
++{
++      if(um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt,
++                        SA_INTERRUPT | SA_SAMPLE_RANDOM, "write sigio", 
++                        NULL)){
++              printk("write_sigio_irq : um_request_irq failed\n");
++              return(-1);
++      }
++      sigio_irq_fd = fd;
++      return(0);
++}
++
++static spinlock_t sigio_spinlock = SPIN_LOCK_UNLOCKED;
++
++void sigio_lock(void)
++{
++      spin_lock(&sigio_spinlock);
++}
++
++void sigio_unlock(void)
++{
++      spin_unlock(&sigio_spinlock);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/sigio_user.c um/arch/um/kernel/sigio_user.c
+--- orig/arch/um/kernel/sigio_user.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/sigio_user.c     Sun Dec 29 23:36:35 2002
+@@ -0,0 +1,440 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <stdlib.h>
++#include <termios.h>
++#include <pty.h>
++#include <fcntl.h>
++#include <signal.h>
++#include <errno.h>
++#include <string.h>
++#include <sched.h>
++#include <sys/socket.h>
++#include <sys/poll.h>
++#include "init.h"
++#include "user.h"
++#include "kern_util.h"
++#include "sigio.h"
++#include "helper.h"
++#include "os.h"
++
++/* Changed during early boot */
++int pty_output_sigio = 0;
++int pty_close_sigio = 0;
++
++/* Used as a flag during SIGIO testing early in boot */
++static int got_sigio = 0;
++
++void __init handler(int sig)
++{
++      got_sigio = 1;
++}
++
++struct openpty_arg {
++      int master;
++      int slave;
++      int err;
++};
++
++static void openpty_cb(void *arg)
++{
++      struct openpty_arg *info = arg;
++
++      info->err = 0;
++      if(openpty(&info->master, &info->slave, NULL, NULL, NULL))
++              info->err = errno;
++}
++
++void __init check_one_sigio(void (*proc)(int, int))
++{
++      struct sigaction old, new;
++      struct termios tt;
++      struct openpty_arg pty = { .master = -1, .slave = -1 };
++      int master, slave, flags;
++
++      initial_thread_cb(openpty_cb, &pty);
++      if(pty.err){
++              printk("openpty failed, errno = %d\n", pty.err);
++              return;
++      }
++
++      master = pty.master;
++      slave = pty.slave;
++
++      if((master == -1) || (slave == -1)){
++              printk("openpty failed to allocate a pty\n");
++              return;
++      }
++
++      if(tcgetattr(master, &tt) < 0)
++              panic("check_sigio : tcgetattr failed, errno = %d\n", errno);
++      cfmakeraw(&tt);
++      if(tcsetattr(master, TCSADRAIN, &tt) < 0)
++              panic("check_sigio : tcsetattr failed, errno = %d\n", errno);
++
++      if((flags = fcntl(master, F_GETFL)) < 0)
++              panic("tty_fds : fcntl F_GETFL failed, errno = %d\n", errno);
++
++      if((fcntl(master, F_SETFL, flags | O_NONBLOCK | O_ASYNC) < 0) ||
++         (fcntl(master, F_SETOWN, os_getpid()) < 0))
++              panic("check_sigio : fcntl F_SETFL or F_SETOWN failed, "
++                    "errno = %d\n", errno);
++
++      if((fcntl(slave, F_SETFL, flags | O_NONBLOCK) < 0))
++              panic("check_sigio : fcntl F_SETFL failed, errno = %d\n", 
++                    errno);
++
++      if(sigaction(SIGIO, NULL, &old) < 0)
++              panic("check_sigio : sigaction 1 failed, errno = %d\n", errno);
++      new = old;
++      new.sa_handler = handler;
++      if(sigaction(SIGIO, &new, NULL) < 0)
++              panic("check_sigio : sigaction 2 failed, errno = %d\n", errno);
++
++      got_sigio = 0;
++      (*proc)(master, slave);
++              
++      close(master);
++      close(slave);
++
++      if(sigaction(SIGIO, &old, NULL) < 0)
++              panic("check_sigio : sigaction 3 failed, errno = %d\n", errno);
++}
++
++static void tty_output(int master, int slave)
++{
++      int n;
++      char buf[512];
++
++      printk("Checking that host ptys support output SIGIO...");
++
++      memset(buf, 0, sizeof(buf));
++      while(write(master, buf, sizeof(buf)) > 0) ;
++      if(errno != EAGAIN)
++              panic("check_sigio : write failed, errno = %d\n", errno);
++
++      while(((n = read(slave, buf, sizeof(buf))) > 0) && !got_sigio) ;
++
++      if(got_sigio){
++              printk("Yes\n");
++              pty_output_sigio = 1;
++      }
++      else if(errno == EAGAIN) printk("No, enabling workaround\n");
++      else panic("check_sigio : read failed, errno = %d\n", errno);
++}
++
++static void tty_close(int master, int slave)
++{
++      printk("Checking that host ptys support SIGIO on close...");
++
++      close(slave);
++      if(got_sigio){
++              printk("Yes\n");
++              pty_close_sigio = 1;
++      }
++      else printk("No, enabling workaround\n");
++}
++
++void __init check_sigio(void)
++{
++      if(access("/dev/ptmx", R_OK) && access("/dev/ptyp0", R_OK)){
++              printk("No pseudo-terminals available - skipping pty SIGIO "
++                     "check\n");
++              return;
++      }
++      check_one_sigio(tty_output);
++      check_one_sigio(tty_close);
++}
++
++/* Protected by sigio_lock(), also used by sigio_cleanup, which is an 
++ * exitcall.
++ */
++static int write_sigio_pid = -1;
++
++/* These arrays are initialized before the sigio thread is started, and
++ * the descriptors closed after it is killed.  So, it can't see them change.
++ * On the UML side, they are changed under the sigio_lock.
++ */
++static int write_sigio_fds[2] = { -1, -1 };
++static int sigio_private[2] = { -1, -1 };
++
++struct pollfds {
++      struct pollfd *poll;
++      int size;
++      int used;
++};
++
++/* Protected by sigio_lock().  Used by the sigio thread, but the UML thread
++ * synchronizes with it.
++ */
++struct pollfds current_poll = {
++      .poll           = NULL,
++      .size           = 0,
++      .used           = 0
++};
++
++struct pollfds next_poll = {
++      .poll           = NULL,
++      .size           = 0,
++      .used           = 0
++};
++
++static int write_sigio_thread(void *unused)
++{
++      struct pollfds *fds, tmp;
++      struct pollfd *p;
++      int i, n, respond_fd;
++      char c;
++
++      fds = &current_poll;
++      while(1){
++              n = poll(fds->poll, fds->used, -1);
++              if(n < 0){
++                      if(errno == EINTR) continue;
++                      printk("write_sigio_thread : poll returned %d, "
++                             "errno = %d\n", n, errno);
++              }
++              for(i = 0; i < fds->used; i++){
++                      p = &fds->poll[i];
++                      if(p->revents == 0) continue;
++                      if(p->fd == sigio_private[1]){
++                              n = read(sigio_private[1], &c, sizeof(c));
++                              if(n != sizeof(c))
++                                      printk("write_sigio_thread : "
++                                             "read failed, errno = %d\n",
++                                             errno);
++                              tmp = current_poll;
++                              current_poll = next_poll;
++                              next_poll = tmp;
++                              respond_fd = sigio_private[1];
++                      }
++                      else {
++                              respond_fd = write_sigio_fds[1];
++                              fds->used--;
++                              memmove(&fds->poll[i], &fds->poll[i + 1],
++                                      (fds->used - i) * sizeof(*fds->poll));
++                      }
++
++                      n = write(respond_fd, &c, sizeof(c));
++                      if(n != sizeof(c))
++                              printk("write_sigio_thread : write failed, "
++                                     "errno = %d\n", errno);
++              }
++      }
++}
++
++static int need_poll(int n)
++{
++      if(n <= next_poll.size){
++              next_poll.used = n;
++              return(0);
++      }
++      if(next_poll.poll != NULL) kfree(next_poll.poll);
++      next_poll.poll = um_kmalloc_atomic(n * sizeof(struct pollfd));
++      if(next_poll.poll == NULL){
++              printk("need_poll : failed to allocate new pollfds\n");
++              next_poll.size = 0;
++              next_poll.used = 0;
++              return(-1);
++      }
++      next_poll.size = n;
++      next_poll.used = n;
++      return(0);
++}
++
++static void update_thread(void)
++{
++      unsigned long flags;
++      int n;
++      char c;
++
++      flags = set_signals(0);
++      n = write(sigio_private[0], &c, sizeof(c));
++      if(n != sizeof(c)){
++              printk("update_thread : write failed, errno = %d\n", errno);
++              goto fail;
++      }
++
++      n = read(sigio_private[0], &c, sizeof(c));
++      if(n != sizeof(c)){
++              printk("update_thread : read failed, errno = %d\n", errno);
++              goto fail;
++      }
++
++      set_signals(flags);
++      return;
++ fail:
++      sigio_lock();
++      if(write_sigio_pid != -1) 
++              os_kill_process(write_sigio_pid, 1);
++      write_sigio_pid = -1;
++      close(sigio_private[0]);
++      close(sigio_private[1]);        
++      close(write_sigio_fds[0]);
++      close(write_sigio_fds[1]);
++      sigio_unlock();
++      set_signals(flags);
++}
++
++int add_sigio_fd(int fd, int read)
++{
++      int err = 0, i, n, events;
++
++      sigio_lock();
++      for(i = 0; i < current_poll.used; i++){
++              if(current_poll.poll[i].fd == fd) 
++                      goto out;
++      }
++
++      n = current_poll.used + 1;
++      err = need_poll(n);
++      if(err) 
++              goto out;
++
++      for(i = 0; i < current_poll.used; i++)
++              next_poll.poll[i] = current_poll.poll[i];
++
++      if(read) events = POLLIN;
++      else events = POLLOUT;
++
++      next_poll.poll[n - 1] = ((struct pollfd) { .fd          = fd,
++                                                 .events      = events,
++                                                 .revents     = 0 });
++      update_thread();
++ out:
++      sigio_unlock();
++      return(err);
++}
++
++int ignore_sigio_fd(int fd)
++{
++      struct pollfd *p;
++      int err = 0, i, n = 0;
++
++      sigio_lock();
++      for(i = 0; i < current_poll.used; i++){
++              if(current_poll.poll[i].fd == fd) break;
++      }
++      if(i == current_poll.used)
++              goto out;
++      
++      err = need_poll(current_poll.used - 1);
++      if(err)
++              goto out;
++
++      for(i = 0; i < current_poll.used; i++){
++              p = &current_poll.poll[i];
++              if(p->fd != fd) next_poll.poll[n++] = current_poll.poll[i];
++      }
++      if(n == i){
++              printk("ignore_sigio_fd : fd %d not found\n", fd);
++              err = -1;
++              goto out;
++      }
++
++      update_thread();
++ out:
++      sigio_unlock();
++      return(err);
++}
++
++static int setup_initial_poll(int fd)
++{
++      struct pollfd *p;
++
++      p = um_kmalloc(sizeof(struct pollfd));
++      if(p == NULL){
++              printk("setup_initial_poll : failed to allocate poll\n");
++              return(-1);
++      }
++      *p = ((struct pollfd) { .fd     = fd,
++                              .events         = POLLIN,
++                              .revents        = 0 });
++      current_poll = ((struct pollfds) { .poll        = p,
++                                         .used        = 1,
++                                         .size        = 1 });
++      return(0);
++}
++
++void write_sigio_workaround(void)
++{
++      unsigned long stack;
++      int err;
++
++      sigio_lock();
++      if(write_sigio_pid != -1)
++              goto out;
++
++      err = os_pipe(write_sigio_fds, 1, 1);
++      if(err){
++              printk("write_sigio_workaround - os_pipe 1 failed, "
++                     "errno = %d\n", -err);
++              goto out;
++      }
++      err = os_pipe(sigio_private, 1, 1);
++      if(err){
++              printk("write_sigio_workaround - os_pipe 2 failed, "
++                     "errno = %d\n", -err);
++              goto out_close1;
++      }
++      if(setup_initial_poll(sigio_private[1]))
++              goto out_close2;
++
++      write_sigio_pid = run_helper_thread(write_sigio_thread, NULL, 
++                                          CLONE_FILES | CLONE_VM, &stack, 0);
++
++      if(write_sigio_pid < 0) goto out_close2;
++
++      if(write_sigio_irq(write_sigio_fds[0])) 
++              goto out_kill;
++
++ out:
++      sigio_unlock();
++      return;
++
++ out_kill:
++      os_kill_process(write_sigio_pid, 1);
++      write_sigio_pid = -1;
++ out_close2:
++      close(sigio_private[0]);
++      close(sigio_private[1]);        
++ out_close1:
++      close(write_sigio_fds[0]);
++      close(write_sigio_fds[1]);
++      sigio_unlock();
++}
++
++int read_sigio_fd(int fd)
++{
++      int n;
++      char c;
++
++      n = read(fd, &c, sizeof(c));
++      if(n != sizeof(c)){
++              printk("read_sigio_fd - read failed, errno = %d\n", errno);
++              return(-errno);
++      }
++      return(n);
++}
++
++static void sigio_cleanup(void)
++{
++      if(write_sigio_pid != -1)
++              os_kill_process(write_sigio_pid, 1);
++}
++
++__uml_exitcall(sigio_cleanup);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/signal_kern.c um/arch/um/kernel/signal_kern.c
+--- orig/arch/um/kernel/signal_kern.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/signal_kern.c    Sun Dec  8 19:44:13 2002
+@@ -0,0 +1,367 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/stddef.h"
++#include "linux/sys.h"
++#include "linux/sched.h"
++#include "linux/wait.h"
++#include "linux/kernel.h"
++#include "linux/smp_lock.h"
++#include "linux/module.h"
++#include "linux/slab.h"
++#include "asm/signal.h"
++#include "asm/uaccess.h"
++#include "asm/ucontext.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "signal_kern.h"
++#include "signal_user.h"
++#include "kern.h"
++#include "frame_kern.h"
++#include "sigcontext.h"
++#include "mode.h"
++
++EXPORT_SYMBOL(block_signals);
++EXPORT_SYMBOL(unblock_signals);
++
++static void force_segv(int sig)
++{
++      if(sig == SIGSEGV){
++              struct k_sigaction *ka;
++
++              ka = &current->sig->action[SIGSEGV - 1];
++              ka->sa.sa_handler = SIG_DFL;
++      }
++      force_sig(SIGSEGV, current);
++}
++
++#define _S(nr) (1<<((nr)-1))
++
++#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP)))
++
++/*
++ * OK, we're invoking a handler
++ */   
++static int handle_signal(struct pt_regs *regs, unsigned long signr, 
++                       struct k_sigaction *ka, siginfo_t *info, 
++                       sigset_t *oldset, int error)
++{
++        __sighandler_t handler;
++      void (*restorer)(void);
++      unsigned long sp;
++      sigset_t save;
++      int err, ret;
++
++      ret = 0;
++      switch(error){
++      case -ERESTARTNOHAND:
++              ret = -EINTR;
++              break;
++
++      case -ERESTARTSYS:
++              if (!(ka->sa.sa_flags & SA_RESTART)) {
++                      ret = -EINTR;
++                      break;
++              }
++              /* fallthrough */
++      case -ERESTARTNOINTR:
++              PT_REGS_RESTART_SYSCALL(regs);
++              PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs);
++
++              /* This is because of the UM_SET_SYSCALL_RETURN and the fact
++               * that on i386 the system call number and return value are
++               * in the same register.  When the system call restarts, %eax
++               * had better have the system call number in it.  Since the
++               * return value doesn't matter (except that it shouldn't be
++               * -ERESTART*), we'll stick the system call number there.
++               */
++              ret = PT_REGS_SYSCALL_NR(regs);
++              break;
++      }
++
++      handler = ka->sa.sa_handler;
++      save = *oldset;
++
++      if (ka->sa.sa_flags & SA_ONESHOT)
++              ka->sa.sa_handler = SIG_DFL;
++
++      if (!(ka->sa.sa_flags & SA_NODEFER)) {
++              spin_lock_irq(&current->sigmask_lock);
++              sigorsets(&current->blocked, &current->blocked, 
++                        &ka->sa.sa_mask);
++              sigaddset(&current->blocked, signr);
++              recalc_sigpending(current);
++              spin_unlock_irq(&current->sigmask_lock);
++      }
++
++      sp = PT_REGS_SP(regs);
++
++      if((ka->sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0))
++              sp = current->sas_ss_sp + current->sas_ss_size;
++      
++      if(error != 0) PT_REGS_SET_SYSCALL_RETURN(regs, ret);
++
++      if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer;
++      else restorer = NULL;
++
++      if(ka->sa.sa_flags & SA_SIGINFO)
++              err = setup_signal_stack_si(sp, signr, (unsigned long) handler,
++                                          restorer, regs, info, &save);
++      else
++              err = setup_signal_stack_sc(sp, signr, (unsigned long) handler,
++                                          restorer, regs, &save);
++      if(err) goto segv;
++
++      return(0);
++ segv:
++      force_segv(signr);
++      return(1);
++}
++
++/*
++ * Note that 'init' is a special process: it doesn't get signals it doesn't
++ * want to handle. Thus you cannot kill init even with a SIGKILL even by
++ * mistake.
++ */
++
++static int kern_do_signal(struct pt_regs *regs, sigset_t *oldset, int error)
++{
++      siginfo_t info;
++      struct k_sigaction *ka;
++      int err;
++
++      if (!oldset)
++              oldset = &current->blocked;
++
++      for (;;) {
++              unsigned long signr;
++
++              spin_lock_irq(&current->sigmask_lock);
++              signr = dequeue_signal(&current->blocked, &info);
++              spin_unlock_irq(&current->sigmask_lock);
++
++              if (!signr)
++                      break;
++
++              if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) {
++                      /* Let the debugger run.  */
++                      current->exit_code = signr;
++                      current->state = TASK_STOPPED;
++                      notify_parent(current, SIGCHLD);
++                      schedule();
++
++                      /* We're back.  Did the debugger cancel the sig?  */
++                      if (!(signr = current->exit_code))
++                              continue;
++                      current->exit_code = 0;
++
++                      /* The debugger continued.  Ignore SIGSTOP.  */
++                      if (signr == SIGSTOP)
++                              continue;
++
++                      /* Update the siginfo structure.  Is this good?  */
++                      if (signr != info.si_signo) {
++                              info.si_signo = signr;
++                              info.si_errno = 0;
++                              info.si_code = SI_USER;
++                              info.si_pid = current->p_pptr->pid;
++                              info.si_uid = current->p_pptr->uid;
++                      }
++
++                      /* If the (new) signal is now blocked, requeue it.  */
++                      if (sigismember(&current->blocked, signr)) {
++                              send_sig_info(signr, &info, current);
++                              continue;
++                      }
++              }
++
++              ka = &current->sig->action[signr-1];
++              if (ka->sa.sa_handler == SIG_IGN) {
++                      if (signr != SIGCHLD)
++                              continue;
++                      /* Check for SIGCHLD: it's special.  */
++                      while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0)
++                              /* nothing */;
++                      continue;
++              }
++
++              if (ka->sa.sa_handler == SIG_DFL) {
++                      int exit_code = signr;
++
++                      /* Init gets no signals it doesn't want.  */
++                      if (current->pid == 1)
++                              continue;
++
++                      switch (signr) {
++                      case SIGCONT: case SIGCHLD: case SIGWINCH: case SIGURG:
++                              continue;
++
++                      case SIGTSTP: case SIGTTIN: case SIGTTOU:
++                              if (is_orphaned_pgrp(current->pgrp))
++                                      continue;
++                              /* FALLTHRU */
++
++                        case SIGSTOP: {
++                                struct signal_struct *sig;
++                              current->state = TASK_STOPPED;
++                              current->exit_code = signr;
++                                sig = current->p_pptr->sig;
++                                if (sig && !(sig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
++                                      notify_parent(current, SIGCHLD);
++                              schedule();
++                              continue;
++                      }
++                      case SIGQUIT: case SIGILL: case SIGTRAP:
++                      case SIGABRT: case SIGFPE: case SIGSEGV:
++                      case SIGBUS: case SIGSYS: case SIGXCPU: case SIGXFSZ:
++                              if (do_coredump(signr, &current->thread.regs))
++                                      exit_code |= 0x80;
++                              /* FALLTHRU */
++
++                      default:
++                              sig_exit(signr, exit_code, &info);
++                              /* NOTREACHED */
++                      }
++              }
++
++              /* Whee!  Actually deliver the signal.  */
++              err = handle_signal(regs, signr, ka, &info, oldset, error);
++              if(!err) return(1);
++      }
++
++      /* Did we come from a system call? */
++      if(PT_REGS_SYSCALL_NR(regs) >= 0){
++              /* Restart the system call - no handlers present */
++              if(PT_REGS_SYSCALL_RET(regs) == -ERESTARTNOHAND ||
++                 PT_REGS_SYSCALL_RET(regs) == -ERESTARTSYS ||
++                 PT_REGS_SYSCALL_RET(regs) == -ERESTARTNOINTR){
++                      PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs);
++                      PT_REGS_RESTART_SYSCALL(regs);
++              }
++      }
++
++      /* This closes a way to execute a system call on the host.  If
++       * you set a breakpoint on a system call instruction and singlestep
++       * from it, the tracing thread used to PTRACE_SINGLESTEP the process
++       * rather than PTRACE_SYSCALL it, allowing the system call to execute
++       * on the host.  The tracing thread will check this flag and 
++       * PTRACE_SYSCALL if necessary.
++       */
++      if((current->ptrace & PT_DTRACE) && 
++         is_syscall(PT_REGS_IP(&current->thread.regs)))
++              (void) CHOOSE_MODE(current->thread.mode.tt.singlestep_syscall = 1, 0);
++
++      return(0);
++}
++
++int do_signal(int error)
++{
++      return(kern_do_signal(&current->thread.regs, NULL, error));
++}
++
++/*
++ * Atomically swap in the new signal mask, and wait for a signal.
++ */
++int sys_sigsuspend(int history0, int history1, old_sigset_t mask)
++{
++      sigset_t saveset;
++
++      mask &= _BLOCKABLE;
++      spin_lock_irq(&current->sigmask_lock);
++      saveset = current->blocked;
++      siginitset(&current->blocked, mask);
++      recalc_sigpending(current);
++      spin_unlock_irq(&current->sigmask_lock);
++
++      while (1) {
++              current->state = TASK_INTERRUPTIBLE;
++              schedule();
++              if(kern_do_signal(&current->thread.regs, &saveset, -EINTR))
++                      return(-EINTR);
++      }
++}
++
++int sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize)
++{
++      sigset_t saveset, newset;
++
++      /* XXX: Don't preclude handling different sized sigset_t's.  */
++      if (sigsetsize != sizeof(sigset_t))
++              return -EINVAL;
++
++      if (copy_from_user(&newset, unewset, sizeof(newset)))
++              return -EFAULT;
++      sigdelsetmask(&newset, ~_BLOCKABLE);
++
++      spin_lock_irq(&current->sigmask_lock);
++      saveset = current->blocked;
++      current->blocked = newset;
++      recalc_sigpending(current);
++      spin_unlock_irq(&current->sigmask_lock);
++
++      while (1) {
++              current->state = TASK_INTERRUPTIBLE;
++              schedule();
++              if (kern_do_signal(&current->thread.regs, &saveset, -EINTR))
++                      return(-EINTR);
++      }
++}
++
++static int copy_sc_from_user(struct pt_regs *to, void *from, 
++                           struct arch_frame_data *arch)
++{
++      int ret;
++
++      ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from, arch),
++                        copy_sc_from_user_skas(&to->regs, from));
++      return(ret);
++}
++
++int sys_sigreturn(struct pt_regs regs)
++{
++      void *sc = sp_to_sc(PT_REGS_SP(&current->thread.regs));
++      void *mask = sp_to_mask(PT_REGS_SP(&current->thread.regs));
++      int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long);
++
++      spin_lock_irq(&current->sigmask_lock);
++      copy_from_user(&current->blocked.sig[0], sc_sigmask(sc), 
++                     sizeof(current->blocked.sig[0]));
++      copy_from_user(&current->blocked.sig[1], mask, sig_size);
++      sigdelsetmask(&current->blocked, ~_BLOCKABLE);
++      recalc_sigpending(current);
++      spin_unlock_irq(&current->sigmask_lock);
++      copy_sc_from_user(&current->thread.regs, sc, 
++                        &signal_frame_sc.common.arch);
++      return(PT_REGS_SYSCALL_RET(&current->thread.regs));
++}
++
++int sys_rt_sigreturn(struct pt_regs regs)
++{
++      struct ucontext *uc = sp_to_uc(PT_REGS_SP(&current->thread.regs));
++      void *fp;
++      int sig_size = _NSIG_WORDS * sizeof(unsigned long);
++
++      spin_lock_irq(&current->sigmask_lock);
++      copy_from_user(&current->blocked, &uc->uc_sigmask, sig_size);
++      sigdelsetmask(&current->blocked, ~_BLOCKABLE);
++      recalc_sigpending(current);
++      spin_unlock_irq(&current->sigmask_lock);
++      fp = (void *) (((unsigned long) uc) + sizeof(struct ucontext));
++      copy_sc_from_user(&current->thread.regs, &uc->uc_mcontext,
++                        &signal_frame_si.common.arch);
++      return(PT_REGS_SYSCALL_RET(&current->thread.regs));
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/signal_user.c um/arch/um/kernel/signal_user.c
+--- orig/arch/um/kernel/signal_user.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/signal_user.c    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,142 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <stdlib.h>
++#include <signal.h>
++#include <errno.h>
++#include <stdarg.h>
++#include <string.h>
++#include <sys/mman.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "signal_user.h"
++#include "signal_kern.h"
++#include "sysdep/sigcontext.h"
++#include "sigcontext.h"
++
++void set_sigstack(void *sig_stack, int size)
++{
++      stack_t stack = ((stack_t) { .ss_flags  = 0,
++                                   .ss_sp     = (__ptr_t) sig_stack,
++                                   .ss_size   = size - sizeof(void *) });
++
++      if(sigaltstack(&stack, NULL) != 0)
++              panic("enabling signal stack failed, errno = %d\n", errno);
++}
++
++void set_handler(int sig, void (*handler)(int), int flags, ...)
++{
++      struct sigaction action;
++      va_list ap;
++      int mask;
++
++      va_start(ap, flags);
++      action.sa_handler = handler;
++      sigemptyset(&action.sa_mask);
++      while((mask = va_arg(ap, int)) != -1){
++              sigaddset(&action.sa_mask, mask);
++      }
++      action.sa_flags = flags;
++      action.sa_restorer = NULL;
++      if(sigaction(sig, &action, NULL) < 0)
++              panic("sigaction failed");
++}
++
++int change_sig(int signal, int on)
++{
++      sigset_t sigset, old;
++
++      sigemptyset(&sigset);
++      sigaddset(&sigset, signal);
++      sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, &old);
++      return(!sigismember(&old, signal));
++}
++
++static void change_signals(int type)
++{
++      sigset_t mask;
++
++      sigemptyset(&mask);
++      sigaddset(&mask, SIGVTALRM);
++      sigaddset(&mask, SIGALRM);
++      sigaddset(&mask, SIGIO);
++      sigaddset(&mask, SIGPROF);
++      if(sigprocmask(type, &mask, NULL) < 0)
++              panic("Failed to change signal mask - errno = %d", errno);
++}
++
++void block_signals(void)
++{
++      change_signals(SIG_BLOCK);
++}
++
++void unblock_signals(void)
++{
++      change_signals(SIG_UNBLOCK);
++}
++
++#define SIGIO_BIT 0
++#define SIGVTALRM_BIT 1
++
++static int enable_mask(sigset_t *mask)
++{
++      int sigs;
++
++      sigs = sigismember(mask, SIGIO) ? 0 : 1 << SIGIO_BIT;
++      sigs |= sigismember(mask, SIGVTALRM) ? 0 : 1 << SIGVTALRM_BIT;
++      sigs |= sigismember(mask, SIGALRM) ? 0 : 1 << SIGVTALRM_BIT;
++      return(sigs);
++}
++
++int get_signals(void)
++{
++      sigset_t mask;
++      
++      if(sigprocmask(SIG_SETMASK, NULL, &mask) < 0)
++              panic("Failed to get signal mask");
++      return(enable_mask(&mask));
++}
++
++int set_signals(int enable)
++{
++      sigset_t mask;
++      int ret;
++
++      sigemptyset(&mask);
++      if(enable & (1 << SIGIO_BIT)) 
++              sigaddset(&mask, SIGIO);
++      if(enable & (1 << SIGVTALRM_BIT)){
++              sigaddset(&mask, SIGVTALRM);
++              sigaddset(&mask, SIGALRM);
++      }
++      if(sigprocmask(SIG_UNBLOCK, &mask, &mask) < 0)
++              panic("Failed to enable signals");
++      ret = enable_mask(&mask);
++      sigemptyset(&mask);
++      if((enable & (1 << SIGIO_BIT)) == 0) 
++              sigaddset(&mask, SIGIO);
++      if((enable & (1 << SIGVTALRM_BIT)) == 0){
++              sigaddset(&mask, SIGVTALRM);
++              sigaddset(&mask, SIGALRM);
++      }
++      if(sigprocmask(SIG_BLOCK, &mask, NULL) < 0)
++              panic("Failed to block signals");
++
++      return(ret);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/Makefile um/arch/um/kernel/skas/Makefile
+--- orig/arch/um/kernel/skas/Makefile  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/Makefile    Fri Nov  1 16:05:44 2002
+@@ -0,0 +1,30 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET = skas.o
++
++obj-y = exec_kern.o exec_user.o mem.o mem_user.o mmu.o process.o \
++      process_kern.o syscall_kern.o syscall_user.o time.o tlb.o trap_user.o
++
++subdir-y = sys-$(SUBARCH)
++
++obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
++
++USER_OBJS = $(filter %_user.o,$(obj-y)) process.o time.o
++
++include $(TOPDIR)/Rules.make
++
++include/skas_ptregs.h : util/mk_ptregs
++      util/mk_ptregs > $@
++
++util/mk_ptregs :
++      $(MAKE) -C util
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++clean :
++      $(MAKE) -C util clean
++      $(RM) -f include/skas_ptregs.h
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/exec_kern.c um/arch/um/kernel/skas/exec_kern.c
+--- orig/arch/um/kernel/skas/exec_kern.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/exec_kern.c Mon Nov 11 18:57:19 2002
+@@ -0,0 +1,41 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/kernel.h"
++#include "asm/current.h"
++#include "asm/page.h"
++#include "asm/signal.h"
++#include "asm/ptrace.h"
++#include "asm/uaccess.h"
++#include "asm/mmu_context.h"
++#include "tlb.h"
++#include "skas.h"
++#include "mmu.h"
++#include "os.h"
++
++void flush_thread_skas(void)
++{
++      force_flush_all();
++      switch_mm_skas(current->mm->context.skas.mm_fd);
++}
++
++void start_thread_skas(struct pt_regs *regs, unsigned long eip, 
++                     unsigned long esp)
++{
++      set_fs(USER_DS);
++        PT_REGS_IP(regs) = eip;
++      PT_REGS_SP(regs) = esp;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/exec_user.c um/arch/um/kernel/skas/exec_user.c
+--- orig/arch/um/kernel/skas/exec_user.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/exec_user.c Sun Nov  3 19:23:01 2002
+@@ -0,0 +1,61 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <errno.h>
++#include <signal.h>
++#include <sched.h>
++#include <sys/wait.h>
++#include <sys/ptrace.h>
++#include "user.h"
++#include "kern_util.h"
++#include "os.h"
++#include "time_user.h"
++
++static int user_thread_tramp(void *arg)
++{
++      if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0)
++              panic("user_thread_tramp - PTRACE_TRACEME failed, "
++                    "errno = %d\n", errno);
++      enable_timer();
++      os_stop_process(os_getpid());
++      return(0);
++}
++
++int user_thread(unsigned long stack, int flags)
++{
++      int pid, status;
++
++      pid = clone(user_thread_tramp, (void *) stack_sp(stack), 
++                  flags | CLONE_FILES | SIGCHLD, NULL);
++      if(pid < 0){
++              printk("user_thread - clone failed, errno = %d\n", errno);
++              return(pid);
++      }
++
++      if(waitpid(pid, &status, WUNTRACED) < 0){
++              printk("user_thread - waitpid failed, errno = %d\n", errno);
++              return(-errno);
++      }
++
++      if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)){
++              printk("user_thread - trampoline didn't stop, status = %d\n", 
++                     status);
++              return(-EINVAL);
++      }
++
++      return(pid);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/mmu.h um/arch/um/kernel/skas/include/mmu.h
+--- orig/arch/um/kernel/skas/include/mmu.h     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/include/mmu.h       Sun Nov 10 21:21:50 2002
+@@ -0,0 +1,27 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_MMU_H
++#define __SKAS_MMU_H
++
++#include "linux/list.h"
++#include "linux/spinlock.h"
++
++struct mmu_context_skas {
++      int mm_fd;
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/mode.h um/arch/um/kernel/skas/include/mode.h
+--- orig/arch/um/kernel/skas/include/mode.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/include/mode.h      Wed Mar 26 13:27:46 2003
+@@ -0,0 +1,36 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MODE_SKAS_H__
++#define __MODE_SKAS_H__
++
++extern unsigned long exec_regs[];
++extern unsigned long exec_fp_regs[];
++extern unsigned long exec_fpx_regs[];
++extern int have_fpx_regs;
++
++extern void user_time_init_skas(void);
++extern int copy_sc_from_user_skas(union uml_pt_regs *regs, void *from_ptr);
++extern int copy_sc_to_user_skas(void *to_ptr, void *fp, 
++                              union uml_pt_regs *regs, 
++                              unsigned long fault_addr, int fault_type);
++extern void sig_handler_common_skas(int sig, void *sc_ptr);
++extern void halt_skas(void);
++extern void reboot_skas(void);
++extern void kill_off_processes_skas(void);
++extern int is_skas_winch(int pid, int fd, void *data);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/mode_kern.h um/arch/um/kernel/skas/include/mode_kern.h
+--- orig/arch/um/kernel/skas/include/mode_kern.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/include/mode_kern.h Mon Dec 16 21:49:11 2002
+@@ -0,0 +1,51 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_MODE_KERN_H__
++#define __SKAS_MODE_KERN_H__
++
++#include "linux/sched.h"
++#include "asm/page.h"
++#include "asm/ptrace.h"
++
++extern void flush_thread_skas(void);
++extern void *_switch_to_skas(void *prev, void *next);
++extern void start_thread_skas(struct pt_regs *regs, unsigned long eip, 
++                            unsigned long esp);
++extern int copy_thread_skas(int nr, unsigned long clone_flags, 
++                          unsigned long sp, unsigned long stack_top, 
++                          struct task_struct *p, struct pt_regs *regs);
++extern void release_thread_skas(struct task_struct *task);
++extern void exit_thread_skas(void);
++extern void initial_thread_cb_skas(void (*proc)(void *), void *arg);
++extern void init_idle_skas(void);
++extern void flush_tlb_kernel_vm_skas(void);
++extern void __flush_tlb_one_skas(unsigned long addr);
++extern void flush_tlb_range_skas(struct mm_struct *mm, unsigned long start, 
++                               unsigned long end);
++extern void flush_tlb_mm_skas(struct mm_struct *mm);
++extern void force_flush_all_skas(void);
++extern long execute_syscall_skas(void *r);
++extern void before_mem_skas(unsigned long unused);
++extern unsigned long set_task_sizes_skas(int arg, unsigned long *host_size_out,
++                                       unsigned long *task_size_out);
++extern int start_uml_skas(void);
++extern int external_pid_skas(struct task_struct *task);
++extern int thread_pid_skas(struct thread_struct *thread);
++
++#define kmem_end_skas (host_task_size - 1024 * 1024)
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/proc_mm.h um/arch/um/kernel/skas/include/proc_mm.h
+--- orig/arch/um/kernel/skas/include/proc_mm.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/include/proc_mm.h   Wed Nov 13 11:57:23 2002
+@@ -0,0 +1,55 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_PROC_MM_H
++#define __SKAS_PROC_MM_H
++
++#define MM_MMAP 54
++#define MM_MUNMAP 55
++#define MM_MPROTECT 56
++#define MM_COPY_SEGMENTS 57
++
++struct mm_mmap {
++      unsigned long addr;
++      unsigned long len;
++      unsigned long prot;
++      unsigned long flags;
++      unsigned long fd;
++      unsigned long offset;
++};
++
++struct mm_munmap {
++      unsigned long addr;
++      unsigned long len;      
++};
++
++struct mm_mprotect {
++      unsigned long addr;
++      unsigned long len;
++        unsigned int prot;
++};
++
++struct proc_mm_op {
++      int op;
++      union {
++              struct mm_mmap mmap;
++              struct mm_munmap munmap;
++              struct mm_mprotect mprotect;
++              int copy_segments;
++      } u;
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/ptrace-skas.h um/arch/um/kernel/skas/include/ptrace-skas.h
+--- orig/arch/um/kernel/skas/include/ptrace-skas.h     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/include/ptrace-skas.h       Fri Jan 17 13:22:09 2003
+@@ -0,0 +1,57 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PTRACE_SKAS_H
++#define __PTRACE_SKAS_H
++
++#include "uml-config.h"
++
++#ifdef UML_CONFIG_MODE_SKAS
++
++#include "skas_ptregs.h"
++
++#define HOST_FRAME_SIZE 17
++
++#define REGS_IP(r) ((r)[HOST_IP])
++#define REGS_SP(r) ((r)[HOST_SP])
++#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS])
++#define REGS_EAX(r) ((r)[HOST_EAX])
++#define REGS_EBX(r) ((r)[HOST_EBX])
++#define REGS_ECX(r) ((r)[HOST_ECX])
++#define REGS_EDX(r) ((r)[HOST_EDX])
++#define REGS_ESI(r) ((r)[HOST_ESI])
++#define REGS_EDI(r) ((r)[HOST_EDI])
++#define REGS_EBP(r) ((r)[HOST_EBP])
++#define REGS_CS(r) ((r)[HOST_CS])
++#define REGS_SS(r) ((r)[HOST_SS])
++#define REGS_DS(r) ((r)[HOST_DS])
++#define REGS_ES(r) ((r)[HOST_ES])
++#define REGS_FS(r) ((r)[HOST_FS])
++#define REGS_GS(r) ((r)[HOST_GS])
++
++#define REGS_SET_SYSCALL_RETURN(r, res) REGS_EAX(r) = (res)
++
++#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r))
++
++#define REGS_SEGV_IS_FIXABLE(r) SEGV_IS_FIXABLE((r)->trap_type)
++
++#define REGS_FAULT_ADDR(r) ((r)->fault_addr)
++
++#define REGS_FAULT_WRITE(r) FAULT_WRITE((r)->fault_type)
++
++#endif
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/skas.h um/arch/um/kernel/skas/include/skas.h
+--- orig/arch/um/kernel/skas/include/skas.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/include/skas.h      Sun Dec  8 21:00:12 2002
+@@ -0,0 +1,49 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_H
++#define __SKAS_H
++
++#include "sysdep/ptrace.h"
++
++extern int userspace_pid;
++
++extern void switch_threads(void *me, void *next);
++extern void thread_wait(void *sw, void *fb);
++extern void new_thread(void *stack, void **switch_buf_ptr, void **fork_buf_ptr,
++                       void (*handler)(int));
++extern int start_idle_thread(void *stack, void *switch_buf_ptr, 
++                           void **fork_buf_ptr);
++extern int user_thread(unsigned long stack, int flags);
++extern void userspace(union uml_pt_regs *regs);
++extern void new_thread_proc(void *stack, void (*handler)(int sig));
++extern void remove_sigstack(void);
++extern void new_thread_handler(int sig);
++extern void handle_syscall(union uml_pt_regs *regs);
++extern void map(int fd, unsigned long virt, unsigned long phys, 
++              unsigned long len, int r, int w, int x);
++extern int unmap(int fd, void *addr, int len);
++extern int protect(int fd, unsigned long addr, unsigned long len, 
++                 int r, int w, int x, int must_succeed);
++extern void user_signal(int sig, union uml_pt_regs *regs);
++extern int singlestepping_skas(void);
++extern int new_mm(int from);
++extern void save_registers(union uml_pt_regs *regs);
++extern void restore_registers(union uml_pt_regs *regs);
++extern void start_userspace(void);
++extern void init_registers(int pid);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/uaccess.h um/arch/um/kernel/skas/include/uaccess.h
+--- orig/arch/um/kernel/skas/include/uaccess.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/include/uaccess.h   Fri Jan 31 23:05:56 2003
+@@ -0,0 +1,232 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __SKAS_UACCESS_H
++#define __SKAS_UACCESS_H
++
++#include "linux/string.h"
++#include "linux/sched.h"
++#include "asm/processor.h"
++#include "asm/pgtable.h"
++#include "asm/errno.h"
++#include "asm/current.h"
++#include "asm/a.out.h"
++#include "kern_util.h"
++
++#define access_ok_skas(type, addr, size) \
++      ((segment_eq(get_fs(), KERNEL_DS)) || \
++       (((unsigned long) (addr) < TASK_SIZE) && \
++        ((unsigned long) (addr) + (size) <= TASK_SIZE)))
++
++static inline int verify_area_skas(int type, const void * addr, 
++                                 unsigned long size)
++{
++      return(access_ok_skas(type, addr, size) ? 0 : -EFAULT);
++}
++
++static inline unsigned long maybe_map(unsigned long virt, int is_write)
++{
++      pte_t pte;
++
++      void *phys = um_virt_to_phys(current, virt, &pte);
++      int dummy_code;
++
++      if(IS_ERR(phys) || (is_write && !pte_write(pte))){
++              if(!handle_page_fault(virt, 0, is_write, 0, &dummy_code))
++                      return(0);
++              phys = um_virt_to_phys(current, virt, NULL);
++      }
++      return((unsigned long) __va((unsigned long) phys));
++}
++
++static inline int buffer_op(unsigned long addr, int len, 
++                          int (*op)(unsigned long addr, int len, void *arg),
++                          void *arg)
++{
++      int size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
++      int remain = len, n;
++
++      n = (*op)(addr, size, arg);
++      if(n != 0)
++              return(n < 0 ? remain : 0);
++
++      addr += size;
++      remain -= size;
++      if(remain == 0) 
++              return(0);
++
++      while(addr < ((addr + remain) & PAGE_MASK)){
++              n = (*op)(addr, PAGE_SIZE, arg);
++              if(n != 0)
++                      return(n < 0 ? remain : 0);
++
++              addr += PAGE_SIZE;
++              remain -= PAGE_SIZE;
++      }
++      if(remain == 0)
++              return(0);
++
++      n = (*op)(addr, remain, arg);
++      if(n != 0)
++              return(n < 0 ? remain : 0);
++      return(0);
++}
++
++static inline int copy_chunk_from_user(unsigned long from, int len, void *arg)
++{
++      unsigned long *to_ptr = arg, to = *to_ptr;
++
++      from = maybe_map(from, 0);
++      if(from == 0)
++              return(-1);
++
++      memcpy((void *) to, (void *) from, len);
++      *to_ptr += len;
++      return(0);
++}
++
++static inline int copy_from_user_skas(void *to, const void *from, int n)
++{
++      if(segment_eq(get_fs(), KERNEL_DS)){
++              memcpy(to, from, n);
++              return(0);
++      }
++
++      return(access_ok_skas(VERIFY_READ, from, n) ?
++             buffer_op((unsigned long) from, n, copy_chunk_from_user, &to) :
++             n);
++}
++
++static inline int copy_chunk_to_user(unsigned long to, int len, void *arg)
++{
++      unsigned long *from_ptr = arg, from = *from_ptr;
++
++      to = maybe_map(to, 1);
++      if(to == 0)
++              return(-1);
++
++      memcpy((void *) to, (void *) from, len);
++      *from_ptr += len;
++      return(0);
++}
++
++static inline int copy_to_user_skas(void *to, const void *from, int n)
++{
++      if(segment_eq(get_fs(), KERNEL_DS)){
++              memcpy(to, from, n);
++              return(0);
++      }
++
++      return(access_ok_skas(VERIFY_WRITE, to, n) ?
++             buffer_op((unsigned long) to, n, copy_chunk_to_user, &from) :
++             n);
++}
++
++static inline int strncpy_chunk_from_user(unsigned long from, int len, 
++                                        void *arg)
++{
++        char **to_ptr = arg, *to = *to_ptr;
++      int n;
++
++      from = maybe_map(from, 0);
++      if(from == 0)
++              return(-1);
++
++      strncpy(to, (void *) from, len);
++      n = strnlen(to, len);
++      *to_ptr += n;
++
++      if(n < len) 
++              return(1);
++      return(0);
++}
++
++static inline int strncpy_from_user_skas(char *dst, const char *src, int count)
++{
++      int n;
++      char *ptr = dst;
++
++      if(segment_eq(get_fs(), KERNEL_DS)){
++              strncpy(dst, src, count);
++              return(strnlen(dst, count));
++      }
++
++      if(!access_ok_skas(VERIFY_READ, src, 1))
++              return(-EFAULT);
++
++      n = buffer_op((unsigned long) src, count, strncpy_chunk_from_user, 
++                    &ptr);
++      if(n != 0)
++              return(-EFAULT);
++      return(strnlen(dst, count));
++}
++
++static inline int clear_chunk(unsigned long addr, int len, void *unused)
++{
++      addr = maybe_map(addr, 1);
++      if(addr == 0) 
++              return(-1);
++
++      memset((void *) addr, 0, len);
++      return(0);
++}
++
++static inline int __clear_user_skas(void *mem, int len)
++{
++      return(buffer_op((unsigned long) mem, len, clear_chunk, NULL));
++}
++
++static inline int clear_user_skas(void *mem, int len)
++{
++      if(segment_eq(get_fs(), KERNEL_DS)){
++              memset(mem, 0, len);
++              return(0);
++      }
++
++      return(access_ok_skas(VERIFY_WRITE, mem, len) ? 
++             buffer_op((unsigned long) mem, len, clear_chunk, NULL) : len);
++}
++
++static inline int strnlen_chunk(unsigned long str, int len, void *arg)
++{
++      int *len_ptr = arg, n;
++
++      str = maybe_map(str, 0);
++      if(str == 0) 
++              return(-1);
++
++      n = strnlen((void *) str, len);
++      *len_ptr += n;
++
++      if(n < len)
++              return(1);
++      return(0);
++}
++
++static inline int strnlen_user_skas(const void *str, int len)
++{
++      int count = 0, n;
++
++      if(segment_eq(get_fs(), KERNEL_DS))
++              return(strnlen(str, len) + 1);
++
++      n = buffer_op((unsigned long) str, len, strnlen_chunk, &count);
++      if(n == 0)
++              return(count + 1);
++      return(-EFAULT);
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/mem.c um/arch/um/kernel/skas/mem.c
+--- orig/arch/um/kernel/skas/mem.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/mem.c       Mon Dec 16 21:49:39 2002
+@@ -0,0 +1,30 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/mm.h"
++#include "mem_user.h"
++
++unsigned long set_task_sizes_skas(int arg, unsigned long *host_size_out, 
++                                unsigned long *task_size_out)
++{
++      /* Round up to the nearest 4M */
++      unsigned long top = ROUND_4M((unsigned long) &arg);
++
++      *host_size_out = top;
++      *task_size_out = top;
++      return(((unsigned long) set_task_sizes_skas) & ~0xffffff);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/mem_user.c um/arch/um/kernel/skas/mem_user.c
+--- orig/arch/um/kernel/skas/mem_user.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/mem_user.c  Tue Dec 31 00:13:18 2002
+@@ -0,0 +1,95 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <errno.h>
++#include <sys/mman.h>
++#include <sys/ptrace.h>
++#include "mem_user.h"
++#include "user.h"
++#include "os.h"
++#include "proc_mm.h"
++
++void map(int fd, unsigned long virt, unsigned long phys, unsigned long len, 
++       int r, int w, int x)
++{
++      struct proc_mm_op map;
++      struct mem_region *region;
++      int prot, n;
++
++      prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | 
++              (x ? PROT_EXEC : 0);
++      region = phys_region(phys);
++
++      map = ((struct proc_mm_op) { .op        = MM_MMAP,
++                                   .u         = 
++                                   { .mmap    = 
++                                     { .addr          = virt,
++                                       .len           = len,
++                                       .prot          = prot,
++                                       .flags         = MAP_SHARED | 
++                                                        MAP_FIXED,
++                                       .fd            = region->fd,
++                                       .offset        = phys_offset(phys)
++                                     } } } );
++      n = os_write_file(fd, &map, sizeof(map));
++      if(n != sizeof(map)) 
++              printk("map : /proc/mm map failed, errno = %d\n", errno);
++}
++
++int unmap(int fd, void *addr, int len)
++{
++      struct proc_mm_op unmap;
++      int n;
++
++      unmap = ((struct proc_mm_op) { .op      = MM_MUNMAP,
++                                     .u       = 
++                                     { .munmap        = 
++                                       { .addr        = (unsigned long) addr,
++                                         .len         = len } } } );
++      n = os_write_file(fd, &unmap, sizeof(unmap));
++      if((n != 0) && (n != sizeof(unmap)))
++              return(-errno);
++      return(0);
++}
++
++int protect(int fd, unsigned long addr, unsigned long len, int r, int w, 
++          int x, int must_succeed)
++{
++      struct proc_mm_op protect;
++      int prot, n;
++
++      prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | 
++              (x ? PROT_EXEC : 0);
++
++      protect = ((struct proc_mm_op) { .op    = MM_MPROTECT,
++                                     .u       = 
++                                     { .mprotect      = 
++                                       { .addr        = (unsigned long) addr,
++                                         .len         = len,
++                                         .prot        = prot } } } );
++
++      n = os_write_file(fd, &protect, sizeof(protect));
++      if((n != 0) && (n != sizeof(protect))){
++              if(must_succeed)
++                      panic("protect failed, errno = %d", errno);
++              return(-errno);
++      }
++      return(0);
++}
++
++void before_mem_skas(unsigned long unused)
++{
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/mmu.c um/arch/um/kernel/skas/mmu.c
+--- orig/arch/um/kernel/skas/mmu.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/mmu.c       Wed Nov 13 13:09:57 2002
+@@ -0,0 +1,44 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/list.h"
++#include "linux/spinlock.h"
++#include "linux/slab.h"
++#include "asm/segment.h"
++#include "asm/mmu.h"
++#include "os.h"
++#include "skas.h"
++
++int init_new_context_skas(struct task_struct *task, struct mm_struct *mm)
++{
++      int from;
++
++      if((current->mm != NULL) && (current->mm != &init_mm))
++              from = current->mm->context.skas.mm_fd;
++      else from = -1;
++
++      mm->context.skas.mm_fd = new_mm(from);
++      if(mm->context.skas.mm_fd < 0)
++              panic("init_new_context_skas - new_mm failed, errno = %d\n",
++                    mm->context.skas.mm_fd);
++
++      return(0);
++}
++
++void destroy_context_skas(struct mm_struct *mm)
++{
++      os_close_file(mm->context.skas.mm_fd);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/process.c um/arch/um/kernel/skas/process.c
+--- orig/arch/um/kernel/skas/process.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/process.c   Wed Mar 26 14:43:19 2003
+@@ -0,0 +1,407 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <unistd.h>
++#include <errno.h>
++#include <signal.h>
++#include <setjmp.h>
++#include <sched.h>
++#include <sys/wait.h>
++#include <sys/ptrace.h>
++#include <sys/mman.h>
++#include <sys/user.h>
++#include <asm/unistd.h>
++#include "user.h"
++#include "ptrace_user.h"
++#include "time_user.h"
++#include "sysdep/ptrace.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "skas.h"
++#include "sysdep/sigcontext.h"
++#include "os.h"
++#include "proc_mm.h"
++#include "skas_ptrace.h"
++#include "chan_user.h"
++
++int is_skas_winch(int pid, int fd, void *data)
++{
++      if(pid != getpid())
++              return(0);
++
++      register_winch_irq(-1, fd, -1, data);
++      return(1);
++}
++
++unsigned long exec_regs[FRAME_SIZE];
++unsigned long exec_fp_regs[HOST_FP_SIZE];
++unsigned long exec_fpx_regs[HOST_XFP_SIZE];
++int have_fpx_regs = 1;
++
++static void handle_segv(int pid)
++{
++      struct ptrace_faultinfo fault;
++      int err;
++
++      err = ptrace(PTRACE_FAULTINFO, pid, 0, &fault);
++      if(err)
++              panic("handle_segv - PTRACE_FAULTINFO failed, errno = %d\n",
++                    errno);
++
++      segv(fault.addr, 0, FAULT_WRITE(fault.is_write), 1, NULL);
++}
++
++static void handle_trap(int pid, union uml_pt_regs *regs)
++{
++      int err, syscall_nr, status;
++
++      syscall_nr = PT_SYSCALL_NR(regs->skas.regs);
++      if(syscall_nr < 1){
++              relay_signal(SIGTRAP, regs);
++              return;
++      }
++      UPT_SYSCALL_NR(regs) = syscall_nr;
++
++      err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid);
++      if(err < 0)
++              panic("handle_trap - nullifying syscall failed errno = %d\n", 
++                    errno);
++
++      err = ptrace(PTRACE_SYSCALL, pid, 0, 0);
++      if(err < 0)
++              panic("handle_trap - continuing to end of syscall failed, "
++                    "errno = %d\n", errno);
++
++      err = waitpid(pid, &status, WUNTRACED);
++      if((err < 0) || !WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP))
++              panic("handle_trap - failed to wait at end of syscall, "
++                    "errno = %d, status = %d\n", errno, status);
++
++      handle_syscall(regs);
++}
++
++static int userspace_tramp(void *arg)
++{
++      init_new_thread_signals(0);
++      enable_timer();
++      ptrace(PTRACE_TRACEME, 0, 0, 0);
++      os_stop_process(os_getpid());
++      return(0);
++}
++
++int userspace_pid;
++
++void start_userspace(void)
++{
++      void *stack;
++      unsigned long sp;
++      int pid, status, n;
++
++      stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC,
++                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
++      if(stack == MAP_FAILED)
++              panic("start_userspace : mmap failed, errno = %d", errno);
++      sp = (unsigned long) stack + PAGE_SIZE - sizeof(void *);
++
++      pid = clone(userspace_tramp, (void *) sp, 
++                  CLONE_FILES | CLONE_VM | SIGCHLD, NULL);
++      if(pid < 0)
++              panic("start_userspace : clone failed, errno = %d", errno);
++
++      do {
++              n = waitpid(pid, &status, WUNTRACED);
++              if(n < 0)
++                      panic("start_userspace : wait failed, errno = %d", 
++                            errno);
++      } while(WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM));
++
++      if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP))
++              panic("start_userspace : expected SIGSTOP, got status = %d",
++                    status);
++
++      if(munmap(stack, PAGE_SIZE) < 0)
++              panic("start_userspace : munmap failed, errno = %d\n", errno);
++
++      userspace_pid = pid;
++}
++
++void userspace(union uml_pt_regs *regs)
++{
++      int err, status, op;
++
++      restore_registers(regs);
++              
++      err = ptrace(PTRACE_SYSCALL, userspace_pid, 0, 0);
++      if(err)
++              panic("userspace - PTRACE_SYSCALL failed, errno = %d\n", 
++                     errno);
++      while(1){
++              err = waitpid(userspace_pid, &status, WUNTRACED);
++              if(err < 0)
++                      panic("userspace - waitpid failed, errno = %d\n", 
++                            errno);
++
++              regs->skas.is_user = 1;
++              save_registers(regs);
++
++              if(WIFSTOPPED(status)){
++                      switch(WSTOPSIG(status)){
++                      case SIGSEGV:
++                              handle_segv(userspace_pid);
++                              break;
++                      case SIGTRAP:
++                              handle_trap(userspace_pid, regs);
++                              break;
++                      case SIGIO:
++                      case SIGVTALRM:
++                      case SIGILL:
++                      case SIGBUS:
++                      case SIGFPE:
++                      case SIGWINCH:
++                              user_signal(WSTOPSIG(status), regs);
++                              break;
++                      default:
++                              printk("userspace - child stopped with signal "
++                                     "%d\n", WSTOPSIG(status));
++                      }
++                      interrupt_end();
++              }
++
++              restore_registers(regs);
++
++              op = singlestepping_skas() ? PTRACE_SINGLESTEP : 
++                      PTRACE_SYSCALL;
++              err = ptrace(op, userspace_pid, 0, 0);
++              if(err)
++                      panic("userspace - PTRACE_SYSCALL failed, "
++                            "errno = %d\n", errno);
++      }
++}
++
++void new_thread(void *stack, void **switch_buf_ptr, void **fork_buf_ptr,
++              void (*handler)(int))
++{
++      jmp_buf switch_buf, fork_buf;
++
++      *switch_buf_ptr = &switch_buf;
++      *fork_buf_ptr = &fork_buf;
++
++      if(setjmp(fork_buf) == 0)
++              new_thread_proc(stack, handler);
++
++      remove_sigstack();
++}
++
++void thread_wait(void *sw, void *fb)
++{
++      jmp_buf buf, **switch_buf = sw, *fork_buf;
++
++      *switch_buf = &buf;
++      fork_buf = fb;
++      if(setjmp(buf) == 0)
++              longjmp(*fork_buf, 1);
++}
++
++static int move_registers(int int_op, int fp_op, union uml_pt_regs *regs,
++                        unsigned long *fp_regs)
++{
++      if(ptrace(int_op, userspace_pid, 0, regs->skas.regs) < 0)
++              return(-errno);
++      if(ptrace(fp_op, userspace_pid, 0, fp_regs) < 0)
++              return(-errno);
++      return(0);
++}
++
++void save_registers(union uml_pt_regs *regs)
++{
++      unsigned long *fp_regs;
++      int err, fp_op;
++
++      if(have_fpx_regs){
++              fp_op = PTRACE_GETFPXREGS;
++              fp_regs = regs->skas.xfp;
++      }
++      else {
++              fp_op = PTRACE_GETFPREGS;
++              fp_regs = regs->skas.fp;
++      }
++
++      err = move_registers(PTRACE_GETREGS, fp_op, regs, fp_regs);
++      if(err)
++              panic("save_registers - saving registers failed, errno = %d\n",
++                    err);
++}
++
++void restore_registers(union uml_pt_regs *regs)
++{
++      unsigned long *fp_regs;
++      int err, fp_op;
++
++      if(have_fpx_regs){
++              fp_op = PTRACE_SETFPXREGS;
++              fp_regs = regs->skas.xfp;
++      }
++      else {
++              fp_op = PTRACE_SETFPREGS;
++              fp_regs = regs->skas.fp;
++      }
++
++      err = move_registers(PTRACE_SETREGS, fp_op, regs, fp_regs);
++      if(err)
++              panic("restore_registers - saving registers failed, "
++                    "errno = %d\n", err);
++}
++
++void switch_threads(void *me, void *next)
++{
++      jmp_buf my_buf, **me_ptr = me, *next_buf = next;
++      
++      *me_ptr = &my_buf;
++      if(setjmp(my_buf) == 0)
++              longjmp(*next_buf, 1);
++}
++
++static jmp_buf initial_jmpbuf;
++
++/* XXX Make these percpu */
++static void (*cb_proc)(void *arg);
++static void *cb_arg;
++static jmp_buf *cb_back;
++
++int start_idle_thread(void *stack, void *switch_buf_ptr, void **fork_buf_ptr)
++{
++      jmp_buf **switch_buf = switch_buf_ptr;
++      int n;
++
++      *fork_buf_ptr = &initial_jmpbuf;
++      n = setjmp(initial_jmpbuf);
++      if(n == 0)
++              new_thread_proc((void *) stack, new_thread_handler);
++      else if(n == 1)
++              remove_sigstack();
++      else if(n == 2){
++              (*cb_proc)(cb_arg);
++              longjmp(*cb_back, 1);
++      }
++      else if(n == 3){
++              kmalloc_ok = 0;
++              return(0);
++      }
++      else if(n == 4){
++              kmalloc_ok = 0;
++              return(1);
++      }
++      longjmp(**switch_buf, 1);
++}
++
++void remove_sigstack(void)
++{
++      stack_t stack = ((stack_t) { .ss_flags  = SS_DISABLE,
++                                   .ss_sp     = NULL,
++                                   .ss_size   = 0 });
++
++      if(sigaltstack(&stack, NULL) != 0)
++              panic("disabling signal stack failed, errno = %d\n", errno);
++}
++
++void initial_thread_cb_skas(void (*proc)(void *), void *arg)
++{
++      jmp_buf here;
++
++      cb_proc = proc;
++      cb_arg = arg;
++      cb_back = &here;
++
++      block_signals();
++      if(setjmp(here) == 0)
++              longjmp(initial_jmpbuf, 2);
++      unblock_signals();
++
++      cb_proc = NULL;
++      cb_arg = NULL;
++      cb_back = NULL;
++}
++
++void halt_skas(void)
++{
++      block_signals();
++      longjmp(initial_jmpbuf, 3);
++}
++
++void reboot_skas(void)
++{
++      block_signals();
++      longjmp(initial_jmpbuf, 4);
++}
++
++int new_mm(int from)
++{
++      struct proc_mm_op copy;
++      int n, fd = os_open_file("/proc/mm", of_write(OPENFLAGS()), 0);
++
++      if(fd < 0)
++              return(-errno);
++
++      if(from != -1){
++              copy = ((struct proc_mm_op) { .op       = MM_COPY_SEGMENTS,
++                                            .u        = 
++                                            { .copy_segments  = from } } );
++              n = os_write_file(fd, &copy, sizeof(copy));
++              if(n != sizeof(copy)) 
++                      printk("new_mm : /proc/mm copy_segments failed, "
++                             "errno = %d\n", errno);
++      }
++      return(fd);
++}
++
++void switch_mm_skas(int mm_fd)
++{
++      int err;
++
++      err = ptrace(PTRACE_SWITCH_MM, userspace_pid, 0, mm_fd);
++      if(err)
++              panic("switch_mm_skas - PTRACE_SWITCH_MM failed, errno = %d\n",
++                    errno);
++}
++
++void kill_off_processes_skas(void)
++{
++      os_kill_process(userspace_pid, 1);
++}
++
++void init_registers(int pid)
++{
++      int err;
++
++      if(ptrace(PTRACE_GETREGS, pid, 0, exec_regs) < 0)
++              panic("check_ptrace : PTRACE_GETREGS failed, errno = %d", 
++                    errno);
++
++      err = ptrace(PTRACE_GETFPXREGS, pid, 0, exec_fpx_regs);
++      if(!err)
++              return;
++
++      have_fpx_regs = 0;
++      if(errno != EIO)
++              panic("check_ptrace : PTRACE_GETFPXREGS failed, errno = %d", 
++                    errno);
++
++      err = ptrace(PTRACE_GETFPREGS, pid, 0, exec_fp_regs);
++      if(err)
++              panic("check_ptrace : PTRACE_GETFPREGS failed, errno = %d", 
++                    errno);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/process_kern.c um/arch/um/kernel/skas/process_kern.c
+--- orig/arch/um/kernel/skas/process_kern.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/process_kern.c      Sun Dec 22 20:37:39 2002
+@@ -0,0 +1,191 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sched.h"
++#include "linux/slab.h"
++#include "kern_util.h"
++#include "time_user.h"
++#include "signal_user.h"
++#include "skas.h"
++#include "os.h"
++#include "user_util.h"
++#include "tlb.h"
++#include "frame.h"
++#include "kern.h"
++#include "mode.h"
++
++int singlestepping_skas(void)
++{
++      int ret = current->ptrace & PT_DTRACE;
++
++      current->ptrace &= ~PT_DTRACE;
++      return(ret);
++}
++
++void *_switch_to_skas(void *prev, void *next)
++{
++      struct task_struct *from, *to;
++
++      from = prev;
++      to = next;
++
++      /* XXX need to check runqueues[cpu].idle */
++      if(current->pid == 0)
++              switch_timers(0);
++
++      to->thread.prev_sched = from;
++      set_current(to);
++
++      switch_threads(&from->thread.mode.skas.switch_buf, 
++                     to->thread.mode.skas.switch_buf);
++
++      if(current->pid == 0)
++              switch_timers(1);
++
++      return(current->thread.prev_sched);
++}
++
++extern void schedule_tail(struct task_struct *prev);
++
++void new_thread_handler(int sig)
++{
++      int (*fn)(void *), n;
++      void *arg;
++
++      fn = current->thread.request.u.thread.proc;
++      arg = current->thread.request.u.thread.arg;
++      change_sig(SIGUSR1, 1);
++      thread_wait(&current->thread.mode.skas.switch_buf, 
++                  current->thread.mode.skas.fork_buf);
++
++      if(current->thread.prev_sched != NULL)
++              schedule_tail(current->thread.prev_sched);
++      current->thread.prev_sched = NULL;
++
++      n = run_kernel_thread(fn, arg, &current->thread.exec_buf);
++      if(n == 1)
++              userspace(&current->thread.regs.regs);
++      else do_exit(0);
++}
++
++void new_thread_proc(void *stack, void (*handler)(int sig))
++{
++      init_new_thread_stack(stack, handler);
++      os_usr1_process(os_getpid());
++}
++
++void release_thread_skas(struct task_struct *task)
++{
++}
++
++void exit_thread_skas(void)
++{
++}
++
++void fork_handler(int sig)
++{
++        change_sig(SIGUSR1, 1);
++      thread_wait(&current->thread.mode.skas.switch_buf, 
++                  current->thread.mode.skas.fork_buf);
++      
++      force_flush_all();
++      if(current->thread.prev_sched != NULL)
++              schedule_tail(current->thread.prev_sched);
++      current->thread.prev_sched = NULL;
++      unblock_signals();
++
++      userspace(&current->thread.regs.regs);
++}
++
++int copy_thread_skas(int nr, unsigned long clone_flags, unsigned long sp,
++                   unsigned long stack_top, struct task_struct * p, 
++                   struct pt_regs *regs)
++{
++      void (*handler)(int);
++
++      if(current->thread.forking){
++              memcpy(&p->thread.regs.regs.skas, 
++                     &current->thread.regs.regs.skas, 
++                     sizeof(p->thread.regs.regs.skas));
++              REGS_SET_SYSCALL_RETURN(p->thread.regs.regs.skas.regs, 0);
++              if(sp != 0) REGS_SP(p->thread.regs.regs.skas.regs) = sp;
++
++              handler = fork_handler;
++      }
++      else {
++              memcpy(p->thread.regs.regs.skas.regs, exec_regs, 
++                     sizeof(p->thread.regs.regs.skas.regs));
++              memcpy(p->thread.regs.regs.skas.fp, exec_fp_regs, 
++                     sizeof(p->thread.regs.regs.skas.fp));
++              memcpy(p->thread.regs.regs.skas.xfp, exec_fpx_regs, 
++                     sizeof(p->thread.regs.regs.skas.xfp));
++                p->thread.request.u.thread = current->thread.request.u.thread;
++              handler = new_thread_handler;
++      }
++
++      new_thread((void *) p->thread.kernel_stack, 
++                 &p->thread.mode.skas.switch_buf, 
++                 &p->thread.mode.skas.fork_buf, handler);
++      return(0);
++}
++
++void init_idle_skas(void)
++{
++      cpu_tasks[current->processor].pid = os_getpid();
++}
++
++extern void start_kernel(void);
++
++static int start_kernel_proc(void *unused)
++{
++      int pid;
++
++      block_signals();
++      pid = os_getpid();
++
++      cpu_tasks[0].pid = pid;
++      cpu_tasks[0].task = current;
++#ifdef CONFIG_SMP
++      cpu_online_map = 1;
++#endif
++      start_kernel();
++      return(0);
++}
++
++int start_uml_skas(void)
++{
++      start_userspace();
++      capture_signal_stack();
++
++      init_new_thread_signals(1);
++      idle_timer();
++
++      init_task.thread.request.u.thread.proc = start_kernel_proc;
++      init_task.thread.request.u.thread.arg = NULL;
++      return(start_idle_thread((void *) init_task.thread.kernel_stack,
++                               &init_task.thread.mode.skas.switch_buf,
++                               &init_task.thread.mode.skas.fork_buf));
++}
++
++int external_pid_skas(struct task_struct *task)
++{
++      return(userspace_pid);
++}
++
++int thread_pid_skas(struct thread_struct *thread)
++{
++      return(userspace_pid);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/sys-i386/Makefile um/arch/um/kernel/skas/sys-i386/Makefile
+--- orig/arch/um/kernel/skas/sys-i386/Makefile Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/sys-i386/Makefile   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,17 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET = sys-i386.o
++
++obj-y = sigcontext.o
++
++USER_OBJS = sigcontext.o
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++clean :
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/sys-i386/sigcontext.c um/arch/um/kernel/skas/sys-i386/sigcontext.c
+--- orig/arch/um/kernel/skas/sys-i386/sigcontext.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/sys-i386/sigcontext.c       Sun Dec  8 20:38:46 2002
+@@ -0,0 +1,115 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <errno.h>
++#include <asm/sigcontext.h>
++#include <sys/ptrace.h>
++#include <linux/ptrace.h>
++#include "sysdep/ptrace.h"
++#include "sysdep/ptrace_user.h"
++#include "kern_util.h"
++#include "user.h"
++#include "sigcontext.h"
++
++extern int userspace_pid;
++
++int copy_sc_from_user_skas(union uml_pt_regs *regs, void *from_ptr)
++{
++      struct sigcontext sc, *from = from_ptr;
++      unsigned long fpregs[FP_FRAME_SIZE];
++      int err;
++
++      err = copy_from_user_proc(&sc, from, sizeof(sc));
++      err |= copy_from_user_proc(fpregs, sc.fpstate, sizeof(fpregs));
++      if(err)
++              return(err);
++
++      regs->skas.regs[GS] = sc.gs;
++      regs->skas.regs[FS] = sc.fs;
++      regs->skas.regs[ES] = sc.es;
++      regs->skas.regs[DS] = sc.ds;
++      regs->skas.regs[EDI] = sc.edi;
++      regs->skas.regs[ESI] = sc.esi;
++      regs->skas.regs[EBP] = sc.ebp;
++      regs->skas.regs[UESP] = sc.esp;
++      regs->skas.regs[EBX] = sc.ebx;
++      regs->skas.regs[EDX] = sc.edx;
++      regs->skas.regs[ECX] = sc.ecx;
++      regs->skas.regs[EAX] = sc.eax;
++      regs->skas.regs[EIP] = sc.eip;
++      regs->skas.regs[CS] = sc.cs;
++      regs->skas.regs[EFL] = sc.eflags;
++      regs->skas.regs[UESP] = sc.esp_at_signal;
++      regs->skas.regs[SS] = sc.ss;
++      regs->skas.fault_addr = sc.cr2;
++      regs->skas.fault_type = FAULT_WRITE(sc.err);
++      regs->skas.trap_type = sc.trapno;
++
++      err = ptrace(PTRACE_SETFPREGS, userspace_pid, 0, fpregs);
++      if(err < 0){
++              printk("copy_sc_to_user - PTRACE_SETFPREGS failed, "
++                     "errno = %d\n", errno);
++              return(1);
++      }
++
++      return(0);
++}
++
++int copy_sc_to_user_skas(void *to_ptr, void *fp, union uml_pt_regs *regs, 
++                       unsigned long fault_addr, int fault_type)
++{
++      struct sigcontext sc, *to = to_ptr;
++      struct _fpstate *to_fp;
++      unsigned long fpregs[FP_FRAME_SIZE];
++      int err;
++
++      sc.gs = regs->skas.regs[GS];
++      sc.fs = regs->skas.regs[FS];
++      sc.es = regs->skas.regs[ES];
++      sc.ds = regs->skas.regs[DS];
++      sc.edi = regs->skas.regs[EDI];
++      sc.esi = regs->skas.regs[ESI];
++      sc.ebp = regs->skas.regs[EBP];
++      sc.esp = regs->skas.regs[UESP];
++      sc.ebx = regs->skas.regs[EBX];
++      sc.edx = regs->skas.regs[EDX];
++      sc.ecx = regs->skas.regs[ECX];
++      sc.eax = regs->skas.regs[EAX];
++      sc.eip = regs->skas.regs[EIP];
++      sc.cs = regs->skas.regs[CS];
++      sc.eflags = regs->skas.regs[EFL];
++      sc.esp_at_signal = regs->skas.regs[UESP];
++      sc.ss = regs->skas.regs[SS];
++      sc.cr2 = fault_addr;
++      sc.err = TO_SC_ERR(fault_type);
++      sc.trapno = regs->skas.trap_type;
++
++      err = ptrace(PTRACE_GETFPREGS, userspace_pid, 0, fpregs);
++      if(err < 0){
++              printk("copy_sc_to_user - PTRACE_GETFPREGS failed, "
++                     "errno = %d\n", errno);
++              return(1);
++      }
++      to_fp = (struct _fpstate *) 
++              (fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to)));
++      sc.fpstate = to_fp;
++
++      if(err)
++              return(err);
++
++      return(copy_to_user_proc(to, &sc, sizeof(sc)) ||
++             copy_to_user_proc(to_fp, fpregs, sizeof(fpregs)));
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/syscall_kern.c um/arch/um/kernel/skas/syscall_kern.c
+--- orig/arch/um/kernel/skas/syscall_kern.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/syscall_kern.c      Sun Dec  8 21:01:44 2002
+@@ -0,0 +1,42 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sys.h"
++#include "asm/errno.h"
++#include "asm/unistd.h"
++#include "asm/ptrace.h"
++#include "asm/current.h"
++#include "sysdep/syscalls.h"
++#include "kern_util.h"
++
++extern syscall_handler_t *sys_call_table[];
++
++long execute_syscall_skas(void *r)
++{
++      struct pt_regs *regs = r;
++      long res;
++      int syscall;
++
++      current->thread.nsyscalls++;
++      nsyscalls++;
++      syscall = UPT_SYSCALL_NR(&regs->regs);
++
++      if((syscall >= NR_syscalls) || (syscall < 0))
++              res = -ENOSYS;
++      else res = EXECUTE_SYSCALL(syscall, regs);
++
++      return(res);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/syscall_user.c um/arch/um/kernel/skas/syscall_user.c
+--- orig/arch/um/kernel/skas/syscall_user.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/syscall_user.c      Sun Dec  8 21:00:12 2002
+@@ -0,0 +1,46 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <signal.h>
++#include "kern_util.h"
++#include "syscall_user.h"
++#include "sysdep/ptrace.h"
++#include "sysdep/sigcontext.h"
++
++/* XXX Bogus */
++#define ERESTARTSYS   512
++#define ERESTARTNOINTR        513
++#define ERESTARTNOHAND        514
++
++void handle_syscall(union uml_pt_regs *regs)
++{
++      long result;
++      int index;
++
++      index = record_syscall_start(UPT_SYSCALL_NR(regs));
++
++      syscall_trace();
++      result = execute_syscall(regs);
++
++      REGS_SET_SYSCALL_RETURN(regs->skas.regs, result);
++      if((result == -ERESTARTNOHAND) || (result == -ERESTARTSYS) || 
++         (result == -ERESTARTNOINTR))
++              do_signal(result);
++
++      syscall_trace();
++      record_syscall_end(index, result);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/time.c um/arch/um/kernel/skas/time.c
+--- orig/arch/um/kernel/skas/time.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/time.c      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,30 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <sys/signal.h>
++#include <sys/time.h>
++#include "time_user.h"
++#include "process.h"
++#include "user.h"
++
++void user_time_init_skas(void)
++{
++        if(signal(SIGALRM, (__sighandler_t) alarm_handler) == SIG_ERR)
++                panic("Couldn't set SIGALRM handler");
++      if(signal(SIGVTALRM, (__sighandler_t) alarm_handler) == SIG_ERR)
++              panic("Couldn't set SIGVTALRM handler");
++      set_interval(ITIMER_VIRTUAL);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/tlb.c um/arch/um/kernel/skas/tlb.c
+--- orig/arch/um/kernel/skas/tlb.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/tlb.c       Sun Dec 22 18:30:35 2002
+@@ -0,0 +1,153 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/stddef.h"
++#include "linux/sched.h"
++#include "asm/page.h"
++#include "asm/pgtable.h"
++#include "asm/mmu.h"
++#include "user_util.h"
++#include "mem_user.h"
++#include "skas.h"
++#include "os.h"
++
++static void fix_range(struct mm_struct *mm, unsigned long start_addr,
++                    unsigned long end_addr, int force)
++{
++      pgd_t *npgd;
++      pmd_t *npmd;
++      pte_t *npte;
++      unsigned long addr;
++      int r, w, x, err, fd;
++
++      if(mm == NULL) return;
++      fd = mm->context.skas.mm_fd;
++      for(addr = start_addr; addr < end_addr;){
++              npgd = pgd_offset(mm, addr);
++              npmd = pmd_offset(npgd, addr);
++              if(pmd_present(*npmd)){
++                      npte = pte_offset(npmd, addr);
++                      r = pte_read(*npte);
++                      w = pte_write(*npte);
++                      x = pte_exec(*npte);
++                      if(!pte_dirty(*npte)) w = 0;
++                      if(!pte_young(*npte)){
++                              r = 0;
++                              w = 0;
++                      }
++                      if(force || pte_newpage(*npte)){
++                              err = unmap(fd, (void *) addr, PAGE_SIZE);
++                              if(err < 0)
++                                      panic("munmap failed, errno = %d\n",
++                                            -err);
++                              if(pte_present(*npte))
++                                      map(fd, addr, 
++                                          pte_val(*npte) & PAGE_MASK,
++                                          PAGE_SIZE, r, w, x);
++                      }
++                      else if(pte_newprot(*npte)){
++                              protect(fd, addr, PAGE_SIZE, r, w, x, 1);
++                      }
++                      *npte = pte_mkuptodate(*npte);
++                      addr += PAGE_SIZE;
++              }
++              else {
++                      if(force || pmd_newpage(*npmd)){
++                              err = unmap(fd, (void *) addr, PMD_SIZE);
++                              if(err < 0)
++                                      panic("munmap failed, errno = %d\n",
++                                            -err);
++                              pmd_mkuptodate(*npmd);
++                      }
++                      addr += PMD_SIZE;
++              }
++      }
++}
++
++static void flush_kernel_vm_range(unsigned long start, unsigned long end)
++{
++      struct mm_struct *mm;
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++      unsigned long addr;
++      int updated = 0, err;
++
++      mm = &init_mm;
++      for(addr = start_vm; addr < end_vm;){
++              pgd = pgd_offset(mm, addr);
++              pmd = pmd_offset(pgd, addr);
++              if(pmd_present(*pmd)){
++                      pte = pte_offset(pmd, addr);
++                      if(!pte_present(*pte) || pte_newpage(*pte)){
++                              updated = 1;
++                              err = os_unmap_memory((void *) addr, 
++                                                    PAGE_SIZE);
++                              if(err < 0)
++                                      panic("munmap failed, errno = %d\n",
++                                            -err);
++                              if(pte_present(*pte))
++                                      map_memory(addr, 
++                                                 pte_val(*pte) & PAGE_MASK,
++                                                 PAGE_SIZE, 1, 1, 1);
++                      }
++                      else if(pte_newprot(*pte)){
++                              updated = 1;
++                              protect_memory(addr, PAGE_SIZE, 1, 1, 1, 1);
++                      }
++                      addr += PAGE_SIZE;
++              }
++              else {
++                      if(pmd_newpage(*pmd)){
++                              updated = 1;
++                              err = os_unmap_memory((void *) addr, PMD_SIZE);
++                              if(err < 0)
++                                      panic("munmap failed, errno = %d\n",
++                                            -err);
++                      }
++                      addr += PMD_SIZE;
++              }
++      }
++}
++
++void flush_tlb_kernel_vm_skas(void)
++{
++      flush_kernel_vm_range(start_vm, end_vm);
++}
++
++void __flush_tlb_one_skas(unsigned long addr)
++{
++      flush_kernel_vm_range(addr, addr + PAGE_SIZE);
++}
++
++void flush_tlb_range_skas(struct mm_struct *mm, unsigned long start, 
++                   unsigned long end)
++{
++      if(mm == NULL)
++              flush_kernel_vm_range(start, end);
++      else fix_range(mm, start, end, 0);
++}
++
++void flush_tlb_mm_skas(struct mm_struct *mm)
++{
++      flush_tlb_kernel_vm_skas();
++      fix_range(mm, 0, host_task_size, 0);
++}
++
++void force_flush_all_skas(void)
++{
++      fix_range(current->mm, 0, host_task_size, 1);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/trap_user.c um/arch/um/kernel/skas/trap_user.c
+--- orig/arch/um/kernel/skas/trap_user.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/trap_user.c Sun Dec 15 13:28:41 2002
+@@ -0,0 +1,65 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <signal.h>
++#include <errno.h>
++#include <asm/sigcontext.h>
++#include "sysdep/ptrace.h"
++#include "signal_user.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "task.h"
++#include "sigcontext.h"
++
++void sig_handler_common_skas(int sig, void *sc_ptr)
++{
++      struct sigcontext *sc = sc_ptr;
++      struct skas_regs *r;
++      struct signal_info *info;
++      int save_errno = errno;
++
++      r = &TASK_REGS(get_current())->skas;
++      r->is_user = 0;
++      r->fault_addr = SC_FAULT_ADDR(sc);
++      r->fault_type = SC_FAULT_TYPE(sc);
++      r->trap_type = SC_TRAP_TYPE(sc);
++
++      change_sig(SIGUSR1, 1);
++      info = &sig_info[sig];
++      if(!info->is_irq) unblock_signals();
++
++      (*info->handler)(sig, (union uml_pt_regs *) r);
++
++      errno = save_errno;
++}
++
++extern int missed_ticks[];
++
++void user_signal(int sig, union uml_pt_regs *regs)
++{
++      struct signal_info *info;
++
++      if(sig == SIGVTALRM)
++              missed_ticks[cpu()]++;
++      regs->skas.is_user = 1;
++      regs->skas.fault_addr = 0;
++      regs->skas.fault_type = 0;
++      regs->skas.trap_type = 0;
++      info = &sig_info[sig];
++      (*info->handler)(sig, regs);
++
++      unblock_signals();
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/util/Makefile um/arch/um/kernel/skas/util/Makefile
+--- orig/arch/um/kernel/skas/util/Makefile     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/util/Makefile       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,10 @@
++all: mk_ptregs
++
++mk_ptregs : mk_ptregs.o
++      $(CC) -o mk_ptregs mk_ptregs.o
++
++mk_ptregs.o : mk_ptregs.c
++      $(CC) -c $< 
++
++clean : 
++      $(RM) -f mk_ptregs *.o *~
+diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/util/mk_ptregs.c um/arch/um/kernel/skas/util/mk_ptregs.c
+--- orig/arch/um/kernel/skas/util/mk_ptregs.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/skas/util/mk_ptregs.c    Mon Nov 11 12:10:06 2002
+@@ -0,0 +1,50 @@
++#include <asm/ptrace.h>
++#include <asm/user.h>
++
++#define PRINT_REG(name, val) printf("#define HOST_%s %d\n", (name), (val))
++
++int main(int argc, char **argv)
++{
++      printf("/* Automatically generated by "
++             "arch/um/kernel/skas/util/mk_ptregs */\n");
++      printf("\n");
++      printf("#ifndef __SKAS_PT_REGS_\n");
++      printf("#define __SKAS_PT_REGS_\n");
++      printf("\n");
++      printf("#define HOST_FRAME_SIZE %d\n", FRAME_SIZE);
++      printf("#define HOST_FP_SIZE %d\n", 
++             sizeof(struct user_i387_struct) / sizeof(unsigned long));
++      printf("#define HOST_XFP_SIZE %d\n", 
++             sizeof(struct user_fxsr_struct) / sizeof(unsigned long));
++
++      PRINT_REG("IP", EIP);
++      PRINT_REG("SP", UESP);
++      PRINT_REG("EFLAGS", EFL);
++      PRINT_REG("EAX", EAX);
++      PRINT_REG("EBX", EBX);
++      PRINT_REG("ECX", ECX);
++      PRINT_REG("EDX", EDX);
++      PRINT_REG("ESI", ESI);
++      PRINT_REG("EDI", EDI);
++      PRINT_REG("EBP", EBP);
++      PRINT_REG("CS", CS);
++      PRINT_REG("SS", SS);
++      PRINT_REG("DS", DS);
++      PRINT_REG("FS", FS);
++      PRINT_REG("ES", ES);
++      PRINT_REG("GS", GS);
++      printf("\n");
++      printf("#endif\n");
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/smp.c um/arch/um/kernel/smp.c
+--- orig/arch/um/kernel/smp.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/smp.c    Sat Feb 22 14:28:45 2003
+@@ -0,0 +1,324 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++
++
++#ifdef CONFIG_SMP
++
++#include "linux/sched.h"
++#include "linux/threads.h"
++#include "linux/interrupt.h"
++#include "asm/smp.h"
++#include "asm/processor.h"
++#include "asm/spinlock.h"
++#include "asm/softirq.h"
++#include "asm/hardirq.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "irq_user.h"
++#include "kern.h"
++#include "os.h"
++
++/* Total count of live CPUs, set by smp_boot_cpus */
++int smp_num_cpus = 1;
++
++/* The 'big kernel lock' */
++spinlock_cacheline_t kernel_flag_cacheline = {SPIN_LOCK_UNLOCKED};
++
++/* Per CPU bogomips and other parameters */
++
++/* The only piece used here is the ipi pipe, which is set before SMP is
++ * started and never changed.
++ */
++struct cpuinfo_um cpu_data[NR_CPUS];
++
++/* CPU online map, set by smp_boot_cpus */
++unsigned long cpu_online_map;
++
++atomic_t global_bh_count;
++
++/* Set when the idlers are all forked */
++int smp_threads_ready = 0;
++
++/* Not used by UML */
++unsigned char global_irq_holder = 0;
++unsigned volatile long global_irq_lock;
++
++/* A statistic, can be a little off */
++static int num_reschedules_sent = 0;
++
++void smp_send_reschedule(int cpu)
++{
++      write(cpu_data[cpu].ipi_pipe[1], "R", 1);
++      num_reschedules_sent++;
++}
++
++static void show(char * str)
++{
++      int cpu = smp_processor_id();
++
++      printk(KERN_INFO "\n%s, CPU %d:\n", str, cpu);
++}
++      
++#define MAXCOUNT 100000000
++
++static inline void wait_on_bh(void)
++{
++      int count = MAXCOUNT;
++      do {
++              if (!--count) {
++                      show("wait_on_bh");
++                      count = ~0;
++              }
++              /* nothing .. wait for the other bh's to go away */
++      } while (atomic_read(&global_bh_count) != 0);
++}
++
++/*
++ * This is called when we want to synchronize with
++ * bottom half handlers. We need to wait until
++ * no other CPU is executing any bottom half handler.
++ *
++ * Don't wait if we're already running in an interrupt
++ * context or are inside a bh handler. 
++ */
++void synchronize_bh(void)
++{
++      if (atomic_read(&global_bh_count) && !in_interrupt())
++              wait_on_bh();
++}
++
++void smp_send_stop(void)
++{
++      int i;
++ 
++      printk(KERN_INFO "Stopping all CPUs...");
++      for(i = 0; i < ncpus; i++){
++              if(i == current->processor)
++                      continue;
++              write(cpu_data[i].ipi_pipe[1], "S", 1);
++      }
++      printk("done\n");
++}
++
++
++static atomic_t smp_commenced = ATOMIC_INIT(0);
++static volatile unsigned long smp_callin_map = 0;
++
++void smp_commence(void)
++{
++      printk("All CPUs are go!\n");
++
++      wmb();
++      atomic_set(&smp_commenced, 1);
++}
++
++static int idle_proc(void *unused)
++{
++      int cpu, err;
++
++      set_current(current);
++      del_from_runqueue(current);
++      unhash_process(current);
++
++      cpu = current->processor;
++      err = os_pipe(cpu_data[cpu].ipi_pipe, 1, 1);
++      if(err)
++              panic("CPU#%d failed to create IPI pipe, errno = %d", cpu, 
++                    -err);
++
++      activate_ipi(cpu_data[cpu].ipi_pipe[0], 
++                   current->thread.mode.tt.extern_pid);
++ 
++      wmb();
++      if (test_and_set_bit(current->processor, &smp_callin_map)) {
++              printk("huh, CPU#%d already present??\n", current->processor);
++              BUG();
++      }
++
++      while (!atomic_read(&smp_commenced))
++              cpu_relax();
++
++      init_idle();
++      cpu_idle();
++      return(0);
++}
++
++static int idle_thread(int (*fn)(void *), int cpu)
++{
++      struct task_struct *new_task;
++      int pid;
++      unsigned char c;
++
++        current->thread.request.u.thread.proc = fn;
++        current->thread.request.u.thread.arg = NULL;
++      pid = do_fork(CLONE_VM | CLONE_PID, 0, NULL, 0);
++      if(pid < 0) panic("do_fork failed in idle_thread");
++      new_task = get_task(pid, 1);
++
++      cpu_tasks[cpu].pid = new_task->thread.mode.tt.extern_pid;
++      cpu_tasks[cpu].task = new_task;
++      init_tasks[cpu] = new_task;
++      new_task->processor = cpu;
++      new_task->cpus_allowed = 1 << cpu;
++      new_task->cpus_runnable = new_task->cpus_allowed;
++      CHOOSE_MODE(write(new_task->thread.mode.tt.switch_pipe[1], &c, 
++                        sizeof(c)),
++                  ({ panic("skas mode doesn't support SMP"); }));
++      return(new_task->thread.mode.tt.extern_pid);
++}
++
++void smp_boot_cpus(void)
++{
++      int err;
++
++      set_bit(0, &cpu_online_map);
++      set_bit(0, &smp_callin_map);
++
++      err = os_pipe(cpu_data[0].ipi_pipe, 1, 1);
++      if(err) panic("CPU#0 failed to create IPI pipe, errno = %d", -err);
++
++      activate_ipi(cpu_data[0].ipi_pipe[0], 
++                   current->thread.mode.tt.extern_pid);
++
++      if(ncpus < 1){
++              printk(KERN_INFO "ncpus set to 1\n");
++              ncpus = 1;
++      }
++      else if(ncpus > NR_CPUS){
++              printk(KERN_INFO 
++                     "ncpus can't be greater than NR_CPUS, set to %d\n",
++                     NR_CPUS);
++              ncpus = NR_CPUS;
++      }
++
++      if(ncpus > 1){
++              int i, pid;
++
++              printk(KERN_INFO "Starting up other processors:\n");
++              for(i=1;i<ncpus;i++){
++                      int waittime;
++
++                      /* Do this early, for hard_smp_processor_id()  */
++                      cpu_tasks[i].pid = -1;
++                      set_bit(i, &cpu_online_map);
++                      smp_num_cpus++;
++
++                      pid = idle_thread(idle_proc, i);
++                      printk(KERN_INFO "\t#%d - idle thread pid = %d.. ",
++                             i, pid);
++
++                      waittime = 200000000;
++                      while (waittime-- && !test_bit(i, &smp_callin_map))
++                              cpu_relax();
++
++                      if (test_bit(i, &smp_callin_map))
++                              printk("online\n");
++                      else {
++                              printk("failed\n");
++                              clear_bit(i, &cpu_online_map);
++                      }
++              }
++      }
++}
++
++int setup_profiling_timer(unsigned int multiplier)
++{
++      printk(KERN_INFO "setup_profiling_timer\n");
++      return(0);
++}
++
++void smp_call_function_slave(int cpu);
++
++void IPI_handler(int cpu)
++{
++      unsigned char c;
++      int fd;
++
++      fd = cpu_data[cpu].ipi_pipe[0];
++      while (read(fd, &c, 1) == 1) {
++              switch (c) {
++              case 'C':
++                      smp_call_function_slave(cpu);
++                      break;
++
++              case 'R':
++                      current->need_resched = 1;
++                      break;
++
++              case 'S':
++                      printk("CPU#%d stopping\n", cpu);
++                      while(1)
++                              pause();
++                      break;
++
++              default:
++                      printk("CPU#%d received unknown IPI [%c]!\n", cpu, c);
++                      break;
++              }
++      }
++}
++
++int hard_smp_processor_id(void)
++{
++      return(pid_to_processor_id(os_getpid()));
++}
++
++static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
++static atomic_t scf_started;
++static atomic_t scf_finished;
++static void (*func)(void *info);
++static void *info;
++
++void smp_call_function_slave(int cpu)
++{
++      atomic_inc(&scf_started);
++      (*func)(info);
++      atomic_inc(&scf_finished);
++}
++
++int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, 
++                    int wait)
++{
++      int cpus = smp_num_cpus - 1;
++      int i;
++
++      if (!cpus)
++              return 0;
++
++      spin_lock_bh(&call_lock);
++      atomic_set(&scf_started, 0);
++      atomic_set(&scf_finished, 0);
++      func = _func;
++      info = _info;
++
++      for (i=0;i<NR_CPUS;i++)
++              if (i != current->processor && test_bit(i, &cpu_online_map))
++                      write(cpu_data[i].ipi_pipe[1], "C", 1);
++
++      while (atomic_read(&scf_started) != cpus)
++              barrier();
++
++      if (wait)
++              while (atomic_read(&scf_finished) != cpus)
++                      barrier();
++
++      spin_unlock_bh(&call_lock);
++      return 0;
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/sys_call_table.c um/arch/um/kernel/sys_call_table.c
+--- orig/arch/um/kernel/sys_call_table.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/sys_call_table.c Thu Feb 27 13:33:23 2003
+@@ -0,0 +1,485 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/unistd.h"
++#include "linux/version.h"
++#include "linux/sys.h"
++#include "asm/signal.h"
++#include "sysdep/syscalls.h"
++#include "kern_util.h"
++
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_exit;
++extern syscall_handler_t sys_fork;
++extern syscall_handler_t sys_creat;
++extern syscall_handler_t sys_link;
++extern syscall_handler_t sys_unlink;
++extern syscall_handler_t sys_chdir;
++extern syscall_handler_t sys_mknod;
++extern syscall_handler_t sys_chmod;
++extern syscall_handler_t sys_lchown16;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_stat;
++extern syscall_handler_t sys_getpid;
++extern syscall_handler_t sys_oldumount;
++extern syscall_handler_t sys_setuid16;
++extern syscall_handler_t sys_getuid16;
++extern syscall_handler_t sys_ptrace;
++extern syscall_handler_t sys_alarm;
++extern syscall_handler_t sys_fstat;
++extern syscall_handler_t sys_pause;
++extern syscall_handler_t sys_utime;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_access;
++extern syscall_handler_t sys_nice;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_sync;
++extern syscall_handler_t sys_kill;
++extern syscall_handler_t sys_rename;
++extern syscall_handler_t sys_mkdir;
++extern syscall_handler_t sys_rmdir;
++extern syscall_handler_t sys_pipe;
++extern syscall_handler_t sys_times;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_brk;
++extern syscall_handler_t sys_setgid16;
++extern syscall_handler_t sys_getgid16;
++extern syscall_handler_t sys_signal;
++extern syscall_handler_t sys_geteuid16;
++extern syscall_handler_t sys_getegid16;
++extern syscall_handler_t sys_acct;
++extern syscall_handler_t sys_umount;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_ioctl;
++extern syscall_handler_t sys_fcntl;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_setpgid;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_olduname;
++extern syscall_handler_t sys_umask;
++extern syscall_handler_t sys_chroot;
++extern syscall_handler_t sys_ustat;
++extern syscall_handler_t sys_dup2;
++extern syscall_handler_t sys_getppid;
++extern syscall_handler_t sys_getpgrp;
++extern syscall_handler_t sys_sigaction;
++extern syscall_handler_t sys_sgetmask;
++extern syscall_handler_t sys_ssetmask;
++extern syscall_handler_t sys_setreuid16;
++extern syscall_handler_t sys_setregid16;
++extern syscall_handler_t sys_sigsuspend;
++extern syscall_handler_t sys_sigpending;
++extern syscall_handler_t sys_sethostname;
++extern syscall_handler_t sys_setrlimit;
++extern syscall_handler_t sys_old_getrlimit;
++extern syscall_handler_t sys_getrusage;
++extern syscall_handler_t sys_gettimeofday;
++extern syscall_handler_t sys_settimeofday;
++extern syscall_handler_t sys_getgroups16;
++extern syscall_handler_t sys_setgroups16;
++extern syscall_handler_t sys_symlink;
++extern syscall_handler_t sys_lstat;
++extern syscall_handler_t sys_readlink;
++extern syscall_handler_t sys_uselib;
++extern syscall_handler_t sys_swapon;
++extern syscall_handler_t sys_reboot;
++extern syscall_handler_t old_readdir;
++extern syscall_handler_t sys_munmap;
++extern syscall_handler_t sys_truncate;
++extern syscall_handler_t sys_ftruncate;
++extern syscall_handler_t sys_fchmod;
++extern syscall_handler_t sys_fchown16;
++extern syscall_handler_t sys_getpriority;
++extern syscall_handler_t sys_setpriority;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_statfs;
++extern syscall_handler_t sys_fstatfs;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_socketcall;
++extern syscall_handler_t sys_syslog;
++extern syscall_handler_t sys_setitimer;
++extern syscall_handler_t sys_getitimer;
++extern syscall_handler_t sys_newstat;
++extern syscall_handler_t sys_newlstat;
++extern syscall_handler_t sys_newfstat;
++extern syscall_handler_t sys_uname;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_vhangup;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_swapoff;
++extern syscall_handler_t sys_sysinfo;
++extern syscall_handler_t sys_ipc;
++extern syscall_handler_t sys_fsync;
++extern syscall_handler_t sys_sigreturn;
++extern syscall_handler_t sys_rt_sigreturn;
++extern syscall_handler_t sys_clone;
++extern syscall_handler_t sys_setdomainname;
++extern syscall_handler_t sys_newuname;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_adjtimex;
++extern syscall_handler_t sys_mprotect;
++extern syscall_handler_t sys_sigprocmask;
++extern syscall_handler_t sys_create_module;
++extern syscall_handler_t sys_init_module;
++extern syscall_handler_t sys_delete_module;
++extern syscall_handler_t sys_get_kernel_syms;
++extern syscall_handler_t sys_quotactl;
++extern syscall_handler_t sys_getpgid;
++extern syscall_handler_t sys_fchdir;
++extern syscall_handler_t sys_bdflush;
++extern syscall_handler_t sys_sysfs;
++extern syscall_handler_t sys_personality;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_setfsuid16;
++extern syscall_handler_t sys_setfsgid16;
++extern syscall_handler_t sys_llseek;
++extern syscall_handler_t sys_getdents;
++extern syscall_handler_t sys_flock;
++extern syscall_handler_t sys_msync;
++extern syscall_handler_t sys_readv;
++extern syscall_handler_t sys_writev;
++extern syscall_handler_t sys_getsid;
++extern syscall_handler_t sys_fdatasync;
++extern syscall_handler_t sys_sysctl;
++extern syscall_handler_t sys_mlock;
++extern syscall_handler_t sys_munlock;
++extern syscall_handler_t sys_mlockall;
++extern syscall_handler_t sys_munlockall;
++extern syscall_handler_t sys_sched_setparam;
++extern syscall_handler_t sys_sched_getparam;
++extern syscall_handler_t sys_sched_setscheduler;
++extern syscall_handler_t sys_sched_getscheduler;
++extern syscall_handler_t sys_sched_get_priority_max;
++extern syscall_handler_t sys_sched_get_priority_min;
++extern syscall_handler_t sys_sched_rr_get_interval;
++extern syscall_handler_t sys_nanosleep;
++extern syscall_handler_t sys_mremap;
++extern syscall_handler_t sys_setresuid16;
++extern syscall_handler_t sys_getresuid16;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_query_module;
++extern syscall_handler_t sys_poll;
++extern syscall_handler_t sys_nfsservctl;
++extern syscall_handler_t sys_setresgid16;
++extern syscall_handler_t sys_getresgid16;
++extern syscall_handler_t sys_prctl;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_rt_sigaction;
++extern syscall_handler_t sys_rt_sigprocmask;
++extern syscall_handler_t sys_rt_sigpending;
++extern syscall_handler_t sys_rt_sigtimedwait;
++extern syscall_handler_t sys_rt_sigqueueinfo;
++extern syscall_handler_t sys_rt_sigsuspend;
++extern syscall_handler_t sys_pread;
++extern syscall_handler_t sys_pwrite;
++extern syscall_handler_t sys_chown16;
++extern syscall_handler_t sys_getcwd;
++extern syscall_handler_t sys_capget;
++extern syscall_handler_t sys_capset;
++extern syscall_handler_t sys_sigaltstack;
++extern syscall_handler_t sys_sendfile;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_ni_syscall;
++extern syscall_handler_t sys_vfork;
++extern syscall_handler_t sys_getrlimit;
++extern syscall_handler_t sys_mmap2;
++extern syscall_handler_t sys_truncate64;
++extern syscall_handler_t sys_ftruncate64;
++extern syscall_handler_t sys_stat64;
++extern syscall_handler_t sys_lstat64;
++extern syscall_handler_t sys_fstat64;
++extern syscall_handler_t sys_lchown;
++extern syscall_handler_t sys_getuid;
++extern syscall_handler_t sys_getgid;
++extern syscall_handler_t sys_geteuid;
++extern syscall_handler_t sys_getegid;
++extern syscall_handler_t sys_setreuid;
++extern syscall_handler_t sys_setregid;
++extern syscall_handler_t sys_getgroups;
++extern syscall_handler_t sys_setgroups;
++extern syscall_handler_t sys_fchown;
++extern syscall_handler_t sys_setresuid;
++extern syscall_handler_t sys_getresuid;
++extern syscall_handler_t sys_setresgid;
++extern syscall_handler_t sys_getresgid;
++extern syscall_handler_t sys_chown;
++extern syscall_handler_t sys_setuid;
++extern syscall_handler_t sys_setgid;
++extern syscall_handler_t sys_setfsuid;
++extern syscall_handler_t sys_setfsgid;
++extern syscall_handler_t sys_pivot_root;
++extern syscall_handler_t sys_mincore;
++extern syscall_handler_t sys_madvise;
++extern syscall_handler_t sys_fcntl64;
++extern syscall_handler_t sys_getdents64;
++extern syscall_handler_t sys_gettid;
++extern syscall_handler_t sys_readahead;
++extern syscall_handler_t sys_tkill;
++extern syscall_handler_t sys_setxattr;
++extern syscall_handler_t sys_lsetxattr;
++extern syscall_handler_t sys_fsetxattr;
++extern syscall_handler_t sys_getxattr;
++extern syscall_handler_t sys_lgetxattr;
++extern syscall_handler_t sys_fgetxattr;
++extern syscall_handler_t sys_listxattr;
++extern syscall_handler_t sys_llistxattr;
++extern syscall_handler_t sys_flistxattr;
++extern syscall_handler_t sys_removexattr;
++extern syscall_handler_t sys_lremovexattr;
++extern syscall_handler_t sys_fremovexattr;
++
++extern syscall_handler_t um_mount;
++extern syscall_handler_t um_time;
++extern syscall_handler_t um_stime;
++
++#define LAST_GENERIC_SYSCALL __NR_sched_getaffinity
++
++#if LAST_GENERIC_SYSCALL > LAST_ARCH_SYSCALL
++#define LAST_SYSCALL LAST_GENERIC_SYSCALL
++#else
++#define LAST_SYSCALL LAST_ARCH_SYSCALL
++#endif
++
++syscall_handler_t *sys_call_table[] = {
++      [ 0 ] = sys_ni_syscall,
++      [ __NR_exit ] = sys_exit,
++      [ __NR_fork ] = sys_fork,
++      [ __NR_read ] = (syscall_handler_t *) sys_read,
++      [ __NR_write ] = (syscall_handler_t *) sys_write,
++
++      /* These three are declared differently in asm/unistd.h */
++      [ __NR_open ] = (syscall_handler_t *) sys_open,
++      [ __NR_close ] = (syscall_handler_t *) sys_close,
++      [ __NR_waitpid ] = (syscall_handler_t *) sys_waitpid,
++      [ __NR_creat ] = sys_creat,
++      [ __NR_link ] = sys_link,
++      [ __NR_unlink ] = sys_unlink,
++
++      /* declared differently in kern_util.h */
++      [ __NR_execve ] = (syscall_handler_t *) sys_execve,
++      [ __NR_chdir ] = sys_chdir,
++      [ __NR_time ] = um_time,
++      [ __NR_mknod ] = sys_mknod,
++      [ __NR_chmod ] = sys_chmod,
++      [ __NR_lchown ] = sys_lchown16,
++      [ __NR_break ] = sys_ni_syscall,
++      [ __NR_oldstat ] = sys_stat,
++      [ __NR_lseek ] = (syscall_handler_t *) sys_lseek,
++      [ __NR_getpid ] = sys_getpid,
++      [ __NR_mount ] = um_mount,
++      [ __NR_umount ] = sys_oldumount,
++      [ __NR_setuid ] = sys_setuid16,
++      [ __NR_getuid ] = sys_getuid16,
++      [ __NR_stime ] = um_stime,
++      [ __NR_ptrace ] = sys_ptrace,
++      [ __NR_alarm ] = sys_alarm,
++      [ __NR_oldfstat ] = sys_fstat,
++      [ __NR_pause ] = sys_pause,
++      [ __NR_utime ] = sys_utime,
++      [ __NR_stty ] = sys_ni_syscall,
++      [ __NR_gtty ] = sys_ni_syscall,
++      [ __NR_access ] = sys_access,
++      [ __NR_nice ] = sys_nice,
++      [ __NR_ftime ] = sys_ni_syscall,
++      [ __NR_sync ] = sys_sync,
++      [ __NR_kill ] = sys_kill,
++      [ __NR_rename ] = sys_rename,
++      [ __NR_mkdir ] = sys_mkdir,
++      [ __NR_rmdir ] = sys_rmdir,
++
++      /* Declared differently in asm/unistd.h */
++      [ __NR_dup ] = (syscall_handler_t *) sys_dup,
++      [ __NR_pipe ] = sys_pipe,
++      [ __NR_times ] = sys_times,
++      [ __NR_prof ] = sys_ni_syscall,
++      [ __NR_brk ] = sys_brk,
++      [ __NR_setgid ] = sys_setgid16,
++      [ __NR_getgid ] = sys_getgid16,
++      [ __NR_signal ] = sys_signal,
++      [ __NR_geteuid ] = sys_geteuid16,
++      [ __NR_getegid ] = sys_getegid16,
++      [ __NR_acct ] = sys_acct,
++      [ __NR_umount2 ] = sys_umount,
++      [ __NR_lock ] = sys_ni_syscall,
++      [ __NR_ioctl ] = sys_ioctl,
++      [ __NR_fcntl ] = sys_fcntl,
++      [ __NR_mpx ] = sys_ni_syscall,
++      [ __NR_setpgid ] = sys_setpgid,
++      [ __NR_ulimit ] = sys_ni_syscall,
++      [ __NR_oldolduname ] = sys_olduname,
++      [ __NR_umask ] = sys_umask,
++      [ __NR_chroot ] = sys_chroot,
++      [ __NR_ustat ] = sys_ustat,
++      [ __NR_dup2 ] = sys_dup2,
++      [ __NR_getppid ] = sys_getppid,
++      [ __NR_getpgrp ] = sys_getpgrp,
++      [ __NR_setsid ] = (syscall_handler_t *) sys_setsid,
++      [ __NR_sigaction ] = sys_sigaction,
++      [ __NR_sgetmask ] = sys_sgetmask,
++      [ __NR_ssetmask ] = sys_ssetmask,
++      [ __NR_setreuid ] = sys_setreuid16,
++      [ __NR_setregid ] = sys_setregid16,
++      [ __NR_sigsuspend ] = sys_sigsuspend,
++      [ __NR_sigpending ] = sys_sigpending,
++      [ __NR_sethostname ] = sys_sethostname,
++      [ __NR_setrlimit ] = sys_setrlimit,
++      [ __NR_getrlimit ] = sys_old_getrlimit,
++      [ __NR_getrusage ] = sys_getrusage,
++      [ __NR_gettimeofday ] = sys_gettimeofday,
++      [ __NR_settimeofday ] = sys_settimeofday,
++      [ __NR_getgroups ] = sys_getgroups16,
++      [ __NR_setgroups ] = sys_setgroups16,
++      [ __NR_symlink ] = sys_symlink,
++      [ __NR_oldlstat ] = sys_lstat,
++      [ __NR_readlink ] = sys_readlink,
++      [ __NR_uselib ] = sys_uselib,
++      [ __NR_swapon ] = sys_swapon,
++      [ __NR_reboot ] = sys_reboot,
++      [ __NR_readdir ] = old_readdir,
++      [ __NR_munmap ] = sys_munmap,
++      [ __NR_truncate ] = sys_truncate,
++      [ __NR_ftruncate ] = sys_ftruncate,
++      [ __NR_fchmod ] = sys_fchmod,
++      [ __NR_fchown ] = sys_fchown16,
++      [ __NR_getpriority ] = sys_getpriority,
++      [ __NR_setpriority ] = sys_setpriority,
++      [ __NR_profil ] = sys_ni_syscall,
++      [ __NR_statfs ] = sys_statfs,
++      [ __NR_fstatfs ] = sys_fstatfs,
++      [ __NR_ioperm ] = sys_ni_syscall,
++      [ __NR_socketcall ] = sys_socketcall,
++      [ __NR_syslog ] = sys_syslog,
++      [ __NR_setitimer ] = sys_setitimer,
++      [ __NR_getitimer ] = sys_getitimer,
++      [ __NR_stat ] = sys_newstat,
++      [ __NR_lstat ] = sys_newlstat,
++      [ __NR_fstat ] = sys_newfstat,
++      [ __NR_olduname ] = sys_uname,
++      [ __NR_iopl ] = sys_ni_syscall,
++      [ __NR_vhangup ] = sys_vhangup,
++      [ __NR_idle ] = sys_ni_syscall,
++      [ __NR_wait4 ] = (syscall_handler_t *) sys_wait4,
++      [ __NR_swapoff ] = sys_swapoff,
++      [ __NR_sysinfo ] = sys_sysinfo,
++      [ __NR_ipc ] = sys_ipc,
++      [ __NR_fsync ] = sys_fsync,
++      [ __NR_sigreturn ] = sys_sigreturn,
++      [ __NR_clone ] = sys_clone,
++      [ __NR_setdomainname ] = sys_setdomainname,
++      [ __NR_uname ] = sys_newuname,
++      [ __NR_adjtimex ] = sys_adjtimex,
++      [ __NR_mprotect ] = sys_mprotect,
++      [ __NR_sigprocmask ] = sys_sigprocmask,
++      [ __NR_create_module ] = sys_create_module,
++      [ __NR_init_module ] = sys_init_module,
++      [ __NR_delete_module ] = sys_delete_module,
++      [ __NR_get_kernel_syms ] = sys_get_kernel_syms,
++      [ __NR_quotactl ] = sys_quotactl,
++      [ __NR_getpgid ] = sys_getpgid,
++      [ __NR_fchdir ] = sys_fchdir,
++      [ __NR_bdflush ] = sys_bdflush,
++      [ __NR_sysfs ] = sys_sysfs,
++      [ __NR_personality ] = sys_personality,
++      [ __NR_afs_syscall ] = sys_ni_syscall,
++      [ __NR_setfsuid ] = sys_setfsuid16,
++      [ __NR_setfsgid ] = sys_setfsgid16,
++      [ __NR__llseek ] = sys_llseek,
++      [ __NR_getdents ] = sys_getdents,
++      [ __NR__newselect ] = (syscall_handler_t *) sys_select,
++      [ __NR_flock ] = sys_flock,
++      [ __NR_msync ] = sys_msync,
++      [ __NR_readv ] = sys_readv,
++      [ __NR_writev ] = sys_writev,
++      [ __NR_getsid ] = sys_getsid,
++      [ __NR_fdatasync ] = sys_fdatasync,
++      [ __NR__sysctl ] = sys_sysctl,
++      [ __NR_mlock ] = sys_mlock,
++      [ __NR_munlock ] = sys_munlock,
++      [ __NR_mlockall ] = sys_mlockall,
++      [ __NR_munlockall ] = sys_munlockall,
++      [ __NR_sched_setparam ] = sys_sched_setparam,
++      [ __NR_sched_getparam ] = sys_sched_getparam,
++      [ __NR_sched_setscheduler ] = sys_sched_setscheduler,
++      [ __NR_sched_getscheduler ] = sys_sched_getscheduler,
++      [ __NR_sched_yield ] = (syscall_handler_t *) yield,
++      [ __NR_sched_get_priority_max ] = sys_sched_get_priority_max,
++      [ __NR_sched_get_priority_min ] = sys_sched_get_priority_min,
++      [ __NR_sched_rr_get_interval ] = sys_sched_rr_get_interval,
++      [ __NR_nanosleep ] = sys_nanosleep,
++      [ __NR_mremap ] = sys_mremap,
++      [ __NR_setresuid ] = sys_setresuid16,
++      [ __NR_getresuid ] = sys_getresuid16,
++      [ __NR_vm86 ] = sys_ni_syscall,
++      [ __NR_query_module ] = sys_query_module,
++      [ __NR_poll ] = sys_poll,
++      [ __NR_nfsservctl ] = sys_nfsservctl,
++      [ __NR_setresgid ] = sys_setresgid16,
++      [ __NR_getresgid ] = sys_getresgid16,
++      [ __NR_prctl ] = sys_prctl,
++      [ __NR_rt_sigreturn ] = sys_rt_sigreturn,
++      [ __NR_rt_sigaction ] = sys_rt_sigaction,
++      [ __NR_rt_sigprocmask ] = sys_rt_sigprocmask,
++      [ __NR_rt_sigpending ] = sys_rt_sigpending,
++      [ __NR_rt_sigtimedwait ] = sys_rt_sigtimedwait,
++      [ __NR_rt_sigqueueinfo ] = sys_rt_sigqueueinfo,
++      [ __NR_rt_sigsuspend ] = sys_rt_sigsuspend,
++      [ __NR_pread ] = sys_pread,
++      [ __NR_pwrite ] = sys_pwrite,
++      [ __NR_chown ] = sys_chown16,
++      [ __NR_getcwd ] = sys_getcwd,
++      [ __NR_capget ] = sys_capget,
++      [ __NR_capset ] = sys_capset,
++      [ __NR_sigaltstack ] = sys_sigaltstack,
++      [ __NR_sendfile ] = sys_sendfile,
++      [ __NR_getpmsg ] = sys_ni_syscall,
++      [ __NR_putpmsg ] = sys_ni_syscall,
++      [ __NR_vfork ] = sys_vfork,
++      [ __NR_ugetrlimit ] = sys_getrlimit,
++      [ __NR_mmap2 ] = sys_mmap2,
++      [ __NR_truncate64 ] = sys_truncate64,
++      [ __NR_ftruncate64 ] = sys_ftruncate64,
++      [ __NR_stat64 ] = sys_stat64,
++      [ __NR_lstat64 ] = sys_lstat64,
++      [ __NR_fstat64 ] = sys_fstat64,
++      [ __NR_fcntl64 ] = sys_fcntl64,
++      [ __NR_getdents64 ] = sys_getdents64,
++        [ __NR_security ] = sys_ni_syscall,
++      [ __NR_gettid ] = sys_gettid,
++      [ __NR_readahead ] = sys_readahead,
++      [ __NR_setxattr ] = sys_setxattr,
++      [ __NR_lsetxattr ] = sys_lsetxattr,
++      [ __NR_fsetxattr ] = sys_fsetxattr,
++      [ __NR_getxattr ] = sys_getxattr,
++      [ __NR_lgetxattr ] = sys_lgetxattr,
++      [ __NR_fgetxattr ] = sys_fgetxattr,
++      [ __NR_listxattr ] = sys_listxattr,
++      [ __NR_llistxattr ] = sys_llistxattr,
++      [ __NR_flistxattr ] = sys_flistxattr,
++      [ __NR_removexattr ] = sys_removexattr,
++      [ __NR_lremovexattr ] = sys_lremovexattr,
++      [ __NR_fremovexattr ] = sys_fremovexattr,
++      [ __NR_tkill ] = sys_tkill,
++      [ __NR_sendfile64 ] = sys_ni_syscall,
++      [ __NR_futex ] = sys_ni_syscall,
++      [ __NR_sched_setaffinity ] = sys_ni_syscall,
++      [ __NR_sched_getaffinity ] = sys_ni_syscall,
++
++      ARCH_SYSCALLS
++      [ LAST_SYSCALL + 1 ... NR_syscalls ] = 
++              (syscall_handler_t *) sys_ni_syscall
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/syscall_kern.c um/arch/um/kernel/syscall_kern.c
+--- orig/arch/um/kernel/syscall_kern.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/syscall_kern.c   Fri Nov  8 14:04:10 2002
+@@ -0,0 +1,343 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sched.h"
++#include "linux/file.h"
++#include "linux/smp_lock.h"
++#include "linux/mm.h"
++#include "linux/utsname.h"
++#include "linux/msg.h"
++#include "linux/shm.h"
++#include "linux/sys.h"
++#include "linux/unistd.h"
++#include "linux/slab.h"
++#include "linux/utime.h"
++#include "asm/mman.h"
++#include "asm/uaccess.h"
++#include "asm/ipc.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "sysdep/syscalls.h"
++#include "mode_kern.h"
++#include "choose-mode.h"
++
++/*  Unlocked, I don't care if this is a bit off */
++int nsyscalls = 0;
++
++long um_mount(char * dev_name, char * dir_name, char * type,
++            unsigned long new_flags, void * data)
++{
++      if(type == NULL) type = "";
++      return(sys_mount(dev_name, dir_name, type, new_flags, data));
++}
++
++long sys_fork(void)
++{
++      long ret;
++
++      current->thread.forking = 1;
++        ret = do_fork(SIGCHLD, 0, NULL, 0);
++      current->thread.forking = 0;
++      return(ret);
++}
++
++long sys_clone(unsigned long clone_flags, unsigned long newsp)
++{
++      long ret;
++
++      current->thread.forking = 1;
++      ret = do_fork(clone_flags, newsp, NULL, 0);
++      current->thread.forking = 0;
++      return(ret);
++}
++
++long sys_vfork(void)
++{
++      long ret;
++
++      current->thread.forking = 1;
++      ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0);
++      current->thread.forking = 0;
++      return(ret);
++}
++
++/* common code for old and new mmaps */
++long do_mmap2(struct mm_struct *mm, unsigned long addr, unsigned long len,
++            unsigned long prot, unsigned long flags, unsigned long fd,
++            unsigned long pgoff)
++{
++      int error = -EBADF;
++      struct file * file = NULL;
++
++      flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
++      if (!(flags & MAP_ANONYMOUS)) {
++              file = fget(fd);
++              if (!file)
++                      goto out;
++      }
++
++      down_write(&mm->mmap_sem);
++      error = do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff);
++      up_write(&mm->mmap_sem);
++
++      if (file)
++              fput(file);
++ out:
++      return error;
++}
++
++long sys_mmap2(unsigned long addr, unsigned long len,
++             unsigned long prot, unsigned long flags,
++             unsigned long fd, unsigned long pgoff)
++{
++      return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff);
++}
++
++/*
++ * Perform the select(nd, in, out, ex, tv) and mmap() system
++ * calls. Linux/i386 didn't use to be able to handle more than
++ * 4 system call parameters, so these system calls used a memory
++ * block for parameter passing..
++ */
++
++struct mmap_arg_struct {
++      unsigned long addr;
++      unsigned long len;
++      unsigned long prot;
++      unsigned long flags;
++      unsigned long fd;
++      unsigned long offset;
++};
++
++int old_mmap(unsigned long addr, unsigned long len,
++           unsigned long prot, unsigned long flags,
++           unsigned long fd, unsigned long offset)
++{
++      int err = -EINVAL;
++      if (offset & ~PAGE_MASK)
++              goto out;
++
++      err = do_mmap2(current->mm, addr, len, prot, flags, fd, 
++                     offset >> PAGE_SHIFT);
++ out:
++      return err;
++}
++/*
++ * sys_pipe() is the normal C calling standard for creating
++ * a pipe. It's not the way unix traditionally does this, though.
++ */
++int sys_pipe(unsigned long * fildes)
++{
++        int fd[2];
++        int error;
++
++        error = do_pipe(fd);
++        if (!error) {
++                if (copy_to_user(fildes, fd, 2*sizeof(int)))
++                        error = -EFAULT;
++        }
++        return error;
++}
++
++int sys_pause(void)
++{
++      current->state = TASK_INTERRUPTIBLE;
++      schedule();
++      return -ERESTARTNOHAND;
++}
++
++int sys_sigaction(int sig, const struct old_sigaction *act,
++                       struct old_sigaction *oact)
++{
++      struct k_sigaction new_ka, old_ka;
++      int ret;
++
++      if (act) {
++              old_sigset_t mask;
++              if (verify_area(VERIFY_READ, act, sizeof(*act)) ||
++                  __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
++                  __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
++                      return -EFAULT;
++              __get_user(new_ka.sa.sa_flags, &act->sa_flags);
++              __get_user(mask, &act->sa_mask);
++              siginitset(&new_ka.sa.sa_mask, mask);
++      }
++
++      ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
++
++      if (!ret && oact) {
++              if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) ||
++                  __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
++                  __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
++                      return -EFAULT;
++              __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
++              __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
++      }
++
++      return ret;
++}
++
++/*
++ * sys_ipc() is the de-multiplexer for the SysV IPC calls..
++ *
++ * This is really horribly ugly.
++ */
++int sys_ipc (uint call, int first, int second,
++           int third, void *ptr, long fifth)
++{
++      int version, ret;
++
++      version = call >> 16; /* hack for backward compatibility */
++      call &= 0xffff;
++
++      switch (call) {
++      case SEMOP:
++              return sys_semop (first, (struct sembuf *)ptr, second);
++      case SEMGET:
++              return sys_semget (first, second, third);
++      case SEMCTL: {
++              union semun fourth;
++              if (!ptr)
++                      return -EINVAL;
++              if (get_user(fourth.__pad, (void **) ptr))
++                      return -EFAULT;
++              return sys_semctl (first, second, third, fourth);
++      }
++
++      case MSGSND:
++              return sys_msgsnd (first, (struct msgbuf *) ptr, 
++                                 second, third);
++      case MSGRCV:
++              switch (version) {
++              case 0: {
++                      struct ipc_kludge tmp;
++                      if (!ptr)
++                              return -EINVAL;
++                      
++                      if (copy_from_user(&tmp,
++                                         (struct ipc_kludge *) ptr, 
++                                         sizeof (tmp)))
++                              return -EFAULT;
++                      return sys_msgrcv (first, tmp.msgp, second,
++                                         tmp.msgtyp, third);
++              }
++              default:
++                      panic("msgrcv with version != 0");
++                      return sys_msgrcv (first,
++                                         (struct msgbuf *) ptr,
++                                         second, fifth, third);
++              }
++      case MSGGET:
++              return sys_msgget ((key_t) first, second);
++      case MSGCTL:
++              return sys_msgctl (first, second, (struct msqid_ds *) ptr);
++
++      case SHMAT:
++              switch (version) {
++              default: {
++                      ulong raddr;
++                      ret = sys_shmat (first, (char *) ptr, second, &raddr);
++                      if (ret)
++                              return ret;
++                      return put_user (raddr, (ulong *) third);
++              }
++              case 1: /* iBCS2 emulator entry point */
++                      if (!segment_eq(get_fs(), get_ds()))
++                              return -EINVAL;
++                      return sys_shmat (first, (char *) ptr, second, (ulong *) third);
++              }
++      case SHMDT: 
++              return sys_shmdt ((char *)ptr);
++      case SHMGET:
++              return sys_shmget (first, second, third);
++      case SHMCTL:
++              return sys_shmctl (first, second,
++                                 (struct shmid_ds *) ptr);
++      default:
++              return -EINVAL;
++      }
++}
++
++int sys_uname(struct old_utsname * name)
++{
++      int err;
++      if (!name)
++              return -EFAULT;
++      down_read(&uts_sem);
++      err=copy_to_user(name, &system_utsname, sizeof (*name));
++      up_read(&uts_sem);
++      return err?-EFAULT:0;
++}
++
++int sys_olduname(struct oldold_utsname * name)
++{
++      int error;
++
++      if (!name)
++              return -EFAULT;
++      if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
++              return -EFAULT;
++  
++      down_read(&uts_sem);
++      
++      error = __copy_to_user(&name->sysname,&system_utsname.sysname,
++                             __OLD_UTS_LEN);
++      error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
++      error |= __copy_to_user(&name->nodename,&system_utsname.nodename,
++                              __OLD_UTS_LEN);
++      error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
++      error |= __copy_to_user(&name->release,&system_utsname.release,
++                              __OLD_UTS_LEN);
++      error |= __put_user(0,name->release+__OLD_UTS_LEN);
++      error |= __copy_to_user(&name->version,&system_utsname.version,
++                              __OLD_UTS_LEN);
++      error |= __put_user(0,name->version+__OLD_UTS_LEN);
++      error |= __copy_to_user(&name->machine,&system_utsname.machine,
++                              __OLD_UTS_LEN);
++      error |= __put_user(0,name->machine+__OLD_UTS_LEN);
++      
++      up_read(&uts_sem);
++      
++      error = error ? -EFAULT : 0;
++
++      return error;
++}
++
++int sys_sigaltstack(const stack_t *uss, stack_t *uoss)
++{
++      return(do_sigaltstack(uss, uoss, PT_REGS_SP(&current->thread.regs)));
++}
++
++long execute_syscall(void *r)
++{
++      return(CHOOSE_MODE_PROC(execute_syscall_tt, execute_syscall_skas, r));
++}
++
++spinlock_t syscall_lock = SPIN_LOCK_UNLOCKED;
++
++static int syscall_index = 0;
++
++int next_syscall_index(int limit)
++{
++      int ret;
++
++      spin_lock(&syscall_lock);
++      ret = syscall_index;
++      if(++syscall_index == limit)
++              syscall_index = 0;
++      spin_unlock(&syscall_lock);
++      return(ret);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/syscall_user.c um/arch/um/kernel/syscall_user.c
+--- orig/arch/um/kernel/syscall_user.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/syscall_user.c   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,48 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <sys/time.h>
++#include "kern_util.h"
++#include "syscall_user.h"
++
++struct {
++      int syscall;
++      int pid;
++      int result;
++      struct timeval start;
++      struct timeval end;
++} syscall_record[1024];
++
++int record_syscall_start(int syscall)
++{
++      int max, index;
++      
++      max = sizeof(syscall_record)/sizeof(syscall_record[0]);
++      index = next_syscall_index(max);
++
++      syscall_record[index].syscall = syscall;
++      syscall_record[index].pid = current_pid();
++      syscall_record[index].result = 0xdeadbeef;
++      gettimeofday(&syscall_record[index].start, NULL);
++      return(index);
++}
++
++void record_syscall_end(int index, int result)
++{
++      syscall_record[index].result = result;
++      gettimeofday(&syscall_record[index].end, NULL);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/sysrq.c um/arch/um/kernel/sysrq.c
+--- orig/arch/um/kernel/sysrq.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/sysrq.c  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,98 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sched.h"
++#include "linux/kernel.h"
++#include "linux/module.h"
++#include "asm/page.h"
++#include "asm/processor.h"
++#include "sysrq.h"
++#include "user_util.h"
++
++ /*
++  * If the address is either in the .text section of the
++  * kernel, or in the vmalloc'ed module regions, it *may* 
++  * be the address of a calling routine
++  */
++ 
++#ifdef CONFIG_MODULES
++
++extern struct module *module_list;
++extern struct module kernel_module;
++
++static inline int kernel_text_address(unsigned long addr)
++{
++      int retval = 0;
++      struct module *mod;
++
++      if (addr >= (unsigned long) &_stext &&
++          addr <= (unsigned long) &_etext)
++              return 1;
++
++      for (mod = module_list; mod != &kernel_module; mod = mod->next) {
++              /* mod_bound tests for addr being inside the vmalloc'ed
++               * module area. Of course it'd be better to test only
++               * for the .text subset... */
++              if (mod_bound(addr, 0, mod)) {
++                      retval = 1;
++                      break;
++              }
++      }
++
++      return retval;
++}
++
++#else
++
++static inline int kernel_text_address(unsigned long addr)
++{
++      return (addr >= (unsigned long) &_stext &&
++              addr <= (unsigned long) &_etext);
++}
++
++#endif
++
++void show_trace(unsigned long * stack)
++{
++        int i;
++        unsigned long addr;
++
++        if (!stack)
++                stack = (unsigned long*) &stack;
++
++        printk("Call Trace: ");
++        i = 1;
++        while (((long) stack & (THREAD_SIZE-1)) != 0) {
++                addr = *stack++;
++              if (kernel_text_address(addr)) {
++                      if (i && ((i % 6) == 0))
++                              printk("\n   ");
++                      printk("[<%08lx>] ", addr);
++                      i++;
++                }
++        }
++        printk("\n");
++}
++
++void show_trace_task(struct task_struct *tsk)
++{
++      unsigned long esp = PT_REGS_SP(&tsk->thread.regs);
++
++      /* User space on another CPU? */
++      if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1))
++              return;
++      show_trace((unsigned long *)esp);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tempfile.c um/arch/um/kernel/tempfile.c
+--- orig/arch/um/kernel/tempfile.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tempfile.c       Fri Jan 17 23:16:19 2003
+@@ -0,0 +1,80 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <string.h>
++#include <errno.h>
++#include <sys/param.h>
++#include "init.h"
++
++/* Modified from create_mem_file and start_debugger */
++static char *tempdir = NULL;
++
++static void __init find_tempdir(void)
++{
++      char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL };
++      int i;
++      char *dir = NULL;
++
++      if(tempdir != NULL) return;     /* We've already been called */
++      for(i = 0; dirs[i]; i++){
++              dir = getenv(dirs[i]);
++              if((dir != NULL) && (*dir != '\0'))
++                      break;
++      }
++      if((dir == NULL) || (*dir == '\0')) 
++              dir = "/tmp";
++
++      tempdir = malloc(strlen(dir) + 2);
++      if(tempdir == NULL){
++              fprintf(stderr, "Failed to malloc tempdir, "
++                      "errno = %d\n", errno);
++              return;
++      }
++      strcpy(tempdir, dir);
++      strcat(tempdir, "/");
++}
++
++int make_tempfile(const char *template, char **out_tempname, int do_unlink)
++{
++      char tempname[MAXPATHLEN];
++      int fd;
++
++      find_tempdir();
++      if (*template != '/')
++              strcpy(tempname, tempdir);
++      else
++              *tempname = 0;
++      strcat(tempname, template);
++      if((fd = mkstemp(tempname)) < 0){
++              fprintf(stderr, "open - cannot create %s: %s\n", tempname, 
++                      strerror(errno));
++              return -1;
++      }
++      if(do_unlink && (unlink(tempname) < 0)){
++              perror("unlink");
++              return -1;
++      }
++      if(out_tempname){
++              if((*out_tempname = strdup(tempname)) == NULL){
++                      perror("strdup");
++                      return -1;
++              }
++      }
++      return(fd);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/time.c um/arch/um/kernel/time.c
+--- orig/arch/um/kernel/time.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/time.c   Wed Apr 23 20:45:19 2003
+@@ -0,0 +1,127 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <time.h>
++#include <sys/time.h>
++#include <signal.h>
++#include <errno.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "process.h"
++#include "signal_user.h"
++#include "time_user.h"
++
++extern struct timeval xtime;
++
++struct timeval local_offset = { 0, 0 };
++
++void timer(void)
++{
++      gettimeofday(&xtime, NULL);
++      timeradd(&xtime, &local_offset, &xtime);
++}
++
++void set_interval(int timer_type)
++{
++      int usec = 1000000/hz();
++      struct itimerval interval = ((struct itimerval) { { 0, usec },
++                                                        { 0, usec } });
++
++      if(setitimer(timer_type, &interval, NULL) == -1)
++              panic("setitimer failed - errno = %d\n", errno);
++}
++
++void enable_timer(void)
++{
++      int usec = 1000000/hz();
++      struct itimerval enable = ((struct itimerval) { { 0, usec },
++                                                      { 0, usec }});
++      if(setitimer(ITIMER_VIRTUAL, &enable, NULL))
++              printk("enable_timer - setitimer failed, errno = %d\n",
++                     errno);
++}
++
++void switch_timers(int to_real)
++{
++      struct itimerval disable = ((struct itimerval) { { 0, 0 }, { 0, 0 }});
++      struct itimerval enable = ((struct itimerval) { { 0, 1000000/hz() },
++                                                      { 0, 1000000/hz() }});
++      int old, new;
++
++      if(to_real){
++              old = ITIMER_VIRTUAL;
++              new = ITIMER_REAL;
++      }
++      else {
++              old = ITIMER_REAL;
++              new = ITIMER_VIRTUAL;
++      }
++
++      if((setitimer(old, &disable, NULL) < 0) ||
++         (setitimer(new, &enable, NULL)))
++              printk("switch_timers - setitimer failed, errno = %d\n",
++                     errno);
++}
++
++void idle_timer(void)
++{
++      if(signal(SIGVTALRM, SIG_IGN) == SIG_ERR)
++              panic("Couldn't unset SIGVTALRM handler");
++      
++      set_handler(SIGALRM, (__sighandler_t) alarm_handler, 
++                  SA_RESTART, SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1);
++      set_interval(ITIMER_REAL);
++}
++
++void time_init(void)
++{
++      if(signal(SIGVTALRM, boot_timer_handler) == SIG_ERR)
++              panic("Couldn't set SIGVTALRM handler");
++      set_interval(ITIMER_VIRTUAL);
++}
++
++void do_gettimeofday(struct timeval *tv)
++{
++      unsigned long flags;
++
++      flags = time_lock();
++      gettimeofday(tv, NULL);
++      timeradd(tv, &local_offset, tv);
++      time_unlock(flags);
++}
++
++void do_settimeofday(struct timeval *tv)
++{
++      struct timeval now;
++      unsigned long flags;
++
++      flags = time_lock();
++      gettimeofday(&now, NULL);
++      timersub(tv, &now, &local_offset);
++      time_unlock(flags);
++}
++
++void idle_sleep(int secs)
++{
++      struct timespec ts;
++
++      ts.tv_sec = secs;
++      ts.tv_nsec = 0;
++      nanosleep(&ts, NULL);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/time_kern.c um/arch/um/kernel/time_kern.c
+--- orig/arch/um/kernel/time_kern.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/time_kern.c      Wed Apr 23 22:19:08 2003
+@@ -0,0 +1,172 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/kernel.h"
++#include "linux/unistd.h"
++#include "linux/stddef.h"
++#include "linux/spinlock.h"
++#include "linux/sched.h"
++#include "linux/interrupt.h"
++#include "linux/init.h"
++#include "linux/delay.h"
++#include "asm/irq.h"
++#include "asm/param.h"
++#include "asm/current.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "time_user.h"
++#include "mode.h"
++
++extern rwlock_t xtime_lock;
++
++int hz(void)
++{
++      return(HZ);
++}
++
++/* Changed at early boot */
++int timer_irq_inited = 0;
++
++/* missed_ticks will be modified after kernel memory has been 
++ * write-protected, so this puts it in a section which will be left 
++ * write-enabled.
++ */
++int __attribute__ ((__section__ (".unprotected"))) missed_ticks[NR_CPUS];
++
++void timer_irq(union uml_pt_regs *regs)
++{
++      int cpu = current->processor, ticks = missed_ticks[cpu];
++
++        if(!timer_irq_inited) return;
++      missed_ticks[cpu] = 0;
++      while(ticks--) do_IRQ(TIMER_IRQ, regs);
++}
++
++void boot_timer_handler(int sig)
++{
++      struct pt_regs regs;
++
++      CHOOSE_MODE((void) 
++                  (UPT_SC(&regs.regs) = (struct sigcontext *) (&sig + 1)),
++                  (void) (regs.regs.skas.is_user = 0));
++      do_timer(&regs);
++}
++
++void um_timer(int irq, void *dev, struct pt_regs *regs)
++{
++      do_timer(regs);
++      write_lock(&xtime_lock);
++      vxtime_lock();
++      timer();
++      vxtime_unlock();
++      write_unlock(&xtime_lock);
++}
++
++long um_time(int * tloc)
++{
++      struct timeval now;
++
++      do_gettimeofday(&now);
++      if (tloc) {
++              if (put_user(now.tv_sec,tloc))
++                      now.tv_sec = -EFAULT;
++      }
++      return now.tv_sec;
++}
++
++long um_stime(int * tptr)
++{
++      int value;
++      struct timeval new;
++
++      if (get_user(value, tptr))
++                return -EFAULT;
++      new.tv_sec = value;
++      new.tv_usec = 0;
++      do_settimeofday(&new);
++      return 0;
++}
++
++/* XXX Needs to be moved under sys-i386 */
++void __delay(um_udelay_t time)
++{
++      /* Stolen from the i386 __loop_delay */
++      int d0;
++      __asm__ __volatile__(
++              "\tjmp 1f\n"
++              ".align 16\n"
++              "1:\tjmp 2f\n"
++              ".align 16\n"
++              "2:\tdecl %0\n\tjns 2b"
++              :"=&a" (d0)
++              :"0" (time));
++}
++
++void __udelay(um_udelay_t usecs)
++{
++      int i, n;
++
++      n = (loops_per_jiffy * HZ * usecs) / 1000000;
++      for(i=0;i<n;i++) ;
++}
++
++void __const_udelay(um_udelay_t usecs)
++{
++      int i, n;
++
++      n = (loops_per_jiffy * HZ * usecs) / 1000000;
++      for(i=0;i<n;i++) ;
++}
++
++void timer_handler(int sig, union uml_pt_regs *regs)
++{
++#ifdef CONFIG_SMP
++      update_process_times(user_context(UPT_SP(regs)));
++#endif
++      if(current->processor == 0)
++              timer_irq(regs);
++}
++
++static spinlock_t timer_spinlock = SPIN_LOCK_UNLOCKED;
++
++unsigned long time_lock(void)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&timer_spinlock, flags);
++      return(flags);
++}
++
++void time_unlock(unsigned long flags)
++{
++      spin_unlock_irqrestore(&timer_spinlock, flags);
++}
++
++int __init timer_init(void)
++{
++      int err;
++
++      CHOOSE_MODE(user_time_init_tt(), user_time_init_skas());
++      if((err = request_irq(TIMER_IRQ, um_timer, SA_INTERRUPT, "timer", 
++                            NULL)) != 0)
++              printk(KERN_ERR "timer_init : request_irq failed - "
++                     "errno = %d\n", -err);
++      timer_irq_inited = 1;
++      return(0);
++}
++
++__initcall(timer_init);
++
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tlb.c um/arch/um/kernel/tlb.c
+--- orig/arch/um/kernel/tlb.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tlb.c    Wed Oct 23 22:15:51 2002
+@@ -0,0 +1,80 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/mm.h"
++#include "asm/page.h"
++#include "asm/pgalloc.h"
++#include "choose-mode.h"
++#include "mode_kern.h"
++
++void flush_tlb_page(struct vm_area_struct *vma, unsigned long address)
++{
++      address &= PAGE_MASK;
++      flush_tlb_range(vma->vm_mm, address, address + PAGE_SIZE);
++}
++
++void flush_tlb_all(void)
++{
++      flush_tlb_mm(current->mm);
++}
++
++void flush_tlb_kernel_vm(void)
++{
++      CHOOSE_MODE(flush_tlb_kernel_vm_tt(), flush_tlb_kernel_vm_skas());
++}
++
++void __flush_tlb_one(unsigned long addr)
++{
++      CHOOSE_MODE_PROC(__flush_tlb_one_tt, __flush_tlb_one_skas, addr);
++}
++
++void flush_tlb_range(struct mm_struct *mm, unsigned long start, 
++                   unsigned long end)
++{
++      CHOOSE_MODE_PROC(flush_tlb_range_tt, flush_tlb_range_skas, mm, start, 
++                       end);
++}
++
++void flush_tlb_mm(struct mm_struct *mm)
++{
++      CHOOSE_MODE_PROC(flush_tlb_mm_tt, flush_tlb_mm_skas, mm);
++}
++
++void force_flush_all(void)
++{
++      CHOOSE_MODE(force_flush_all_tt(), force_flush_all_skas());
++}
++
++
++pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address)
++{
++      return(pgd_offset(mm, address));
++}
++
++pmd_t *pmd_offset_proc(pgd_t *pgd, unsigned long address)
++{
++      return(pmd_offset(pgd, address));
++}
++
++pte_t *pte_offset_proc(pmd_t *pmd, unsigned long address)
++{
++      return(pte_offset(pmd, address));
++}
++
++pte_t *addr_pte(struct task_struct *task, unsigned long addr)
++{
++      return(pte_offset(pmd_offset(pgd_offset(task->mm, addr), addr), addr));
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/trap_kern.c um/arch/um/kernel/trap_kern.c
+--- orig/arch/um/kernel/trap_kern.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/trap_kern.c      Wed Mar 26 13:26:00 2003
+@@ -0,0 +1,192 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/kernel.h"
++#include "linux/sched.h"
++#include "linux/mm.h"
++#include "linux/spinlock.h"
++#include "linux/config.h"
++#include "linux/init.h"
++#include "asm/semaphore.h"
++#include "asm/pgtable.h"
++#include "asm/pgalloc.h"
++#include "asm/a.out.h"
++#include "asm/current.h"
++#include "asm/irq.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "chan_kern.h"
++#include "mconsole_kern.h"
++#include "2_5compat.h"
++
++unsigned long handle_page_fault(unsigned long address, unsigned long ip, 
++                              int is_write, int is_user, int *code_out)
++{
++      struct mm_struct *mm = current->mm;
++      struct vm_area_struct *vma;
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++      unsigned long page;
++      int handled = 0;
++
++      *code_out = SEGV_MAPERR;
++      down_read(&mm->mmap_sem);
++      vma = find_vma(mm, address);
++      if(!vma) 
++              goto out;
++      else if(vma->vm_start <= address) 
++              goto good_area;
++      else if(!(vma->vm_flags & VM_GROWSDOWN)) 
++              goto out;
++      else if(expand_stack(vma, address)) 
++              goto out;
++
++ good_area:
++      *code_out = SEGV_ACCERR;
++      if(is_write && !(vma->vm_flags & VM_WRITE)) 
++              goto out;
++      page = address & PAGE_MASK;
++      if(page == (unsigned long) current + PAGE_SIZE)
++              panic("Kernel stack overflow");
++      pgd = pgd_offset(mm, page);
++      pmd = pmd_offset(pgd, page);
++      do {
++      survive:
++              switch (handle_mm_fault(mm, vma, address, is_write)) {
++              case 1:
++                      current->min_flt++;
++                      break;
++              case 2:
++                      current->maj_flt++;
++                      break;
++              default:
++                      if (current->pid == 1) {
++                              up_read(&mm->mmap_sem);
++                              yield();
++                              down_read(&mm->mmap_sem);
++                              goto survive;
++                      }
++                      /* Fall through to bad area case */
++              case 0:
++                      goto out;
++              }
++              pte = pte_offset(pmd, page);
++      } while(!pte_present(*pte));
++      handled = 1;
++      *pte = pte_mkyoung(*pte);
++      if(pte_write(*pte)) *pte = pte_mkdirty(*pte);
++      flush_tlb_page(vma, page);
++ out:
++      up_read(&mm->mmap_sem);
++      return(handled);
++}
++
++unsigned long segv(unsigned long address, unsigned long ip, int is_write, 
++                 int is_user, void *sc)
++{
++      struct siginfo si;
++      void *catcher;
++      int handled;
++
++        if(!is_user && (address >= start_vm) && (address < end_vm)){
++                flush_tlb_kernel_vm();
++                return(0);
++        }
++        if(current->mm == NULL)
++              panic("Segfault with no mm");
++
++      handled = handle_page_fault(address, ip, is_write, is_user, 
++                                  &si.si_code);
++
++      catcher = current->thread.fault_catcher;
++      if(handled)
++              return(0);
++      else if(catcher != NULL){
++              current->thread.fault_addr = (void *) address;
++              do_longjmp(catcher, 1);
++      } 
++      else if(current->thread.fault_addr != NULL){
++              panic("fault_addr set but no fault catcher");
++      }
++      else if(arch_fixup(ip, sc))
++              return(0);
++
++      if(!is_user) 
++              panic("Kernel mode fault at addr 0x%lx, ip 0x%lx", 
++                    address, ip);
++      si.si_signo = SIGSEGV;
++      si.si_addr = (void *) address;
++      current->thread.cr2 = address;
++      current->thread.err = is_write;
++      force_sig_info(SIGSEGV, &si, current);
++      return(0);
++}
++
++void bad_segv(unsigned long address, unsigned long ip, int is_write)
++{
++      struct siginfo si;
++
++      printk(KERN_ERR "Unfixable SEGV in '%s' (pid %d) at 0x%lx "
++             "(ip 0x%lx)\n", current->comm, current->pid, address, ip);
++      si.si_signo = SIGSEGV;
++      si.si_code = SEGV_ACCERR;
++      si.si_addr = (void *) address;
++      current->thread.cr2 = address;
++      current->thread.err = is_write;
++      force_sig_info(SIGSEGV, &si, current);
++}
++
++void relay_signal(int sig, union uml_pt_regs *regs)
++{
++      if(arch_handle_signal(sig, regs)) return;
++      if(!UPT_IS_USER(regs))
++              panic("Kernel mode signal %d", sig);
++      force_sig(sig, current);
++}
++
++void bus_handler(int sig, union uml_pt_regs *regs)
++{
++      if(current->thread.fault_catcher != NULL)
++              do_longjmp(current->thread.fault_catcher, 1);
++      else relay_signal(sig, regs);
++}
++
++void winch(int sig, union uml_pt_regs *regs)
++{
++      do_IRQ(WINCH_IRQ, regs);
++}
++
++void trap_init(void)
++{
++}
++
++spinlock_t trap_lock = SPIN_LOCK_UNLOCKED;
++
++static int trap_index = 0;
++
++int next_trap_index(int limit)
++{
++      int ret;
++
++      spin_lock(&trap_lock);
++      ret = trap_index;
++      if(++trap_index == limit)
++              trap_index = 0;
++      spin_unlock(&trap_lock);
++      return(ret);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/trap_user.c um/arch/um/kernel/trap_user.c
+--- orig/arch/um/kernel/trap_user.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/trap_user.c      Wed Mar 26 13:25:50 2003
+@@ -0,0 +1,140 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <setjmp.h>
++#include <signal.h>
++#include <sys/time.h>
++#include <sys/ioctl.h>
++#include <sys/ptrace.h>
++#include <sys/wait.h>
++#include <asm/page.h>
++#include <asm/unistd.h>
++#include <asm/ptrace.h>
++#include "init.h"
++#include "sysdep/ptrace.h"
++#include "sigcontext.h"
++#include "sysdep/sigcontext.h"
++#include "irq_user.h"
++#include "frame_user.h"
++#include "signal_user.h"
++#include "time_user.h"
++#include "task.h"
++#include "mode.h"
++#include "choose-mode.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "os.h"
++
++void kill_child_dead(int pid)
++{
++      kill(pid, SIGKILL);
++      kill(pid, SIGCONT);
++      while(waitpid(pid, NULL, 0) > 0) kill(pid, SIGCONT);
++}
++
++/* Unlocked - don't care if this is a bit off */
++int nsegfaults = 0;
++
++struct {
++      unsigned long address;
++      int is_write;
++      int pid;
++      unsigned long sp;
++      int is_user;
++} segfault_record[1024];
++
++void segv_handler(int sig, union uml_pt_regs *regs)
++{
++      int index, max;
++
++      if(UPT_IS_USER(regs) && !UPT_SEGV_IS_FIXABLE(regs)){
++              bad_segv(UPT_FAULT_ADDR(regs), UPT_IP(regs), 
++                       UPT_FAULT_WRITE(regs));
++              return;
++      }
++      max = sizeof(segfault_record)/sizeof(segfault_record[0]);
++      index = next_trap_index(max);
++
++      nsegfaults++;
++      segfault_record[index].address = UPT_FAULT_ADDR(regs);
++      segfault_record[index].pid = os_getpid();
++      segfault_record[index].is_write = UPT_FAULT_WRITE(regs);
++      segfault_record[index].sp = UPT_SP(regs);
++      segfault_record[index].is_user = UPT_IS_USER(regs);
++      segv(UPT_FAULT_ADDR(regs), UPT_IP(regs), UPT_FAULT_WRITE(regs),
++           UPT_IS_USER(regs), regs);
++}
++
++void usr2_handler(int sig, union uml_pt_regs *regs)
++{
++      CHOOSE_MODE(syscall_handler_tt(sig, regs), (void) 0);
++}
++
++struct signal_info sig_info[] = {
++      [ SIGTRAP ] { .handler          = relay_signal,
++                    .is_irq           = 0 },
++      [ SIGFPE ] { .handler           = relay_signal,
++                   .is_irq            = 0 },
++      [ SIGILL ] { .handler           = relay_signal,
++                   .is_irq            = 0 },
++      [ SIGWINCH ] { .handler         = winch,
++                     .is_irq          = 1 },
++      [ SIGBUS ] { .handler           = bus_handler,
++                   .is_irq            = 0 },
++      [ SIGSEGV] { .handler           = segv_handler,
++                   .is_irq            = 0 },
++      [ SIGIO ] { .handler            = sigio_handler,
++                  .is_irq             = 1 },
++      [ SIGVTALRM ] { .handler        = timer_handler,
++                      .is_irq         = 1 },
++        [ SIGALRM ] { .handler          = timer_handler,
++                      .is_irq           = 1 },
++      [ SIGUSR2 ] { .handler          = usr2_handler,
++                    .is_irq           = 0 },
++};
++
++void sig_handler(int sig, struct sigcontext sc)
++{
++      CHOOSE_MODE_PROC(sig_handler_common_tt, sig_handler_common_skas,
++                       sig, &sc);
++}
++
++extern int timer_irq_inited, missed_ticks[];
++
++void alarm_handler(int sig, struct sigcontext sc)
++{
++      if(!timer_irq_inited) return;
++      missed_ticks[cpu()]++;
++
++      if(sig == SIGALRM)
++              switch_timers(0);
++
++      CHOOSE_MODE_PROC(sig_handler_common_tt, sig_handler_common_skas,
++                       sig, &sc);
++
++      if(sig == SIGALRM)
++              switch_timers(1);
++}
++
++void do_longjmp(void *b, int val)
++{
++      jmp_buf *buf = b;
++
++      longjmp(*buf, val);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/Makefile um/arch/um/kernel/tt/Makefile
+--- orig/arch/um/kernel/tt/Makefile    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/Makefile      Fri Dec 20 23:29:42 2002
+@@ -0,0 +1,39 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET = tt.o
++
++obj-y = exec_kern.o exec_user.o gdb.o ksyms.o mem.o mem_user.o process_kern.o \
++      syscall_kern.o syscall_user.o time.o tlb.o tracer.o trap_user.o \
++      uaccess_user.o
++
++obj-$(CONFIG_PT_PROXY) += gdb_kern.o 
++
++subdir-y = sys-$(SUBARCH)
++subdir-$(CONFIG_PT_PROXY) += ptproxy
++
++obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
++
++export-objs = ksyms.o
++
++USER_OBJS = $(filter %_user.o,$(obj-y)) gdb.o time.o tracer.o
++
++UNMAP_CFLAGS := $(patsubst -pg -DPROFILING,,$(USER_CFLAGS))
++UNMAP_CFLAGS := $(patsubst -fprofile-arcs -ftest-coverage,,$(UNMAP_CFLAGS))
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++$(O_TARGET) : unmap_fin.o
++
++unmap.o: unmap.c
++      $(CC) $(UNMAP_CFLAGS) -c -o $@ $<
++
++unmap_fin.o : unmap.o
++      ld -r -o $@ $< -lc -L/usr/lib
++
++clean :
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/exec_kern.c um/arch/um/kernel/tt/exec_kern.c
+--- orig/arch/um/kernel/tt/exec_kern.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/exec_kern.c   Thu Oct 24 19:22:17 2002
+@@ -0,0 +1,83 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/kernel.h"
++#include "linux/mm.h"
++#include "asm/signal.h"
++#include "asm/ptrace.h"
++#include "asm/uaccess.h"
++#include "asm/pgalloc.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "irq_user.h"
++#include "time_user.h"
++#include "mem_user.h"
++#include "os.h"
++#include "tlb.h"
++
++static int exec_tramp(void *sig_stack)
++{
++      init_new_thread_stack(sig_stack, NULL);
++      init_new_thread_signals(1);
++      os_stop_process(os_getpid());
++      return(0);
++}
++
++void flush_thread_tt(void)
++{
++      unsigned long stack;
++      int new_pid;
++
++      stack = alloc_stack(0, 0);
++      if(stack == 0){
++              printk(KERN_ERR 
++                     "flush_thread : failed to allocate temporary stack\n");
++              do_exit(SIGKILL);
++      }
++              
++      new_pid = start_fork_tramp((void *) current->thread.kernel_stack,
++                                 stack, 0, exec_tramp);
++      if(new_pid < 0){
++              printk(KERN_ERR 
++                     "flush_thread : new thread failed, errno = %d\n",
++                     -new_pid);
++              do_exit(SIGKILL);
++      }
++
++      if(current->processor == 0)
++              forward_interrupts(new_pid);
++      current->thread.request.op = OP_EXEC;
++      current->thread.request.u.exec.pid = new_pid;
++      unprotect_stack((unsigned long) current);
++      os_usr1_process(os_getpid());
++
++      enable_timer();
++      free_page(stack);
++      protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 1, 0, 1);
++      task_protections((unsigned long) current);
++      force_flush_all();
++      unblock_signals();
++}
++
++void start_thread_tt(struct pt_regs *regs, unsigned long eip, 
++                   unsigned long esp)
++{
++      set_fs(USER_DS);
++      flush_tlb_mm(current->mm);
++      PT_REGS_IP(regs) = eip;
++      PT_REGS_SP(regs) = esp;
++      PT_FIX_EXEC_STACK(esp);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/exec_user.c um/arch/um/kernel/tt/exec_user.c
+--- orig/arch/um/kernel/tt/exec_user.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/exec_user.c   Thu Dec  5 19:36:57 2002
+@@ -0,0 +1,49 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <stdlib.h>
++#include <sched.h>
++#include <errno.h>
++#include <sys/wait.h>
++#include <sys/ptrace.h>
++#include <signal.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "ptrace_user.h"
++
++void do_exec(int old_pid, int new_pid)
++{
++      unsigned long regs[FRAME_SIZE];
++
++      if((ptrace(PTRACE_ATTACH, new_pid, 0, 0) < 0) ||
++         (ptrace(PTRACE_CONT, new_pid, 0, 0) < 0) ||
++         (waitpid(new_pid, 0, WUNTRACED) < 0))
++              tracer_panic("do_exec failed to attach proc - errno = %d",
++                           errno);
++
++      if(ptrace_getregs(old_pid, regs) < 0)
++              tracer_panic("do_exec failed to get registers - errno = %d",
++                           errno);
++
++      kill(old_pid, SIGKILL);
++
++      if(ptrace_setregs(new_pid, regs) < 0)
++              tracer_panic("do_exec failed to start new proc - errno = %d",
++                           errno);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/gdb.c um/arch/um/kernel/tt/gdb.c
+--- orig/arch/um/kernel/tt/gdb.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/gdb.c Fri Jan 17 13:23:31 2003
+@@ -0,0 +1,278 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <errno.h>
++#include <string.h>
++#include <signal.h>
++#include <sys/ptrace.h>
++#include <sys/types.h>
++#include "uml-config.h"
++#include "kern_constants.h"
++#include "chan_user.h"
++#include "init.h"
++#include "user.h"
++#include "debug.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "tt.h"
++#include "sysdep/thread.h"
++
++extern int debugger_pid;
++extern int debugger_fd;
++extern int debugger_parent;
++
++int detach(int pid, int sig)
++{
++      return(ptrace(PTRACE_DETACH, pid, 0, sig));
++}
++
++int attach(int pid)
++{
++      int err;
++
++      err = ptrace(PTRACE_ATTACH, pid, 0, 0);
++      if(err < 0) return(-errno);
++      else return(err);
++}
++
++int cont(int pid)
++{
++      return(ptrace(PTRACE_CONT, pid, 0, 0));
++}
++
++#ifdef UML_CONFIG_PT_PROXY
++
++int debugger_signal(int status, pid_t pid)
++{
++      return(debugger_proxy(status, pid));
++}
++
++void child_signal(pid_t pid, int status)
++{
++      child_proxy(pid, status);
++}
++
++static void gdb_announce(char *dev_name, int dev)
++{
++      printf("gdb assigned device '%s'\n", dev_name);
++}
++
++static struct chan_opts opts = {
++      .announce       = gdb_announce,
++      .xterm_title    = "UML kernel debugger",
++      .raw            = 0,
++      .tramp_stack    = 0,
++      .in_kernel      = 0,
++};
++
++/* Accessed by the tracing thread, which automatically serializes access */
++static void *xterm_data;
++static int xterm_fd;
++
++extern void *xterm_init(char *, int, struct chan_opts *);
++extern int xterm_open(int, int, int, void *, char **);
++extern void xterm_close(int, void *);
++
++int open_gdb_chan(void)
++{
++      char stack[UM_KERN_PAGE_SIZE], *dummy;
++
++      opts.tramp_stack = (unsigned long) stack;
++      xterm_data = xterm_init("", 0, &opts);
++      xterm_fd = xterm_open(1, 1, 1, xterm_data, &dummy);
++      return(xterm_fd);
++}
++
++static void exit_debugger_cb(void *unused)
++{
++      if(debugger_pid != -1){
++              if(gdb_pid != -1){
++                      fake_child_exit();
++                      gdb_pid = -1;
++              }
++              else kill_child_dead(debugger_pid);
++              debugger_pid = -1;
++              if(debugger_parent != -1)
++                      detach(debugger_parent, SIGINT);
++      }
++      if(xterm_data != NULL) xterm_close(xterm_fd, xterm_data);
++}
++
++static void exit_debugger(void)
++{
++      initial_thread_cb(exit_debugger_cb, NULL);
++}
++
++__uml_exitcall(exit_debugger);
++
++struct gdb_data {
++      char *str;
++      int err;
++};
++
++static void config_gdb_cb(void *arg)
++{
++      struct gdb_data *data = arg;
++      void *task;
++      int pid;
++
++      data->err = -1;
++      if(debugger_pid != -1) exit_debugger_cb(NULL);
++      if(!strncmp(data->str, "pid,", strlen("pid,"))){
++              data->str += strlen("pid,");
++              pid = strtoul(data->str, NULL, 0);
++              task = cpu_tasks[0].task;
++              debugger_pid = attach_debugger(TASK_EXTERN_PID(task), pid, 0);
++              if(debugger_pid != -1){
++                      data->err = 0;
++                      gdb_pid = pid;
++              }
++              return;
++      }
++      data->err = 0;
++      debugger_pid = start_debugger(linux_prog, 0, 0, &debugger_fd);
++      init_proxy(debugger_pid, 0, 0);
++}
++
++int gdb_config(char *str)
++{
++      struct gdb_data data;
++
++      if(*str++ != '=') return(-1);
++      data.str = str;
++      initial_thread_cb(config_gdb_cb, &data);
++      return(data.err);
++}
++
++void remove_gdb_cb(void *unused)
++{
++      exit_debugger_cb(NULL);
++}
++
++int gdb_remove(char *unused)
++{
++      initial_thread_cb(remove_gdb_cb, NULL);
++      return(0);
++}
++
++void signal_usr1(int sig)
++{
++      if(debugger_pid != -1){
++              printk(UM_KERN_ERR "The debugger is already running\n");
++              return;
++      }
++      debugger_pid = start_debugger(linux_prog, 0, 0, &debugger_fd);
++      init_proxy(debugger_pid, 0, 0);
++}
++
++int init_ptrace_proxy(int idle_pid, int startup, int stop)
++{
++      int pid, status;
++
++      pid = start_debugger(linux_prog, startup, stop, &debugger_fd);
++      status = wait_for_stop(idle_pid, SIGSTOP, PTRACE_CONT, NULL);
++      if(pid < 0){
++              cont(idle_pid);
++              return(-1);
++      }
++      init_proxy(pid, 1, status);
++      return(pid);
++}
++
++int attach_debugger(int idle_pid, int pid, int stop)
++{
++      int status = 0, err;
++
++      err = attach(pid);
++      if(err < 0){
++              printf("Failed to attach pid %d, errno = %d\n", pid, -err);
++              return(-1);
++      }
++      if(stop) status = wait_for_stop(idle_pid, SIGSTOP, PTRACE_CONT, NULL);
++      init_proxy(pid, 1, status);
++      return(pid);
++}
++
++#ifdef notdef /* Put this back in when it does something useful */
++static int __init uml_gdb_init_setup(char *line, int *add)
++{
++      gdb_init = uml_strdup(line);
++      return 0;
++}
++
++__uml_setup("gdb=", uml_gdb_init_setup, 
++"gdb=<channel description>\n\n"
++);
++#endif
++
++static int __init uml_gdb_pid_setup(char *line, int *add)
++{
++      gdb_pid = strtoul(line, NULL, 0);
++      *add = 0;
++      return 0;
++}
++
++__uml_setup("gdb-pid=", uml_gdb_pid_setup, 
++"gdb-pid=<pid>\n"
++"    gdb-pid is used to attach an external debugger to UML.  This may be\n"
++"    an already-running gdb or a debugger-like process like strace.\n\n"
++);
++
++#else
++
++int debugger_signal(int status, pid_t pid){ return(0); }
++void child_signal(pid_t pid, int status){ }
++int init_ptrace_proxy(int idle_pid, int startup, int stop)
++{
++      printk(UM_KERN_ERR "debug requested when CONFIG_PT_PROXY is off\n");
++      kill_child_dead(idle_pid);
++      exit(1);
++}
++
++void signal_usr1(int sig)
++{
++      printk(UM_KERN_ERR "debug requested when CONFIG_PT_PROXY is off\n");
++}
++
++int attach_debugger(int idle_pid, int pid, int stop)
++{
++      printk(UM_KERN_ERR "attach_debugger called when CONFIG_PT_PROXY "
++             "is off\n");
++      return(-1);
++}
++
++int config_gdb(char *str)
++{
++      return(-1);
++}
++
++int remove_gdb(void)
++{
++      return(-1);
++}
++
++int init_parent_proxy(int pid)
++{
++      return(-1);
++}
++
++void debugger_parent_signal(int status, int pid)
++{
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/gdb_kern.c um/arch/um/kernel/tt/gdb_kern.c
+--- orig/arch/um/kernel/tt/gdb_kern.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/gdb_kern.c    Sun Dec 15 21:16:17 2002
+@@ -0,0 +1,40 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/init.h"
++#include "linux/config.h"
++#include "mconsole_kern.h"
++
++#ifdef CONFIG_MCONSOLE
++
++extern int gdb_config(char *str);
++extern int gdb_remove(char *unused);
++
++static struct mc_device gdb_mc = {
++      .name           = "gdb",
++      .config         = gdb_config,
++      .remove         = gdb_remove,
++};
++
++int gdb_mc_init(void)
++{
++      mconsole_register_dev(&gdb_mc);
++      return(0);
++}
++
++__initcall(gdb_mc_init);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/debug.h um/arch/um/kernel/tt/include/debug.h
+--- orig/arch/um/kernel/tt/include/debug.h     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/include/debug.h       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,29 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002  Jeff Dike (jdike@karaya.com) and
++ * Lars Brinkhoff.
++ * Licensed under the GPL
++ */
++
++#ifndef __DEBUG_H
++#define __DEBUG_H
++
++extern int debugger_proxy(int status, pid_t pid);
++extern void child_proxy(pid_t pid, int status);
++extern void init_proxy (pid_t pid, int waiting, int status);
++extern int start_debugger(char *prog, int startup, int stop, int *debugger_fd);
++extern void fake_child_exit(void);
++extern int gdb_config(char *str);
++extern int gdb_remove(char *unused);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/mmu.h um/arch/um/kernel/tt/include/mmu.h
+--- orig/arch/um/kernel/tt/include/mmu.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/include/mmu.h Sat Nov  9 12:51:32 2002
+@@ -0,0 +1,23 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __TT_MMU_H
++#define __TT_MMU_H
++
++struct mmu_context_tt {
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/mode.h um/arch/um/kernel/tt/include/mode.h
+--- orig/arch/um/kernel/tt/include/mode.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/include/mode.h        Mon Dec  9 00:34:40 2002
+@@ -0,0 +1,36 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MODE_TT_H__
++#define __MODE_TT_H__
++
++#include "sysdep/ptrace.h"
++
++extern int tracing_pid;
++
++extern int tracer(int (*init_proc)(void *), void *sp);
++extern void user_time_init_tt(void);
++extern int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data);
++extern int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr, 
++                            void *data);
++extern void sig_handler_common_tt(int sig, void *sc);
++extern void syscall_handler_tt(int sig, union uml_pt_regs *regs);
++extern void reboot_tt(void);
++extern void halt_tt(void);
++extern int is_tracer_winch(int pid, int fd, void *data);
++extern void kill_off_processes_tt(void);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/mode_kern.h um/arch/um/kernel/tt/include/mode_kern.h
+--- orig/arch/um/kernel/tt/include/mode_kern.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/include/mode_kern.h   Mon Dec 16 21:49:18 2002
+@@ -0,0 +1,52 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __TT_MODE_KERN_H__
++#define __TT_MODE_KERN_H__
++
++#include "linux/sched.h"
++#include "asm/page.h"
++#include "asm/ptrace.h"
++#include "asm/uaccess.h"
++
++extern void *_switch_to_tt(void *prev, void *next);
++extern void flush_thread_tt(void);
++extern void start_thread_tt(struct pt_regs *regs, unsigned long eip, 
++                         unsigned long esp);
++extern int copy_thread_tt(int nr, unsigned long clone_flags, unsigned long sp,
++                        unsigned long stack_top, struct task_struct *p, 
++                        struct pt_regs *regs);
++extern void release_thread_tt(struct task_struct *task);
++extern void exit_thread_tt(void);
++extern void initial_thread_cb_tt(void (*proc)(void *), void *arg);
++extern void init_idle_tt(void);
++extern void flush_tlb_kernel_vm_tt(void);
++extern void __flush_tlb_one_tt(unsigned long addr);
++extern void flush_tlb_range_tt(struct mm_struct *mm, unsigned long start, 
++                             unsigned long end);
++extern void flush_tlb_mm_tt(struct mm_struct *mm);
++extern void force_flush_all_tt(void);
++extern long execute_syscall_tt(void *r);
++extern void before_mem_tt(unsigned long brk_start);
++extern unsigned long set_task_sizes_tt(int arg, unsigned long *host_size_out, 
++                                     unsigned long *task_size_out);
++extern int start_uml_tt(void);
++extern int external_pid_tt(struct task_struct *task);
++extern int thread_pid_tt(struct thread_struct *thread);
++
++#define kmem_end_tt (host_task_size - ABOVE_KMEM)
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/ptrace-tt.h um/arch/um/kernel/tt/include/ptrace-tt.h
+--- orig/arch/um/kernel/tt/include/ptrace-tt.h Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/include/ptrace-tt.h   Fri Jan 17 13:23:30 2003
+@@ -0,0 +1,26 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PTRACE_TT_H
++#define __PTRACE_TT_H
++
++#include "uml-config.h"
++
++#ifdef UML_CONFIG_MODE_TT
++#include "sysdep/sc.h"
++#endif
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/tt.h um/arch/um/kernel/tt/include/tt.h
+--- orig/arch/um/kernel/tt/include/tt.h        Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/include/tt.h  Fri Dec 20 23:29:11 2002
+@@ -0,0 +1,46 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __TT_H__
++#define __TT_H__
++
++#include "sysdep/ptrace.h"
++
++extern int gdb_pid;
++extern int debug;
++extern int debug_stop;
++extern int debug_trace;
++
++extern int honeypot;
++
++extern int fork_tramp(void *sig_stack);
++extern int do_proc_op(void *t, int proc_id);
++extern int tracer(int (*init_proc)(void *), void *sp);
++extern void attach_process(int pid);
++extern void tracer_panic(char *format, ...);
++extern void set_init_pid(int pid);
++extern int set_user_mode(void *task);
++extern void set_tracing(void *t, int tracing);
++extern int is_tracing(void *task);
++extern int singlestepping_tt(void *t);
++extern void clear_singlestep(void *t);
++extern void syscall_handler(int sig, union uml_pt_regs *regs);
++extern void exit_kernel(int pid, void *task);
++extern int do_syscall(void *task, int pid);
++extern int is_valid_pid(int pid);
++extern void remap_data(void *segment_start, void *segment_end, int w);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/uaccess.h um/arch/um/kernel/tt/include/uaccess.h
+--- orig/arch/um/kernel/tt/include/uaccess.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/include/uaccess.h     Tue Mar 25 16:58:42 2003
+@@ -0,0 +1,122 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __TT_UACCESS_H
++#define __TT_UACCESS_H
++
++#include "linux/string.h"
++#include "linux/sched.h"
++#include "asm/processor.h"
++#include "asm/errno.h"
++#include "asm/current.h"
++#include "asm/a.out.h"
++#include "uml_uaccess.h"
++
++#define ABOVE_KMEM (16 * 1024 * 1024)
++
++extern unsigned long end_vm;
++extern unsigned long uml_physmem;
++
++#define under_task_size(addr, size) \
++      (((unsigned long) (addr) < TASK_SIZE) && \
++         (((unsigned long) (addr) + (size)) < TASK_SIZE))
++
++#define is_stack(addr, size) \
++      (((unsigned long) (addr) < STACK_TOP) && \
++       ((unsigned long) (addr) >= STACK_TOP - ABOVE_KMEM) && \
++       (((unsigned long) (addr) + (size)) <= STACK_TOP))
++
++#define access_ok_tt(type, addr, size) \
++      ((type == VERIFY_READ) || (segment_eq(get_fs(), KERNEL_DS)) || \
++         (((unsigned long) (addr) <= ((unsigned long) (addr) + (size))) && \
++          (under_task_size(addr, size) || is_stack(addr, size))))
++
++static inline int verify_area_tt(int type, const void * addr, 
++                               unsigned long size)
++{
++      return(access_ok_tt(type, addr, size) ? 0 : -EFAULT);
++}
++
++extern unsigned long get_fault_addr(void);
++
++extern int __do_copy_from_user(void *to, const void *from, int n,
++                             void **fault_addr, void **fault_catcher);
++
++static inline int copy_from_user_tt(void *to, const void *from, int n)
++{
++      if(!access_ok_tt(VERIFY_READ, from, n)) 
++              return(n);
++
++      return(__do_copy_from_user(to, from, n, &current->thread.fault_addr,
++                                 &current->thread.fault_catcher));
++}
++
++static inline int copy_to_user_tt(void *to, const void *from, int n)
++{
++      if(!access_ok_tt(VERIFY_WRITE, to, n))
++              return(n);
++              
++      return(__do_copy_to_user(to, from, n, &current->thread.fault_addr,
++                               &current->thread.fault_catcher));
++}
++
++extern int __do_strncpy_from_user(char *dst, const char *src, size_t n,
++                                void **fault_addr, void **fault_catcher);
++
++static inline int strncpy_from_user_tt(char *dst, const char *src, int count)
++{
++      int n;
++
++      if(!access_ok_tt(VERIFY_READ, src, 1)) 
++              return(-EFAULT);
++
++      n = __do_strncpy_from_user(dst, src, count, 
++                                 &current->thread.fault_addr,
++                                 &current->thread.fault_catcher);
++      if(n < 0) return(-EFAULT);
++      return(n);
++}
++
++extern int __do_clear_user(void *mem, size_t len, void **fault_addr,
++                         void **fault_catcher);
++
++static inline int __clear_user_tt(void *mem, int len)
++{
++      return(__do_clear_user(mem, len,
++                             &current->thread.fault_addr,
++                             &current->thread.fault_catcher));
++}
++
++static inline int clear_user_tt(void *mem, int len)
++{
++      if(!access_ok_tt(VERIFY_WRITE, mem, len))
++              return(len);
++
++      return(__do_clear_user(mem, len, &current->thread.fault_addr,
++                             &current->thread.fault_catcher));
++}
++
++extern int __do_strnlen_user(const char *str, unsigned long n,
++                           void **fault_addr, void **fault_catcher);
++
++static inline int strnlen_user_tt(const void *str, int len)
++{
++      return(__do_strnlen_user(str, len,
++                               &current->thread.fault_addr,
++                               &current->thread.fault_catcher));
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ksyms.c um/arch/um/kernel/tt/ksyms.c
+--- orig/arch/um/kernel/tt/ksyms.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ksyms.c       Sun Oct 27 17:01:56 2002
+@@ -0,0 +1,28 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/module.h"
++#include "asm/uaccess.h"
++#include "mode.h"
++
++EXPORT_SYMBOL(__do_copy_from_user);
++EXPORT_SYMBOL(__do_copy_to_user);
++EXPORT_SYMBOL(__do_strncpy_from_user);
++EXPORT_SYMBOL(__do_strnlen_user); 
++EXPORT_SYMBOL(__do_clear_user);
++
++EXPORT_SYMBOL(tracing_pid);
++EXPORT_SYMBOL(honeypot);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/mem.c um/arch/um/kernel/tt/mem.c
+--- orig/arch/um/kernel/tt/mem.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/mem.c Mon Dec 16 21:49:51 2002
+@@ -0,0 +1,51 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/stddef.h"
++#include "linux/config.h"
++#include "linux/mm.h"
++#include "asm/uaccess.h"
++#include "mem_user.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "kern.h"
++#include "tt.h"
++
++void before_mem_tt(unsigned long brk_start)
++{
++      if(!jail || debug)
++              remap_data(UML_ROUND_DOWN(&_stext), UML_ROUND_UP(&_etext), 1);
++      remap_data(UML_ROUND_DOWN(&_sdata), UML_ROUND_UP(&_edata), 1);
++      remap_data(UML_ROUND_DOWN(&__bss_start), UML_ROUND_UP(brk_start), 1);
++}
++
++#ifdef CONFIG_HOST_2G_2G
++#define TOP 0x80000000
++#else
++#define TOP 0xc0000000
++#endif
++
++#define SIZE ((CONFIG_NEST_LEVEL + CONFIG_KERNEL_HALF_GIGS) * 0x20000000)
++#define START (TOP - SIZE)
++
++unsigned long set_task_sizes_tt(int arg, unsigned long *host_size_out, 
++                              unsigned long *task_size_out)
++{
++      /* Round up to the nearest 4M */
++      *host_size_out = ROUND_4M((unsigned long) &arg);
++      *task_size_out = START;
++      return(START);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/mem_user.c um/arch/um/kernel/tt/mem_user.c
+--- orig/arch/um/kernel/tt/mem_user.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/mem_user.c    Fri Jan 17 22:07:31 2003
+@@ -0,0 +1,50 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <stdio.h>
++#include <unistd.h>
++#include <string.h>
++#include <errno.h>
++#include <sys/mman.h>
++#include "tt.h"
++#include "mem_user.h"
++#include "user_util.h"
++
++void remap_data(void *segment_start, void *segment_end, int w)
++{
++      void *addr;
++      unsigned long size;
++      int data, prot;
++
++      if(w) prot = PROT_WRITE;
++      else prot = 0;
++      prot |= PROT_READ | PROT_EXEC;
++      size = (unsigned long) segment_end - 
++              (unsigned long) segment_start;
++      data = create_mem_file(size);
++      if((addr = mmap(NULL, size, PROT_WRITE | PROT_READ, 
++                      MAP_SHARED, data, 0)) == MAP_FAILED){
++              perror("mapping new data segment");
++              exit(1);
++      }
++      memcpy(addr, segment_start, size);
++      if(switcheroo(data, prot, addr, segment_start, 
++                    size) < 0){
++              printf("switcheroo failed\n");
++              exit(1);
++      }
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/process_kern.c um/arch/um/kernel/tt/process_kern.c
+--- orig/arch/um/kernel/tt/process_kern.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/process_kern.c        Sun Feb 16 21:34:23 2003
+@@ -0,0 +1,516 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sched.h"
++#include "linux/signal.h"
++#include "linux/kernel.h"
++#include "asm/system.h"
++#include "asm/pgalloc.h"
++#include "asm/ptrace.h"
++#include "irq_user.h"
++#include "signal_user.h"
++#include "kern_util.h"
++#include "user_util.h"
++#include "os.h"
++#include "kern.h"
++#include "sigcontext.h"
++#include "time_user.h"
++#include "mem_user.h"
++#include "tlb.h"
++#include "mode.h"
++#include "init.h"
++#include "tt.h"
++
++void *_switch_to_tt(void *prev, void *next)
++{
++      struct task_struct *from, *to;
++      unsigned long flags;
++      int err, vtalrm, alrm, prof, cpu;
++      char c;
++      /* jailing and SMP are incompatible, so this doesn't need to be 
++       * made per-cpu 
++       */
++      static int reading;
++
++      from = prev;
++      to = next;
++
++      to->thread.prev_sched = from;
++
++      cpu = from->processor;
++      if(cpu == 0)
++              forward_interrupts(to->thread.mode.tt.extern_pid);
++#ifdef CONFIG_SMP
++      forward_ipi(cpu_data[cpu].ipi_pipe[0], to->thread.mode.tt.extern_pid);
++#endif
++      local_irq_save(flags);
++
++      vtalrm = change_sig(SIGVTALRM, 0);
++      alrm = change_sig(SIGALRM, 0);
++      prof = change_sig(SIGPROF, 0);
++
++      c = 0;
++      set_current(to);
++
++      reading = 0;
++      err = os_write_file(to->thread.mode.tt.switch_pipe[1], &c, sizeof(c));
++      if(err != sizeof(c))
++              panic("write of switch_pipe failed, errno = %d", -err);
++
++      reading = 1;
++      if(from->state == TASK_ZOMBIE)
++              os_kill_process(os_getpid(), 0);
++
++      err = os_read_file(from->thread.mode.tt.switch_pipe[0], &c, sizeof(c));
++      if(err != sizeof(c))
++              panic("read of switch_pipe failed, errno = %d", -err);
++
++      /* This works around a nasty race with 'jail'.  If we are switching
++       * between two threads of a threaded app and the incoming process 
++       * runs before the outgoing process reaches the read, and it makes
++       * it all the way out to userspace, then it will have write-protected 
++       * the outgoing process stack.  Then, when the outgoing process 
++       * returns from the write, it will segfault because it can no longer
++       * write its own stack.  So, in order to avoid that, the incoming 
++       * thread sits in a loop yielding until 'reading' is set.  This 
++       * isn't entirely safe, since there may be a reschedule from a timer
++       * happening between setting 'reading' and sleeping in read.  But,
++       * it should get a whole quantum in which to reach the read and sleep,
++       * which should be enough.
++       */
++
++      if(jail){
++              while(!reading) sched_yield();
++      }
++
++      change_sig(SIGVTALRM, vtalrm);
++      change_sig(SIGALRM, alrm);
++      change_sig(SIGPROF, prof);
++
++      arch_switch();
++
++      flush_tlb_all();
++      local_irq_restore(flags);
++
++      return(current->thread.prev_sched);
++}
++
++void release_thread_tt(struct task_struct *task)
++{
++      os_kill_process(task->thread.mode.tt.extern_pid, 0);
++}
++
++void exit_thread_tt(void)
++{
++      close(current->thread.mode.tt.switch_pipe[0]);
++      close(current->thread.mode.tt.switch_pipe[1]);
++}
++
++extern void schedule_tail(struct task_struct *prev);
++
++static void new_thread_handler(int sig)
++{
++      int (*fn)(void *);
++      void *arg;
++
++      fn = current->thread.request.u.thread.proc;
++      arg = current->thread.request.u.thread.arg;
++      UPT_SC(&current->thread.regs.regs) = (void *) (&sig + 1);
++      suspend_new_thread(current->thread.mode.tt.switch_pipe[0]);
++
++      init_new_thread_signals(1);
++      enable_timer();
++      free_page(current->thread.temp_stack);
++      set_cmdline("(kernel thread)");
++      force_flush_all();
++
++      if(current->thread.prev_sched != NULL)
++              schedule_tail(current->thread.prev_sched);
++      current->thread.prev_sched = NULL;
++
++      change_sig(SIGUSR1, 1);
++      change_sig(SIGVTALRM, 1);
++      change_sig(SIGPROF, 1);
++      sti();
++      if(!run_kernel_thread(fn, arg, &current->thread.exec_buf))
++              do_exit(0);
++}
++
++static int new_thread_proc(void *stack)
++{
++      cli();
++      init_new_thread_stack(stack, new_thread_handler);
++      os_usr1_process(os_getpid());
++      return(0);
++}
++
++/* Signal masking - signals are blocked at the start of fork_tramp.  They
++ * are re-enabled when finish_fork_handler is entered by fork_tramp hitting
++ * itself with a SIGUSR1.  set_user_mode has to be run with SIGUSR1 off,
++ * so it is blocked before it's called.  They are re-enabled on sigreturn
++ * despite the fact that they were blocked when the SIGUSR1 was issued because
++ * copy_thread copies the parent's signcontext, including the signal mask
++ * onto the signal frame.
++ */
++
++static void finish_fork_handler(int sig)
++{
++      UPT_SC(&current->thread.regs.regs) = (void *) (&sig + 1);
++      suspend_new_thread(current->thread.mode.tt.switch_pipe[0]);
++      
++      init_new_thread_signals(1);
++      enable_timer();
++      sti();
++      force_flush_all();
++      if(current->mm != current->p_pptr->mm)
++              protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 
++                             1, 0, 1);
++      task_protections((unsigned long) current);
++
++      if(current->thread.prev_sched != NULL)
++              schedule_tail(current->thread.prev_sched);
++      current->thread.prev_sched = NULL;
++
++      free_page(current->thread.temp_stack);
++      cli();
++      change_sig(SIGUSR1, 0);
++      set_user_mode(current);
++}
++
++int fork_tramp(void *stack)
++{
++      cli();
++      init_new_thread_stack(stack, finish_fork_handler);
++      os_usr1_process(os_getpid());
++      return(0);
++}
++
++int copy_thread_tt(int nr, unsigned long clone_flags, unsigned long sp,
++                 unsigned long stack_top, struct task_struct * p, 
++                 struct pt_regs *regs)
++{
++      int (*tramp)(void *);
++      int new_pid, err;
++      unsigned long stack;
++      
++      if(current->thread.forking)
++              tramp = fork_tramp;
++      else {
++              tramp = new_thread_proc;
++              p->thread.request.u.thread = current->thread.request.u.thread;
++      }
++
++      err = os_pipe(p->thread.mode.tt.switch_pipe, 1, 1);
++      if(err){
++              printk("copy_thread : pipe failed, errno = %d\n", -err);
++              return(err);
++      }
++
++      stack = alloc_stack(0, 0);
++      if(stack == 0){
++              printk(KERN_ERR "copy_thread : failed to allocate "
++                     "temporary stack\n");
++              return(-ENOMEM);
++      }
++
++      clone_flags &= CLONE_VM;
++      p->thread.temp_stack = stack;
++      new_pid = start_fork_tramp((void *) p->thread.kernel_stack, stack,
++                                 clone_flags, tramp);
++      if(new_pid < 0){
++              printk(KERN_ERR "copy_thread : clone failed - errno = %d\n", 
++                     -new_pid);
++              return(new_pid);
++      }
++
++      if(current->thread.forking){
++              sc_to_sc(UPT_SC(&p->thread.regs.regs), 
++                       UPT_SC(&current->thread.regs.regs));
++              SC_SET_SYSCALL_RETURN(UPT_SC(&p->thread.regs.regs), 0);
++              if(sp != 0) SC_SP(UPT_SC(&p->thread.regs.regs)) = sp;
++      }
++      p->thread.mode.tt.extern_pid = new_pid;
++
++      current->thread.request.op = OP_FORK;
++      current->thread.request.u.fork.pid = new_pid;
++      os_usr1_process(os_getpid());
++      return(0);
++}
++
++void reboot_tt(void)
++{
++      current->thread.request.op = OP_REBOOT;
++      os_usr1_process(os_getpid());
++      os_kill_process(os_getpid(), 0);
++}
++
++void halt_tt(void)
++{
++      current->thread.request.op = OP_HALT;
++      os_usr1_process(os_getpid());
++      os_kill_process(os_getpid(), 0);
++}
++
++void kill_off_processes_tt(void)
++{
++      struct task_struct *p;
++      int me;
++
++      me = os_getpid();
++      for_each_task(p){
++              int pid = p->thread.mode.tt.extern_pid;
++              if((pid != me) && (pid != -1))
++                      os_kill_process(p->thread.mode.tt.extern_pid, 0);
++      }
++      if((init_task.thread.mode.tt.extern_pid != me) &&
++         (init_task.thread.mode.tt.extern_pid != -1))
++              os_kill_process(init_task.thread.mode.tt.extern_pid, 0);
++}
++
++void initial_thread_cb_tt(void (*proc)(void *), void *arg)
++{
++      if(os_getpid() == tracing_pid){
++              (*proc)(arg);
++      }
++      else {
++              current->thread.request.op = OP_CB;
++              current->thread.request.u.cb.proc = proc;
++              current->thread.request.u.cb.arg = arg;
++              os_usr1_process(os_getpid());
++      }
++}
++
++int do_proc_op(void *t, int proc_id)
++{
++      struct task_struct *task;
++      struct thread_struct *thread;
++      int op, pid;
++
++      task = t;
++      thread = &task->thread;
++      op = thread->request.op;
++      switch(op){
++      case OP_NONE:
++      case OP_TRACE_ON:
++              break;
++      case OP_EXEC:
++              pid = thread->request.u.exec.pid;
++              do_exec(thread->mode.tt.extern_pid, pid);
++              thread->mode.tt.extern_pid = pid;
++              cpu_tasks[task->processor].pid = pid;
++              break;
++      case OP_FORK:
++              attach_process(thread->request.u.fork.pid);
++              break;
++      case OP_CB:
++              (*thread->request.u.cb.proc)(thread->request.u.cb.arg);
++              break;
++      case OP_REBOOT:
++      case OP_HALT:
++              break;
++      default:
++              tracer_panic("Bad op in do_proc_op");
++              break;
++      }
++      thread->request.op = OP_NONE;
++      return(op);
++}
++
++void init_idle_tt(void)
++{
++      idle_timer();
++}
++
++/* Changed by jail_setup, which is a setup */
++int jail = 0;
++
++int __init jail_setup(char *line, int *add)
++{
++      int ok = 1;
++
++      if(jail) return(0);
++#ifdef CONFIG_SMP
++      printf("'jail' may not used used in a kernel with CONFIG_SMP "
++             "enabled\n");
++      ok = 0;
++#endif
++#ifdef CONFIG_HOSTFS
++      printf("'jail' may not used used in a kernel with CONFIG_HOSTFS "
++             "enabled\n");
++      ok = 0;
++#endif
++#ifdef CONFIG_MODULES
++      printf("'jail' may not used used in a kernel with CONFIG_MODULES "
++             "enabled\n");
++      ok = 0;
++#endif        
++      if(!ok) exit(1);
++
++      /* CAP_SYS_RAWIO controls the ability to open /dev/mem and /dev/kmem.
++       * Removing it from the bounding set eliminates the ability of anything
++       * to acquire it, and thus read or write kernel memory.
++       */
++      cap_lower(cap_bset, CAP_SYS_RAWIO);
++      jail = 1;
++      return(0);
++}
++
++__uml_setup("jail", jail_setup,
++"jail\n"
++"    Enables the protection of kernel memory from processes.\n\n"
++);
++
++static void mprotect_kernel_mem(int w)
++{
++      unsigned long start, end;
++      int pages;
++
++      if(!jail || (current == &init_task)) return;
++
++      pages = (1 << CONFIG_KERNEL_STACK_ORDER);
++
++      start = (unsigned long) current + PAGE_SIZE;
++      end = (unsigned long) current + PAGE_SIZE * pages;
++      protect_memory(uml_reserved, start - uml_reserved, 1, w, 1, 1);
++      protect_memory(end, high_physmem - end, 1, w, 1, 1);
++
++      start = (unsigned long) UML_ROUND_DOWN(&_stext);
++      end = (unsigned long) UML_ROUND_UP(&_etext);
++      protect_memory(start, end - start, 1, w, 1, 1);
++
++      start = (unsigned long) UML_ROUND_DOWN(&_unprotected_end);
++      end = (unsigned long) UML_ROUND_UP(&_edata);
++      protect_memory(start, end - start, 1, w, 1, 1);
++
++      start = (unsigned long) UML_ROUND_DOWN(&__bss_start);
++      end = (unsigned long) UML_ROUND_UP(brk_start);
++      protect_memory(start, end - start, 1, w, 1, 1);
++
++      mprotect_kernel_vm(w);
++}
++
++void unprotect_kernel_mem(void)
++{
++      mprotect_kernel_mem(1);
++}
++
++void protect_kernel_mem(void)
++{
++      mprotect_kernel_mem(0);
++}
++
++extern void start_kernel(void);
++
++static int start_kernel_proc(void *unused)
++{
++      int pid;
++
++      block_signals();
++      pid = os_getpid();
++
++      cpu_tasks[0].pid = pid;
++      cpu_tasks[0].task = current;
++#ifdef CONFIG_SMP
++      cpu_online_map = 1;
++#endif
++      if(debug) os_stop_process(pid);
++      start_kernel();
++      return(0);
++}
++
++void set_tracing(void *task, int tracing)
++{
++      ((struct task_struct *) task)->thread.mode.tt.tracing = tracing;
++}
++
++int is_tracing(void *t)
++{
++      return (((struct task_struct *) t)->thread.mode.tt.tracing);
++}
++
++int set_user_mode(void *t)
++{
++      struct task_struct *task;
++
++      task = t ? t : current;
++      if(task->thread.mode.tt.tracing) 
++              return(1);
++      task->thread.request.op = OP_TRACE_ON;
++      os_usr1_process(os_getpid());
++      return(0);
++}
++
++void set_init_pid(int pid)
++{
++      int err;
++
++      init_task.thread.mode.tt.extern_pid = pid;
++      err = os_pipe(init_task.thread.mode.tt.switch_pipe, 1, 1);
++      if(err) panic("Can't create switch pipe for init_task, errno = %d", 
++                    err);
++}
++
++int singlestepping_tt(void *t)
++{
++      struct task_struct *task = t;
++
++      if(task->thread.mode.tt.singlestep_syscall)
++              return(0);
++      return(task->ptrace & PT_DTRACE);
++}
++
++void clear_singlestep(void *t)
++{
++      struct task_struct *task = t;
++
++      task->ptrace &= ~PT_DTRACE;
++}
++
++int start_uml_tt(void)
++{
++      void *sp;
++      int pages;
++
++      pages = (1 << CONFIG_KERNEL_STACK_ORDER) - 2;
++      sp = (void *) init_task.thread.kernel_stack + pages * PAGE_SIZE - 
++              sizeof(unsigned long);
++      return(tracer(start_kernel_proc, sp));
++}
++
++int external_pid_tt(struct task_struct *task)
++{
++      return(task->thread.mode.tt.extern_pid);
++}
++
++int thread_pid_tt(struct thread_struct *thread)
++{
++      return(thread->mode.tt.extern_pid);
++}
++
++int is_valid_pid(int pid)
++{
++      struct task_struct *task;
++
++        read_lock(&tasklist_lock);
++        for_each_task(task){
++                if(task->thread.mode.tt.extern_pid == pid){
++                      read_unlock(&tasklist_lock);
++                      return(1);
++                }
++        }
++      read_unlock(&tasklist_lock);
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/Makefile um/arch/um/kernel/tt/ptproxy/Makefile
+--- orig/arch/um/kernel/tt/ptproxy/Makefile    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ptproxy/Makefile      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,12 @@
++O_TARGET = ptproxy.o
++
++obj-y = proxy.o ptrace.o sysdep.o wait.o
++
++USER_OBJS = $(obj-y)
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++clean:
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/proxy.c um/arch/um/kernel/tt/ptproxy/proxy.c
+--- orig/arch/um/kernel/tt/ptproxy/proxy.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ptproxy/proxy.c       Wed Apr 16 14:01:03 2003
+@@ -0,0 +1,370 @@
++/**********************************************************************
++proxy.c
++
++Copyright (C) 1999 Lars Brinkhoff.  See the file COPYING for licensing
++terms and conditions.
++
++Jeff Dike (jdike@karaya.com) : Modified for integration into uml
++**********************************************************************/
++
++/* XXX This file shouldn't refer to CONFIG_* */
++
++#include <errno.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <signal.h>
++#include <string.h>
++#include <fcntl.h>
++#include <termios.h>
++#include <sys/wait.h>
++#include <sys/types.h>
++#include <sys/ptrace.h>
++#include <sys/ioctl.h>
++#include <asm/unistd.h>
++
++#include "ptproxy.h"
++#include "sysdep.h"
++#include "wait.h"
++
++#include "user_util.h"
++#include "user.h"
++#include "os.h"
++#include "tempfile.h"
++
++static int debugger_wait(debugger_state *debugger, int *status, int options,
++                       int (*syscall)(debugger_state *debugger, pid_t child),
++                       int (*normal_return)(debugger_state *debugger, 
++                                            pid_t unused),
++                       int (*wait_return)(debugger_state *debugger, 
++                                          pid_t unused))
++{
++      if(debugger->real_wait){
++              debugger->handle_trace = normal_return;
++              syscall_continue(debugger->pid);
++              debugger->real_wait = 0;
++              return(1);
++      }
++      debugger->wait_status_ptr = status;
++      debugger->wait_options = options;
++      if((debugger->debugee != NULL) && debugger->debugee->event){
++              syscall_continue(debugger->pid);
++              wait_for_stop(debugger->pid, SIGTRAP, PTRACE_SYSCALL,
++                            NULL);
++              (*wait_return)(debugger, -1);
++              return(0);
++      }
++      else if(debugger->wait_options & WNOHANG){
++              syscall_cancel(debugger->pid, 0);
++              debugger->handle_trace = syscall;
++              return(0);
++      }
++      else {
++              syscall_pause(debugger->pid);
++              debugger->handle_trace = wait_return;
++              debugger->waiting = 1;
++      }
++      return(1);
++}
++
++/*
++ * Handle debugger trap, i.e. syscall.
++ */
++
++int debugger_syscall(debugger_state *debugger, pid_t child)
++{
++      long arg1, arg2, arg3, arg4, arg5, result;
++      int syscall, ret = 0;
++
++      syscall = get_syscall(debugger->pid, &arg1, &arg2, &arg3, &arg4, 
++                            &arg5);
++
++      switch(syscall){
++      case __NR_execve:
++              /* execve never returns */
++              debugger->handle_trace = debugger_syscall; 
++              break;
++
++      case __NR_ptrace:
++              if(debugger->debugee->pid != 0) arg2 = debugger->debugee->pid;
++              if(!debugger->debugee->in_context) 
++                      child = debugger->debugee->pid;
++              result = proxy_ptrace(debugger, arg1, arg2, arg3, arg4, child,
++                                    &ret);
++              syscall_cancel(debugger->pid, result);
++              debugger->handle_trace = debugger_syscall;
++              return(ret);
++
++      case __NR_waitpid:
++      case __NR_wait4:
++              if(!debugger_wait(debugger, (int *) arg2, arg3, 
++                                debugger_syscall, debugger_normal_return, 
++                                proxy_wait_return))
++                      return(0);
++              break;
++
++      case __NR_kill:
++              if(!debugger->debugee->in_context) 
++                      child = debugger->debugee->pid;
++              if(arg1 == debugger->debugee->pid){
++                      result = kill(child, arg2);
++                      syscall_cancel(debugger->pid, result);
++                      debugger->handle_trace = debugger_syscall;
++                      return(0);
++              }
++              else debugger->handle_trace = debugger_normal_return;
++              break;
++
++      default:
++              debugger->handle_trace = debugger_normal_return;
++      }
++
++      syscall_continue(debugger->pid);
++      return(0);
++}
++
++/* Used by the tracing thread */
++static debugger_state parent;
++static int parent_syscall(debugger_state *debugger, int pid);
++
++int init_parent_proxy(int pid)
++{
++      parent = ((debugger_state) { .pid               = pid,
++                                   .wait_options      = 0,
++                                   .wait_status_ptr   = NULL,
++                                   .waiting           = 0,
++                                   .real_wait         = 0,
++                                   .expecting_child   = 0,
++                                   .handle_trace      = parent_syscall,
++                                   .debugee           = NULL } );
++      return(0);
++}
++
++int parent_normal_return(debugger_state *debugger, pid_t unused)
++{
++      debugger->handle_trace = parent_syscall;
++      syscall_continue(debugger->pid);
++      return(0);
++}
++
++static int parent_syscall(debugger_state *debugger, int pid)
++{
++      long arg1, arg2, arg3, arg4, arg5;
++      int syscall;
++
++      syscall = get_syscall(pid, &arg1, &arg2, &arg3, &arg4, &arg5);
++              
++      if((syscall == __NR_waitpid) || (syscall == __NR_wait4)){
++              debugger_wait(&parent, (int *) arg2, arg3, parent_syscall,
++                            parent_normal_return, parent_wait_return);
++      }
++      else ptrace(PTRACE_SYSCALL, pid, 0, 0);
++      return(0);
++}
++
++int debugger_normal_return(debugger_state *debugger, pid_t unused)
++{
++      debugger->handle_trace = debugger_syscall;
++      syscall_continue(debugger->pid);
++      return(0);
++}
++
++void debugger_cancelled_return(debugger_state *debugger, int result)
++{
++      debugger->handle_trace = debugger_syscall;
++      syscall_set_result(debugger->pid, result);
++      syscall_continue(debugger->pid);
++}
++
++/* Used by the tracing thread */
++static debugger_state debugger;
++static debugee_state debugee;
++
++void init_proxy (pid_t debugger_pid, int stopped, int status)
++{
++      debugger.pid = debugger_pid;
++      debugger.handle_trace = debugger_syscall;
++      debugger.debugee = &debugee;
++      debugger.waiting = 0;
++      debugger.real_wait = 0;
++      debugger.expecting_child = 0;
++
++      debugee.pid = 0;
++      debugee.traced = 0;
++      debugee.stopped = stopped;
++      debugee.event = 0;
++      debugee.zombie = 0;
++      debugee.died = 0;
++      debugee.wait_status = status;
++      debugee.in_context = 1;
++}
++
++int debugger_proxy(int status, int pid)
++{
++      int ret = 0, sig;
++
++      if(WIFSTOPPED(status)){
++              sig = WSTOPSIG(status);
++              if (sig == SIGTRAP)
++                      ret = (*debugger.handle_trace)(&debugger, pid);
++                                                     
++              else if(sig == SIGCHLD){
++                      if(debugger.expecting_child){
++                              ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig);
++                              debugger.expecting_child = 0;
++                      }
++                      else if(debugger.waiting)
++                              real_wait_return(&debugger);
++                      else {
++                              ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig);
++                              debugger.real_wait = 1;
++                      }
++              }
++              else ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig);
++      }
++      else if(WIFEXITED(status)){
++              tracer_panic("debugger (pid %d) exited with status %d", 
++                           debugger.pid, WEXITSTATUS(status));
++      }
++      else if(WIFSIGNALED(status)){
++              tracer_panic("debugger (pid %d) exited with signal %d", 
++                           debugger.pid, WTERMSIG(status));
++      }
++      else {
++              tracer_panic("proxy got unknown status (0x%x) on debugger "
++                           "(pid %d)", status, debugger.pid);
++      }
++      return(ret);
++}
++
++void child_proxy(pid_t pid, int status)
++{
++      debugee.event = 1;
++      debugee.wait_status = status;
++
++      if(WIFSTOPPED(status)){
++              debugee.stopped = 1;
++              debugger.expecting_child = 1;
++              kill(debugger.pid, SIGCHLD);
++      }
++      else if(WIFEXITED(status) || WIFSIGNALED(status)){
++              debugee.zombie = 1;
++              debugger.expecting_child = 1;
++              kill(debugger.pid, SIGCHLD);
++      }
++      else panic("proxy got unknown status (0x%x) on child (pid %d)", 
++                 status, pid);
++}
++
++void debugger_parent_signal(int status, int pid)
++{
++      int sig;
++
++      if(WIFSTOPPED(status)){
++              sig = WSTOPSIG(status);
++              if(sig == SIGTRAP) (*parent.handle_trace)(&parent, pid);
++              else ptrace(PTRACE_SYSCALL, pid, 0, sig);
++      }
++}
++
++void fake_child_exit(void)
++{
++      int status, pid;
++
++      child_proxy(1, W_EXITCODE(0, 0));
++      while(debugger.waiting == 1){
++              pid = waitpid(debugger.pid, &status, WUNTRACED);
++              if(pid != debugger.pid){
++                      printk("fake_child_exit - waitpid failed, "
++                             "errno = %d\n", errno);
++                      return;
++              }
++              debugger_proxy(status, debugger.pid);
++      }
++      pid = waitpid(debugger.pid, &status, WUNTRACED);
++      if(pid != debugger.pid){
++              printk("fake_child_exit - waitpid failed, "
++                     "errno = %d\n", errno);
++              return;
++      }
++      if(ptrace(PTRACE_DETACH, debugger.pid, 0, SIGCONT) < 0)
++              printk("fake_child_exit - PTRACE_DETACH failed, errno = %d\n",
++                     errno);
++}
++
++char gdb_init_string[] = 
++"att 1 \n\
++b panic \n\
++b stop \n\
++handle SIGWINCH nostop noprint pass \n\
++";
++
++int start_debugger(char *prog, int startup, int stop, int *fd_out)
++{
++      int slave, child;
++
++      slave = open_gdb_chan();
++      if((child = fork()) == 0){
++              char *tempname = NULL;
++              int fd;
++
++              if(setsid() < 0) perror("setsid");
++              if((dup2(slave, 0) < 0) || (dup2(slave, 1) < 0) || 
++                 (dup2(slave, 2) < 0)){
++                      printk("start_debugger : dup2 failed, errno = %d\n",
++                             errno);
++                      exit(1);
++              }
++              if(ioctl(0, TIOCSCTTY, 0) < 0){
++                      printk("start_debugger : TIOCSCTTY failed, "
++                             "errno = %d\n", errno);
++                      exit(1);
++              }
++              if(tcsetpgrp (1, os_getpid()) < 0){
++                      printk("start_debugger : tcsetpgrp failed, "
++                             "errno = %d\n", errno);
++#ifdef notdef
++                      exit(1);
++#endif
++              }
++              if((fd = make_tempfile("/tmp/gdb_init-XXXXXX", &tempname, 0)) < 0){
++                      printk("start_debugger : make_tempfile failed, errno = %d\n",
++                             errno);
++                      exit(1);
++              }
++              write(fd, gdb_init_string, sizeof(gdb_init_string) - 1);
++              if(startup){
++                      if(stop){
++                              write(fd, "b start_kernel\n",
++                                    strlen("b start_kernel\n"));
++                      }
++                      write(fd, "c\n", strlen("c\n"));
++              }
++              if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){
++                      printk("start_debugger :  PTRACE_TRACEME failed, "
++                             "errno = %d\n", errno);
++                      exit(1);
++              }
++              execlp("gdb", "gdb", "--command", tempname, prog, NULL);
++              printk("start_debugger : exec of gdb failed, errno = %d\n",
++                     errno);
++      }
++      if(child < 0){
++              printk("start_debugger : fork for gdb failed, errno = %d\n",
++                     errno);
++              return(-1);
++      }
++      *fd_out = slave;
++      return(child);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/ptproxy.h um/arch/um/kernel/tt/ptproxy/ptproxy.h
+--- orig/arch/um/kernel/tt/ptproxy/ptproxy.h   Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ptproxy/ptproxy.h     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,61 @@
++/**********************************************************************
++ptproxy.h
++
++Copyright (C) 1999 Lars Brinkhoff.  See the file COPYING for licensing
++terms and conditions.
++**********************************************************************/
++
++#ifndef __PTPROXY_H
++#define __PTPROXY_H
++
++#include <sys/types.h>
++
++typedef struct debugger debugger_state;
++typedef struct debugee debugee_state;
++
++struct debugger
++{
++      pid_t pid;
++      int wait_options;
++      int *wait_status_ptr;
++      unsigned int waiting : 1;
++      unsigned int real_wait : 1;
++      unsigned int expecting_child : 1;
++      int (*handle_trace) (debugger_state *, pid_t);
++
++      debugee_state *debugee;
++};
++
++struct debugee
++{
++      pid_t pid;
++      int wait_status;
++      unsigned int died : 1;
++      unsigned int event : 1;
++      unsigned int stopped : 1;
++      unsigned int trace_singlestep : 1;
++      unsigned int trace_syscall : 1;
++      unsigned int traced : 1;
++      unsigned int zombie : 1;
++      unsigned int in_context : 1;
++};
++
++extern int debugger_syscall(debugger_state *debugger, pid_t pid);
++extern int debugger_normal_return (debugger_state *debugger, pid_t unused);
++
++extern long proxy_ptrace (struct debugger *, int, pid_t, long, long, pid_t,
++                        int *strace_out);
++extern void debugger_cancelled_return(debugger_state *debugger, int result);
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/ptrace.c um/arch/um/kernel/tt/ptproxy/ptrace.c
+--- orig/arch/um/kernel/tt/ptproxy/ptrace.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ptproxy/ptrace.c      Mon Nov 11 13:06:03 2002
+@@ -0,0 +1,239 @@
++/**********************************************************************
++ptrace.c
++
++Copyright (C) 1999 Lars Brinkhoff.  See the file COPYING for licensing
++terms and conditions.
++
++Jeff Dike (jdike@karaya.com) : Modified for integration into uml
++**********************************************************************/
++
++#include <errno.h>
++#include <unistd.h>
++#include <signal.h>
++#include <sys/types.h>
++#include <sys/time.h>
++#include <sys/ptrace.h>
++#include <sys/wait.h>
++#include <asm/ptrace.h>
++
++#include "ptproxy.h"
++#include "debug.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "ptrace_user.h"
++#include "tt.h"
++
++long proxy_ptrace(struct debugger *debugger, int arg1, pid_t arg2,
++                long arg3, long arg4, pid_t child, int *ret)
++{
++      sigset_t relay;
++      long result;
++      int status;
++
++      *ret = 0;
++      if(debugger->debugee->died) return(-ESRCH);
++
++      switch(arg1){
++      case PTRACE_ATTACH:
++              if(debugger->debugee->traced) return(-EPERM);
++
++              debugger->debugee->pid = arg2;
++              debugger->debugee->traced = 1;
++
++              if(is_valid_pid(arg2) && (arg2 != child)){
++                      debugger->debugee->in_context = 0;
++                      kill(arg2, SIGSTOP);
++                      debugger->debugee->event = 1;
++                      debugger->debugee->wait_status = W_STOPCODE(SIGSTOP);
++              }
++              else {
++                      debugger->debugee->in_context = 1;
++                      if(debugger->debugee->stopped) 
++                              child_proxy(child, W_STOPCODE(SIGSTOP));
++                      else kill(child, SIGSTOP);
++              }
++
++              return(0);
++
++      case PTRACE_DETACH:
++              if(!debugger->debugee->traced) return(-EPERM);
++              
++              debugger->debugee->traced = 0;
++              debugger->debugee->pid = 0;
++              if(!debugger->debugee->in_context)
++                      kill(child, SIGCONT);
++
++              return(0);
++
++      case PTRACE_CONT:
++              if(!debugger->debugee->in_context) return(-EPERM);
++              *ret = PTRACE_CONT;
++              return(ptrace(PTRACE_CONT, child, arg3, arg4));
++
++#ifdef UM_HAVE_GETFPREGS
++      case PTRACE_GETFPREGS:
++      {
++              long regs[FP_FRAME_SIZE];
++              int i, result;
++
++              result = ptrace(PTRACE_GETFPREGS, child, 0, regs);
++              if(result == -1) return(-errno);
++              
++              for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++)
++                      ptrace(PTRACE_POKEDATA, debugger->pid, arg4 + 4 * i,
++                             regs[i]);
++              return(result);
++      }
++#endif
++
++#ifdef UM_HAVE_GETFPXREGS
++      case PTRACE_GETFPXREGS:
++      {
++              long regs[FPX_FRAME_SIZE];
++              int i, result;
++
++              result = ptrace(PTRACE_GETFPXREGS, child, 0, regs);
++              if(result == -1) return(-errno);
++              
++              for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++)
++                      ptrace(PTRACE_POKEDATA, debugger->pid, arg4 + 4 * i,
++                             regs[i]);
++              return(result);
++      }
++#endif
++
++#ifdef UM_HAVE_GETREGS
++      case PTRACE_GETREGS:
++      {
++              long regs[FRAME_SIZE];
++              int i, result;
++
++              result = ptrace(PTRACE_GETREGS, child, 0, regs);
++              if(result == -1) return(-errno);
++
++              for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++)
++                      ptrace (PTRACE_POKEDATA, debugger->pid,
++                              arg4 + 4 * i, regs[i]);
++              return(result);
++      }
++      break;
++#endif
++
++      case PTRACE_KILL:
++              result = ptrace(PTRACE_KILL, child, arg3, arg4);
++              if(result == -1) return(-errno);
++
++              return(result);
++
++      case PTRACE_PEEKDATA:
++      case PTRACE_PEEKTEXT:
++      case PTRACE_PEEKUSER:
++              /* The value being read out could be -1, so we have to 
++               * check errno to see if there's an error, and zero it
++               * beforehand so we're not faked out by an old error
++               */
++
++              errno = 0;
++              result = ptrace(arg1, child, arg3, 0);
++              if((result == -1) && (errno != 0)) return(-errno);
++
++              result = ptrace(PTRACE_POKEDATA, debugger->pid, arg4, result);
++              if(result == -1) return(-errno);
++                      
++              return(result);
++
++      case PTRACE_POKEDATA:
++      case PTRACE_POKETEXT:
++      case PTRACE_POKEUSER:
++              result = ptrace(arg1, child, arg3, arg4);
++              if(result == -1) return(-errno);
++
++              if(arg1 == PTRACE_POKEUSER) ptrace_pokeuser(arg3, arg4);
++              return(result);
++
++#ifdef UM_HAVE_SETFPREGS
++      case PTRACE_SETFPREGS:
++      {
++              long regs[FP_FRAME_SIZE];
++              int i;
++
++              for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++)
++                      regs[i] = ptrace (PTRACE_PEEKDATA, debugger->pid,
++                                        arg4 + 4 * i, 0);
++              result = ptrace(PTRACE_SETFPREGS, child, 0, regs);
++              if(result == -1) return(-errno);
++
++              return(result);
++      }
++#endif
++
++#ifdef UM_HAVE_SETFPXREGS
++      case PTRACE_SETFPXREGS:
++      {
++              long regs[FPX_FRAME_SIZE];
++              int i;
++
++              for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++)
++                      regs[i] = ptrace (PTRACE_PEEKDATA, debugger->pid,
++                                        arg4 + 4 * i, 0);
++              result = ptrace(PTRACE_SETFPXREGS, child, 0, regs);
++              if(result == -1) return(-errno);
++
++              return(result);
++      }
++#endif
++
++#ifdef UM_HAVE_SETREGS
++      case PTRACE_SETREGS:
++      {
++              long regs[FRAME_SIZE];
++              int i;
++
++              for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++)
++                      regs[i] = ptrace(PTRACE_PEEKDATA, debugger->pid,
++                                       arg4 + 4 * i, 0);
++              result = ptrace(PTRACE_SETREGS, child, 0, regs);
++              if(result == -1) return(-errno);
++
++              return(result);
++      }
++#endif
++
++      case PTRACE_SINGLESTEP:
++              if(!debugger->debugee->in_context) return(-EPERM);
++              sigemptyset(&relay);
++              sigaddset(&relay, SIGSEGV);
++              sigaddset(&relay, SIGILL);
++              sigaddset(&relay, SIGBUS);
++              result = ptrace(PTRACE_SINGLESTEP, child, arg3, arg4);
++              if(result == -1) return(-errno);
++              
++              status = wait_for_stop(child, SIGTRAP, PTRACE_SINGLESTEP,
++                                     &relay);
++              child_proxy(child, status);
++              return(result);
++
++      case PTRACE_SYSCALL:
++              if(!debugger->debugee->in_context) return(-EPERM);
++              result = ptrace(PTRACE_SYSCALL, child, arg3, arg4);
++              if(result == -1) return(-errno);
++
++              *ret = PTRACE_SYSCALL;
++              return(result);
++
++      case PTRACE_TRACEME:
++      default:
++              return(-EINVAL);
++      }
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/sysdep.c um/arch/um/kernel/tt/ptproxy/sysdep.c
+--- orig/arch/um/kernel/tt/ptproxy/sysdep.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ptproxy/sysdep.c      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,71 @@
++/**********************************************************************
++sysdep.c
++
++Copyright (C) 1999 Lars Brinkhoff.  See the file COPYING for licensing
++terms and conditions.
++**********************************************************************/
++
++#include <stdio.h>
++#include <string.h>
++#include <stdlib.h>
++#include <signal.h>
++#include <sys/types.h>
++#include <sys/ptrace.h>
++#include <asm/ptrace.h>
++#include <linux/unistd.h>
++#include "ptrace_user.h"
++#include "user_util.h"
++#include "user.h"
++
++int get_syscall(pid_t pid, long *arg1, long *arg2, long *arg3, long *arg4, 
++              long *arg5)
++{
++      *arg1 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG1_OFFSET, 0);
++      *arg2 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG2_OFFSET, 0);
++      *arg3 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG3_OFFSET, 0);
++      *arg4 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG4_OFFSET, 0);
++      *arg5 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG5_OFFSET, 0);
++      return(ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET, 0));
++}
++
++void syscall_cancel(pid_t pid, int result)
++{
++      if((ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, 
++                 __NR_getpid) < 0) ||
++         (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) ||
++         (wait_for_stop(pid, SIGTRAP, PTRACE_SYSCALL, NULL) < 0) ||
++         (ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, result) < 0) ||
++         (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0))
++              printk("ptproxy: couldn't cancel syscall: errno = %d\n", 
++                     errno);
++}
++
++void syscall_set_result(pid_t pid, long result)
++{
++      ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, result);
++}
++
++void syscall_continue(pid_t pid)
++{
++      ptrace(PTRACE_SYSCALL, pid, 0, 0);
++}
++
++int syscall_pause(pid_t pid) 
++{
++      if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_pause) < 0){
++              printk("syscall_change - ptrace failed, errno = %d\n", errno);
++              return(-1);
++      }
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/sysdep.h um/arch/um/kernel/tt/ptproxy/sysdep.h
+--- orig/arch/um/kernel/tt/ptproxy/sysdep.h    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ptproxy/sysdep.h      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,25 @@
++/**********************************************************************
++sysdep.h
++
++Copyright (C) 1999 Lars Brinkhoff.
++Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++See the file COPYING for licensing terms and conditions.
++**********************************************************************/
++
++extern int get_syscall(pid_t pid, long *arg1, long *arg2, long *arg3, 
++                     long *arg4, long *arg5);
++extern void syscall_cancel (pid_t pid, long result);
++extern void syscall_set_result (pid_t pid, long result);
++extern void syscall_continue (pid_t pid);
++extern int syscall_pause(pid_t pid);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/wait.c um/arch/um/kernel/tt/ptproxy/wait.c
+--- orig/arch/um/kernel/tt/ptproxy/wait.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ptproxy/wait.c        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,86 @@
++/**********************************************************************
++wait.c
++
++Copyright (C) 1999 Lars Brinkhoff.  See the file COPYING for licensing
++terms and conditions.
++
++**********************************************************************/
++
++#include <errno.h>
++#include <signal.h>
++#include <sys/wait.h>
++#include <sys/ptrace.h>
++#include <asm/ptrace.h>
++
++#include "ptproxy.h"
++#include "sysdep.h"
++#include "wait.h"
++#include "user_util.h"
++#include "sysdep/ptrace.h"
++#include "sysdep/ptrace_user.h"
++#include "sysdep/sigcontext.h"
++
++int proxy_wait_return(struct debugger *debugger, pid_t unused)
++{
++      debugger->waiting = 0;
++
++      if(debugger->debugee->died || (debugger->wait_options & __WCLONE)){
++              debugger_cancelled_return(debugger, -ECHILD);
++              return(0);
++      }
++
++      if(debugger->debugee->zombie && debugger->debugee->event)
++              debugger->debugee->died = 1;
++
++      if(debugger->debugee->event){
++              debugger->debugee->event = 0;
++              ptrace(PTRACE_POKEDATA, debugger->pid,
++                     debugger->wait_status_ptr, 
++                     debugger->debugee->wait_status);
++              /* if (wait4)
++                 ptrace (PTRACE_POKEDATA, pid, rusage_ptr, ...); */
++              debugger_cancelled_return(debugger, debugger->debugee->pid);
++              return(0);
++      }
++
++      /* pause will return -EINTR, which happens to be right for wait */
++      debugger_normal_return(debugger, -1);
++      return(0);
++}
++
++int parent_wait_return(struct debugger *debugger, pid_t unused)
++{
++      return(debugger_normal_return(debugger, -1));
++}
++
++int real_wait_return(struct debugger *debugger)
++{
++      unsigned long ip;
++      int err, pid;
++
++      pid = debugger->pid;
++      ip = ptrace(PTRACE_PEEKUSER, pid, PT_IP_OFFSET, 0);
++      ip = IP_RESTART_SYSCALL(ip);
++      err = ptrace(PTRACE_POKEUSER, pid, PT_IP_OFFSET, ip);
++      if(ptrace(PTRACE_POKEUSER, pid, PT_IP_OFFSET, ip) < 0)
++              tracer_panic("real_wait_return : Failed to restart system "
++                           "call, errno = %d\n");
++      if((ptrace(PTRACE_SYSCALL, debugger->pid, 0, SIGCHLD) < 0) ||
++         (ptrace(PTRACE_SYSCALL, debugger->pid, 0, 0) < 0) ||
++         (ptrace(PTRACE_SYSCALL, debugger->pid, 0, 0) < 0) ||
++         debugger_normal_return(debugger, -1))
++              tracer_panic("real_wait_return : gdb failed to wait, "
++                           "errno = %d\n");
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/wait.h um/arch/um/kernel/tt/ptproxy/wait.h
+--- orig/arch/um/kernel/tt/ptproxy/wait.h      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/ptproxy/wait.h        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,15 @@
++/**********************************************************************
++wait.h
++
++Copyright (C) 1999 Lars Brinkhoff.  See the file COPYING for licensing
++terms and conditions.
++**********************************************************************/
++
++#ifndef __PTPROXY_WAIT_H
++#define __PTPROXY_WAIT_H
++
++extern int proxy_wait_return(struct debugger *debugger, pid_t unused);
++extern int real_wait_return(struct debugger *debugger);
++extern int parent_wait_return(struct debugger *debugger, pid_t unused);
++
++#endif
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/sys-i386/Makefile um/arch/um/kernel/tt/sys-i386/Makefile
+--- orig/arch/um/kernel/tt/sys-i386/Makefile   Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/sys-i386/Makefile     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,17 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET = sys-i386.o
++
++obj-y = sigcontext.o
++
++USER_OBJS = sigcontext.o
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++clean :
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/sys-i386/sigcontext.c um/arch/um/kernel/tt/sys-i386/sigcontext.c
+--- orig/arch/um/kernel/tt/sys-i386/sigcontext.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/sys-i386/sigcontext.c Sun Dec  1 23:33:52 2002
+@@ -0,0 +1,60 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <asm/sigcontext.h>
++#include "kern_util.h"
++#include "sysdep/frame.h"
++
++int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data)
++{
++      struct arch_frame_data *arch = data;
++      struct sigcontext *to = to_ptr, *from = from_ptr;
++      struct _fpstate *to_fp, *from_fp;
++      unsigned long sigs;
++      int err;
++
++      to_fp = to->fpstate;
++      from_fp = from->fpstate;
++      sigs = to->oldmask;
++      err = copy_from_user_proc(to, from, sizeof(*to));
++      to->oldmask = sigs;
++      if(to_fp != NULL){
++              err |= copy_from_user_proc(&to->fpstate, &to_fp,
++                                         sizeof(to->fpstate));
++              err |= copy_from_user_proc(to_fp, from_fp, arch->fpstate_size);
++      }
++      return(err);
++}
++
++int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr, void *data)
++{
++      struct arch_frame_data *arch = data;
++      struct sigcontext *to = to_ptr, *from = from_ptr;
++      struct _fpstate *to_fp, *from_fp;
++      int err;
++
++      to_fp = (struct _fpstate *) 
++              (fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to)));
++      from_fp = from->fpstate;
++      err = copy_to_user_proc(to, from, sizeof(*to));
++      if(from_fp != NULL){
++              err |= copy_to_user_proc(&to->fpstate, &to_fp,
++                                       sizeof(to->fpstate));
++              err |= copy_to_user_proc(to_fp, from_fp, arch->fpstate_size);
++      }
++      return(err);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/syscall_kern.c um/arch/um/kernel/tt/syscall_kern.c
+--- orig/arch/um/kernel/tt/syscall_kern.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/syscall_kern.c        Sun Dec  8 19:32:53 2002
+@@ -0,0 +1,142 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/types.h"
++#include "linux/utime.h"
++#include "linux/sys.h"
++#include "asm/unistd.h"
++#include "asm/ptrace.h"
++#include "asm/uaccess.h"
++#include "sysdep/syscalls.h"
++#include "kern_util.h"
++
++static inline int check_area(void *ptr, int size)
++{
++      return(verify_area(VERIFY_WRITE, ptr, size));
++}
++
++static int check_readlink(struct pt_regs *regs)
++{
++      return(check_area((void *) UPT_SYSCALL_ARG1(&regs->regs),
++                        UPT_SYSCALL_ARG2(&regs->regs)));
++}
++
++static int check_utime(struct pt_regs *regs)
++{
++      return(check_area((void *) UPT_SYSCALL_ARG1(&regs->regs),
++                        sizeof(struct utimbuf)));
++}
++
++static int check_oldstat(struct pt_regs *regs)
++{
++      return(check_area((void *) UPT_SYSCALL_ARG1(&regs->regs), 
++                        sizeof(struct __old_kernel_stat)));
++}
++
++static int check_stat(struct pt_regs *regs)
++{
++      return(check_area((void *) UPT_SYSCALL_ARG1(&regs->regs), 
++                        sizeof(struct stat)));
++}
++
++static int check_stat64(struct pt_regs *regs)
++{
++      return(check_area((void *) UPT_SYSCALL_ARG1(&regs->regs), 
++                        sizeof(struct stat64)));
++}
++
++struct bogus {
++      int kernel_ds;
++      int (*check_params)(struct pt_regs *);
++};
++
++struct bogus this_is_bogus[256] = {
++      [ __NR_mknod ] = { 1, NULL },
++      [ __NR_mkdir ] = { 1, NULL },
++      [ __NR_rmdir ] = { 1, NULL },
++      [ __NR_unlink ] = { 1, NULL },
++      [ __NR_symlink ] = { 1, NULL },
++      [ __NR_link ] = { 1, NULL },
++      [ __NR_rename ] = { 1, NULL },
++      [ __NR_umount ] = { 1, NULL },
++      [ __NR_mount ] = { 1, NULL },
++      [ __NR_pivot_root ] = { 1, NULL },
++      [ __NR_chdir ] = { 1, NULL },
++      [ __NR_chroot ] = { 1, NULL },
++      [ __NR_open ] = { 1, NULL },
++      [ __NR_quotactl ] = { 1, NULL },
++      [ __NR_sysfs ] = { 1, NULL },
++      [ __NR_readlink ] = { 1, check_readlink },
++      [ __NR_acct ] = { 1, NULL },
++      [ __NR_execve ] = { 1, NULL },
++      [ __NR_uselib ] = { 1, NULL },
++      [ __NR_statfs ] = { 1, NULL },
++      [ __NR_truncate ] = { 1, NULL },
++      [ __NR_access ] = { 1, NULL },
++      [ __NR_chmod ] = { 1, NULL },
++      [ __NR_chown ] = { 1, NULL },
++      [ __NR_lchown ] = { 1, NULL },
++      [ __NR_utime ] = { 1, check_utime },
++      [ __NR_oldlstat ] = { 1, check_oldstat },
++      [ __NR_oldstat ] = { 1, check_oldstat },
++      [ __NR_stat ] = { 1, check_stat },
++      [ __NR_lstat ] = { 1, check_stat },
++      [ __NR_stat64 ] = { 1, check_stat64 },
++      [ __NR_lstat64 ] = { 1, check_stat64 },
++      [ __NR_chown32 ] = { 1, NULL },
++};
++
++/* sys_utimes */
++
++static int check_bogosity(struct pt_regs *regs)
++{
++      struct bogus *bogon = &this_is_bogus[UPT_SYSCALL_NR(&regs->regs)];
++
++      if(!bogon->kernel_ds) return(0);
++      if(bogon->check_params && (*bogon->check_params)(regs))
++              return(-EFAULT);
++      set_fs(KERNEL_DS);
++      return(0);
++}
++
++extern syscall_handler_t *sys_call_table[];
++
++long execute_syscall_tt(void *r)
++{
++      struct pt_regs *regs = r;
++      long res;
++      int syscall;
++
++      current->thread.nsyscalls++;
++      nsyscalls++;
++      syscall = UPT_SYSCALL_NR(&regs->regs);
++
++      if((syscall >= NR_syscalls) || (syscall < 0))
++              res = -ENOSYS;
++      else if(honeypot && check_bogosity(regs))
++              res = -EFAULT;
++      else res = EXECUTE_SYSCALL(syscall, regs);
++
++      set_fs(USER_DS);
++
++      if(current->thread.mode.tt.singlestep_syscall){
++              current->thread.mode.tt.singlestep_syscall = 0;
++              current->ptrace &= ~PT_DTRACE;
++              force_sig(SIGTRAP, current);
++      }
++
++      return(res);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/syscall_user.c um/arch/um/kernel/tt/syscall_user.c
+--- orig/arch/um/kernel/tt/syscall_user.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/syscall_user.c        Sun Dec  8 21:00:11 2002
+@@ -0,0 +1,89 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <signal.h>
++#include <errno.h>
++#include <sys/ptrace.h>
++#include <asm/unistd.h>
++#include "sysdep/ptrace.h"
++#include "sigcontext.h"
++#include "ptrace_user.h"
++#include "task.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "syscall_user.h"
++#include "tt.h"
++
++/* XXX Bogus */
++#define ERESTARTSYS   512
++#define ERESTARTNOINTR        513
++#define ERESTARTNOHAND        514
++
++void syscall_handler_tt(int sig, union uml_pt_regs *regs)
++{
++      void *sc;
++      long result;
++      int index, syscall;
++
++      syscall = UPT_SYSCALL_NR(regs);
++      sc = UPT_SC(regs);
++      SC_START_SYSCALL(sc);
++
++      index = record_syscall_start(syscall);
++      syscall_trace();
++      result = execute_syscall(regs);
++
++      /* regs->sc may have changed while the system call ran (there may
++       * have been an interrupt or segfault), so it needs to be refreshed.
++       */
++      UPT_SC(regs) = sc;
++
++      SC_SET_SYSCALL_RETURN(sc, result);
++      if((result == -ERESTARTNOHAND) || (result == -ERESTARTSYS) || 
++         (result == -ERESTARTNOINTR))
++              do_signal(result);
++
++      syscall_trace();
++      record_syscall_end(index, result);
++}
++
++int do_syscall(void *task, int pid)
++{
++      unsigned long proc_regs[FRAME_SIZE];
++      union uml_pt_regs *regs;
++      int syscall;
++
++      if(ptrace_getregs(pid, proc_regs) < 0)
++              tracer_panic("Couldn't read registers");
++      syscall = PT_SYSCALL_NR(proc_regs);
++
++      regs = TASK_REGS(task);
++      UPT_SYSCALL_NR(regs) = syscall;
++
++      if(syscall < 1) return(0);
++
++      if((syscall != __NR_sigreturn) &&
++         ((unsigned long *) PT_IP(proc_regs) >= &_stext) && 
++         ((unsigned long *) PT_IP(proc_regs) <= &_etext))
++              tracer_panic("I'm tracing myself and I can't get out");
++
++      if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, 
++                __NR_getpid) < 0)
++              tracer_panic("do_syscall : Nullifying syscall failed, "
++                           "errno = %d", errno);
++      return(1);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/time.c um/arch/um/kernel/tt/time.c
+--- orig/arch/um/kernel/tt/time.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/time.c        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,28 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <signal.h>
++#include <sys/time.h>
++#include <time_user.h>
++#include "process.h"
++#include "user.h"
++
++void user_time_init_tt(void)
++{
++      if(signal(SIGVTALRM, (__sighandler_t) alarm_handler) == SIG_ERR)
++              panic("Couldn't set SIGVTALRM handler");
++      set_interval(ITIMER_VIRTUAL);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/tlb.c um/arch/um/kernel/tt/tlb.c
+--- orig/arch/um/kernel/tt/tlb.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/tlb.c Thu Dec 19 13:03:11 2002
+@@ -0,0 +1,220 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/stddef.h"
++#include "linux/kernel.h"
++#include "linux/sched.h"
++#include "asm/page.h"
++#include "asm/pgtable.h"
++#include "asm/uaccess.h"
++#include "user_util.h"
++#include "mem_user.h"
++#include "os.h"
++
++static void fix_range(struct mm_struct *mm, unsigned long start_addr, 
++                    unsigned long end_addr, int force)
++{
++      pgd_t *npgd;
++      pmd_t *npmd;
++      pte_t *npte;
++      unsigned long addr;
++      int r, w, x, err;
++
++      if((current->thread.mode.tt.extern_pid != -1) && 
++         (current->thread.mode.tt.extern_pid != os_getpid()))
++              panic("fix_range fixing wrong address space, current = 0x%p",
++                    current);
++      if(mm == NULL) return;
++      for(addr=start_addr;addr<end_addr;){
++              if(addr == TASK_SIZE){
++                      /* Skip over kernel text, kernel data, and physical
++                       * memory, which don't have ptes, plus kernel virtual
++                       * memory, which is flushed separately, and remap
++                       * the process stack.  The only way to get here is
++                       * if (end_addr == STACK_TOP) > TASK_SIZE, which is
++                       * only true in the honeypot case.
++                       */
++                      addr = STACK_TOP - ABOVE_KMEM;
++                      continue;
++              }
++              npgd = pgd_offset(mm, addr);
++              npmd = pmd_offset(npgd, addr);
++              if(pmd_present(*npmd)){
++                      npte = pte_offset(npmd, addr);
++                      r = pte_read(*npte);
++                      w = pte_write(*npte);
++                      x = pte_exec(*npte);
++                      if(!pte_dirty(*npte)) w = 0;
++                      if(!pte_young(*npte)){
++                              r = 0;
++                              w = 0;
++                      }
++                      if(force || pte_newpage(*npte)){
++                              err = os_unmap_memory((void *) addr, 
++                                                    PAGE_SIZE);
++                              if(err < 0)
++                                      panic("munmap failed, errno = %d\n",
++                                            -err);
++                              if(pte_present(*npte))
++                                      map_memory(addr, 
++                                                 pte_val(*npte) & PAGE_MASK,
++                                                 PAGE_SIZE, r, w, x);
++                      }
++                      else if(pte_newprot(*npte)){
++                              protect_memory(addr, PAGE_SIZE, r, w, x, 1);
++                      }
++                      *npte = pte_mkuptodate(*npte);
++                      addr += PAGE_SIZE;
++              }
++              else {
++                      if(force || pmd_newpage(*npmd)){
++                              err = os_unmap_memory((void *) addr, PMD_SIZE);
++                              if(err < 0)
++                                      panic("munmap failed, errno = %d\n",
++                                            -err);
++                              pmd_mkuptodate(*npmd);
++                      }
++                      addr += PMD_SIZE;
++              }
++      }
++}
++
++atomic_t vmchange_seq = ATOMIC_INIT(1);
++
++static void flush_kernel_vm_range(unsigned long start, unsigned long end,
++                                int update_seq)
++{
++      struct mm_struct *mm;
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++      unsigned long addr;
++      int updated = 0, err;
++
++      mm = &init_mm;
++      for(addr = start; addr < end;){
++              pgd = pgd_offset(mm, addr);
++              pmd = pmd_offset(pgd, addr);
++              if(pmd_present(*pmd)){
++                      pte = pte_offset(pmd, addr);
++                      if(!pte_present(*pte) || pte_newpage(*pte)){
++                              updated = 1;
++                              err = os_unmap_memory((void *) addr, 
++                                                    PAGE_SIZE);
++                              if(err < 0)
++                                      panic("munmap failed, errno = %d\n",
++                                            -err);
++                              if(pte_present(*pte))
++                                      map_memory(addr, 
++                                                 pte_val(*pte) & PAGE_MASK,
++                                                 PAGE_SIZE, 1, 1, 1);
++                      }
++                      else if(pte_newprot(*pte)){
++                              updated = 1;
++                              protect_memory(addr, PAGE_SIZE, 1, 1, 1, 1);
++                      }
++                      addr += PAGE_SIZE;
++              }
++              else {
++                      if(pmd_newpage(*pmd)){
++                              updated = 1;
++                              err = os_unmap_memory((void *) addr, PMD_SIZE);
++                              if(err < 0)
++                                      panic("munmap failed, errno = %d\n",
++                                            -err);
++                      }
++                      addr += PMD_SIZE;
++              }
++      }
++      if(updated && update_seq) atomic_inc(&vmchange_seq);
++}
++
++static void protect_vm_page(unsigned long addr, int w, int must_succeed)
++{
++      int err;
++
++      err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed);
++      if(err == 0) return;
++      else if((err == -EFAULT) || (err == -ENOMEM)){
++              flush_kernel_vm_range(addr, addr + PAGE_SIZE, 1);
++              protect_vm_page(addr, w, 1);
++      }
++      else panic("protect_vm_page : protect failed, errno = %d\n", err);
++}
++
++void mprotect_kernel_vm(int w)
++{
++      struct mm_struct *mm;
++      pgd_t *pgd;
++      pmd_t *pmd;
++      pte_t *pte;
++      unsigned long addr;
++      
++      mm = &init_mm;
++      for(addr = start_vm; addr < end_vm;){
++              pgd = pgd_offset(mm, addr);
++              pmd = pmd_offset(pgd, addr);
++              if(pmd_present(*pmd)){
++                      pte = pte_offset(pmd, addr);
++                      if(pte_present(*pte)) protect_vm_page(addr, w, 0);
++                      addr += PAGE_SIZE;
++              }
++              else addr += PMD_SIZE;
++      }
++}
++
++void flush_tlb_kernel_vm_tt(void)
++{
++      flush_kernel_vm_range(start_vm, end_vm, 1);
++}
++
++void __flush_tlb_one_tt(unsigned long addr)
++{
++      flush_kernel_vm_range(addr, addr + PAGE_SIZE, 1);
++}
++
++void flush_tlb_range_tt(struct mm_struct *mm, unsigned long start, 
++                   unsigned long end)
++{
++      if(mm != current->mm) return;
++
++      /* Assumes that the range start ... end is entirely within
++       * either process memory or kernel vm
++       */
++      if((start >= start_vm) && (start < end_vm)) 
++              flush_kernel_vm_range(start, end, 1);
++      else fix_range(mm, start, end, 0);
++}
++
++void flush_tlb_mm_tt(struct mm_struct *mm)
++{
++      unsigned long seq;
++
++      if(mm != current->mm) return;
++
++      fix_range(mm, 0, STACK_TOP, 0);
++
++      seq = atomic_read(&vmchange_seq);
++      if(current->thread.mode.tt.vm_seq == seq) return;
++      current->thread.mode.tt.vm_seq = seq;
++      flush_kernel_vm_range(start_vm, end_vm, 0);
++}
++
++void force_flush_all_tt(void)
++{
++      fix_range(current->mm, 0, STACK_TOP, 1);
++      flush_kernel_vm_range(start_vm, end_vm, 0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/tracer.c um/arch/um/kernel/tt/tracer.c
+--- orig/arch/um/kernel/tt/tracer.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/tracer.c      Wed Mar 26 10:01:33 2003
+@@ -0,0 +1,453 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <stdarg.h>
++#include <unistd.h>
++#include <signal.h>
++#include <errno.h>
++#include <sched.h>
++#include <string.h>
++#include <sys/mman.h>
++#include <sys/ptrace.h>
++#include <sys/time.h>
++#include <sys/wait.h>
++#include "user.h"
++#include "sysdep/ptrace.h"
++#include "sigcontext.h"
++#include "sysdep/sigcontext.h"
++#include "os.h"
++#include "signal_user.h"
++#include "user_util.h"
++#include "mem_user.h"
++#include "process.h"
++#include "kern_util.h"
++#include "frame.h"
++#include "chan_user.h"
++#include "ptrace_user.h"
++#include "mode.h"
++#include "tt.h"
++
++static int tracer_winch[2];
++
++int is_tracer_winch(int pid, int fd, void *data)
++{
++      if(pid != tracing_pid)
++              return(0);
++
++      register_winch_irq(tracer_winch[0], fd, -1, data);
++      return(1);
++}
++
++static void tracer_winch_handler(int sig)
++{
++      char c = 1;
++
++      if(write(tracer_winch[1], &c, sizeof(c)) != sizeof(c))
++              printk("tracer_winch_handler - write failed, errno = %d\n",
++                     errno);
++}
++
++/* Called only by the tracing thread during initialization */
++
++static void setup_tracer_winch(void)
++{
++      int err;
++
++      err = os_pipe(tracer_winch, 1, 1);
++      if(err){
++              printk("setup_tracer_winch : os_pipe failed, errno = %d\n", 
++                     -err);
++              return;
++      }
++      signal(SIGWINCH, tracer_winch_handler);
++}
++
++void attach_process(int pid)
++{
++      if((ptrace(PTRACE_ATTACH, pid, 0, 0) < 0) ||
++         (ptrace(PTRACE_CONT, pid, 0, 0) < 0))
++              tracer_panic("OP_FORK failed to attach pid");
++      wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL);
++      if(ptrace(PTRACE_CONT, pid, 0, 0) < 0)
++              tracer_panic("OP_FORK failed to continue process");
++}
++
++void tracer_panic(char *format, ...)
++{
++      va_list ap;
++
++      va_start(ap, format);
++      vprintf(format, ap);
++      printf("\n");
++      while(1) pause();
++}
++
++static void tracer_segv(int sig, struct sigcontext sc)
++{
++      printf("Tracing thread segfault at address 0x%lx, ip 0x%lx\n",
++             SC_FAULT_ADDR(&sc), SC_IP(&sc));
++      while(1)
++              pause();
++}
++
++/* Changed early in boot, and then only read */
++int debug = 0;
++int debug_stop = 1;
++int debug_parent = 0;
++int honeypot = 0;
++
++static int signal_tramp(void *arg)
++{
++      int (*proc)(void *);
++
++      if(honeypot && munmap((void *) (host_task_size - 0x10000000),
++                            0x10000000)) 
++              panic("Unmapping stack failed");
++      if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0)
++              panic("ptrace PTRACE_TRACEME failed");
++      os_stop_process(os_getpid());
++      change_sig(SIGWINCH, 0);
++      signal(SIGUSR1, SIG_IGN);
++      change_sig(SIGCHLD, 0);
++      signal(SIGSEGV, (__sighandler_t) sig_handler);
++      set_cmdline("(idle thread)");
++      set_init_pid(os_getpid());
++      proc = arg;
++      return((*proc)(NULL));
++}
++
++static void sleeping_process_signal(int pid, int sig)
++{
++      switch(sig){
++      /* These two result from UML being ^Z-ed and bg-ed.  PTRACE_CONT is
++       * right because the process must be in the kernel already.
++       */
++      case SIGCONT:
++      case SIGTSTP:
++              if(ptrace(PTRACE_CONT, pid, 0, sig) < 0)
++                      tracer_panic("sleeping_process_signal : Failed to "
++                                   "continue pid %d, errno = %d\n", pid,
++                                   sig);
++              break;
++
++      /* This happens when the debugger (e.g. strace) is doing system call 
++       * tracing on the kernel.  During a context switch, the current task
++       * will be set to the incoming process and the outgoing process will
++       * hop into write and then read.  Since it's not the current process
++       * any more, the trace of those will land here.  So, we need to just 
++       * PTRACE_SYSCALL it.
++       */
++      case SIGTRAP:
++              if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)
++                      tracer_panic("sleeping_process_signal : Failed to "
++                                   "PTRACE_SYSCALL pid %d, errno = %d\n",
++                                   pid, sig);
++              break;
++      case SIGSTOP:
++              break;
++      default:
++              tracer_panic("sleeping process %d got unexpected "
++                           "signal : %d\n", pid, sig);
++              break;
++      }
++}
++
++/* Accessed only by the tracing thread */
++int debugger_pid = -1;
++int debugger_parent = -1;
++int debugger_fd = -1;
++int gdb_pid = -1;
++
++struct {
++      int pid;
++      int signal;
++      unsigned long addr;
++      struct timeval time;
++} signal_record[1024][32];
++
++int signal_index[32];
++int nsignals = 0;
++int debug_trace = 0;
++extern int io_nsignals, io_count, intr_count;
++
++extern void signal_usr1(int sig);
++
++int tracing_pid = -1;
++
++int tracer(int (*init_proc)(void *), void *sp)
++{
++      void *task = NULL;
++      unsigned long eip = 0;
++      int status, pid = 0, sig = 0, cont_type, tracing = 0, op = 0;
++      int last_index, proc_id = 0, n, err, old_tracing = 0, strace = 0;
++
++      capture_signal_stack();
++      signal(SIGPIPE, SIG_IGN);
++      setup_tracer_winch();
++      tracing_pid = os_getpid();
++      printf("tracing thread pid = %d\n", tracing_pid);
++
++      pid = clone(signal_tramp, sp, CLONE_FILES | SIGCHLD, init_proc);
++      n = waitpid(pid, &status, WUNTRACED);
++      if(n < 0){
++              printf("waitpid on idle thread failed, errno = %d\n", errno);
++              exit(1);
++      }
++      if((ptrace(PTRACE_CONT, pid, 0, 0) < 0)){
++              printf("Failed to continue idle thread, errno = %d\n", errno);
++              exit(1);
++      }
++
++      signal(SIGSEGV, (sighandler_t) tracer_segv);
++      signal(SIGUSR1, signal_usr1);
++      if(debug_trace){
++              printf("Tracing thread pausing to be attached\n");
++              stop();
++      }
++      if(debug){
++              if(gdb_pid != -1) 
++                      debugger_pid = attach_debugger(pid, gdb_pid, 1);
++              else debugger_pid = init_ptrace_proxy(pid, 1, debug_stop);
++              if(debug_parent){
++                      debugger_parent = os_process_parent(debugger_pid);
++                      init_parent_proxy(debugger_parent);
++                      err = attach(debugger_parent);
++                      if(err){
++                              printf("Failed to attach debugger parent %d, "
++                                     "errno = %d\n", debugger_parent, err);
++                              debugger_parent = -1;
++                      }
++                      else {
++                              if(ptrace(PTRACE_SYSCALL, debugger_parent, 
++                                        0, 0) < 0){
++                                      printf("Failed to continue debugger "
++                                             "parent, errno = %d\n", errno);
++                                      debugger_parent = -1;
++                              }
++                      }
++              }
++      }
++      set_cmdline("(tracing thread)");
++      while(1){
++              if((pid = waitpid(-1, &status, WUNTRACED)) <= 0){
++                      if(errno != ECHILD){
++                              printf("wait failed - errno = %d\n", errno);
++                      }
++                      continue;
++              }
++              if(pid == debugger_pid){
++                      int cont = 0;
++
++                      if(WIFEXITED(status) || WIFSIGNALED(status))
++                              debugger_pid = -1;
++                      /* XXX Figure out how to deal with gdb and SMP */
++                      else cont = debugger_signal(status, cpu_tasks[0].pid);
++                      if(cont == PTRACE_SYSCALL) strace = 1;
++                      continue;
++              }
++              else if(pid == debugger_parent){
++                      debugger_parent_signal(status, pid);
++                      continue;
++              }
++              nsignals++;
++              if(WIFEXITED(status)) ;
++#ifdef notdef
++              {
++                      printf("Child %d exited with status %d\n", pid, 
++                             WEXITSTATUS(status));
++              }
++#endif
++              else if(WIFSIGNALED(status)){
++                      sig = WTERMSIG(status);
++                      if(sig != 9){
++                              printf("Child %d exited with signal %d\n", pid,
++                                     sig);
++                      }
++              }
++              else if(WIFSTOPPED(status)){
++                      proc_id = pid_to_processor_id(pid);
++                      sig = WSTOPSIG(status);
++                      if(signal_index[proc_id] == 1024){
++                              signal_index[proc_id] = 0;
++                              last_index = 1023;
++                      }
++                      else last_index = signal_index[proc_id] - 1;
++                      if(((sig == SIGPROF) || (sig == SIGVTALRM) || 
++                          (sig == SIGALRM)) &&
++                         (signal_record[proc_id][last_index].signal == sig)&&
++                         (signal_record[proc_id][last_index].pid == pid))
++                              signal_index[proc_id] = last_index;
++                      signal_record[proc_id][signal_index[proc_id]].pid = pid;
++                      gettimeofday(&signal_record[proc_id][signal_index[proc_id]].time, NULL);
++                      eip = ptrace(PTRACE_PEEKUSER, pid, PT_IP_OFFSET, 0);
++                      signal_record[proc_id][signal_index[proc_id]].addr = eip;
++                      signal_record[proc_id][signal_index[proc_id]++].signal = sig;
++                      
++                      if(proc_id == -1){
++                              sleeping_process_signal(pid, sig);
++                              continue;
++                      }
++
++                      task = cpu_tasks[proc_id].task;
++                      tracing = is_tracing(task);
++                      old_tracing = tracing;
++
++                      switch(sig){
++                      case SIGUSR1:
++                              sig = 0;
++                              op = do_proc_op(task, proc_id);
++                              switch(op){
++                              case OP_TRACE_ON:
++                                      arch_leave_kernel(task, pid);
++                                      tracing = 1;
++                                      break;
++                              case OP_REBOOT:
++                              case OP_HALT:
++                                      unmap_physmem();
++                                      kmalloc_ok = 0;
++                                      ptrace(PTRACE_KILL, pid, 0, 0);
++                                      return(op == OP_REBOOT);
++                              case OP_NONE:
++                                      printf("Detaching pid %d\n", pid);
++                                      detach(pid, SIGSTOP);
++                                      continue;
++                              default:
++                                      break;
++                              }
++                              /* OP_EXEC switches host processes on us,
++                               * we want to continue the new one.
++                               */
++                              pid = cpu_tasks[proc_id].pid;
++                              break;
++                      case SIGTRAP:
++                              if(!tracing && (debugger_pid != -1)){
++                                      child_signal(pid, status);
++                                      continue;
++                              }
++                              tracing = 0;
++                              if(do_syscall(task, pid)) sig = SIGUSR2;
++                              else clear_singlestep(task);
++                              break;
++                      case SIGPROF:
++                              if(tracing) sig = 0;
++                              break;
++                      case SIGCHLD:
++                      case SIGHUP:
++                              sig = 0;
++                              break;
++                      case SIGSEGV:
++                      case SIGIO:
++                      case SIGALRM:
++                      case SIGVTALRM:
++                      case SIGFPE:
++                      case SIGBUS:
++                      case SIGILL:
++                      case SIGWINCH:
++                      default:
++                              tracing = 0;
++                              break;
++                      }
++                      set_tracing(task, tracing);
++
++                      if(!tracing && old_tracing)
++                              arch_enter_kernel(task, pid);
++
++                      if(!tracing && (debugger_pid != -1) && (sig != 0) &&
++                              (sig != SIGALRM) && (sig != SIGVTALRM) &&
++                              (sig != SIGSEGV) && (sig != SIGTRAP) &&
++                              (sig != SIGUSR2) && (sig != SIGIO) &&
++                              (sig != SIGFPE)){
++                              child_signal(pid, status);
++                              continue;
++                      }
++
++                      if(tracing){
++                              if(singlestepping_tt(task))
++                                      cont_type = PTRACE_SINGLESTEP;
++                              else cont_type = PTRACE_SYSCALL;
++                      }
++                      else cont_type = PTRACE_CONT;
++
++                      if((cont_type == PTRACE_CONT) && 
++                         (debugger_pid != -1) && strace)
++                              cont_type = PTRACE_SYSCALL;
++
++                      if(ptrace(cont_type, pid, 0, sig) != 0){
++                              tracer_panic("ptrace failed to continue "
++                                           "process - errno = %d\n", 
++                                           errno);
++                      }
++              }
++      }
++      return(0);
++}
++
++static int __init uml_debug_setup(char *line, int *add)
++{
++      char *next;
++
++      debug = 1;
++      *add = 0;
++      if(*line != '=') return(0);
++      line++;
++
++      while(line != NULL){
++              next = strchr(line, ',');
++              if(next) *next++ = '\0';
++              
++              if(!strcmp(line, "go")) debug_stop = 0;
++              else if(!strcmp(line, "parent")) debug_parent = 1;
++              else printf("Unknown debug option : '%s'\n", line);
++
++              line = next;
++      }
++      return(0);
++}
++
++__uml_setup("debug", uml_debug_setup,
++"debug\n"
++"    Starts up the kernel under the control of gdb. See the \n"
++"    kernel debugging tutorial and the debugging session pages\n"
++"    at http://user-mode-linux.sourceforge.net/ for more information.\n\n"
++);
++
++static int __init uml_debugtrace_setup(char *line, int *add)
++{
++      debug_trace = 1;
++      return 0;
++}
++__uml_setup("debugtrace", uml_debugtrace_setup,
++"debugtrace\n"
++"    Causes the tracing thread to pause until it is attached by a\n"
++"    debugger and continued.  This is mostly for debugging crashes\n"
++"    early during boot, and should be pretty much obsoleted by\n"
++"    the debug switch.\n\n"
++);
++
++static int __init uml_honeypot_setup(char *line, int *add)
++{
++      jail_setup("", add);
++      honeypot = 1;
++      return 0;
++}
++__uml_setup("honeypot", uml_honeypot_setup, 
++"honeypot\n"
++"    This makes UML put process stacks in the same location as they are\n"
++"    on the host, allowing expoits such as stack smashes to work against\n"
++"    UML.  This implies 'jail'.\n\n"
++);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/trap_user.c um/arch/um/kernel/tt/trap_user.c
+--- orig/arch/um/kernel/tt/trap_user.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/trap_user.c   Mon Dec  9 13:14:42 2002
+@@ -0,0 +1,59 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <errno.h>
++#include <signal.h>
++#include <asm/sigcontext.h>
++#include "sysdep/ptrace.h"
++#include "signal_user.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "task.h"
++#include "tt.h"
++
++void sig_handler_common_tt(int sig, void *sc_ptr)
++{
++      struct sigcontext *sc = sc_ptr;
++      struct tt_regs save_regs, *r;
++      struct signal_info *info;
++      int save_errno = errno, is_user;
++
++      unprotect_kernel_mem();
++
++      r = &TASK_REGS(get_current())->tt;
++      save_regs = *r;
++      is_user = user_context(SC_SP(sc));
++      r->sc = sc;
++      if(sig != SIGUSR2) 
++              r->syscall = -1;
++
++      change_sig(SIGUSR1, 1);
++      info = &sig_info[sig];
++      if(!info->is_irq) unblock_signals();
++
++      (*info->handler)(sig, (union uml_pt_regs *) r);
++
++      if(is_user){
++              interrupt_end();
++              block_signals();
++              change_sig(SIGUSR1, 0);
++              set_user_mode(NULL);
++      }
++      *r = save_regs;
++      errno = save_errno;
++      if(is_user) protect_kernel_mem();
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/uaccess_user.c um/arch/um/kernel/tt/uaccess_user.c
+--- orig/arch/um/kernel/tt/uaccess_user.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/uaccess_user.c        Tue Mar 25 17:10:54 2003
+@@ -0,0 +1,100 @@
++/* 
++ * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk)
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <setjmp.h>
++#include <string.h>
++#include "user_util.h"
++#include "uml_uaccess.h"
++#include "task.h"
++#include "kern_util.h"
++
++int __do_copy_from_user(void *to, const void *from, int n,
++                      void **fault_addr, void **fault_catcher)
++{
++      struct tt_regs save = TASK_REGS(get_current())->tt;
++      unsigned long fault;
++      int faulted;
++
++      fault = __do_user_copy(to, from, n, fault_addr, fault_catcher,
++                             __do_copy, &faulted);
++      TASK_REGS(get_current())->tt = save;
++
++      if(!faulted) return(0);
++      else return(n - (fault - (unsigned long) from));
++}
++
++static void __do_strncpy(void *dst, const void *src, int count)
++{
++      strncpy(dst, src, count);
++}     
++
++int __do_strncpy_from_user(char *dst, const char *src, unsigned long count,
++                         void **fault_addr, void **fault_catcher)
++{
++      struct tt_regs save = TASK_REGS(get_current())->tt;
++      unsigned long fault;
++      int faulted;
++
++      fault = __do_user_copy(dst, src, count, fault_addr, fault_catcher,
++                             __do_strncpy, &faulted);
++      TASK_REGS(get_current())->tt = save;
++
++      if(!faulted) return(strlen(dst));
++      else return(-1);
++}
++
++static void __do_clear(void *to, const void *from, int n)
++{
++      memset(to, 0, n);
++}     
++
++int __do_clear_user(void *mem, unsigned long len,
++                  void **fault_addr, void **fault_catcher)
++{
++      struct tt_regs save = TASK_REGS(get_current())->tt;
++      unsigned long fault;
++      int faulted;
++
++      fault = __do_user_copy(mem, NULL, len, fault_addr, fault_catcher,
++                             __do_clear, &faulted);
++      TASK_REGS(get_current())->tt = save;
++
++      if(!faulted) return(0);
++      else return(len - (fault - (unsigned long) mem));
++}
++
++int __do_strnlen_user(const char *str, unsigned long n,
++                    void **fault_addr, void **fault_catcher)
++{
++      struct tt_regs save = TASK_REGS(get_current())->tt;
++      int ret;
++      unsigned long *faddrp = (unsigned long *)fault_addr;
++      jmp_buf jbuf;
++
++      *fault_catcher = &jbuf;
++      if(setjmp(jbuf) == 0){
++              ret = strlen(str) + 1;
++      } 
++      else {
++              ret = *faddrp - (unsigned long) str;
++      }
++      *fault_addr = NULL;
++      *fault_catcher = NULL;
++
++      TASK_REGS(get_current())->tt = save;
++      return ret;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/unmap.c um/arch/um/kernel/tt/unmap.c
+--- orig/arch/um/kernel/tt/unmap.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tt/unmap.c       Wed Dec 11 10:42:21 2002
+@@ -0,0 +1,31 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <sys/mman.h>
++
++int switcheroo(int fd, int prot, void *from, void *to, int size)
++{
++      if(munmap(to, size) < 0){
++              return(-1);
++      }
++      if(mmap(to, size, prot, MAP_SHARED | MAP_FIXED, fd, 0) != to){
++              return(-1);
++      }
++      if(munmap(from, size) < 0){
++              return(-1);
++      }
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/tty_log.c um/arch/um/kernel/tty_log.c
+--- orig/arch/um/kernel/tty_log.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/tty_log.c        Wed Apr 16 16:35:20 2003
+@@ -0,0 +1,213 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) and 
++ * geoffrey hing <ghing@net.ohio-state.edu>
++ * Licensed under the GPL
++ */
++
++#include <errno.h>
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <fcntl.h>
++#include <sys/time.h>
++#include "init.h"
++#include "user.h"
++#include "kern_util.h"
++#include "os.h"
++
++#define TTY_LOG_DIR "./"
++
++/* Set early in boot and then unchanged */
++static char *tty_log_dir = TTY_LOG_DIR;
++static int tty_log_fd = -1;
++
++#define TTY_LOG_OPEN 1
++#define TTY_LOG_CLOSE 2
++#define TTY_LOG_WRITE 3
++#define TTY_LOG_EXEC 4
++
++#define TTY_READ 1
++#define TTY_WRITE 2
++
++struct tty_log_buf {
++      int what;
++      unsigned long tty;
++      int len;
++      int direction;
++      unsigned long sec;
++      unsigned long usec;
++};
++
++int open_tty_log(void *tty, void *current_tty)
++{
++      struct timeval tv;
++      struct tty_log_buf data;
++      char buf[strlen(tty_log_dir) + sizeof("01234567890-01234567\0")];
++      int fd;
++
++      gettimeofday(&tv, NULL);
++      if(tty_log_fd != -1){
++              data = ((struct tty_log_buf) { .what    = TTY_LOG_OPEN,
++                                             .tty  = (unsigned long) tty,
++                                             .len  = sizeof(current_tty),
++                                             .direction = 0,
++                                             .sec = tv.tv_sec,
++                                             .usec = tv.tv_usec } );
++              write(tty_log_fd, &data, sizeof(data));
++              write(tty_log_fd, &current_tty, data.len);
++              return(tty_log_fd);
++      }
++
++      sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, 
++              (unsigned int) tv.tv_usec);
++
++      fd = os_open_file(buf, of_append(of_create(of_rdwr(OPENFLAGS()))),
++                        0644);
++      if(fd < 0){
++              printk("open_tty_log : couldn't open '%s', errno = %d\n",
++                     buf, -fd);
++      }
++      return(fd);
++}
++
++void close_tty_log(int fd, void *tty)
++{
++      struct tty_log_buf data;
++      struct timeval tv;
++
++      if(tty_log_fd != -1){
++              gettimeofday(&tv, NULL);
++              data = ((struct tty_log_buf) { .what    = TTY_LOG_CLOSE,
++                                             .tty  = (unsigned long) tty,
++                                             .len  = 0,
++                                             .direction = 0,
++                                             .sec = tv.tv_sec,
++                                             .usec = tv.tv_usec } );
++              write(tty_log_fd, &data, sizeof(data));
++              return;
++      }
++      close(fd);
++}
++
++static int log_chunk(int fd, char *buf, int len)
++{
++      int total = 0, try, missed, n;
++      char chunk[64];
++
++      while(len > 0){
++              try = (len > sizeof(chunk)) ? sizeof(chunk) : len;
++              missed = copy_from_user_proc(chunk, buf, try);
++              try -= missed;
++              n = write(fd, chunk, try);
++              if(n != try)
++                      return(-errno);
++              if(missed != 0)
++                      return(-EFAULT);
++
++              len -= try;
++              total += try;
++              buf += try;
++      }
++
++      return(total);
++}
++
++int write_tty_log(int fd, char *buf, int len, void *tty, int is_read)
++{
++      struct timeval tv;
++      struct tty_log_buf data;
++      int direction;
++
++      if(fd == tty_log_fd){
++              gettimeofday(&tv, NULL);
++              direction = is_read ? TTY_READ : TTY_WRITE;
++              data = ((struct tty_log_buf) { .what    = TTY_LOG_WRITE,
++                                             .tty  = (unsigned long) tty,
++                                             .len  = len,
++                                             .direction = direction,
++                                             .sec = tv.tv_sec,
++                                             .usec = tv.tv_usec } );
++              write(tty_log_fd, &data, sizeof(data));
++      }
++
++      return(log_chunk(fd, buf, len));
++}
++
++void log_exec(char **argv, void *tty)
++{
++      struct timeval tv;
++      struct tty_log_buf data;
++      char **ptr,*arg;
++      int len;
++      
++      if(tty_log_fd == -1) return;
++
++      gettimeofday(&tv, NULL);
++
++      len = 0;
++      for(ptr = argv; ; ptr++){
++              if(copy_from_user_proc(&arg, ptr, sizeof(arg)))
++                      return;
++              if(arg == NULL) break;
++              len += strlen_user_proc(arg);
++      }
++
++      data = ((struct tty_log_buf) { .what    = TTY_LOG_EXEC,
++                                     .tty  = (unsigned long) tty,
++                                     .len  = len,
++                                     .direction = 0,
++                                     .sec = tv.tv_sec,
++                                     .usec = tv.tv_usec } );
++      write(tty_log_fd, &data, sizeof(data));
++
++      for(ptr = argv; ; ptr++){
++              if(copy_from_user_proc(&arg, ptr, sizeof(arg)))
++                      return;
++              if(arg == NULL) break;
++              log_chunk(tty_log_fd, arg, strlen_user_proc(arg));
++      }
++}
++
++static int __init set_tty_log_dir(char *name, int *add)
++{
++      tty_log_dir = name;
++      return 0;
++}
++
++__uml_setup("tty_log_dir=", set_tty_log_dir,
++"tty_log_dir=<directory>\n"
++"    This is used to specify the directory where the logs of all pty\n"
++"    data from this UML machine will be written.\n\n"
++);
++
++static int __init set_tty_log_fd(char *name, int *add)
++{
++      char *end;
++
++      tty_log_fd = strtoul(name, &end, 0);
++      if((*end != '\0') || (end == name)){
++              printf("set_tty_log_fd - strtoul failed on '%s'\n", name);
++              tty_log_fd = -1;
++      }
++      return 0;
++}
++
++__uml_setup("tty_log_fd=", set_tty_log_fd,
++"tty_log_fd=<fd>\n"
++"    This is used to specify a preconfigured file descriptor to which all\n"
++"    tty data will be written.  Preconfigure the descriptor with something\n"
++"    like '10>tty_log tty_log_fd=10'.\n\n"
++);
++
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/uaccess_user.c um/arch/um/kernel/uaccess_user.c
+--- orig/arch/um/kernel/uaccess_user.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/uaccess_user.c   Tue Mar 25 17:06:05 2003
+@@ -0,0 +1,64 @@
++/* 
++ * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk)
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <setjmp.h>
++#include <string.h>
++
++/* These are here rather than tt/uaccess.c because skas mode needs them in
++ * order to do SIGBUS recovery when a tmpfs mount runs out of room.
++ */
++
++unsigned long __do_user_copy(void *to, const void *from, int n,
++                           void **fault_addr, void **fault_catcher,
++                           void (*op)(void *to, const void *from,
++                                      int n), int *faulted_out)
++{
++      unsigned long *faddrp = (unsigned long *) fault_addr, ret;
++
++      jmp_buf jbuf;
++      *fault_catcher = &jbuf;
++      if(setjmp(jbuf) == 0){
++              (*op)(to, from, n);
++              ret = 0;
++              *faulted_out = 0;
++      } 
++      else {
++              ret = *faddrp;
++              *faulted_out = 1;
++      }
++      *fault_addr = NULL;
++      *fault_catcher = NULL;
++      return ret;
++}
++
++void __do_copy(void *to, const void *from, int n)
++{
++      memcpy(to, from, n);
++}     
++
++
++int __do_copy_to_user(void *to, const void *from, int n,
++                    void **fault_addr, void **fault_catcher)
++{
++      unsigned long fault;
++      int faulted;
++
++      fault = __do_user_copy(to, from, n, fault_addr, fault_catcher,
++                             __do_copy, &faulted);
++      if(!faulted) return(0);
++      else return(n - (fault - (unsigned long) to));
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/um_arch.c um/arch/um/kernel/um_arch.c
+--- orig/arch/um/kernel/um_arch.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/um_arch.c        Thu Mar  6 19:06:09 2003
+@@ -0,0 +1,425 @@
++/* 
++ * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/kernel.h"
++#include "linux/sched.h"
++#include "linux/notifier.h"
++#include "linux/mm.h"
++#include "linux/types.h"
++#include "linux/tty.h"
++#include "linux/init.h"
++#include "linux/bootmem.h"
++#include "linux/spinlock.h"
++#include "linux/utsname.h"
++#include "linux/sysrq.h"
++#include "linux/seq_file.h"
++#include "linux/delay.h"
++#include "asm/page.h"
++#include "asm/pgtable.h"
++#include "asm/ptrace.h"
++#include "asm/elf.h"
++#include "asm/user.h"
++#include "ubd_user.h"
++#include "asm/current.h"
++#include "user_util.h"
++#include "kern_util.h"
++#include "kern.h"
++#include "mprot.h"
++#include "mem_user.h"
++#include "mem.h"
++#include "umid.h"
++#include "initrd.h"
++#include "init.h"
++#include "os.h"
++#include "choose-mode.h"
++#include "mode_kern.h"
++#include "mode.h"
++
++#define DEFAULT_COMMAND_LINE "root=/dev/ubd0"
++
++struct cpuinfo_um boot_cpu_data = { 
++      .loops_per_jiffy        = 0,
++      .pgd_quick              = NULL,
++      .pmd_quick              = NULL,
++      .pte_quick              = NULL,
++      .pgtable_cache_sz       = 0,
++      .ipi_pipe               = { -1, -1 }
++};
++
++unsigned long thread_saved_pc(struct thread_struct *thread)
++{
++      return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas,
++                                            thread)));
++}
++
++static int show_cpuinfo(struct seq_file *m, void *v)
++{
++      int index;
++
++      index = (struct cpuinfo_um *)v - cpu_data;
++#ifdef CONFIG_SMP
++      if (!(cpu_online_map & (1 << index)))
++              return 0;
++#endif
++
++      seq_printf(m, "processor\t: %d\n", index);
++      seq_printf(m, "vendor_id\t: User Mode Linux\n");
++      seq_printf(m, "model name\t: UML\n");
++      seq_printf(m, "mode\t\t: %s\n", CHOOSE_MODE("tt", "skas"));
++      seq_printf(m, "host\t\t: %s\n", host_info);
++      seq_printf(m, "bogomips\t: %lu.%02lu\n\n",
++                 loops_per_jiffy/(500000/HZ),
++                 (loops_per_jiffy/(5000/HZ)) % 100);
++
++      return(0);
++}
++
++static void *c_start(struct seq_file *m, loff_t *pos)
++{
++      return *pos < NR_CPUS ? cpu_data + *pos : NULL;
++}
++
++static void *c_next(struct seq_file *m, void *v, loff_t *pos)
++{
++      ++*pos;
++      return c_start(m, pos);
++}
++
++static void c_stop(struct seq_file *m, void *v)
++{
++}
++
++struct seq_operations cpuinfo_op = {
++      .start  = c_start,
++      .next   = c_next,
++      .stop   = c_stop,
++      .show   = show_cpuinfo,
++};
++
++pte_t * __bad_pagetable(void)
++{
++      panic("Someone should implement __bad_pagetable");
++      return(NULL);
++}
++
++/* Set in linux_main */
++unsigned long host_task_size;
++unsigned long task_size;
++unsigned long uml_start;
++
++/* Set in early boot */
++unsigned long uml_physmem;
++unsigned long uml_reserved;
++unsigned long start_vm;
++unsigned long end_vm;
++int ncpus = 1;
++
++#ifdef CONFIG_MODE_TT
++/* Pointer set in linux_main, the array itself is private to each thread,
++ * and changed at address space creation time so this poses no concurrency
++ * problems.
++ */
++static char *argv1_begin = NULL;
++static char *argv1_end = NULL;
++#endif
++
++/* Set in early boot */
++static int have_root __initdata = 0;
++long physmem_size = 32 * 1024 * 1024;
++
++void set_cmdline(char *cmd)
++{
++#ifdef CONFIG_MODE_TT
++      char *umid, *ptr;
++
++      if(CHOOSE_MODE(honeypot, 0)) return;
++
++      umid = get_umid(1);
++      if(umid != NULL){
++              snprintf(argv1_begin, 
++                       (argv1_end - argv1_begin) * sizeof(*ptr), 
++                       "(%s) ", umid);
++              ptr = &argv1_begin[strlen(argv1_begin)];
++      }
++      else ptr = argv1_begin;
++
++      snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), "[%s]", cmd);
++      memset(argv1_begin + strlen(argv1_begin), '\0', 
++             argv1_end - argv1_begin - strlen(argv1_begin));
++#endif
++}
++
++static char *usage_string = 
++"User Mode Linux v%s\n"
++"     available at http://user-mode-linux.sourceforge.net/\n\n";
++
++static int __init uml_version_setup(char *line, int *add)
++{
++      printf("%s\n", system_utsname.release);
++      exit(0);
++}
++
++__uml_setup("--version", uml_version_setup,
++"--version\n"
++"    Prints the version number of the kernel.\n\n"
++);
++
++static int __init uml_root_setup(char *line, int *add)
++{
++      have_root = 1;
++      return 0;
++}
++
++__uml_setup("root=", uml_root_setup,
++"root=<file containing the root fs>\n"
++"    This is actually used by the generic kernel in exactly the same\n"
++"    way as in any other kernel. If you configure a number of block\n"
++"    devices and want to boot off something other than ubd0, you \n"
++"    would use something like:\n"
++"        root=/dev/ubd5\n\n"
++);
++
++#ifdef CONFIG_SMP
++static int __init uml_ncpus_setup(char *line, int *add)
++{
++       if (!sscanf(line, "%d", &ncpus)) {
++               printf("Couldn't parse [%s]\n", line);
++               return -1;
++       }
++
++       return 0;
++}
++
++__uml_setup("ncpus=", uml_ncpus_setup,
++"ncpus=<# of desired CPUs>\n"
++"    This tells an SMP kernel how many virtual processors to start.\n\n" 
++);
++#endif
++
++int force_tt = 0;
++
++#if defined(CONFIG_MODE_TT) && defined(CONFIG_MODE_SKAS)
++#define DEFAULT_TT 0
++
++static int __init mode_tt_setup(char *line, int *add)
++{
++      force_tt = 1;
++      return(0);
++}
++
++#else
++#ifdef CONFIG_MODE_SKAS
++
++#define DEFAULT_TT 0
++
++static int __init mode_tt_setup(char *line, int *add)
++{
++      printf("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n");
++      return(0);
++}
++
++#else
++#ifdef CONFIG_MODE_TT
++
++#define DEFAULT_TT 1
++
++static int __init mode_tt_setup(char *line, int *add)
++{
++      printf("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n");
++      return(0);
++}
++
++#else
++
++#error Either CONFIG_MODE_TT or CONFIG_MODE_SKAS must be enabled
++
++#endif
++#endif
++#endif
++
++__uml_setup("mode=tt", mode_tt_setup,
++"mode=tt\n"
++"    When both CONFIG_MODE_TT and CONFIG_MODE_SKAS are enabled, this option\n"
++"    forces UML to run in tt (tracing thread) mode.  It is not the default\n"
++"    because it's slower and less secure than skas mode.\n\n"
++);
++
++int mode_tt = DEFAULT_TT;
++
++static int __init Usage(char *line, int *add)
++{
++      const char **p;
++
++      printf(usage_string, system_utsname.release);
++      p = &__uml_help_start;
++      while (p < &__uml_help_end) {
++              printf("%s", *p);
++              p++;
++      }
++      exit(0);
++}
++
++__uml_setup("--help", Usage,
++"--help\n"
++"    Prints this message.\n\n"
++);
++
++static int __init uml_checksetup(char *line, int *add)
++{
++      struct uml_param *p;
++
++      p = &__uml_setup_start;
++      while(p < &__uml_setup_end) {
++              int n;
++
++              n = strlen(p->str);
++              if(!strncmp(line, p->str, n)){
++                      if (p->setup_func(line + n, add)) return 1;
++              }
++              p++;
++      }
++      return 0;
++}
++
++static void __init uml_postsetup(void)
++{
++      initcall_t *p;
++
++      p = &__uml_postsetup_start;
++      while(p < &__uml_postsetup_end){
++              (*p)();
++              p++;
++      }
++      return;
++}
++
++/* Set during early boot */
++unsigned long brk_start;
++static struct vm_reserved kernel_vm_reserved;
++
++#define MIN_VMALLOC (32 * 1024 * 1024)
++
++int linux_main(int argc, char **argv)
++{
++      unsigned long avail;
++      unsigned long virtmem_size, max_physmem;
++      unsigned int i, add, err;
++
++      for (i = 1; i < argc; i++){
++              if((i == 1) && (argv[i][0] == ' ')) continue;
++              add = 1;
++              uml_checksetup(argv[i], &add);
++              if(add) add_arg(saved_command_line, argv[i]);
++      }
++      if(have_root == 0) add_arg(saved_command_line, DEFAULT_COMMAND_LINE);
++
++      mode_tt = force_tt ? 1 : !can_do_skas();
++      uml_start = CHOOSE_MODE_PROC(set_task_sizes_tt, set_task_sizes_skas, 0,
++                                   &host_task_size, &task_size);
++
++      brk_start = (unsigned long) sbrk(0);
++      CHOOSE_MODE_PROC(before_mem_tt, before_mem_skas, brk_start);
++
++      uml_physmem = uml_start;
++
++      /* Reserve up to 4M after the current brk */
++      uml_reserved = ROUND_4M(brk_start) + (1 << 22);
++
++      setup_machinename(system_utsname.machine);
++
++#ifdef CONFIG_MODE_TT
++      argv1_begin = argv[1];
++      argv1_end = &argv[1][strlen(argv[1])];
++#endif
++  
++      set_usable_vm(uml_physmem, get_kmem_end());
++
++      highmem = 0;
++      max_physmem = get_kmem_end() - uml_physmem - MIN_VMALLOC;
++      if(physmem_size > max_physmem){
++              highmem = physmem_size - max_physmem;
++              physmem_size -= highmem;
++#ifndef CONFIG_HIGHMEM
++              highmem = 0;
++              printf("CONFIG_HIGHMEM not enabled - physical memory shrunk "
++                     "to %ld bytes\n", physmem_size);
++#endif
++      }
++
++      high_physmem = uml_physmem + physmem_size;
++      high_memory = (void *) high_physmem;
++
++      start_vm = VMALLOC_START;
++
++      setup_physmem(uml_physmem, uml_reserved, physmem_size);
++      virtmem_size = physmem_size;
++      avail = get_kmem_end() - start_vm;
++      if(physmem_size > avail) virtmem_size = avail;
++      end_vm = start_vm + virtmem_size;
++
++      if(virtmem_size < physmem_size)
++              printf("Kernel virtual memory size shrunk to %ld bytes\n",
++                     virtmem_size);
++
++      err = reserve_vm(high_physmem, end_vm, &kernel_vm_reserved);
++      if(err){
++              printf("Failed to reserve VM area for kernel VM\n");
++              exit(1);
++      }
++
++      uml_postsetup();
++
++      init_task.thread.kernel_stack = (unsigned long) &init_task + 
++              2 * PAGE_SIZE;
++
++      task_protections((unsigned long) &init_task);
++      os_flush_stdout();
++
++      return(CHOOSE_MODE(start_uml_tt(), start_uml_skas()));
++}
++
++static int panic_exit(struct notifier_block *self, unsigned long unused1,
++                    void *unused2)
++{
++#ifdef CONFIG_SYSRQ
++      handle_sysrq('p', &current->thread.regs, NULL, NULL);
++#endif
++      machine_halt();
++      return(0);
++}
++
++static struct notifier_block panic_exit_notifier = {
++      .notifier_call          = panic_exit,
++      .next                   = NULL,
++      .priority               = 0
++};
++
++void __init setup_arch(char **cmdline_p)
++{
++      notifier_chain_register(&panic_notifier_list, &panic_exit_notifier);
++      paging_init();
++      strcpy(command_line, saved_command_line);
++      *cmdline_p = command_line;
++      setup_hostinfo();
++}
++
++void __init check_bugs(void)
++{
++      arch_check_bugs();
++      check_ptrace();
++      check_sigio();
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/umid.c um/arch/um/kernel/umid.c
+--- orig/arch/um/kernel/umid.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/umid.c   Mon Feb 24 23:11:23 2003
+@@ -0,0 +1,319 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <string.h>
++#include <stdlib.h>
++#include <dirent.h>
++#include <signal.h>
++#include <sys/stat.h>
++#include <sys/param.h>
++#include "user.h"
++#include "umid.h"
++#include "init.h"
++#include "os.h"
++#include "user_util.h"
++#include "choose-mode.h"
++
++#define UMID_LEN 64
++#define UML_DIR "~/.uml/"
++
++/* Changed by set_umid and make_umid, which are run early in boot */
++static char umid[UMID_LEN] = { 0 };
++
++/* Changed by set_uml_dir and make_uml_dir, which are run early in boot */
++static char *uml_dir = UML_DIR;
++
++/* Changed by set_umid */
++static int umid_is_random = 1;
++static int umid_inited = 0;
++
++static int make_umid(int (*printer)(const char *fmt, ...));
++
++static int __init set_umid(char *name, int is_random, 
++                         int (*printer)(const char *fmt, ...))
++{
++      if(umid_inited){
++              (*printer)("Unique machine name can't be set twice\n");
++              return(-1);
++      }
++
++      if(strlen(name) > UMID_LEN - 1)
++              (*printer)("Unique machine name is being truncated to %s "
++                         "characters\n", UMID_LEN);
++      strncpy(umid, name, UMID_LEN - 1);
++      umid[UMID_LEN - 1] = '\0';
++
++      umid_is_random = is_random;
++      umid_inited = 1;
++      return 0;
++}
++
++static int __init set_umid_arg(char *name, int *add)
++{
++      return(set_umid(name, 0, printf));
++}
++
++__uml_setup("umid=", set_umid_arg,
++"umid=<name>\n"
++"    This is used to assign a unique identity to this UML machine and\n"
++"    is used for naming the pid file and management console socket.\n\n"
++);
++
++int __init umid_file_name(char *name, char *buf, int len)
++{
++      int n;
++
++      if(!umid_inited && make_umid(printk)) return(-1);
++
++      n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1;
++      if(n > len){
++              printk("umid_file_name : buffer too short\n");
++              return(-1);
++      }
++
++      sprintf(buf, "%s%s/%s", uml_dir, umid, name);
++      return(0);
++}
++
++extern int tracing_pid;
++
++static int __init create_pid_file(void)
++{
++      char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
++      char pid[sizeof("nnnnn\0")];
++      int fd;
++
++      if(umid_file_name("pid", file, sizeof(file))) return 0;
++
++      fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))), 
++                        0644);
++      if(fd < 0){
++              printf("Open of machine pid file \"%s\" failed - "
++                     "errno = %d\n", file, -fd);
++              return 0;
++      }
++
++      sprintf(pid, "%d\n", os_getpid());
++      if(write(fd, pid, strlen(pid)) != strlen(pid))
++              printf("Write of pid file failed - errno = %d\n", errno);
++      close(fd);
++      return 0;
++}
++
++static int actually_do_remove(char *dir)
++{
++      DIR *directory;
++      struct dirent *ent;
++      int len;
++      char file[256];
++
++      if((directory = opendir(dir)) == NULL){
++              printk("actually_do_remove : couldn't open directory '%s', "
++                     "errno = %d\n", dir, errno);
++              return(1);
++      }
++      while((ent = readdir(directory)) != NULL){
++              if(!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
++                      continue;
++              len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1;
++              if(len > sizeof(file)){
++                      printk("Not deleting '%s' from '%s' - name too long\n",
++                             ent->d_name, dir);
++                      continue;
++              }
++              sprintf(file, "%s/%s", dir, ent->d_name);
++              if(unlink(file) < 0){
++                      printk("actually_do_remove : couldn't remove '%s' "
++                             "from '%s', errno = %d\n", ent->d_name, dir, 
++                             errno);
++                      return(1);
++              }
++      }
++      if(rmdir(dir) < 0){
++              printk("actually_do_remove : couldn't rmdir '%s', "
++                     "errno = %d\n", dir, errno);
++              return(1);
++      }
++      return(0);
++}
++
++void remove_umid_dir(void)
++{
++      char dir[strlen(uml_dir) + UMID_LEN + 1];
++      if(!umid_inited) return;
++
++      sprintf(dir, "%s%s", uml_dir, umid);
++      actually_do_remove(dir);
++}
++
++char *get_umid(int only_if_set)
++{
++      if(only_if_set && umid_is_random) return(NULL);
++      return(umid);
++}
++
++int not_dead_yet(char *dir)
++{
++      char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
++      char pid[sizeof("nnnnn\0")], *end;
++      int dead, fd, p;
++
++      sprintf(file, "%s/pid", dir);
++      dead = 0;
++      if((fd = os_open_file(file, of_read(OPENFLAGS()), 0)) < 0){
++              if(fd != -ENOENT){
++                      printk("not_dead_yet : couldn't open pid file '%s', "
++                             "errno = %d\n", file, -fd);
++                      return(1);
++              }
++              dead = 1;
++      }
++      if(fd > 0){
++              if(read(fd, pid, sizeof(pid)) < 0){
++                      printk("not_dead_yet : couldn't read pid file '%s', "
++                             "errno = %d\n", file, errno);
++                      return(1);
++              }
++              p = strtoul(pid, &end, 0);
++              if(end == pid){
++                      printk("not_dead_yet : couldn't parse pid file '%s', "
++                             "errno = %d\n", file, errno);
++                      dead = 1;
++              }
++              if(((kill(p, 0) < 0) && (errno == ESRCH)) ||
++                 (p == CHOOSE_MODE(tracing_pid, os_getpid())))
++                      dead = 1;
++      }
++      if(!dead) return(1);
++      return(actually_do_remove(dir));
++}
++
++static int __init set_uml_dir(char *name, int *add)
++{
++      if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){
++              uml_dir = malloc(strlen(name) + 1);
++              if(uml_dir == NULL){
++                      printf("Failed to malloc uml_dir - error = %d\n",
++                             errno);
++                      uml_dir = name;
++                      return(0);
++              }
++              sprintf(uml_dir, "%s/", name);
++      }
++      else uml_dir = name;
++      return 0;
++}
++
++static int __init make_uml_dir(void)
++{
++      char dir[MAXPATHLEN + 1] = { '\0' };
++      int len;
++
++      if(*uml_dir == '~'){
++              char *home = getenv("HOME");
++
++              if(home == NULL){
++                      printf("make_uml_dir : no value in environment for "
++                             "$HOME\n");
++                      exit(1);
++              }
++              strncpy(dir, home, sizeof(dir));
++              uml_dir++;
++      }
++      len = strlen(dir);
++      strncat(dir, uml_dir, sizeof(dir) - len);
++      len = strlen(dir);
++      if((len > 0) && (len < sizeof(dir) - 1) && (dir[len - 1] != '/')){
++              dir[len] = '/';
++              dir[len + 1] = '\0';
++      }
++
++      if((uml_dir = malloc(strlen(dir) + 1)) == NULL){
++              printf("make_uml_dir : malloc failed, errno = %d\n", errno);
++              exit(1);
++      }
++      strcpy(uml_dir, dir);
++      
++      if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){
++              printf("Failed to mkdir %s - errno = %i\n", uml_dir, errno);
++              return(-1);
++      }
++      return 0;
++}
++
++static int __init make_umid(int (*printer)(const char *fmt, ...))
++{
++      int fd, err;
++      char tmp[strlen(uml_dir) + UMID_LEN + 1];
++
++      strncpy(tmp, uml_dir, sizeof(tmp) - 1);
++      tmp[sizeof(tmp) - 1] = '\0';
++
++      if(!umid_inited){
++              strcat(tmp, "XXXXXX");
++              fd = mkstemp(tmp);
++              if(fd < 0){
++                      (*printer)("make_umid - mkstemp failed, errno = %d\n",
++                                 errno);
++                      return(1);
++              }
++
++              close(fd);
++              /* There's a nice tiny little race between this unlink and
++               * the mkdir below.  It'd be nice if there were a mkstemp
++               * for directories.
++               */
++              unlink(tmp);
++              set_umid(&tmp[strlen(uml_dir)], 1, printer);
++      }
++      
++      sprintf(tmp, "%s%s", uml_dir, umid);
++
++      if((err = mkdir(tmp, 0777)) < 0){
++              if(errno == EEXIST){
++                      if(not_dead_yet(tmp)){
++                              (*printer)("umid '%s' is in use\n", umid);
++                              return(-1);
++                      }
++                      err = mkdir(tmp, 0777);
++              }
++      }
++      if(err < 0){
++              (*printer)("Failed to create %s - errno = %d\n", umid, errno);
++              return(-1);
++      }
++
++      return(0);
++}
++
++__uml_setup("uml_dir=", set_uml_dir,
++"uml_dir=<directory>\n"
++"    The location to place the pid and umid files.\n\n"
++);
++
++__uml_postsetup(make_uml_dir);
++
++static int __init make_umid_setup(void)
++{
++      return(make_umid(printf));
++}
++
++__uml_postsetup(make_umid_setup);
++__uml_postsetup(create_pid_file);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/kernel/user_syms.c um/arch/um/kernel/user_syms.c
+--- orig/arch/um/kernel/user_syms.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/user_syms.c      Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,116 @@
++#include <stdio.h>
++#include <unistd.h>
++#include <fcntl.h>
++#include <dirent.h>
++#include <errno.h>
++#include <utime.h>
++#include <string.h>
++#include <sys/stat.h>
++#include <sys/vfs.h>
++#include <sys/ioctl.h>
++#include "user_util.h"
++#include "mem_user.h"
++
++/* XXX All the __CONFIG_* stuff is broken because this file can't include
++ * config.h
++ */
++
++/* Had to steal this from linux/module.h because that file can't be included
++ * since this includes various user-level headers.
++ */
++
++struct module_symbol
++{
++      unsigned long value;
++      const char *name;
++};
++
++/* Indirect stringification.  */
++
++#define __MODULE_STRING_1(x)  #x
++#define __MODULE_STRING(x)    __MODULE_STRING_1(x)
++
++#if !defined(__AUTOCONF_INCLUDED__)
++
++#define __EXPORT_SYMBOL(sym,str)   error config_must_be_included_before_module
++#define EXPORT_SYMBOL(var)       error config_must_be_included_before_module
++#define EXPORT_SYMBOL_NOVERS(var)  error config_must_be_included_before_module
++
++#elif !defined(__CONFIG_MODULES__)
++
++#define __EXPORT_SYMBOL(sym,str)
++#define EXPORT_SYMBOL(var)
++#define EXPORT_SYMBOL_NOVERS(var)
++
++#else
++
++#define __EXPORT_SYMBOL(sym, str)                     \
++const char __kstrtab_##sym[]                          \
++__attribute__((section(".kstrtab"))) = str;           \
++const struct module_symbol __ksymtab_##sym            \
++__attribute__((section("__ksymtab"))) =                       \
++{ (unsigned long)&sym, __kstrtab_##sym }
++
++#if defined(__MODVERSIONS__) || !defined(__CONFIG_MODVERSIONS__)
++#define EXPORT_SYMBOL(var)  __EXPORT_SYMBOL(var, __MODULE_STRING(var))
++#else
++#define EXPORT_SYMBOL(var)  __EXPORT_SYMBOL(var, __MODULE_STRING(__VERSIONED_SYMBOL(var)))
++#endif
++
++#define EXPORT_SYMBOL_NOVERS(var)  __EXPORT_SYMBOL(var, __MODULE_STRING(var))
++
++#endif
++
++EXPORT_SYMBOL(__errno_location);
++
++EXPORT_SYMBOL(access);
++EXPORT_SYMBOL(open);
++EXPORT_SYMBOL(open64);
++EXPORT_SYMBOL(close);
++EXPORT_SYMBOL(read);
++EXPORT_SYMBOL(write);
++EXPORT_SYMBOL(dup2);
++EXPORT_SYMBOL(__xstat);
++EXPORT_SYMBOL(__lxstat);
++EXPORT_SYMBOL(__lxstat64);
++EXPORT_SYMBOL(lseek);
++EXPORT_SYMBOL(lseek64);
++EXPORT_SYMBOL(chown);
++EXPORT_SYMBOL(truncate);
++EXPORT_SYMBOL(utime);
++EXPORT_SYMBOL(chmod);
++EXPORT_SYMBOL(rename);
++EXPORT_SYMBOL(__xmknod);
++
++EXPORT_SYMBOL(symlink);
++EXPORT_SYMBOL(link);
++EXPORT_SYMBOL(unlink);
++EXPORT_SYMBOL(readlink);
++
++EXPORT_SYMBOL(mkdir);
++EXPORT_SYMBOL(rmdir);
++EXPORT_SYMBOL(opendir);
++EXPORT_SYMBOL(readdir);
++EXPORT_SYMBOL(closedir);
++EXPORT_SYMBOL(seekdir);
++EXPORT_SYMBOL(telldir);
++
++EXPORT_SYMBOL(ioctl);
++
++extern ssize_t pread64 (int __fd, void *__buf, size_t __nbytes,
++                      __off64_t __offset);
++extern ssize_t pwrite64 (int __fd, __const void *__buf, size_t __n,
++                       __off64_t __offset);
++EXPORT_SYMBOL(pread64);
++EXPORT_SYMBOL(pwrite64);
++
++EXPORT_SYMBOL(statfs);
++EXPORT_SYMBOL(statfs64);
++
++EXPORT_SYMBOL(memcpy);
++EXPORT_SYMBOL(getuid);
++
++EXPORT_SYMBOL(memset);
++EXPORT_SYMBOL(strstr);
++
++EXPORT_SYMBOL(find_iomem);
+diff -Naur -X ../exclude-files orig/arch/um/kernel/user_util.c um/arch/um/kernel/user_util.c
+--- orig/arch/um/kernel/user_util.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/kernel/user_util.c      Wed Apr 23 20:41:54 2003
+@@ -0,0 +1,164 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <limits.h>
++#include <sys/mman.h> 
++#include <sys/stat.h>
++#include <sys/ptrace.h>
++#include <sys/utsname.h>
++#include <sys/param.h>
++#include <sys/time.h>
++#include "asm/types.h"
++#include <ctype.h>
++#include <signal.h>
++#include <wait.h>
++#include <errno.h>
++#include <stdarg.h>
++#include <sched.h>
++#include <termios.h>
++#include <string.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "user.h"
++#include "mem_user.h"
++#include "init.h"
++#include "helper.h"
++#include "uml-config.h"
++
++#define COMMAND_LINE_SIZE _POSIX_ARG_MAX
++
++/* Changed in linux_main and setup_arch, which run before SMP is started */
++char saved_command_line[COMMAND_LINE_SIZE] = { 0 };
++char command_line[COMMAND_LINE_SIZE] = { 0 };
++
++void add_arg(char *cmd_line, char *arg)
++{
++      if (strlen(cmd_line) + strlen(arg) + 1 > COMMAND_LINE_SIZE) {
++              printf("add_arg: Too much command line!\n");
++              exit(1);
++      }
++      if(strlen(cmd_line) > 0) strcat(cmd_line, " ");
++      strcat(cmd_line, arg);
++}
++
++void stop(void)
++{
++      while(1) sleep(1000000);
++}
++
++void stack_protections(unsigned long address)
++{
++      int prot = PROT_READ | PROT_WRITE | PROT_EXEC;
++
++        if(mprotect((void *) address, page_size(), prot) < 0)
++              panic("protecting stack failed, errno = %d", errno);
++}
++
++void task_protections(unsigned long address)
++{
++      unsigned long guard = address + page_size();
++      unsigned long stack = guard + page_size();
++      int prot = 0, pages;
++#ifdef notdef
++      if(mprotect((void *) guard, page_size(), prot) < 0)
++              panic("protecting guard page failed, errno = %d", errno);
++#endif
++      pages = (1 << UML_CONFIG_KERNEL_STACK_ORDER) - 2;
++      prot = PROT_READ | PROT_WRITE | PROT_EXEC;
++      if(mprotect((void *) stack, pages * page_size(), prot) < 0)
++              panic("protecting stack failed, errno = %d", errno);
++}
++
++int wait_for_stop(int pid, int sig, int cont_type, void *relay)
++{
++      sigset_t *relay_signals = relay;
++      int status, ret;
++
++      while(1){
++              if(((ret = waitpid(pid, &status, WUNTRACED)) < 0) ||
++                 !WIFSTOPPED(status) || (WSTOPSIG(status) != sig)){
++                      if(ret < 0){
++                              if(errno == EINTR) continue;
++                              printk("wait failed, errno = %d\n",
++                                     errno);
++                      }
++                      else if(WIFEXITED(status)) 
++                              printk("process exited with status %d\n", 
++                                     WEXITSTATUS(status));
++                      else if(WIFSIGNALED(status))
++                              printk("process exited with signal %d\n", 
++                                     WTERMSIG(status));
++                      else if((WSTOPSIG(status) == SIGVTALRM) ||
++                              (WSTOPSIG(status) == SIGALRM) ||
++                              (WSTOPSIG(status) == SIGIO) ||
++                              (WSTOPSIG(status) == SIGPROF) ||
++                              (WSTOPSIG(status) == SIGCHLD) ||
++                              (WSTOPSIG(status) == SIGWINCH) ||
++                              (WSTOPSIG(status) == SIGINT)){
++                              ptrace(cont_type, pid, 0, WSTOPSIG(status));
++                              continue;
++                      }
++                      else if((relay_signals != NULL) &&
++                              sigismember(relay_signals, WSTOPSIG(status))){
++                              ptrace(cont_type, pid, 0, WSTOPSIG(status));
++                              continue;
++                      }
++                      else printk("process stopped with signal %d\n", 
++                                  WSTOPSIG(status));
++                      panic("wait_for_stop failed to wait for %d to stop "
++                            "with %d\n", pid, sig);
++              }
++              return(status);
++      }
++}
++
++int raw(int fd, int complain)
++{
++      struct termios tt;
++      int err;
++
++      tcgetattr(fd, &tt);
++      cfmakeraw(&tt);
++      err = tcsetattr(fd, TCSANOW, &tt);
++      if((err < 0) && complain){
++              printk("tcsetattr failed, errno = %d\n", errno);
++              return(-errno);
++      }
++      return(0);
++}
++
++void setup_machinename(char *machine_out)
++{
++      struct utsname host;
++
++      uname(&host);
++      strcpy(machine_out, host.machine);
++}
++
++char host_info[(_UTSNAME_LENGTH + 1) * 4 + _UTSNAME_NODENAME_LENGTH + 1];
++
++void setup_hostinfo(void)
++{
++      struct utsname host;
++
++      uname(&host);
++      sprintf(host_info, "%s %s %s %s %s", host.sysname, host.nodename,
++              host.release, host.version, host.machine);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/link.ld.in um/arch/um/link.ld.in
+--- orig/arch/um/link.ld.in    Wed Dec 31 19:00:00 1969
++++ um/arch/um/link.ld.in      Fri Jan 17 23:11:30 2003
+@@ -0,0 +1,95 @@
++OUTPUT_FORMAT("ELF_FORMAT")
++OUTPUT_ARCH(ELF_ARCH)
++ENTRY(_start)
++
++SECTIONS
++{
++  . = START() + SIZEOF_HEADERS;
++
++  . = ALIGN(4096);
++  __binary_start = .;
++ifdef(`MODE_TT', `
++  .thread_private : {
++    __start_thread_private = .;
++    errno = .;
++    . += 4;
++    arch/um/kernel/tt/unmap_fin.o (.data)
++    __end_thread_private = .;
++  }
++  . = ALIGN(4096);
++  .remap : { arch/um/kernel/tt/unmap_fin.o (.text) }
++')
++  . = ALIGN(4096);            /* Init code and data */
++  _stext = .;
++  __init_begin = .;
++  .text.init : { *(.text.init) }
++  . = ALIGN(4096);
++  .text      :
++  {
++    *(.text)
++    /* .gnu.warning sections are handled specially by elf32.em.  */
++    *(.gnu.warning)
++    *(.gnu.linkonce.t*)
++  }
++  .fini      : { *(.fini)    } =0x9090
++  .rodata    : { *(.rodata) *(.gnu.linkonce.r*) }
++  .rodata1   : { *(.rodata1) }
++  _etext = .;
++  PROVIDE (etext = .);
++
++  . = ALIGN(4096);
++  PROVIDE (_sdata = .);
++
++include(`arch/um/common.ld.in')
++
++  .data    :
++  {
++    . = ALIGN(KERNEL_STACK_SIZE);             /* init_task */
++    *(.data.init_task)
++    *(.data)
++    *(.gnu.linkonce.d*)
++    CONSTRUCTORS
++  }
++  .data1   : { *(.data1) }
++  .ctors         :
++  {
++    *(.ctors)
++  }
++  .dtors         :
++  {
++    *(.dtors)
++  }
++
++  .got           : { *(.got.plt) *(.got) }
++  .dynamic       : { *(.dynamic) }
++  /* We want the small data sections together, so single-instruction offsets
++     can access them all, and initialized data all before uninitialized, so
++     we can shorten the on-disk segment size.  */
++  .sdata     : { *(.sdata) }
++  _edata  =  .;
++  PROVIDE (edata = .);
++  . = ALIGN(0x1000);
++  .sbss      : 
++  {
++   __bss_start = .;
++   PROVIDE(_bss_start = .);
++   *(.sbss) 
++   *(.scommon) 
++  }
++  .bss       :
++  {
++   *(.dynbss)
++   *(.bss)
++   *(COMMON)
++  }
++  _end = . ;
++  PROVIDE (end = .);
++  /* Stabs debugging sections.  */
++  .stab 0 : { *(.stab) }
++  .stabstr 0 : { *(.stabstr) }
++  .stab.excl 0 : { *(.stab.excl) }
++  .stab.exclstr 0 : { *(.stab.exclstr) }
++  .stab.index 0 : { *(.stab.index) }
++  .stab.indexstr 0 : { *(.stab.indexstr) }
++  .comment 0 : { *(.comment) }
++}
+diff -Naur -X ../exclude-files orig/arch/um/main.c um/arch/um/main.c
+--- orig/arch/um/main.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/main.c  Fri Jan 17 13:22:40 2003
+@@ -0,0 +1,195 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <stdio.h> 
++#include <stdlib.h>
++#include <string.h>
++#include <signal.h>
++#include <sys/resource.h>
++#include <sys/mman.h>
++#include <sys/user.h>
++#include <asm/page.h>
++#include "user_util.h"
++#include "kern_util.h"
++#include "mem_user.h"
++#include "signal_user.h"
++#include "user.h"
++#include "init.h"
++#include "mode.h"
++#include "choose-mode.h"
++#include "uml-config.h"
++
++/* Set in set_stklim, which is called from main and __wrap_malloc.  
++ * __wrap_malloc only calls it if main hasn't started.
++ */
++unsigned long stacksizelim;
++
++/* Set in main */
++char *linux_prog;
++
++#define PGD_BOUND (4 * 1024 * 1024)
++#define STACKSIZE (8 * 1024 * 1024)
++#define THREAD_NAME_LEN (256)
++
++static void set_stklim(void)
++{
++      struct rlimit lim;
++
++      if(getrlimit(RLIMIT_STACK, &lim) < 0){
++              perror("getrlimit");
++              exit(1);
++      }
++      if((lim.rlim_cur == RLIM_INFINITY) || (lim.rlim_cur > STACKSIZE)){
++              lim.rlim_cur = STACKSIZE;
++              if(setrlimit(RLIMIT_STACK, &lim) < 0){
++                      perror("setrlimit");
++                      exit(1);
++              }
++      }
++      stacksizelim = (lim.rlim_cur + PGD_BOUND - 1) & ~(PGD_BOUND - 1);
++}
++
++static __init void do_uml_initcalls(void)
++{
++      initcall_t *call;
++
++      call = &__uml_initcall_start;
++      while (call < &__uml_initcall_end){;
++              (*call)();
++              call++;
++      }
++}
++
++static void last_ditch_exit(int sig)
++{
++      CHOOSE_MODE(kmalloc_ok = 0, (void) 0);
++      signal(SIGINT, SIG_DFL);
++      signal(SIGTERM, SIG_DFL);
++      signal(SIGHUP, SIG_DFL);
++      uml_cleanup();
++      exit(1);
++}
++
++extern int uml_exitcode;
++
++int main(int argc, char **argv, char **envp)
++{
++      char **new_argv;
++      sigset_t mask;
++      int ret, i;
++
++      /* Enable all signals except SIGIO - in some environments, we can 
++       * enter with some signals blocked
++       */
++
++      sigemptyset(&mask);
++      sigaddset(&mask, SIGIO);
++      if(sigprocmask(SIG_SETMASK, &mask, NULL) < 0){
++              perror("sigprocmask");
++              exit(1);
++      }
++
++#ifdef UML_CONFIG_MODE_TT
++      /* Allocate memory for thread command lines */
++      if(argc < 2 || strlen(argv[1]) < THREAD_NAME_LEN - 1){
++
++              char padding[THREAD_NAME_LEN] = { 
++                      [ 0 ...  THREAD_NAME_LEN - 2] = ' ', '\0' 
++              };
++
++              new_argv = malloc((argc + 2) * sizeof(char*));
++              if(!new_argv) {
++                      perror("Allocating extended argv");
++                      exit(1);
++              }       
++              
++              new_argv[0] = argv[0];
++              new_argv[1] = padding;
++              
++              for(i = 2; i <= argc; i++)
++                      new_argv[i] = argv[i - 1];
++              new_argv[argc + 1] = NULL;
++              
++              execvp(new_argv[0], new_argv);
++              perror("execing with extended args");
++              exit(1);
++      }       
++#endif
++
++      linux_prog = argv[0];
++
++      set_stklim();
++
++      if((new_argv = malloc((argc + 1) * sizeof(char *))) == NULL){
++              perror("Mallocing argv");
++              exit(1);
++      }
++      for(i=0;i<argc;i++){
++              if((new_argv[i] = strdup(argv[i])) == NULL){
++                      perror("Mallocing an arg");
++                      exit(1);
++              }
++      }
++      new_argv[argc] = NULL;
++
++      set_handler(SIGINT, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
++      set_handler(SIGTERM, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
++      set_handler(SIGHUP, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1);
++
++      do_uml_initcalls();
++      ret = linux_main(argc, argv);
++      
++      /* Reboot */
++      if(ret){
++              printf("\n");
++              execvp(new_argv[0], new_argv);
++              perror("Failed to exec kernel");
++              ret = 1;
++      }
++      printf("\n");
++      return(uml_exitcode);
++}
++
++#define CAN_KMALLOC() \
++      (kmalloc_ok && CHOOSE_MODE((getpid() != tracing_pid), 1))
++
++extern void *__real_malloc(int);
++
++void *__wrap_malloc(int size)
++{
++      if(CAN_KMALLOC())
++              return(um_kmalloc(size));
++      else
++              return(__real_malloc(size));
++}
++
++void *__wrap_calloc(int n, int size)
++{
++      void *ptr = __wrap_malloc(n * size);
++
++      if(ptr == NULL) return(NULL);
++      memset(ptr, 0, n * size);
++      return(ptr);
++}
++
++extern void __real_free(void *);
++
++void __wrap_free(void *ptr)
++{
++      if(CAN_KMALLOC()) kfree(ptr);
++      else __real_free(ptr);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/Makefile um/arch/um/os-Linux/Makefile
+--- orig/arch/um/os-Linux/Makefile     Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/Makefile       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,17 @@
++# 
++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET = built-in.o
++
++obj-y = file.o process.o tty.o
++
++include $(TOPDIR)/Rules.make
++
++$(obj-y) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++clean :
++
++archmrproper:
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/Makefile um/arch/um/os-Linux/drivers/Makefile
+--- orig/arch/um/os-Linux/drivers/Makefile     Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/drivers/Makefile       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,31 @@
++# 
++# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET := drivers.o
++
++list-multi := tuntap.o ethertap.o
++
++ethertap-objs := ethertap_kern.o ethertap_user.o
++tuntap-objs := tuntap_kern.o tuntap_user.o
++
++obj-y = 
++obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o
++obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o
++
++USER_SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y)),$($(f)-objs))
++
++USER_OBJS = $(filter %_user.o,$(obj-y) $(USER_SINGLE_OBJS))
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++ethertap.o : $(ethertap-objs)
++
++tuntap.o : $(tuntap-objs)
++
++$(list-multi) : # This doesn't work, but should : '%.o : $(%-objs)'
++      $(LD) $(LD_RFLAG) -r -o $@ $($(patsubst %.o,%,$@)-objs)
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/etap.h um/arch/um/os-Linux/drivers/etap.h
+--- orig/arch/um/os-Linux/drivers/etap.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/drivers/etap.h Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,27 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "net_user.h"
++
++struct ethertap_data {
++      char *dev_name;
++      char *gate_addr;
++      int data_fd;
++      int control_fd;
++      void *dev;
++};
++
++extern struct net_user_info ethertap_user_info;
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/ethertap_kern.c um/arch/um/os-Linux/drivers/ethertap_kern.c
+--- orig/arch/um/os-Linux/drivers/ethertap_kern.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/drivers/ethertap_kern.c        Sun Dec 15 21:17:37 2002
+@@ -0,0 +1,122 @@
++/*
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and 
++ * James Leu (jleu@mindspring.net).
++ * Copyright (C) 2001 by various other people who didn't put their name here.
++ * Licensed under the GPL.
++ */
++
++#include "linux/init.h"
++#include "linux/netdevice.h"
++#include "linux/etherdevice.h"
++#include "linux/init.h"
++#include "net_kern.h"
++#include "net_user.h"
++#include "etap.h"
++
++struct ethertap_init {
++      char *dev_name;
++      char *gate_addr;
++};
++
++static void etap_init(struct net_device *dev, void *data)
++{
++      struct uml_net_private *pri;
++      struct ethertap_data *epri;
++      struct ethertap_init *init = data;
++
++      init_etherdev(dev, 0);
++      pri = dev->priv;
++      epri = (struct ethertap_data *) pri->user;
++      *epri = ((struct ethertap_data)
++              { .dev_name             = init->dev_name,
++                .gate_addr            = init->gate_addr,
++                .data_fd              = -1,
++                .control_fd           = -1,
++                .dev                  = dev });
++
++      printk("ethertap backend - %s", epri->dev_name);
++      if(epri->gate_addr != NULL) 
++              printk(", IP = %s", epri->gate_addr);
++      printk("\n");
++}
++
++static int etap_read(int fd, struct sk_buff **skb, struct uml_net_private *lp)
++{
++      int len;
++
++      *skb = ether_adjust_skb(*skb, ETH_HEADER_ETHERTAP);
++      if(*skb == NULL) return(-ENOMEM);
++      len = net_recvfrom(fd, (*skb)->mac.raw, 
++                         (*skb)->dev->mtu + 2 * ETH_HEADER_ETHERTAP);
++      if(len <= 0) return(len);
++      skb_pull(*skb, 2);
++      len -= 2;
++      return(len);
++}
++
++static int etap_write(int fd, struct sk_buff **skb, struct uml_net_private *lp)
++{
++      if(skb_headroom(*skb) < 2){
++              struct sk_buff *skb2;
++
++              skb2 = skb_realloc_headroom(*skb, 2);
++              dev_kfree_skb(*skb);
++              if (skb2 == NULL) return(-ENOMEM);
++              *skb = skb2;
++      }
++      skb_push(*skb, 2);
++      return(net_send(fd, (*skb)->data, (*skb)->len));
++}
++
++struct net_kern_info ethertap_kern_info = {
++      .init                   = etap_init,
++      .protocol               = eth_protocol,
++      .read                   = etap_read,
++      .write                  = etap_write,
++};
++
++int ethertap_setup(char *str, char **mac_out, void *data)
++{
++      struct ethertap_init *init = data;
++
++      *init = ((struct ethertap_init)
++              { .dev_name     = NULL,
++                .gate_addr    = NULL });
++      if(tap_setup_common(str, "ethertap", &init->dev_name, mac_out,
++                          &init->gate_addr))
++              return(0);
++      if(init->dev_name == NULL){
++              printk("ethertap_setup : Missing tap device name\n");
++              return(0);
++      }
++
++      return(1);
++}
++
++static struct transport ethertap_transport = {
++      .list           = LIST_HEAD_INIT(ethertap_transport.list),
++      .name           = "ethertap",
++      .setup          = ethertap_setup,
++      .user           = &ethertap_user_info,
++      .kern           = &ethertap_kern_info,
++      .private_size   = sizeof(struct ethertap_data),
++};
++
++static int register_ethertap(void)
++{
++      register_transport(&ethertap_transport);
++      return(1);
++}
++
++__initcall(register_ethertap);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/ethertap_user.c um/arch/um/os-Linux/drivers/ethertap_user.c
+--- orig/arch/um/os-Linux/drivers/ethertap_user.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/drivers/ethertap_user.c        Sun Dec 15 21:17:52 2002
+@@ -0,0 +1,238 @@
++/*
++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and 
++ * James Leu (jleu@mindspring.net).
++ * Copyright (C) 2001 by various other people who didn't put their name here.
++ * Licensed under the GPL.
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <stddef.h>
++#include <fcntl.h>
++#include <stdlib.h>
++#include <sys/errno.h>
++#include <sys/socket.h>
++#include <sys/wait.h>
++#include <sys/un.h>
++#include <net/if.h>
++#include "user.h"
++#include "kern_util.h"
++#include "net_user.h"
++#include "etap.h"
++#include "helper.h"
++#include "os.h"
++
++#define MAX_PACKET ETH_MAX_PACKET
++
++void etap_user_init(void *data, void *dev)
++{
++      struct ethertap_data *pri = data;
++
++      pri->dev = dev;
++}
++
++struct addr_change {
++      enum { ADD_ADDR, DEL_ADDR } what;
++      unsigned char addr[4];
++      unsigned char netmask[4];
++};
++
++static void etap_change(int op, unsigned char *addr, unsigned char *netmask,
++                      int fd)
++{
++      struct addr_change change;
++      void *output;
++
++      change.what = op;
++      memcpy(change.addr, addr, sizeof(change.addr));
++      memcpy(change.netmask, netmask, sizeof(change.netmask));
++      if(write(fd, &change, sizeof(change)) != sizeof(change))
++              printk("etap_change - request failed, errno = %d\n",
++                     errno);
++      output = um_kmalloc(page_size());
++      if(output == NULL)
++              printk("etap_change : Failed to allocate output buffer\n");
++      read_output(fd, output, page_size());
++      if(output != NULL){
++              printk("%s", output);
++              kfree(output);
++      }
++}
++
++static void etap_open_addr(unsigned char *addr, unsigned char *netmask,
++                         void *arg)
++{
++      etap_change(ADD_ADDR, addr, netmask, *((int *) arg));
++}
++
++static void etap_close_addr(unsigned char *addr, unsigned char *netmask,
++                          void *arg)
++{
++      etap_change(DEL_ADDR, addr, netmask, *((int *) arg));
++}
++
++struct etap_pre_exec_data {
++      int control_remote;
++      int control_me;
++      int data_me;
++};
++
++static void etap_pre_exec(void *arg)
++{
++      struct etap_pre_exec_data *data = arg;
++
++      dup2(data->control_remote, 1);
++      close(data->data_me);
++      close(data->control_me);
++}
++
++static int etap_tramp(char *dev, char *gate, int control_me, 
++                    int control_remote, int data_me, int data_remote)
++{
++      struct etap_pre_exec_data pe_data;
++      int pid, status, err;
++      char version_buf[sizeof("nnnnn\0")];
++      char data_fd_buf[sizeof("nnnnnn\0")];
++      char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")];
++      char *setup_args[] = { "uml_net", version_buf, "ethertap", dev,
++                             data_fd_buf, gate_buf, NULL };
++      char *nosetup_args[] = { "uml_net", version_buf, "ethertap", 
++                               dev, data_fd_buf, NULL };
++      char **args, c;
++
++      sprintf(data_fd_buf, "%d", data_remote);
++      sprintf(version_buf, "%d", UML_NET_VERSION);
++      if(gate != NULL){
++              strcpy(gate_buf, gate);
++              args = setup_args;
++      }
++      else args = nosetup_args;
++
++      err = 0;
++      pe_data.control_remote = control_remote;
++      pe_data.control_me = control_me;
++      pe_data.data_me = data_me;
++      pid = run_helper(etap_pre_exec, &pe_data, args, NULL);
++
++      if(pid < 0) err = errno;
++      close(data_remote);
++      close(control_remote);
++      if(read(control_me, &c, sizeof(c)) != sizeof(c)){
++              printk("etap_tramp : read of status failed, errno = %d\n",
++                     errno);
++              return(EINVAL);
++      }
++      if(c != 1){
++              printk("etap_tramp : uml_net failed\n");
++              err = EINVAL;
++              if(waitpid(pid, &status, 0) < 0) err = errno;
++              else if(!WIFEXITED(status) || (WEXITSTATUS(status) != 1)){
++                      printk("uml_net didn't exit with status 1\n");
++              }
++      }
++      return(err);
++}
++
++static int etap_open(void *data)
++{
++      struct ethertap_data *pri = data;
++      char *output;
++      int data_fds[2], control_fds[2], err, output_len;
++
++      err = tap_open_common(pri->dev, pri->gate_addr);
++      if(err) return(err);
++
++      err = os_pipe(data_fds, 0, 0);
++      if(err){
++              printk("data os_pipe failed - errno = %d\n", -err);
++              return(err);
++      }
++
++      err = os_pipe(control_fds, 1, 0);
++      if(err){
++              printk("control os_pipe failed - errno = %d\n", -err);
++              return(err);
++      }
++      
++      err = etap_tramp(pri->dev_name, pri->gate_addr, control_fds[0], 
++                       control_fds[1], data_fds[0], data_fds[1]);
++      output_len = page_size();
++      output = um_kmalloc(output_len);
++      read_output(control_fds[0], output, output_len);
++
++      if(output == NULL)
++              printk("etap_open : failed to allocate output buffer\n");
++      else {
++              printk("%s", output);
++              kfree(output);
++      }
++
++      if(err != 0){
++              printk("etap_tramp failed - errno = %d\n", err);
++              return(-err);
++      }
++
++      pri->data_fd = data_fds[0];
++      pri->control_fd = control_fds[0];
++      iter_addresses(pri->dev, etap_open_addr, &pri->control_fd);
++      return(data_fds[0]);
++}
++
++static void etap_close(int fd, void *data)
++{
++      struct ethertap_data *pri = data;
++
++      iter_addresses(pri->dev, etap_close_addr, &pri->control_fd);
++      close(fd);
++      os_shutdown_socket(pri->data_fd, 1, 1);
++      close(pri->data_fd);
++      pri->data_fd = -1;
++      close(pri->control_fd);
++      pri->control_fd = -1;
++}
++
++static int etap_set_mtu(int mtu, void *data)
++{
++      return(mtu);
++}
++
++static void etap_add_addr(unsigned char *addr, unsigned char *netmask,
++                        void *data)
++{
++      struct ethertap_data *pri = data;
++
++      tap_check_ips(pri->gate_addr, addr);
++      if(pri->control_fd == -1) return;
++      etap_open_addr(addr, netmask, &pri->control_fd);
++}
++
++static void etap_del_addr(unsigned char *addr, unsigned char *netmask, 
++                        void *data)
++{
++      struct ethertap_data *pri = data;
++
++      if(pri->control_fd == -1) return;
++      etap_close_addr(addr, netmask, &pri->control_fd);
++}
++
++struct net_user_info ethertap_user_info = {
++      .init           = etap_user_init,
++      .open           = etap_open,
++      .close          = etap_close,
++      .remove         = NULL,
++      .set_mtu        = etap_set_mtu,
++      .add_address    = etap_add_addr,
++      .delete_address = etap_del_addr,
++      .max_packet     = MAX_PACKET - ETH_HEADER_ETHERTAP
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/tuntap.h um/arch/um/os-Linux/drivers/tuntap.h
+--- orig/arch/um/os-Linux/drivers/tuntap.h     Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/drivers/tuntap.h       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,32 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_TUNTAP_H
++#define __UM_TUNTAP_H
++
++#include "net_user.h"
++
++struct tuntap_data {
++      char *dev_name;
++      int fixed_config;
++      char *gate_addr;
++      int fd;
++      void *dev;
++};
++
++extern struct net_user_info tuntap_user_info;
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/tuntap_kern.c um/arch/um/os-Linux/drivers/tuntap_kern.c
+--- orig/arch/um/os-Linux/drivers/tuntap_kern.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/drivers/tuntap_kern.c  Sun Dec 15 21:18:16 2002
+@@ -0,0 +1,105 @@
++/* 
++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/stddef.h"
++#include "linux/netdevice.h"
++#include "linux/etherdevice.h"
++#include "linux/skbuff.h"
++#include "linux/init.h"
++#include "asm/errno.h"
++#include "net_kern.h"
++#include "net_user.h"
++#include "tuntap.h"
++
++struct tuntap_init {
++      char *dev_name;
++      char *gate_addr;
++};
++
++static void tuntap_init(struct net_device *dev, void *data)
++{
++      struct uml_net_private *pri;
++      struct tuntap_data *tpri;
++      struct tuntap_init *init = data;
++
++      init_etherdev(dev, 0);
++      pri = dev->priv;
++      tpri = (struct tuntap_data *) pri->user;
++      *tpri = ((struct tuntap_data)
++              { .dev_name             = init->dev_name,
++                .fixed_config         = (init->dev_name != NULL),
++                .gate_addr            = init->gate_addr,
++                .fd                   = -1,
++                .dev                  = dev });
++      printk("TUN/TAP backend - ");
++      if(tpri->gate_addr != NULL) 
++              printk("IP = %s", tpri->gate_addr);
++      printk("\n");
++}
++
++static int tuntap_read(int fd, struct sk_buff **skb, 
++                     struct uml_net_private *lp)
++{
++      *skb = ether_adjust_skb(*skb, ETH_HEADER_OTHER);
++      if(*skb == NULL) return(-ENOMEM);
++      return(net_read(fd, (*skb)->mac.raw, 
++                      (*skb)->dev->mtu + ETH_HEADER_OTHER));
++}
++
++static int tuntap_write(int fd, struct sk_buff **skb, 
++                      struct uml_net_private *lp)
++{
++      return(net_write(fd, (*skb)->data, (*skb)->len));
++}
++
++struct net_kern_info tuntap_kern_info = {
++      .init                   = tuntap_init,
++      .protocol               = eth_protocol,
++      .read                   = tuntap_read,
++      .write                  = tuntap_write,
++};
++
++int tuntap_setup(char *str, char **mac_out, void *data)
++{
++      struct tuntap_init *init = data;
++
++      *init = ((struct tuntap_init)
++              { .dev_name     = NULL,
++                .gate_addr    = NULL });
++      if(tap_setup_common(str, "tuntap", &init->dev_name, mac_out,
++                          &init->gate_addr))
++              return(0);
++
++      return(1);
++}
++
++static struct transport tuntap_transport = {
++      .list           = LIST_HEAD_INIT(tuntap_transport.list),
++      .name           = "tuntap",
++      .setup          = tuntap_setup,
++      .user           = &tuntap_user_info,
++      .kern           = &tuntap_kern_info,
++      .private_size   = sizeof(struct tuntap_data),
++      .setup_size     = sizeof(struct tuntap_init),
++};
++
++static int register_tuntap(void)
++{
++      register_transport(&tuntap_transport);
++      return(1);
++}
++
++__initcall(register_tuntap);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/tuntap_user.c um/arch/um/os-Linux/drivers/tuntap_user.c
+--- orig/arch/um/os-Linux/drivers/tuntap_user.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/drivers/tuntap_user.c  Sun Dec 15 21:18:25 2002
+@@ -0,0 +1,223 @@
++/* 
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <stddef.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <sys/wait.h>
++#include <sys/socket.h>
++#include <sys/un.h>
++#include <sys/uio.h>
++#include <sys/ioctl.h>
++#include <net/if.h>
++#include <linux/if_tun.h>
++#include "net_user.h"
++#include "tuntap.h"
++#include "kern_util.h"
++#include "user.h"
++#include "helper.h"
++#include "os.h"
++
++#define MAX_PACKET ETH_MAX_PACKET
++
++void tuntap_user_init(void *data, void *dev)
++{
++      struct tuntap_data *pri = data;
++
++      pri->dev = dev;
++}
++
++static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask,
++                          void *data)
++{
++      struct tuntap_data *pri = data;
++
++      tap_check_ips(pri->gate_addr, addr);
++      if((pri->fd == -1) || pri->fixed_config) return;
++      open_addr(addr, netmask, pri->dev_name);
++}
++
++static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask,
++                          void *data)
++{
++      struct tuntap_data *pri = data;
++
++      if((pri->fd == -1) || pri->fixed_config) return;
++      close_addr(addr, netmask, pri->dev_name);
++}
++
++struct tuntap_pre_exec_data {
++      int stdout;
++      int close_me;
++};
++
++static void tuntap_pre_exec(void *arg)
++{
++      struct tuntap_pre_exec_data *data = arg;
++      
++      dup2(data->stdout, 1);
++      close(data->close_me);
++}
++
++static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote,
++                           char *buffer, int buffer_len, int *used_out)
++{
++      struct tuntap_pre_exec_data data;
++      char version_buf[sizeof("nnnnn\0")];
++      char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate,
++                       NULL };
++      char buf[CMSG_SPACE(sizeof(*fd_out))];
++      struct msghdr msg;
++      struct cmsghdr *cmsg;
++      struct iovec iov;
++      int pid, n;
++
++      sprintf(version_buf, "%d", UML_NET_VERSION);
++
++      data.stdout = remote;
++      data.close_me = me;
++
++      pid = run_helper(tuntap_pre_exec, &data, argv, NULL);
++
++      if(pid < 0) return(-pid);
++
++      close(remote);
++
++      msg.msg_name = NULL;
++      msg.msg_namelen = 0;
++      if(buffer != NULL){
++              iov = ((struct iovec) { buffer, buffer_len });
++              msg.msg_iov = &iov;
++              msg.msg_iovlen = 1;
++      }
++      else {
++              msg.msg_iov = NULL;
++              msg.msg_iovlen = 0;
++      }
++      msg.msg_control = buf;
++      msg.msg_controllen = sizeof(buf);
++      msg.msg_flags = 0;
++      n = recvmsg(me, &msg, 0);
++      *used_out = n;
++      if(n < 0){
++              printk("tuntap_open_tramp : recvmsg failed - errno = %d\n", 
++                     errno);
++              return(errno);
++      }
++      waitpid(pid, NULL, 0);
++
++      cmsg = CMSG_FIRSTHDR(&msg);
++      if(cmsg == NULL){
++              printk("tuntap_open_tramp : didn't receive a message\n");
++              return(EINVAL);
++      }
++      if((cmsg->cmsg_level != SOL_SOCKET) || 
++         (cmsg->cmsg_type != SCM_RIGHTS)){
++              printk("tuntap_open_tramp : didn't receive a descriptor\n");
++              return(EINVAL);
++      }
++      *fd_out = ((int *) CMSG_DATA(cmsg))[0];
++      return(0);
++}
++
++static int tuntap_open(void *data)
++{
++      struct ifreq ifr;
++      struct tuntap_data *pri = data;
++      char *output, *buffer;
++      int err, fds[2], len, used;
++
++      err = tap_open_common(pri->dev, pri->gate_addr);
++      if(err) return(err);
++
++      if(pri->fixed_config){
++              if((pri->fd = open("/dev/net/tun", O_RDWR)) < 0){
++                      printk("Failed to open /dev/net/tun, errno = %d\n",
++                             errno);
++                      return(-errno);
++              }
++              memset(&ifr, 0, sizeof(ifr));
++              ifr.ifr_flags = IFF_TAP;
++              strncpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name) - 1);
++              if(ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0){
++                      printk("TUNSETIFF failed, errno = %d", errno);
++                      close(pri->fd);
++                      return(-errno);
++              }
++      }
++      else {
++              err = os_pipe(fds, 0, 0);
++              if(err){
++                      printk("tuntap_open : os_pipe failed - errno = %d\n",
++                             -err);
++                      return(err);
++              }
++
++              buffer = get_output_buffer(&len);
++              if(buffer != NULL) len--;
++              used = 0;
++
++              err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0],
++                                      fds[1], buffer, len, &used);
++
++              output = buffer;
++              if(err == 0){
++                      pri->dev_name = uml_strdup(buffer);
++                      output += IFNAMSIZ;
++                      printk(output);
++                      free_output_buffer(buffer);
++              }
++              else {
++                      printk(output);
++                      free_output_buffer(buffer);
++                      printk("tuntap_open_tramp failed - errno = %d\n", err);
++                      return(-err);
++              }
++              close(fds[0]);
++              iter_addresses(pri->dev, open_addr, pri->dev_name);
++      }
++
++      return(pri->fd);
++}
++
++static void tuntap_close(int fd, void *data)
++{
++      struct tuntap_data *pri = data;
++
++      if(!pri->fixed_config) 
++              iter_addresses(pri->dev, close_addr, pri->dev_name);
++      close(fd);
++      pri->fd = -1;
++}
++
++static int tuntap_set_mtu(int mtu, void *data)
++{
++      return(mtu);
++}
++
++struct net_user_info tuntap_user_info = {
++      .init           = tuntap_user_init,
++      .open           = tuntap_open,
++      .close          = tuntap_close,
++      .remove         = NULL,
++      .set_mtu        = tuntap_set_mtu,
++      .add_address    = tuntap_add_addr,
++      .delete_address = tuntap_del_addr,
++      .max_packet     = MAX_PACKET
++};
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/file.c um/arch/um/os-Linux/file.c
+--- orig/arch/um/os-Linux/file.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/file.c Tue Feb  4 19:32:10 2003
+@@ -0,0 +1,384 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <unistd.h>
++#include <errno.h>
++#include <fcntl.h>
++#include <signal.h>
++#include <sys/socket.h>
++#include <sys/un.h>
++#include <sys/ioctl.h>
++#include <sys/mount.h>
++#include <sys/uio.h>
++#include "os.h"
++#include "user.h"
++#include "kern_util.h"
++
++int os_file_type(char *file)
++{
++      struct stat64 buf;
++
++      if(stat64(file, &buf) == -1)
++              return(-errno);
++
++      if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR);
++      else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK);
++      else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV);
++      else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV);
++      else if(S_ISFIFO(buf.st_mode)) return(OS_TYPE_FIFO);
++      else if(S_ISSOCK(buf.st_mode)) return(OS_TYPE_SOCK);
++      else return(OS_TYPE_FILE);
++}
++
++int os_file_mode(char *file, struct openflags *mode_out)
++{
++      *mode_out = OPENFLAGS();
++
++      if(!access(file, W_OK)) *mode_out = of_write(*mode_out);
++      else if(errno != EACCES) 
++              return(-errno);
++
++      if(!access(file, R_OK)) *mode_out = of_read(*mode_out);
++      else if(errno != EACCES) 
++              return(-errno);
++
++      return(0);
++}
++
++int os_open_file(char *file, struct openflags flags, int mode)
++{
++      int fd, f = 0;
++
++      if(flags.r && flags.w) f = O_RDWR;
++      else if(flags.r) f = O_RDONLY;
++      else if(flags.w) f = O_WRONLY;
++      else f = 0;
++
++      if(flags.s) f |= O_SYNC;
++      if(flags.c) f |= O_CREAT;
++      if(flags.t) f |= O_TRUNC;
++      if(flags.e) f |= O_EXCL;
++
++      fd = open64(file, f, mode);
++      if(fd < 0) return(-errno);
++
++      if(flags.cl){
++              if(fcntl(fd, F_SETFD, 1)){
++                      close(fd);
++                      return(-errno);
++              }
++      }
++
++      return(fd);
++}
++
++int os_connect_socket(char *name)
++{
++      struct sockaddr_un sock;
++      int fd, err;
++
++      sock.sun_family = AF_UNIX;
++      snprintf(sock.sun_path, sizeof(sock.sun_path), "%s", name);
++
++      fd = socket(AF_UNIX, SOCK_STREAM, 0);
++      if(fd < 0)
++              return(fd);
++
++      err = connect(fd, (struct sockaddr *) &sock, sizeof(sock));
++      if(err)
++              return(err);
++
++      return(fd);
++}
++
++void os_close_file(int fd)
++{
++      close(fd);
++}
++
++int os_seek_file(int fd, __u64 offset)
++{
++      __u64 actual;
++
++      actual = lseek64(fd, offset, SEEK_SET);
++      if(actual != offset) return(-errno);
++      return(0);
++}
++
++int os_read_file(int fd, void *buf, int len)
++{
++      int n;
++
++      /* Force buf into memory if it's not already. */
++
++      /* XXX This fails if buf is kernel memory */
++#ifdef notdef
++      if(copy_to_user_proc(buf, &c, sizeof(c)))
++              return(-EFAULT);
++#endif
++
++      n = read(fd, buf, len);
++      if(n < 0)
++              return(-errno);
++      return(n);
++}
++
++int os_write_file(int fd, void *buf, int count)
++{
++      int n;
++
++      /* Force buf into memory if it's not already. */
++      
++      /* XXX This fails if buf is kernel memory */
++#ifdef notdef
++      if(copy_to_user_proc(buf, buf, buf[0]))
++              return(-EFAULT);
++#endif
++
++      n = write(fd, buf, count);
++      if(n < 0)
++              return(-errno);
++      return(n);
++}
++
++int os_file_size(char *file, long long *size_out)
++{
++      struct stat64 buf;
++
++      if(stat64(file, &buf) == -1){
++              printk("Couldn't stat \"%s\" : errno = %d\n", file, errno);
++              return(-errno);
++      }
++      if(S_ISBLK(buf.st_mode)){
++              int fd, blocks;
++
++              if((fd = open64(file, O_RDONLY)) < 0){
++                      printk("Couldn't open \"%s\", errno = %d\n", file,
++                             errno);
++                      return(-errno);
++              }
++              if(ioctl(fd, BLKGETSIZE, &blocks) < 0){
++                      printk("Couldn't get the block size of \"%s\", "
++                             "errno = %d\n", file, errno);
++                      close(fd);
++                      return(-errno);
++              }
++              *size_out = ((long long) blocks) * 512;
++              close(fd);
++              return(0);
++      }
++      *size_out = buf.st_size;
++      return(0);
++}
++
++int os_pipe(int *fds, int stream, int close_on_exec)
++{
++      int err, type = stream ? SOCK_STREAM : SOCK_DGRAM;
++
++      err = socketpair(AF_UNIX, type, 0, fds);
++      if(err) 
++              return(-errno);
++
++      if(!close_on_exec)
++              return(0);
++
++      if((fcntl(fds[0], F_SETFD, 1) < 0) || (fcntl(fds[1], F_SETFD, 1) < 0))
++              printk("os_pipe : Setting FD_CLOEXEC failed, errno = %d", 
++                     errno);
++
++      return(0);
++}
++
++int os_set_fd_async(int fd, int owner)
++{
++      /* XXX This should do F_GETFL first */
++      if(fcntl(fd, F_SETFL, O_ASYNC | O_NONBLOCK) < 0){
++              printk("os_set_fd_async : failed to set O_ASYNC and "
++                     "O_NONBLOCK on fd # %d, errno = %d\n", fd, errno);
++              return(-errno);
++      }
++#ifdef notdef
++      if(fcntl(fd, F_SETFD, 1) < 0){
++              printk("os_set_fd_async : Setting FD_CLOEXEC failed, "
++                     "errno = %d\n", errno);
++      }
++#endif
++
++      if((fcntl(fd, F_SETSIG, SIGIO) < 0) ||
++         (fcntl(fd, F_SETOWN, owner) < 0)){
++              printk("os_set_fd_async : Failed to fcntl F_SETOWN "
++                     "(or F_SETSIG) fd %d to pid %d, errno = %d\n", fd, 
++                     owner, errno);
++              return(-errno);
++      }
++
++      return(0);
++}
++
++int os_set_fd_block(int fd, int blocking)
++{
++      int flags;
++
++      flags = fcntl(fd, F_GETFL);
++
++      if(blocking) flags &= ~O_NONBLOCK;
++      else flags |= O_NONBLOCK;
++
++      if(fcntl(fd, F_SETFL, flags) < 0){
++              printk("Failed to change blocking on fd # %d, errno = %d\n",
++                     fd, errno);
++              return(-errno);
++      }
++      return(0);
++}
++
++int os_accept_connection(int fd)
++{
++      int new;
++
++      new = accept(fd, NULL, 0);
++      if(new < 0) 
++              return(-errno);
++      return(new);
++}
++
++#ifndef SHUT_RD
++#define SHUT_RD 0
++#endif
++
++#ifndef SHUT_WR
++#define SHUT_WR 1
++#endif
++
++#ifndef SHUT_RDWR
++#define SHUT_RDWR 2
++#endif
++
++int os_shutdown_socket(int fd, int r, int w)
++{
++      int what, err;
++
++      if(r && w) what = SHUT_RDWR;
++      else if(r) what = SHUT_RD;
++      else if(w) what = SHUT_WR;
++      else {
++              printk("os_shutdown_socket : neither r or w was set\n");
++              return(-EINVAL);
++      }
++      err = shutdown(fd, what);
++      if(err)
++              return(-errno);
++      return(0);
++}
++
++int os_rcv_fd(int fd, int *helper_pid_out)
++{
++      int new, n;
++      char buf[CMSG_SPACE(sizeof(new))];
++      struct msghdr msg;
++      struct cmsghdr *cmsg;
++      struct iovec iov;
++
++      msg.msg_name = NULL;
++      msg.msg_namelen = 0;
++      iov = ((struct iovec) { .iov_base  = helper_pid_out,
++                              .iov_len   = sizeof(*helper_pid_out) });
++      msg.msg_iov = &iov;
++      msg.msg_iovlen = 1;
++      msg.msg_control = buf;
++      msg.msg_controllen = sizeof(buf);
++      msg.msg_flags = 0;
++
++      n = recvmsg(fd, &msg, 0);
++      if(n < 0)
++              return(-errno);
++
++      else if(n != sizeof(iov.iov_len))
++              *helper_pid_out = -1;
++
++      cmsg = CMSG_FIRSTHDR(&msg);
++      if(cmsg == NULL){
++              printk("rcv_fd didn't receive anything, error = %d\n", errno);
++              return(-1);
++      }
++      if((cmsg->cmsg_level != SOL_SOCKET) || 
++         (cmsg->cmsg_type != SCM_RIGHTS)){
++              printk("rcv_fd didn't receive a descriptor\n");
++              return(-1);
++      }
++
++      new = ((int *) CMSG_DATA(cmsg))[0];
++      return(new);
++}
++
++int create_unix_socket(char *file, int len)
++{
++      struct sockaddr_un addr;
++      int sock, err;
++
++      sock = socket(PF_UNIX, SOCK_DGRAM, 0);
++      if (sock < 0){
++              printk("create_unix_socket - socket failed, errno = %d\n",
++                     errno);
++              return(-errno);
++      }
++
++      addr.sun_family = AF_UNIX;
++
++      /* XXX Be more careful about overflow */
++      snprintf(addr.sun_path, len, "%s", file);
++
++      err = bind(sock, (struct sockaddr *) &addr, sizeof(addr));
++      if (err < 0){
++              printk("create_listening_socket - bind failed, errno = %d\n",
++                     errno);
++              return(-errno);
++      }
++
++      return(sock);
++}
++
++void os_flush_stdout(void)
++{
++      fflush(stdout);
++}
++
++int os_lock_file(int fd, int excl)
++{
++      int type = excl ? F_WRLCK : F_RDLCK;
++      struct flock lock = ((struct flock) { .l_type   = type,
++                                            .l_whence = SEEK_SET,
++                                            .l_start  = 0,
++                                            .l_len    = 0 } );
++      int err, save;
++
++      err = fcntl(fd, F_SETLK, &lock);
++      if(!err)
++              goto out;
++
++      save = -errno;
++      err = fcntl(fd, F_GETLK, &lock);
++      if(err){
++              err = -errno;
++              goto out;
++      }
++              
++      printk("F_SETLK failed, file already locked by pid %d\n", lock.l_pid);
++      err = save;
++ out:
++      return(err);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/include/file.h um/arch/um/os-Linux/include/file.h
+--- orig/arch/um/os-Linux/include/file.h       Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/include/file.h Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,22 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __OS_FILE_H__
++#define __OS_FILE_H__
++
++#define DEV_NULL "/dev/null"
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/process.c um/arch/um/os-Linux/process.c
+--- orig/arch/um/os-Linux/process.c    Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/process.c      Wed Jan  8 14:19:00 2003
+@@ -0,0 +1,142 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <stdio.h>
++#include <errno.h>
++#include <signal.h>
++#include <sys/mman.h>
++#include <sys/wait.h>
++#include "os.h"
++#include "user.h"
++
++unsigned long os_process_pc(int pid)
++{
++      char proc_stat[sizeof("/proc/#####/stat\0")], buf[256];
++      unsigned long pc;
++      int fd;
++
++      sprintf(proc_stat, "/proc/%d/stat", pid);
++      fd = os_open_file(proc_stat, of_read(OPENFLAGS()), 0);
++      if(fd < 0){
++              printk("os_process_pc - couldn't open '%s', errno = %d\n", 
++                     proc_stat, errno);
++              return(-1);
++      }
++      if(read(fd, buf, sizeof(buf)) < 0){
++              printk("os_process_pc - couldn't read '%s', errno = %d\n", 
++                     proc_stat, errno);
++              close(fd);
++              return(-1);
++      }
++      close(fd);
++      pc = -1;
++      if(sscanf(buf, "%*d %*s %*c %*d %*d %*d %*d %*d %*d %*d %*d "
++                "%*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d "
++                "%*d %*d %*d %*d %ld", &pc) != 1){
++              printk("os_process_pc - couldn't find pc in '%s'\n", buf);
++      }
++      return(pc);
++}
++
++int os_process_parent(int pid)
++{
++      char stat[sizeof("/proc/nnnnn/stat\0")];
++      char data[256];
++      int parent, n, fd;
++
++      if(pid == -1) return(-1);
++
++      snprintf(stat, sizeof(stat), "/proc/%d/stat", pid);
++      fd = os_open_file(stat, of_read(OPENFLAGS()), 0);
++      if(fd < 0){
++              printk("Couldn't open '%s', errno = %d\n", stat, -fd);
++              return(-1);
++      }
++
++      n = read(fd, data, sizeof(data));
++      close(fd);
++
++      if(n < 0){
++              printk("Couldn't read '%s', errno = %d\n", stat);
++              return(-1);
++      }
++
++      parent = -1;
++      /* XXX This will break if there is a space in the command */
++      n = sscanf(data, "%*d %*s %*c %d", &parent);
++      if(n != 1) printk("Failed to scan '%s'\n", data);
++
++      return(parent);
++}
++
++void os_stop_process(int pid)
++{
++      kill(pid, SIGSTOP);
++}
++
++void os_kill_process(int pid, int reap_child)
++{
++      kill(pid, SIGKILL);
++      if(reap_child)
++              waitpid(pid, NULL, 0);
++              
++}
++
++void os_usr1_process(int pid)
++{
++      kill(pid, SIGUSR1);
++}
++
++int os_getpid(void)
++{
++      return(getpid());
++}
++
++int os_map_memory(void *virt, int fd, unsigned long off, unsigned long len, 
++                int r, int w, int x)
++{
++      void *loc;
++      int prot;
++
++      prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | 
++              (x ? PROT_EXEC : 0);
++
++      loc = mmap((void *) virt, len, prot, MAP_SHARED | MAP_FIXED, 
++                 fd, off);
++      if(loc == MAP_FAILED)
++              return(-errno);
++      return(0);
++}
++
++int os_protect_memory(void *addr, unsigned long len, int r, int w, int x)
++{
++        int prot = ((r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | 
++                  (x ? PROT_EXEC : 0));
++
++        if(mprotect(addr, len, prot) < 0)
++              return(-errno);
++        return(0);
++}
++
++int os_unmap_memory(void *addr, int len)
++{
++        int err;
++
++        err = munmap(addr, len);
++        if(err < 0) return(-errno);
++        return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/os-Linux/tty.c um/arch/um/os-Linux/tty.c
+--- orig/arch/um/os-Linux/tty.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/os-Linux/tty.c  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,61 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdlib.h>
++#include <errno.h>
++#include "os.h"
++#include "user.h"
++#include "kern_util.h"
++
++struct grantpt_info {
++      int fd;
++      int res;
++      int err;
++};
++
++static void grantpt_cb(void *arg)
++{
++      struct grantpt_info *info = arg;
++
++      info->res = grantpt(info->fd);
++      info->err = errno;
++}
++
++int get_pty(void)
++{
++      struct grantpt_info info;
++      int fd;
++
++      if((fd = os_open_file("/dev/ptmx", of_rdwr(OPENFLAGS()), 0)) < 0){
++              printk("get_pty : Couldn't open /dev/ptmx - errno = %d\n",
++                     errno);
++              return(-1);
++      }
++
++      info.fd = fd;
++      initial_thread_cb(grantpt_cb, &info);
++
++      if(info.res < 0){
++              printk("get_pty : Couldn't grant pty - errno = %d\n", 
++                     info.err);
++              return(-1);
++      }
++      if(unlockpt(fd) < 0){
++              printk("get_pty : Couldn't unlock pty - errno = %d\n", errno);
++              return(-1);
++      }
++      return(fd);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/Makefile um/arch/um/sys-i386/Makefile
+--- orig/arch/um/sys-i386/Makefile     Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/Makefile       Sat Nov 23 23:34:24 2002
+@@ -0,0 +1,46 @@
++# 
++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++# Licensed under the GPL
++#
++
++O_TARGET = built-in.o
++
++obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o ptrace.o \
++      ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o
++export-objs = ksyms.o
++
++USER_OBJS = bugs.o ptrace_user.o sigcontext.o fault.o
++
++SYMLINKS = semaphore.c extable.c
++
++semaphore.c-dir = kernel
++extable.c-dir = mm
++
++include $(TOPDIR)/Rules.make
++
++$(USER_OBJS) : %.o: %.c
++      $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $<
++
++define make_link
++      -rm -f $1
++      ln -sf $(TOPDIR)/arch/i386/$($1-dir)/$1 $1
++endef
++
++$(SYMLINKS): 
++      $(call make_link,$@)
++
++clean:
++      $(MAKE) -C util clean
++      rm -f $(SYMLINKS)
++
++fastdep:
++
++dep:
++
++archmrproper:
++
++archclean:
++
++archdep:
++
++modules:
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/bugs.c um/arch/um/sys-i386/bugs.c
+--- orig/arch/um/sys-i386/bugs.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/bugs.c Sun Dec  8 20:38:45 2002
+@@ -0,0 +1,157 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <unistd.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <string.h>
++#include <sys/signal.h>
++#include "kern_util.h"
++#include "user.h"
++#include "sysdep/ptrace.h"
++#include "task.h"
++
++#define MAXTOKEN 64
++
++/* Set during early boot */
++int cpu_has_cmov = 1;
++int cpu_has_xmm = 0;
++
++static char token(int fd, char *buf, int len, char stop)
++{
++      int n;
++      char *ptr, *end, c;
++
++      ptr = buf;
++      end = &buf[len];
++      do {
++              n = read(fd, ptr, sizeof(*ptr));
++              c = *ptr++;
++              if(n == 0) return(0);
++              else if(n != sizeof(*ptr)){
++                      printk("Reading /proc/cpuinfo failed, "
++                             "errno = %d\n", errno);
++                      return(-errno);
++              }
++      } while((c != '\n') && (c != stop) && (ptr < end));
++
++      if(ptr == end){
++              printk("Failed to find '%c' in /proc/cpuinfo\n", stop);
++              return(-1);
++      }
++      *(ptr - 1) = '\0';
++      return(c);
++}
++
++static int check_cpu_feature(char *feature, int *have_it)
++{
++      char buf[MAXTOKEN], c;
++      int fd, len = sizeof(buf)/sizeof(buf[0]), n;
++
++      printk("Checking for host processor %s support...", feature);
++      fd = open("/proc/cpuinfo", O_RDONLY);
++      if(fd < 0){
++              printk("Couldn't open /proc/cpuinfo, errno = %d\n", errno);
++              return(0);
++      }
++
++      *have_it = 0;
++      buf[len - 1] = '\0';
++      while(1){
++              c = token(fd, buf, len - 1, ':');
++              if(c <= 0) goto out;
++              else if(c != ':'){
++                      printk("Failed to find ':' in /proc/cpuinfo\n");
++                      goto out;
++              }
++
++              if(!strncmp(buf, "flags", strlen("flags"))) break;
++
++              do {
++                      n = read(fd, &c, sizeof(c));
++                      if(n != sizeof(c)){
++                              printk("Failed to find newline in "
++                                     "/proc/cpuinfo, n = %d, errno = %d\n",
++                                     n, errno);
++                              goto out;
++                      }
++              } while(c != '\n');
++      }
++
++      c = token(fd, buf, len - 1, ' ');
++      if(c < 0) goto out;
++      else if(c != ' '){
++              printk("Failed to find ':' in /proc/cpuinfo\n");
++              goto out;
++      }
++
++      while(1){
++              c = token(fd, buf, len - 1, ' ');
++              if(c < 0) goto out;
++              else if(c == '\n') break;
++
++              if(!strcmp(buf, feature)){
++                      *have_it = 1;
++                      goto out;
++              }
++      }
++ out:
++      if(*have_it == 0) printk("No\n");
++      else if(*have_it == 1) printk("Yes\n");
++      close(fd);
++      return(1);
++}
++
++void arch_check_bugs(void)
++{
++      int have_it;
++
++      if(access("/proc/cpuinfo", R_OK)){
++              printk("/proc/cpuinfo not available - skipping CPU capability "
++                     "checks\n");
++              return;
++      }
++      if(check_cpu_feature("cmov", &have_it)) cpu_has_cmov = have_it;
++      if(check_cpu_feature("xmm", &have_it)) cpu_has_xmm = have_it;
++}
++
++int arch_handle_signal(int sig, union uml_pt_regs *regs)
++{
++      unsigned long ip;
++
++      /* This is testing for a cmov (0x0f 0x4x) instruction causing a
++       * SIGILL in init.
++       */
++      if((sig != SIGILL) || (TASK_PID(get_current()) != 1)) return(0);
++
++      ip = UPT_IP(regs);
++      if((*((char *) ip) != 0x0f) || ((*((char *) (ip + 1)) & 0xf0) != 0x40))
++              return(0);
++
++      if(cpu_has_cmov == 0)
++              panic("SIGILL caused by cmov, which this processor doesn't "
++                    "implement, boot a filesystem compiled for older "
++                    "processors");
++      else if(cpu_has_cmov == 1)
++              panic("SIGILL caused by cmov, which this processor claims to "
++                    "implement");
++      else if(cpu_has_cmov == -1)
++              panic("SIGILL caused by cmov, couldn't tell if this processor "
++                    "implements it, boot a filesystem compiled for older "
++                    "processors");
++      else panic("Bad value for cpu_has_cmov (%d)", cpu_has_cmov);
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/checksum.S um/arch/um/sys-i386/checksum.S
+--- orig/arch/um/sys-i386/checksum.S   Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/checksum.S     Thu Oct 31 20:17:50 2002
+@@ -0,0 +1,460 @@
++/*
++ * INET               An implementation of the TCP/IP protocol suite for the LINUX
++ *            operating system.  INET is implemented using the  BSD Socket
++ *            interface as the means of communication with the user level.
++ *
++ *            IP/TCP/UDP checksumming routines
++ *
++ * Authors:   Jorge Cwik, <jorge@laser.satlink.net>
++ *            Arnt Gulbrandsen, <agulbra@nvg.unit.no>
++ *            Tom May, <ftom@netcom.com>
++ *              Pentium Pro/II routines:
++ *              Alexander Kjeldaas <astor@guardian.no>
++ *              Finn Arne Gangstad <finnag@guardian.no>
++ *            Lots of code moved from tcp.c and ip.c; see those files
++ *            for more names.
++ *
++ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
++ *                         handling.
++ *            Andi Kleen,  add zeroing on error
++ *                   converted to pure assembler
++ *
++ *            This program is free software; you can redistribute it and/or
++ *            modify it under the terms of the GNU General Public License
++ *            as published by the Free Software Foundation; either version
++ *            2 of the License, or (at your option) any later version.
++ */
++
++#include <linux/config.h>
++#include <asm/errno.h>
++                              
++/*
++ * computes a partial checksum, e.g. for TCP/UDP fragments
++ */
++
++/*    
++unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
++ */
++              
++.text
++.align 4
++.globl arch_csum_partial                                                              
++              
++#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
++
++        /*            
++         * Experiments with Ethernet and SLIP connections show that buff
++         * is aligned on either a 2-byte or 4-byte boundary.  We get at
++         * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
++         * Fortunately, it is easy to convert 2-byte alignment to 4-byte
++         * alignment for the unrolled loop.
++         */           
++arch_csum_partial:    
++      pushl %esi
++      pushl %ebx
++      movl 20(%esp),%eax      # Function arg: unsigned int sum
++      movl 16(%esp),%ecx      # Function arg: int len
++      movl 12(%esp),%esi      # Function arg: unsigned char *buff
++      testl $2, %esi          # Check alignment.
++      jz 2f                   # Jump if alignment is ok.
++      subl $2, %ecx           # Alignment uses up two bytes.
++      jae 1f                  # Jump if we had at least two bytes.
++      addl $2, %ecx           # ecx was < 2.  Deal with it.
++      jmp 4f
++1:    movw (%esi), %bx
++      addl $2, %esi
++      addw %bx, %ax
++      adcl $0, %eax
++2:
++      movl %ecx, %edx
++      shrl $5, %ecx
++      jz 2f
++      testl %esi, %esi
++1:    movl (%esi), %ebx
++      adcl %ebx, %eax
++      movl 4(%esi), %ebx
++      adcl %ebx, %eax
++      movl 8(%esi), %ebx
++      adcl %ebx, %eax
++      movl 12(%esi), %ebx
++      adcl %ebx, %eax
++      movl 16(%esi), %ebx
++      adcl %ebx, %eax
++      movl 20(%esi), %ebx
++      adcl %ebx, %eax
++      movl 24(%esi), %ebx
++      adcl %ebx, %eax
++      movl 28(%esi), %ebx
++      adcl %ebx, %eax
++      lea 32(%esi), %esi
++      dec %ecx
++      jne 1b
++      adcl $0, %eax
++2:    movl %edx, %ecx
++      andl $0x1c, %edx
++      je 4f
++      shrl $2, %edx           # This clears CF
++3:    adcl (%esi), %eax
++      lea 4(%esi), %esi
++      dec %edx
++      jne 3b
++      adcl $0, %eax
++4:    andl $3, %ecx
++      jz 7f
++      cmpl $2, %ecx
++      jb 5f
++      movw (%esi),%cx
++      leal 2(%esi),%esi
++      je 6f
++      shll $16,%ecx
++5:    movb (%esi),%cl
++6:    addl %ecx,%eax
++      adcl $0, %eax 
++7:    
++      popl %ebx
++      popl %esi
++      ret
++
++#else
++
++/* Version for PentiumII/PPro */
++
++arch_csum_partial:
++      pushl %esi
++      pushl %ebx
++      movl 20(%esp),%eax      # Function arg: unsigned int sum
++      movl 16(%esp),%ecx      # Function arg: int len
++      movl 12(%esp),%esi      # Function arg: const unsigned char *buf
++
++      testl $2, %esi         
++      jnz 30f                 
++10:
++      movl %ecx, %edx
++      movl %ecx, %ebx
++      andl $0x7c, %ebx
++      shrl $7, %ecx
++      addl %ebx,%esi
++      shrl $2, %ebx  
++      negl %ebx
++      lea 45f(%ebx,%ebx,2), %ebx
++      testl %esi, %esi
++      jmp *%ebx
++
++      # Handle 2-byte-aligned regions
++20:   addw (%esi), %ax
++      lea 2(%esi), %esi
++      adcl $0, %eax
++      jmp 10b
++
++30:   subl $2, %ecx          
++      ja 20b                 
++      je 32f
++      movzbl (%esi),%ebx      # csumming 1 byte, 2-aligned
++      addl %ebx, %eax
++      adcl $0, %eax
++      jmp 80f
++32:
++      addw (%esi), %ax        # csumming 2 bytes, 2-aligned
++      adcl $0, %eax
++      jmp 80f
++
++40: 
++      addl -128(%esi), %eax
++      adcl -124(%esi), %eax
++      adcl -120(%esi), %eax
++      adcl -116(%esi), %eax   
++      adcl -112(%esi), %eax   
++      adcl -108(%esi), %eax
++      adcl -104(%esi), %eax
++      adcl -100(%esi), %eax
++      adcl -96(%esi), %eax
++      adcl -92(%esi), %eax
++      adcl -88(%esi), %eax
++      adcl -84(%esi), %eax
++      adcl -80(%esi), %eax
++      adcl -76(%esi), %eax
++      adcl -72(%esi), %eax
++      adcl -68(%esi), %eax
++      adcl -64(%esi), %eax     
++      adcl -60(%esi), %eax     
++      adcl -56(%esi), %eax     
++      adcl -52(%esi), %eax   
++      adcl -48(%esi), %eax   
++      adcl -44(%esi), %eax
++      adcl -40(%esi), %eax
++      adcl -36(%esi), %eax
++      adcl -32(%esi), %eax
++      adcl -28(%esi), %eax
++      adcl -24(%esi), %eax
++      adcl -20(%esi), %eax
++      adcl -16(%esi), %eax
++      adcl -12(%esi), %eax
++      adcl -8(%esi), %eax
++      adcl -4(%esi), %eax
++45:
++      lea 128(%esi), %esi
++      adcl $0, %eax
++      dec %ecx
++      jge 40b
++      movl %edx, %ecx
++50:   andl $3, %ecx
++      jz 80f
++
++      # Handle the last 1-3 bytes without jumping
++      notl %ecx               # 1->2, 2->1, 3->0, higher bits are masked
++      movl $0xffffff,%ebx     # by the shll and shrl instructions
++      shll $3,%ecx
++      shrl %cl,%ebx
++      andl -128(%esi),%ebx    # esi is 4-aligned so should be ok
++      addl %ebx,%eax
++      adcl $0,%eax
++80: 
++      popl %ebx
++      popl %esi
++      ret
++                              
++#endif
++
++/*
++unsigned int csum_partial_copy_generic (const char *src, char *dst,
++                                int len, int sum, int *src_err_ptr, int *dst_err_ptr)
++ */ 
++
++/*
++ * Copy from ds while checksumming, otherwise like csum_partial
++ *
++ * The macros SRC and DST specify the type of access for the instruction.
++ * thus we can call a custom exception handler for all access types.
++ *
++ * FIXME: could someone double-check whether I haven't mixed up some SRC and
++ *      DST definitions? It's damn hard to trigger all cases.  I hope I got
++ *      them all but there's no guarantee.
++ */
++
++#define SRC(y...)                     \
++      9999: y;                        \
++      .section __ex_table, "a";       \
++      .long 9999b, 6001f      ;       \
++      .previous
++
++#define DST(y...)                     \
++      9999: y;                        \
++      .section __ex_table, "a";       \
++      .long 9999b, 6002f      ;       \
++      .previous
++
++.align 4
++.globl csum_partial_copy_generic_i386
++                              
++#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
++
++#define ARGBASE 16            
++#define FP            12
++              
++csum_partial_copy_generic_i386:
++      subl  $4,%esp   
++      pushl %edi
++      pushl %esi
++      pushl %ebx
++      movl ARGBASE+16(%esp),%eax      # sum
++      movl ARGBASE+12(%esp),%ecx      # len
++      movl ARGBASE+4(%esp),%esi       # src
++      movl ARGBASE+8(%esp),%edi       # dst
++
++      testl $2, %edi                  # Check alignment. 
++      jz 2f                           # Jump if alignment is ok.
++      subl $2, %ecx                   # Alignment uses up two bytes.
++      jae 1f                          # Jump if we had at least two bytes.
++      addl $2, %ecx                   # ecx was < 2.  Deal with it.
++      jmp 4f
++SRC(1:        movw (%esi), %bx        )
++      addl $2, %esi
++DST(  movw %bx, (%edi)        )
++      addl $2, %edi
++      addw %bx, %ax   
++      adcl $0, %eax
++2:
++      movl %ecx, FP(%esp)
++      shrl $5, %ecx
++      jz 2f
++      testl %esi, %esi
++SRC(1:        movl (%esi), %ebx       )
++SRC(  movl 4(%esi), %edx      )
++      adcl %ebx, %eax
++DST(  movl %ebx, (%edi)       )
++      adcl %edx, %eax
++DST(  movl %edx, 4(%edi)      )
++
++SRC(  movl 8(%esi), %ebx      )
++SRC(  movl 12(%esi), %edx     )
++      adcl %ebx, %eax
++DST(  movl %ebx, 8(%edi)      )
++      adcl %edx, %eax
++DST(  movl %edx, 12(%edi)     )
++
++SRC(  movl 16(%esi), %ebx     )
++SRC(  movl 20(%esi), %edx     )
++      adcl %ebx, %eax
++DST(  movl %ebx, 16(%edi)     )
++      adcl %edx, %eax
++DST(  movl %edx, 20(%edi)     )
++
++SRC(  movl 24(%esi), %ebx     )
++SRC(  movl 28(%esi), %edx     )
++      adcl %ebx, %eax
++DST(  movl %ebx, 24(%edi)     )
++      adcl %edx, %eax
++DST(  movl %edx, 28(%edi)     )
++
++      lea 32(%esi), %esi
++      lea 32(%edi), %edi
++      dec %ecx
++      jne 1b
++      adcl $0, %eax
++2:    movl FP(%esp), %edx
++      movl %edx, %ecx
++      andl $0x1c, %edx
++      je 4f
++      shrl $2, %edx                   # This clears CF
++SRC(3:        movl (%esi), %ebx       )
++      adcl %ebx, %eax
++DST(  movl %ebx, (%edi)       )
++      lea 4(%esi), %esi
++      lea 4(%edi), %edi
++      dec %edx
++      jne 3b
++      adcl $0, %eax
++4:    andl $3, %ecx
++      jz 7f
++      cmpl $2, %ecx
++      jb 5f
++SRC(  movw (%esi), %cx        )
++      leal 2(%esi), %esi
++DST(  movw %cx, (%edi)        )
++      leal 2(%edi), %edi
++      je 6f
++      shll $16,%ecx
++SRC(5:        movb (%esi), %cl        )
++DST(  movb %cl, (%edi)        )
++6:    addl %ecx, %eax
++      adcl $0, %eax
++7:
++5000:
++
++# Exception handler:
++.section .fixup, "ax"                                                 
++
++6001:
++      movl ARGBASE+20(%esp), %ebx     # src_err_ptr
++      movl $-EFAULT, (%ebx)
++
++      # zero the complete destination - computing the rest
++      # is too much work 
++      movl ARGBASE+8(%esp), %edi      # dst
++      movl ARGBASE+12(%esp), %ecx     # len
++      xorl %eax,%eax
++      rep ; stosb
++
++      jmp 5000b
++
++6002:
++      movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
++      movl $-EFAULT,(%ebx)
++      jmp 5000b
++
++.previous
++
++      popl %ebx
++      popl %esi
++      popl %edi
++      popl %ecx                       # equivalent to addl $4,%esp
++      ret     
++
++#else
++
++/* Version for PentiumII/PPro */
++
++#define ROUND1(x) \
++      SRC(movl x(%esi), %ebx  )       ;       \
++      addl %ebx, %eax                 ;       \
++      DST(movl %ebx, x(%edi)  )       ; 
++
++#define ROUND(x) \
++      SRC(movl x(%esi), %ebx  )       ;       \
++      adcl %ebx, %eax                 ;       \
++      DST(movl %ebx, x(%edi)  )       ;
++
++#define ARGBASE 12
++              
++csum_partial_copy_generic_i386:
++      pushl %ebx
++      pushl %edi
++      pushl %esi
++      movl ARGBASE+4(%esp),%esi       #src
++      movl ARGBASE+8(%esp),%edi       #dst    
++      movl ARGBASE+12(%esp),%ecx      #len
++      movl ARGBASE+16(%esp),%eax      #sum
++#     movl %ecx, %edx  
++      movl %ecx, %ebx  
++      movl %esi, %edx
++      shrl $6, %ecx     
++      andl $0x3c, %ebx  
++      negl %ebx
++      subl %ebx, %esi  
++      subl %ebx, %edi  
++      lea  -1(%esi),%edx
++      andl $-32,%edx
++      lea 3f(%ebx,%ebx), %ebx
++      testl %esi, %esi 
++      jmp *%ebx
++1:    addl $64,%esi
++      addl $64,%edi 
++      SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
++      ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)    
++      ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)    
++      ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)    
++      ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)     
++3:    adcl $0,%eax
++      addl $64, %edx
++      dec %ecx
++      jge 1b
++4:    movl ARGBASE+12(%esp),%edx      #len
++      andl $3, %edx
++      jz 7f
++      cmpl $2, %edx
++      jb 5f
++SRC(  movw (%esi), %dx         )
++      leal 2(%esi), %esi
++DST(  movw %dx, (%edi)         )
++      leal 2(%edi), %edi
++      je 6f
++      shll $16,%edx
++5:
++SRC(  movb (%esi), %dl         )
++DST(  movb %dl, (%edi)         )
++6:    addl %edx, %eax
++      adcl $0, %eax
++7:
++.section .fixup, "ax"
++6001: movl    ARGBASE+20(%esp), %ebx  # src_err_ptr   
++      movl $-EFAULT, (%ebx)
++      # zero the complete destination (computing the rest is too much work)
++      movl ARGBASE+8(%esp),%edi       # dst
++      movl ARGBASE+12(%esp),%ecx      # len
++      xorl %eax,%eax
++      rep; stosb
++      jmp 7b
++6002: movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
++      movl $-EFAULT, (%ebx)
++      jmp  7b                 
++.previous                             
++
++      popl %esi
++      popl %edi
++      popl %ebx
++      ret
++                              
++#undef ROUND
++#undef ROUND1         
++              
++#endif
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/fault.c um/arch/um/sys-i386/fault.c
+--- orig/arch/um/sys-i386/fault.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/fault.c        Sun Oct 27 16:49:35 2002
+@@ -0,0 +1,34 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <signal.h>
++#include "sysdep/ptrace.h"
++#include "sysdep/sigcontext.h"
++
++extern unsigned long search_exception_table(unsigned long addr);
++
++int arch_fixup(unsigned long address, void *sc_ptr)
++{
++      struct sigcontext *sc = sc_ptr;
++      unsigned long fixup;
++
++      fixup = search_exception_table(address);
++      if(fixup != 0){
++              sc->eip = fixup;
++              return(1);
++      }
++      return(0);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/ksyms.c um/arch/um/sys-i386/ksyms.c
+--- orig/arch/um/sys-i386/ksyms.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/ksyms.c        Tue Oct 29 21:01:45 2002
+@@ -0,0 +1,17 @@
++#include "linux/module.h"
++#include "linux/in6.h"
++#include "linux/rwsem.h"
++#include "asm/byteorder.h"
++#include "asm/semaphore.h"
++#include "asm/uaccess.h"
++#include "asm/checksum.h"
++#include "asm/errno.h"
++
++EXPORT_SYMBOL(__down_failed);
++EXPORT_SYMBOL(__down_failed_interruptible);
++EXPORT_SYMBOL(__down_failed_trylock);
++EXPORT_SYMBOL(__up_wakeup);
++
++/* Networking helper routines. */
++EXPORT_SYMBOL(csum_partial_copy_from);
++EXPORT_SYMBOL(csum_partial_copy_to);
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/ldt.c um/arch/um/sys-i386/ldt.c
+--- orig/arch/um/sys-i386/ldt.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/ldt.c  Wed Nov 13 12:43:04 2002
+@@ -0,0 +1,92 @@
++/*
++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/config.h"
++#include "linux/slab.h"
++#include "asm/uaccess.h"
++#include "asm/ptrace.h"
++#include "choose-mode.h"
++#include "kern.h"
++
++#ifdef CONFIG_MODE_TT
++extern int modify_ldt(int func, void *ptr, unsigned long bytecount);
++
++int sys_modify_ldt_tt(int func, void *ptr, unsigned long bytecount)
++{
++      if(verify_area(VERIFY_READ, ptr, bytecount)) return(-EFAULT);
++      return(modify_ldt(func, ptr, bytecount));
++}
++#endif
++
++#ifdef CONFIG_MODE_SKAS
++extern int userspace_pid;
++
++int sys_modify_ldt_skas(int func, void *ptr, unsigned long bytecount)
++{
++      struct ptrace_ldt ldt;
++      void *buf;
++      int res, n;
++
++      buf = kmalloc(bytecount, GFP_KERNEL);
++      if(buf == NULL)
++              return(-ENOMEM);
++
++      res = 0;
++
++      switch(func){
++      case 1:
++      case 0x11:
++              res = copy_from_user(buf, ptr, bytecount);
++              break;
++      }
++
++      if(res != 0){
++              res = -EFAULT;
++              goto out;
++      }
++
++      ldt = ((struct ptrace_ldt) { .func      = func,
++                                   .ptr       = buf,
++                                   .bytecount = bytecount });
++      res = ptrace(PTRACE_LDT, userspace_pid, 0, (unsigned long) &ldt);
++      if(res < 0)
++              goto out;
++
++      switch(func){
++      case 0:
++      case 2:
++              n = res;
++              res = copy_to_user(ptr, buf, n);
++              if(res != 0)
++                      res = -EFAULT;
++              else 
++                      res = n;
++              break;
++      }
++
++ out:
++      kfree(buf);
++      return(res);
++}
++#endif
++
++int sys_modify_ldt(int func, void *ptr, unsigned long bytecount)
++{
++      return(CHOOSE_MODE_PROC(sys_modify_ldt_tt, sys_modify_ldt_skas, func, 
++                              ptr, bytecount));
++}
++
++
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/ptrace.c um/arch/um/sys-i386/ptrace.c
+--- orig/arch/um/sys-i386/ptrace.c     Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/ptrace.c       Sun Oct 27 16:49:35 2002
+@@ -0,0 +1,365 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/sched.h"
++#include "asm/elf.h"
++#include "asm/ptrace.h"
++#include "asm/uaccess.h"
++#include "ptrace_user.h"
++#include "sysdep/sigcontext.h"
++#include "sysdep/sc.h"
++
++void arch_switch(void)
++{
++      update_debugregs(current->thread.arch.debugregs_seq);
++}
++
++int is_syscall(unsigned long addr)
++{
++      unsigned short instr;
++      int n;
++
++      n = copy_from_user(&instr, (void *) addr, sizeof(instr));
++      if(n){
++              printk("is_syscall : failed to read instruction from 0x%lu\n", 
++                     addr);
++              return(0);
++      }
++      return(instr == 0x80cd);
++}
++
++/* determines which flags the user has access to. */
++/* 1 = access 0 = no access */
++#define FLAG_MASK 0x00044dd5
++
++int putreg(struct task_struct *child, int regno, unsigned long value)
++{
++      regno >>= 2;
++      switch (regno) {
++      case FS:
++              if (value && (value & 3) != 3)
++                      return -EIO;
++              PT_REGS_FS(&child->thread.regs) = value;
++              return 0;
++      case GS:
++              if (value && (value & 3) != 3)
++                      return -EIO;
++              PT_REGS_GS(&child->thread.regs) = value;
++              return 0;
++      case DS:
++      case ES:
++              if (value && (value & 3) != 3)
++                      return -EIO;
++              value &= 0xffff;
++              break;
++      case SS:
++      case CS:
++              if ((value & 3) != 3)
++                      return -EIO;
++              value &= 0xffff;
++              break;
++      case EFL:
++              value &= FLAG_MASK;
++              value |= PT_REGS_EFLAGS(&child->thread.regs);
++              break;
++      }
++      PT_REGS_SET(&child->thread.regs, regno, value);
++      return 0;
++}
++
++unsigned long getreg(struct task_struct *child, int regno)
++{
++      unsigned long retval = ~0UL;
++
++      regno >>= 2;
++      switch (regno) {
++      case FS:
++      case GS:
++      case DS:
++      case ES:
++      case SS:
++      case CS:
++              retval = 0xffff;
++              /* fall through */
++      default:
++              retval &= PT_REG(&child->thread.regs, regno);
++      }
++      return retval;
++}
++
++struct i387_fxsave_struct {
++      unsigned short  cwd;
++      unsigned short  swd;
++      unsigned short  twd;
++      unsigned short  fop;
++      long    fip;
++      long    fcs;
++      long    foo;
++      long    fos;
++      long    mxcsr;
++      long    reserved;
++      long    st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
++      long    xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
++      long    padding[56];
++};
++
++/*
++ * FPU tag word conversions.
++ */
++
++static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
++{
++      unsigned int tmp; /* to avoid 16 bit prefixes in the code */
++ 
++      /* Transform each pair of bits into 01 (valid) or 00 (empty) */
++        tmp = ~twd;
++        tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
++        /* and move the valid bits to the lower byte. */
++        tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
++        tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
++        tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
++        return tmp;
++}
++
++static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
++{
++      struct _fpxreg *st = NULL;
++      unsigned long twd = (unsigned long) fxsave->twd;
++      unsigned long tag;
++      unsigned long ret = 0xffff0000;
++      int i;
++
++#define FPREG_ADDR(f, n)      ((char *)&(f)->st_space + (n) * 16);
++
++      for ( i = 0 ; i < 8 ; i++ ) {
++              if ( twd & 0x1 ) {
++                      st = (struct _fpxreg *) FPREG_ADDR( fxsave, i );
++
++                      switch ( st->exponent & 0x7fff ) {
++                      case 0x7fff:
++                              tag = 2;                /* Special */
++                              break;
++                      case 0x0000:
++                              if ( !st->significand[0] &&
++                                   !st->significand[1] &&
++                                   !st->significand[2] &&
++                                   !st->significand[3] ) {
++                                      tag = 1;        /* Zero */
++                              } else {
++                                      tag = 2;        /* Special */
++                              }
++                              break;
++                      default:
++                              if ( st->significand[3] & 0x8000 ) {
++                                      tag = 0;        /* Valid */
++                              } else {
++                                      tag = 2;        /* Special */
++                              }
++                              break;
++                      }
++              } else {
++                      tag = 3;                        /* Empty */
++              }
++              ret |= (tag << (2 * i));
++              twd = twd >> 1;
++      }
++      return ret;
++}
++
++/*
++ * FXSR floating point environment conversions.
++ */
++
++#ifdef CONFIG_MODE_TT
++static inline int convert_fxsr_to_user_tt(struct _fpstate *buf, 
++                                        struct pt_regs *regs)
++{
++      struct i387_fxsave_struct *fxsave = SC_FXSR_ENV(PT_REGS_SC(regs));
++      unsigned long env[7];
++      struct _fpreg *to;
++      struct _fpxreg *from;
++      int i;
++
++      env[0] = (unsigned long)fxsave->cwd | 0xffff0000;
++      env[1] = (unsigned long)fxsave->swd | 0xffff0000;
++      env[2] = twd_fxsr_to_i387(fxsave);
++      env[3] = fxsave->fip;
++      env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
++      env[5] = fxsave->foo;
++      env[6] = fxsave->fos;
++
++      if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
++              return 1;
++
++      to = &buf->_st[0];
++      from = (struct _fpxreg *) &fxsave->st_space[0];
++      for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
++              if ( __copy_to_user( to, from, sizeof(*to) ) )
++                      return 1;
++      }
++      return 0;
++}
++#endif
++
++static inline int convert_fxsr_to_user(struct _fpstate *buf, 
++                                     struct pt_regs *regs)
++{
++      return(CHOOSE_MODE(convert_fxsr_to_user_tt(buf, regs), 0));
++}
++
++#ifdef CONFIG_MODE_TT
++static inline int convert_fxsr_from_user_tt(struct pt_regs *regs,
++                                          struct _fpstate *buf)
++{
++      struct i387_fxsave_struct *fxsave = SC_FXSR_ENV(PT_REGS_SC(regs));
++      unsigned long env[7];
++      struct _fpxreg *to;
++      struct _fpreg *from;
++      int i;
++
++      if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
++              return 1;
++
++      fxsave->cwd = (unsigned short)(env[0] & 0xffff);
++      fxsave->swd = (unsigned short)(env[1] & 0xffff);
++      fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
++      fxsave->fip = env[3];
++      fxsave->fop = (unsigned short)((env[4] & 0xffff0000) >> 16);
++      fxsave->fcs = (env[4] & 0xffff);
++      fxsave->foo = env[5];
++      fxsave->fos = env[6];
++
++      to = (struct _fpxreg *) &fxsave->st_space[0];
++      from = &buf->_st[0];
++      for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
++              if ( __copy_from_user( to, from, sizeof(*from) ) )
++                      return 1;
++      }
++      return 0;
++}
++#endif
++
++static inline int convert_fxsr_from_user(struct pt_regs *regs, 
++                                       struct _fpstate *buf)
++{
++      return(CHOOSE_MODE(convert_fxsr_from_user_tt(regs, buf), 0));
++}
++
++int get_fpregs(unsigned long buf, struct task_struct *child)
++{
++      int err;
++
++      err = convert_fxsr_to_user((struct _fpstate *) buf, 
++                                 &child->thread.regs);
++      if(err) return(-EFAULT);
++      else return(0);
++}
++
++int set_fpregs(unsigned long buf, struct task_struct *child)
++{
++      int err;
++
++      err = convert_fxsr_from_user(&child->thread.regs, 
++                                   (struct _fpstate *) buf);
++      if(err) return(-EFAULT);
++      else return(0);
++}
++
++#ifdef CONFIG_MODE_TT
++int get_fpxregs_tt(unsigned long buf, struct task_struct *tsk)
++{
++      struct pt_regs *regs = &tsk->thread.regs;
++      struct i387_fxsave_struct *fxsave = SC_FXSR_ENV(PT_REGS_SC(regs));
++      int err;
++
++      err = __copy_to_user((void *) buf, fxsave,
++                           sizeof(struct user_fxsr_struct));
++      if(err) return -EFAULT;
++      else return 0;
++}
++#endif
++
++int get_fpxregs(unsigned long buf, struct task_struct *tsk)
++{
++      return(CHOOSE_MODE(get_fpxregs_tt(buf, tsk), 0));
++}
++
++#ifdef CONFIG_MODE_TT
++int set_fpxregs_tt(unsigned long buf, struct task_struct *tsk)
++{
++      struct pt_regs *regs = &tsk->thread.regs;
++      struct i387_fxsave_struct *fxsave = SC_FXSR_ENV(PT_REGS_SC(regs));
++      int err;
++
++      err = __copy_from_user(fxsave, (void *) buf,
++                             sizeof(struct user_fxsr_struct) );
++      if(err) return -EFAULT;
++      else return 0;
++}
++#endif
++
++int set_fpxregs(unsigned long buf, struct task_struct *tsk)
++{
++      return(CHOOSE_MODE(set_fpxregs_tt(buf, tsk), 0));
++}
++
++#ifdef notdef
++int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
++{
++      fpu->cwd = (((SC_FP_CW(PT_REGS_SC(regs)) & 0xffff) << 16) |
++                  (SC_FP_SW(PT_REGS_SC(regs)) & 0xffff));
++      fpu->swd = SC_FP_CSSEL(PT_REGS_SC(regs)) & 0xffff;
++      fpu->twd = SC_FP_IPOFF(PT_REGS_SC(regs));
++      fpu->fip = SC_FP_CSSEL(PT_REGS_SC(regs)) & 0xffff;
++      fpu->fcs = SC_FP_DATAOFF(PT_REGS_SC(regs));
++      fpu->foo = SC_FP_DATASEL(PT_REGS_SC(regs));
++      fpu->fos = 0;
++      memcpy(fpu->st_space, (void *) SC_FP_ST(PT_REGS_SC(regs)),
++             sizeof(fpu->st_space));
++      return(1);
++}
++#endif
++
++#ifdef CONFIG_MODE_TT
++static inline void copy_fpu_fxsave_tt(struct pt_regs *regs,
++                                    struct user_i387_struct *buf)
++{
++      struct i387_fxsave_struct *fpu = SC_FXSR_ENV(PT_REGS_SC(regs));
++      unsigned short *to;
++      unsigned short *from;
++      int i;
++
++      memcpy( buf, fpu, 7 * sizeof(long) );
++
++      to = (unsigned short *) &buf->st_space[0];
++      from = (unsigned short *) &fpu->st_space[0];
++      for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
++              memcpy( to, from, 5 * sizeof(unsigned short) );
++      }
++}
++#endif
++
++static inline void copy_fpu_fxsave(struct pt_regs *regs,
++                                 struct user_i387_struct *buf)
++{
++      (void) CHOOSE_MODE(copy_fpu_fxsave_tt(regs, buf), 0);
++}
++
++int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu )
++{
++      copy_fpu_fxsave(regs, (struct user_i387_struct *) fpu);
++      return(1);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/ptrace_user.c um/arch/um/sys-i386/ptrace_user.c
+--- orig/arch/um/sys-i386/ptrace_user.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/ptrace_user.c  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,117 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stdio.h>
++#include <errno.h>
++#include <unistd.h>
++#include <linux/stddef.h>
++#include <sys/ptrace.h>
++#include <asm/ptrace.h>
++#include <asm/user.h>
++#include "kern_util.h"
++#include "sysdep/thread.h"
++#include "user.h"
++#include "os.h"
++
++int ptrace_getregs(long pid, unsigned long *regs_out)
++{
++      return(ptrace(PTRACE_GETREGS, pid, 0, regs_out));
++}
++
++int ptrace_setregs(long pid, unsigned long *regs)
++{
++      return(ptrace(PTRACE_SETREGS, pid, 0, regs));
++}
++
++int ptrace_getfpregs(long pid, unsigned long *regs)
++{
++      return(ptrace(PTRACE_GETFPREGS, pid, 0, regs));
++}
++
++static void write_debugregs(int pid, unsigned long *regs)
++{
++      struct user *dummy;
++      int nregs, i;
++
++      dummy = NULL;
++      nregs = sizeof(dummy->u_debugreg)/sizeof(dummy->u_debugreg[0]);
++      for(i = 0; i < nregs; i++){
++              if((i == 4) || (i == 5)) continue;
++              if(ptrace(PTRACE_POKEUSR, pid, &dummy->u_debugreg[i],
++                        regs[i]) < 0)
++                      printk("write_debugregs - ptrace failed, "
++                             "errno = %d\n", errno);
++      }
++}
++
++static void read_debugregs(int pid, unsigned long *regs)
++{
++      struct user *dummy;
++      int nregs, i;
++
++      dummy = NULL;
++      nregs = sizeof(dummy->u_debugreg)/sizeof(dummy->u_debugreg[0]);
++      for(i = 0; i < nregs; i++){
++              regs[i] = ptrace(PTRACE_PEEKUSR, pid, 
++                               &dummy->u_debugreg[i], 0);
++      }
++}
++
++/* Accessed only by the tracing thread */
++static unsigned long kernel_debugregs[8] = { [ 0 ... 7 ] = 0 };
++static int debugregs_seq = 0;
++
++void arch_enter_kernel(void *task, int pid)
++{
++      read_debugregs(pid, TASK_DEBUGREGS(task));
++      write_debugregs(pid, kernel_debugregs);
++}
++
++void arch_leave_kernel(void *task, int pid)
++{
++      read_debugregs(pid, kernel_debugregs);
++      write_debugregs(pid, TASK_DEBUGREGS(task));
++}
++
++void ptrace_pokeuser(unsigned long addr, unsigned long data)
++{
++      if((addr < offsetof(struct user, u_debugreg[0])) ||
++         (addr > offsetof(struct user, u_debugreg[7])))
++              return;
++      addr -= offsetof(struct user, u_debugreg[0]);
++      addr = addr >> 2;
++      if(kernel_debugregs[addr] == data) return;
++
++      kernel_debugregs[addr] = data;
++      debugregs_seq++;
++}
++
++static void update_debugregs_cb(void *arg)
++{
++      int pid = *((int *) arg);
++
++      write_debugregs(pid, kernel_debugregs);
++}
++
++void update_debugregs(int seq)
++{
++      int me;
++
++      if(seq == debugregs_seq) return;
++
++      me = os_getpid();
++      initial_thread_cb(update_debugregs_cb, &me);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/sigcontext.c um/arch/um/sys-i386/sigcontext.c
+--- orig/arch/um/sys-i386/sigcontext.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/sigcontext.c   Mon Dec  2 23:20:13 2002
+@@ -0,0 +1,80 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include <stddef.h>
++#include <string.h>
++#include <asm/ptrace.h>
++#include <asm/sigcontext.h>
++#include "sysdep/ptrace.h"
++#include "kern_util.h"
++#include "frame_user.h"
++
++int sc_size(void *data)
++{
++      struct arch_frame_data *arch = data;
++
++      return(sizeof(struct sigcontext) + arch->fpstate_size);
++}
++
++void sc_to_sc(void *to_ptr, void *from_ptr)
++{
++      struct sigcontext *to = to_ptr, *from = from_ptr;
++      int size = sizeof(*to) + signal_frame_sc.common.arch.fpstate_size;
++
++      memcpy(to, from, size);
++      if(from->fpstate != NULL) to->fpstate = (struct _fpstate *) (to + 1);
++}
++
++unsigned long *sc_sigmask(void *sc_ptr)
++{
++      struct sigcontext *sc = sc_ptr;
++
++      return(&sc->oldmask);
++}
++
++int sc_get_fpregs(unsigned long buf, void *sc_ptr)
++{
++      struct sigcontext *sc = sc_ptr;
++      struct _fpstate *from = sc->fpstate, *to = (struct _fpstate *) buf;
++      int err = 0;
++
++      if(from == NULL){
++              err |= clear_user_proc(&to->cw, sizeof(to->cw));
++              err |= clear_user_proc(&to->sw, sizeof(to->sw));
++              err |= clear_user_proc(&to->tag, sizeof(to->tag));
++              err |= clear_user_proc(&to->ipoff, sizeof(to->ipoff));
++              err |= clear_user_proc(&to->cssel, sizeof(to->cssel));
++              err |= clear_user_proc(&to->dataoff, sizeof(to->dataoff));
++              err |= clear_user_proc(&to->datasel, sizeof(to->datasel));
++              err |= clear_user_proc(&to->_st, sizeof(to->_st));
++      }
++      else {
++              err |= copy_to_user_proc(&to->cw, &from->cw, sizeof(to->cw));
++              err |= copy_to_user_proc(&to->sw, &from->sw, sizeof(to->sw));
++              err |= copy_to_user_proc(&to->tag, &from->tag, 
++                                       sizeof(to->tag));
++              err |= copy_to_user_proc(&to->ipoff, &from->ipoff, 
++                                       sizeof(to->ipoff));
++              err |= copy_to_user_proc(&to->cssel,& from->cssel, 
++                                       sizeof(to->cssel));
++              err |= copy_to_user_proc(&to->dataoff, &from->dataoff, 
++                                  sizeof(to->dataoff));
++              err |= copy_to_user_proc(&to->datasel, &from->datasel, 
++                                  sizeof(to->datasel));
++              err |= copy_to_user_proc(to->_st, from->_st, sizeof(to->_st));
++      }
++      return(err);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/syscalls.c um/arch/um/sys-i386/syscalls.c
+--- orig/arch/um/sys-i386/syscalls.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/syscalls.c     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,68 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "asm/mman.h"
++#include "asm/uaccess.h"
++#include "asm/unistd.h"
++
++/*
++ * Perform the select(nd, in, out, ex, tv) and mmap() system
++ * calls. Linux/i386 didn't use to be able to handle more than
++ * 4 system call parameters, so these system calls used a memory
++ * block for parameter passing..
++ */
++
++struct mmap_arg_struct {
++      unsigned long addr;
++      unsigned long len;
++      unsigned long prot;
++      unsigned long flags;
++      unsigned long fd;
++      unsigned long offset;
++};
++
++extern int old_mmap(unsigned long addr, unsigned long len,
++                  unsigned long prot, unsigned long flags,
++                  unsigned long fd, unsigned long offset);
++
++int old_mmap_i386(struct mmap_arg_struct *arg)
++{
++      struct mmap_arg_struct a;
++      int err = -EFAULT;
++
++      if (copy_from_user(&a, arg, sizeof(a)))
++              goto out;
++
++      err = old_mmap(a.addr, a.len, a.prot, a.flags, a.fd, a.offset);
++ out:
++      return err;
++}
++
++struct sel_arg_struct {
++      unsigned long n;
++      fd_set *inp, *outp, *exp;
++      struct timeval *tvp;
++};
++
++int old_select(struct sel_arg_struct *arg)
++{
++      struct sel_arg_struct a;
++
++      if (copy_from_user(&a, arg, sizeof(a)))
++              return -EFAULT;
++      /* sys_select() does the appropriate kernel locking */
++      return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/sysrq.c um/arch/um/sys-i386/sysrq.c
+--- orig/arch/um/sys-i386/sysrq.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/sysrq.c        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,30 @@
++#include "linux/kernel.h"
++#include "linux/smp.h"
++#include "linux/sched.h"
++#include "asm/ptrace.h"
++#include "sysrq.h"
++
++void show_regs(struct pt_regs *regs)
++{
++        printk("\n");
++        printk("EIP: %04lx:[<%08lx>] CPU: %d %s", 
++             0xffff & PT_REGS_CS(regs), PT_REGS_IP(regs),
++             smp_processor_id(), print_tainted());
++        if (PT_REGS_CS(regs) & 3)
++                printk(" ESP: %04lx:%08lx", 0xffff & PT_REGS_SS(regs),
++                     PT_REGS_SP(regs));
++        printk(" EFLAGS: %08lx\n    %s\n", PT_REGS_EFLAGS(regs),
++             print_tainted());
++        printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
++                PT_REGS_EAX(regs), PT_REGS_EBX(regs), 
++             PT_REGS_ECX(regs), 
++             PT_REGS_EDX(regs));
++        printk("ESI: %08lx EDI: %08lx EBP: %08lx",
++             PT_REGS_ESI(regs), PT_REGS_EDI(regs), 
++             PT_REGS_EBP(regs));
++        printk(" DS: %04lx ES: %04lx\n",
++             0xffff & PT_REGS_DS(regs), 
++             0xffff & PT_REGS_ES(regs));
++
++        show_trace((unsigned long *) &regs);
++}
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/util/Makefile um/arch/um/sys-i386/util/Makefile
+--- orig/arch/um/sys-i386/util/Makefile        Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/util/Makefile  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,28 @@
++EXE = mk_sc mk_thread
++
++include $(TOPDIR)/Rules.make
++
++all : $(EXE)
++
++mk_sc : mk_sc.o
++      $(CC) -o mk_sc mk_sc.o
++
++mk_sc.o : mk_sc.c
++      $(CC) -c $< 
++
++mk_thread : mk_thread_user.o mk_thread_kern.o
++      $(CC) -o mk_thread mk_thread_user.o mk_thread_kern.o
++
++mk_thread_user.o : mk_thread_user.c
++      $(CC) -c $< 
++
++mk_thread_kern.o : mk_thread_kern.c
++      $(CC) $(CFLAGS) -c $< 
++
++clean :
++      $(RM) $(EXE) *.o
++
++archmrproper : clean
++
++fastdep :
++
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/util/mk_sc.c um/arch/um/sys-i386/util/mk_sc.c
+--- orig/arch/um/sys-i386/util/mk_sc.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/util/mk_sc.c   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,51 @@
++#include <stdio.h>
++#include <signal.h>
++#include <linux/stddef.h>
++
++#define SC_OFFSET(name, field) \
++  printf("#define " name "(sc) *((unsigned long *) &(((char *) (sc))[%d]))\n",\
++       offsetof(struct sigcontext, field))
++
++#define SC_FP_OFFSET(name, field) \
++  printf("#define " name \
++       "(sc) *((unsigned long *) &(((char *) (SC_FPSTATE(sc)))[%d]))\n",\
++       offsetof(struct _fpstate, field))
++
++#define SC_FP_OFFSET_PTR(name, field, type) \
++  printf("#define " name \
++       "(sc) ((" type " *) &(((char *) (SC_FPSTATE(sc)))[%d]))\n",\
++       offsetof(struct _fpstate, field))
++
++int main(int argc, char **argv)
++{
++  SC_OFFSET("SC_IP", eip);
++  SC_OFFSET("SC_SP", esp);
++  SC_OFFSET("SC_FS", fs);
++  SC_OFFSET("SC_GS", gs);
++  SC_OFFSET("SC_DS", ds);
++  SC_OFFSET("SC_ES", es);
++  SC_OFFSET("SC_SS", ss);
++  SC_OFFSET("SC_CS", cs);
++  SC_OFFSET("SC_EFLAGS", eflags);
++  SC_OFFSET("SC_EAX", eax);
++  SC_OFFSET("SC_EBX", ebx);
++  SC_OFFSET("SC_ECX", ecx);
++  SC_OFFSET("SC_EDX", edx);
++  SC_OFFSET("SC_EDI", edi);
++  SC_OFFSET("SC_ESI", esi);
++  SC_OFFSET("SC_EBP", ebp);
++  SC_OFFSET("SC_TRAPNO", trapno);
++  SC_OFFSET("SC_ERR", err);
++  SC_OFFSET("SC_CR2", cr2);
++  SC_OFFSET("SC_FPSTATE", fpstate);
++  SC_FP_OFFSET("SC_FP_CW", cw);
++  SC_FP_OFFSET("SC_FP_SW", sw);
++  SC_FP_OFFSET("SC_FP_TAG", tag);
++  SC_FP_OFFSET("SC_FP_IPOFF", ipoff);
++  SC_FP_OFFSET("SC_FP_CSSEL", cssel);
++  SC_FP_OFFSET("SC_FP_DATAOFF", dataoff);
++  SC_FP_OFFSET("SC_FP_DATASEL", datasel);
++  SC_FP_OFFSET_PTR("SC_FP_ST", _st, "struct _fpstate");
++  SC_FP_OFFSET_PTR("SC_FXSR_ENV", _fxsr_env, "void");
++  return(0);
++}
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/util/mk_thread_kern.c um/arch/um/sys-i386/util/mk_thread_kern.c
+--- orig/arch/um/sys-i386/util/mk_thread_kern.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/util/mk_thread_kern.c  Mon Dec  9 23:24:38 2002
+@@ -0,0 +1,22 @@
++#include "linux/config.h"
++#include "linux/stddef.h"
++#include "linux/sched.h"
++
++extern void print_head(void);
++extern void print_constant_ptr(char *name, int value);
++extern void print_constant(char *name, char *type, int value);
++extern void print_tail(void);
++
++#define THREAD_OFFSET(field) offsetof(struct task_struct, thread.field)
++
++int main(int argc, char **argv)
++{
++  print_head();
++  print_constant_ptr("TASK_DEBUGREGS", THREAD_OFFSET(arch.debugregs));
++#ifdef CONFIG_MODE_TT
++  print_constant("TASK_EXTERN_PID", "int", THREAD_OFFSET(mode.tt.extern_pid));
++#endif
++  print_tail();
++  return(0);
++}
++
+diff -Naur -X ../exclude-files orig/arch/um/sys-i386/util/mk_thread_user.c um/arch/um/sys-i386/util/mk_thread_user.c
+--- orig/arch/um/sys-i386/util/mk_thread_user.c        Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-i386/util/mk_thread_user.c  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,30 @@
++#include <stdio.h>
++
++void print_head(void)
++{
++  printf("/*\n");
++  printf(" * Generated by mk_thread\n");
++  printf(" */\n");
++  printf("\n");
++  printf("#ifndef __UM_THREAD_H\n");
++  printf("#define __UM_THREAD_H\n");
++  printf("\n");
++}
++
++void print_constant_ptr(char *name, int value)
++{
++  printf("#define %s(task) ((unsigned long *) "
++       "&(((char *) (task))[%d]))\n", name, value);
++}
++
++void print_constant(char *name, char *type, int value)
++{
++  printf("#define %s(task) *((%s *) &(((char *) (task))[%d]))\n", name, type, 
++       value);
++}
++
++void print_tail(void)
++{
++  printf("\n");
++  printf("#endif\n");
++}
+diff -Naur -X ../exclude-files orig/arch/um/sys-ia64/Makefile um/arch/um/sys-ia64/Makefile
+--- orig/arch/um/sys-ia64/Makefile     Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-ia64/Makefile       Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,26 @@
++OBJ = sys.o
++
++OBJS =
++
++all: $(OBJ)
++
++$(OBJ): $(OBJS)
++      rm -f $@
++      $(LD) $(LINKFLAGS) --start-group $^ --end-group -o $@
++clean:
++      rm -f $(OBJS)
++
++fastdep:
++
++archmrproper:
++
++archclean:
++      rm -f link.ld
++      @$(MAKEBOOT) clean
++
++archdep:
++      @$(MAKEBOOT) dep
++
++modules:
++
++include $(TOPDIR)/Rules.make
+diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/Makefile um/arch/um/sys-ppc/Makefile
+--- orig/arch/um/sys-ppc/Makefile      Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-ppc/Makefile        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,80 @@
++OBJ = sys.o
++
++.S.o:
++      $(CC) $(AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o
++
++OBJS = ptrace.o sigcontext.o semaphore.o checksum.o miscthings.o misc.o \
++      ptrace_user.o sysrq.o
++
++EXTRA_AFLAGS := -DCONFIG_ALL_PPC -I. -I$(TOPDIR)/arch/ppc/kernel
++
++all: $(OBJ)
++
++$(OBJ): $(OBJS)
++      rm -f $@
++      $(LD) $(LINKFLAGS) --start-group $^ --end-group -o $@
++
++ptrace_user.o: ptrace_user.c
++      $(CC) -D__KERNEL__ $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $<
++
++sigcontext.o: sigcontext.c
++      $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $<
++
++semaphore.c:
++      rm -f $@
++      ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@
++
++checksum.S:
++      rm -f $@
++      ln -s $(TOPDIR)/arch/ppc/lib/$@ $@
++
++mk_defs.c:
++      rm -f $@
++      ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@
++
++ppc_defs.head:
++      rm -f $@
++      ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@
++
++ppc_defs.h: mk_defs.c ppc_defs.head \
++              $(TOPDIR)/include/asm-ppc/mmu.h \
++              $(TOPDIR)/include/asm-ppc/processor.h \
++              $(TOPDIR)/include/asm-ppc/pgtable.h \
++              $(TOPDIR)/include/asm-ppc/ptrace.h
++#     $(CC) $(CFLAGS) -S mk_defs.c
++      cp ppc_defs.head ppc_defs.h
++# for bk, this way we can write to the file even if it's not checked out
++      echo '#define THREAD 608' >> ppc_defs.h
++      echo '#define PT_REGS 8' >> ppc_defs.h
++      echo '#define CLONE_VM 256' >> ppc_defs.h
++#     chmod u+w ppc_defs.h
++#     grep '^#define' mk_defs.s >> ppc_defs.h
++#     rm mk_defs.s
++
++# the asm link is horrible, and breaks the other targets.  This is also
++# not going to work with parallel makes.
++
++checksum.o: checksum.S
++      rm -f asm
++      ln -s $(TOPDIR)/include/asm-ppc asm
++      $(CC) $(EXTRA_AFLAGS) $(AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o
++      rm -f asm
++
++misc.o: misc.S ppc_defs.h
++      rm -f asm
++      ln -s $(TOPDIR)/include/asm-ppc asm
++      $(CC) $(EXTRA_AFLAGS) $(AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o
++      rm -f asm
++
++clean:
++      rm -f $(OBJS)
++      rm -f ppc_defs.h
++      rm -f checksum.S semaphore.c mk_defs.c
++
++fastdep:
++
++dep:
++
++modules:
++
++include $(TOPDIR)/Rules.make
+diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/misc.S um/arch/um/sys-ppc/misc.S
+--- orig/arch/um/sys-ppc/misc.S        Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-ppc/misc.S  Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,116 @@
++/*
++ * This file contains miscellaneous low-level functions.
++ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
++ *
++ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
++ * and Paul Mackerras.
++ *
++ * A couple of functions stolen from arch/ppc/kernel/misc.S for UML
++ * by Chris Emerson.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ *
++ */
++
++#include <linux/config.h>
++#include <asm/processor.h>
++#include "ppc_asm.h"
++
++#if defined(CONFIG_4xx) || defined(CONFIG_8xx)
++#define CACHE_LINE_SIZE               16
++#define LG_CACHE_LINE_SIZE    4
++#define MAX_COPY_PREFETCH     1
++#elif !defined(CONFIG_PPC64BRIDGE)
++#define CACHE_LINE_SIZE               32
++#define LG_CACHE_LINE_SIZE    5
++#define MAX_COPY_PREFETCH     4
++#else
++#define CACHE_LINE_SIZE               128
++#define LG_CACHE_LINE_SIZE    7
++#define MAX_COPY_PREFETCH     1
++#endif /* CONFIG_4xx || CONFIG_8xx */
++
++      .text
++
++/*
++ * Clear a page using the dcbz instruction, which doesn't cause any
++ * memory traffic (except to write out any cache lines which get
++ * displaced).  This only works on cacheable memory.
++ */
++_GLOBAL(clear_page)
++      li      r0,4096/CACHE_LINE_SIZE
++      mtctr   r0
++#ifdef CONFIG_8xx
++      li      r4, 0
++1:    stw     r4, 0(r3)
++      stw     r4, 4(r3)
++      stw     r4, 8(r3)
++      stw     r4, 12(r3)
++#else
++1:    dcbz    0,r3
++#endif
++      addi    r3,r3,CACHE_LINE_SIZE
++      bdnz    1b
++      blr
++
++/*
++ * Copy a whole page.  We use the dcbz instruction on the destination
++ * to reduce memory traffic (it eliminates the unnecessary reads of
++ * the destination into cache).  This requires that the destination
++ * is cacheable.
++ */
++#define COPY_16_BYTES         \
++      lwz     r6,4(r4);       \
++      lwz     r7,8(r4);       \
++      lwz     r8,12(r4);      \
++      lwzu    r9,16(r4);      \
++      stw     r6,4(r3);       \
++      stw     r7,8(r3);       \
++      stw     r8,12(r3);      \
++      stwu    r9,16(r3)
++
++_GLOBAL(copy_page)
++      addi    r3,r3,-4
++      addi    r4,r4,-4
++      li      r5,4
++
++#ifndef CONFIG_8xx
++#if MAX_COPY_PREFETCH > 1
++      li      r0,MAX_COPY_PREFETCH
++      li      r11,4
++      mtctr   r0
++11:   dcbt    r11,r4
++      addi    r11,r11,CACHE_LINE_SIZE
++      bdnz    11b
++#else /* MAX_COPY_PREFETCH == 1 */
++      dcbt    r5,r4
++      li      r11,CACHE_LINE_SIZE+4
++#endif /* MAX_COPY_PREFETCH */
++#endif /* CONFIG_8xx */
++
++      li      r0,4096/CACHE_LINE_SIZE
++      mtctr   r0
++1:
++#ifndef CONFIG_8xx
++      dcbt    r11,r4
++      dcbz    r5,r3
++#endif
++      COPY_16_BYTES
++#if CACHE_LINE_SIZE >= 32
++      COPY_16_BYTES
++#if CACHE_LINE_SIZE >= 64
++      COPY_16_BYTES
++      COPY_16_BYTES
++#if CACHE_LINE_SIZE >= 128
++      COPY_16_BYTES
++      COPY_16_BYTES
++      COPY_16_BYTES
++      COPY_16_BYTES
++#endif
++#endif
++#endif
++      bdnz    1b
++      blr
+diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/miscthings.c um/arch/um/sys-ppc/miscthings.c
+--- orig/arch/um/sys-ppc/miscthings.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-ppc/miscthings.c    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,53 @@
++#include "linux/threads.h"
++#include "linux/stddef.h"  // for NULL
++#include "linux/elf.h"  // for AT_NULL
++
++/* The following function nicked from arch/ppc/kernel/process.c and
++ * adapted slightly */
++/*
++ * XXX ld.so expects the auxiliary table to start on
++ * a 16-byte boundary, so we have to find it and
++ * move it up. :-(
++ */
++void shove_aux_table(unsigned long sp)
++{
++      int argc;
++      char *p;
++      unsigned long e;
++      unsigned long aux_start, offset;
++
++      argc = *(int *)sp;
++      sp += sizeof(int) + (argc + 1) * sizeof(char *);
++      /* skip over the environment pointers */
++      do {
++              p = *(char **)sp;
++              sp += sizeof(char *);
++      } while (p != NULL);
++      aux_start = sp;
++      /* skip to the end of the auxiliary table */
++      do {
++              e = *(unsigned long *)sp;
++              sp += 2 * sizeof(unsigned long);
++      } while (e != AT_NULL);
++      offset = ((aux_start + 15) & ~15) - aux_start;
++      if (offset != 0) {
++              do {
++                      sp -= sizeof(unsigned long);
++                      e = *(unsigned long *)sp;
++                      *(unsigned long *)(sp + offset) = e;
++              } while (sp > aux_start);
++      }
++}
++/* END stuff taken from arch/ppc/kernel/process.c */
++
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/ptrace.c um/arch/um/sys-ppc/ptrace.c
+--- orig/arch/um/sys-ppc/ptrace.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-ppc/ptrace.c        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,28 @@
++#include "linux/sched.h"
++#include "asm/ptrace.h"
++
++int putreg(struct task_struct *child, unsigned long regno, 
++                unsigned long value)
++{
++      child->thread.process_regs.regs[regno >> 2] = value;
++      return 0;
++}
++
++unsigned long getreg(struct task_struct *child, unsigned long regno)
++{
++      unsigned long retval = ~0UL;
++
++      retval &= child->thread.process_regs.regs[regno >> 2];
++      return retval;
++}
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/ptrace_user.c um/arch/um/sys-ppc/ptrace_user.c
+--- orig/arch/um/sys-ppc/ptrace_user.c Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-ppc/ptrace_user.c   Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,40 @@
++#include <sys/ptrace.h>
++#include <errno.h>
++#include <asm/ptrace.h>
++#include "sysdep/ptrace.h"
++
++int ptrace_getregs(long pid, unsigned long *regs_out)
++{
++    int i;
++    for (i=0; i < sizeof(struct sys_pt_regs)/sizeof(PPC_REG); ++i) {
++      errno = 0;
++      regs_out->regs[i] = ptrace(PTRACE_PEEKUSER, pid, i*4, 0);
++      if (errno) {
++          return -errno;
++      }
++    }
++    return 0;
++}
++
++int ptrace_setregs(long pid, unsigned long *regs_in)
++{
++    int i;
++    for (i=0; i < sizeof(struct sys_pt_regs)/sizeof(PPC_REG); ++i) {
++      if (i != 34 /* FIXME: PT_ORIG_R3 */ && i <= PT_MQ) {
++          if (ptrace(PTRACE_POKEUSER, pid, i*4, regs_in->regs[i]) < 0) {
++              return -errno;
++          }
++      }
++    }
++    return 0;
++}
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/sigcontext.c um/arch/um/sys-ppc/sigcontext.c
+--- orig/arch/um/sys-ppc/sigcontext.c  Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-ppc/sigcontext.c    Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,15 @@
++#include "asm/ptrace.h"
++#include "asm/sigcontext.h"
++#include "sysdep/ptrace.h"
++#include "user_util.h"
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/sysrq.c um/arch/um/sys-ppc/sysrq.c
+--- orig/arch/um/sys-ppc/sysrq.c       Wed Dec 31 19:00:00 1969
++++ um/arch/um/sys-ppc/sysrq.c Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,43 @@
++/* 
++ * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk)
++ * Licensed under the GPL
++ */
++
++#include "linux/kernel.h"
++#include "linux/smp.h"
++#include "asm/ptrace.h"
++#include "sysrq.h"
++
++void show_regs(struct pt_regs_subarch *regs)
++{
++      printk("\n");
++      printk("show_regs(): insert regs here.\n");
++#if 0
++        printk("\n");
++        printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs, regs->eip,
++             smp_processor_id());
++        if (regs->xcs & 3)
++                printk(" ESP: %04x:%08lx",0xffff & regs->xss, regs->esp);
++        printk(" EFLAGS: %08lx\n", regs->eflags);
++        printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
++                regs->eax, regs->ebx, regs->ecx, regs->edx);
++        printk("ESI: %08lx EDI: %08lx EBP: %08lx",
++                regs->esi, regs->edi, regs->ebp);
++        printk(" DS: %04x ES: %04x\n",
++                0xffff & regs->xds, 0xffff & regs->xes);
++#endif
++
++        show_trace(&regs->gpr[1]);
++}
++
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/arch/um/util/Makefile um/arch/um/util/Makefile
+--- orig/arch/um/util/Makefile Wed Dec 31 19:00:00 1969
++++ um/arch/um/util/Makefile   Wed Oct 23 21:09:14 2002
+@@ -0,0 +1,26 @@
++ALL = mk_task mk_constants
++
++all : $(ALL)
++
++mk_task : mk_task_user.o mk_task_kern.o
++      $(CC) -o mk_task mk_task_user.o mk_task_kern.o
++
++mk_task_user.o : mk_task_user.c
++      $(CC) -c $< 
++
++mk_task_kern.o : mk_task_kern.c
++      $(CC) $(CFLAGS) -c $< 
++
++mk_constants : mk_constants_user.o mk_constants_kern.o
++      $(CC) -o mk_constants mk_constants_user.o mk_constants_kern.o
++
++mk_constants_user.o : mk_constants_user.c
++      $(CC) -c $< 
++
++mk_constants_kern.o : mk_constants_kern.c
++      $(CC) $(CFLAGS) -c $< 
++
++clean :
++      $(RM) $(ALL) *.o *~
++
++archmrproper : clean
+diff -Naur -X ../exclude-files orig/arch/um/util/mk_constants_kern.c um/arch/um/util/mk_constants_kern.c
+--- orig/arch/um/util/mk_constants_kern.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/util/mk_constants_kern.c        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,24 @@
++#include "linux/kernel.h"
++#include "linux/stringify.h"
++#include "asm/page.h"
++
++extern void print_head(void);
++extern void print_constant_str(char *name, char *value);
++extern void print_constant_int(char *name, int value);
++extern void print_tail(void);
++
++int main(int argc, char **argv)
++{
++  print_head();
++  print_constant_int("UM_KERN_PAGE_SIZE", PAGE_SIZE);
++  print_constant_str("UM_KERN_EMERG", KERN_EMERG);
++  print_constant_str("UM_KERN_ALERT", KERN_ALERT);
++  print_constant_str("UM_KERN_CRIT", KERN_CRIT);
++  print_constant_str("UM_KERN_ERR", KERN_ERR);
++  print_constant_str("UM_KERN_WARNING", KERN_WARNING);
++  print_constant_str("UM_KERN_NOTICE", KERN_NOTICE);
++  print_constant_str("UM_KERN_INFO", KERN_INFO);
++  print_constant_str("UM_KERN_DEBUG", KERN_DEBUG);
++  print_tail();
++  return(0);
++}
+diff -Naur -X ../exclude-files orig/arch/um/util/mk_constants_user.c um/arch/um/util/mk_constants_user.c
+--- orig/arch/um/util/mk_constants_user.c      Wed Dec 31 19:00:00 1969
++++ um/arch/um/util/mk_constants_user.c        Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,28 @@
++#include <stdio.h>
++
++void print_head(void)
++{
++  printf("/*\n");
++  printf(" * Generated by mk_constants\n");
++  printf(" */\n");
++  printf("\n");
++  printf("#ifndef __UM_CONSTANTS_H\n");
++  printf("#define __UM_CONSTANTS_H\n");
++  printf("\n");
++}
++
++void print_constant_str(char *name, char *value)
++{
++  printf("#define %s \"%s\"\n", name, value);
++}
++
++void print_constant_int(char *name, int value)
++{
++  printf("#define %s %d\n", name, value);
++}
++
++void print_tail(void)
++{
++  printf("\n");
++  printf("#endif\n");
++}
+diff -Naur -X ../exclude-files orig/arch/um/util/mk_task_kern.c um/arch/um/util/mk_task_kern.c
+--- orig/arch/um/util/mk_task_kern.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/util/mk_task_kern.c     Sun Dec  8 21:03:34 2002
+@@ -0,0 +1,17 @@
++#include "linux/sched.h"
++#include "linux/stddef.h"
++
++extern void print(char *name, char *type, int offset);
++extern void print_ptr(char *name, char *type, int offset);
++extern void print_head(void);
++extern void print_tail(void);
++
++int main(int argc, char **argv)
++{
++  print_head();
++  print_ptr("TASK_REGS", "union uml_pt_regs", 
++          offsetof(struct task_struct, thread.regs));
++  print("TASK_PID", "int", offsetof(struct task_struct, pid));
++  print_tail();
++  return(0);
++}
+diff -Naur -X ../exclude-files orig/arch/um/util/mk_task_user.c um/arch/um/util/mk_task_user.c
+--- orig/arch/um/util/mk_task_user.c   Wed Dec 31 19:00:00 1969
++++ um/arch/um/util/mk_task_user.c     Wed Oct 23 21:08:04 2002
+@@ -0,0 +1,30 @@
++#include <stdio.h>
++
++void print(char *name, char *type, int offset)
++{
++  printf("#define %s(task) *((%s *) &(((char *) (task))[%d]))\n", name, type,
++       offset);
++}
++
++void print_ptr(char *name, char *type, int offset)
++{
++  printf("#define %s(task) ((%s *) &(((char *) (task))[%d]))\n", name, type,
++       offset);
++}
++
++void print_head(void)
++{
++  printf("/*\n");
++  printf(" * Generated by mk_task\n");
++  printf(" */\n");
++  printf("\n");
++  printf("#ifndef __TASK_H\n");
++  printf("#define __TASK_H\n");
++  printf("\n");
++}
++
++void print_tail(void)
++{
++  printf("\n");
++  printf("#endif\n");
++}
+diff -Naur -X ../exclude-files orig/drivers/char/Makefile um/drivers/char/Makefile
+--- orig/drivers/char/Makefile Thu Feb 27 13:04:15 2003
++++ um/drivers/char/Makefile   Thu Feb 27 13:05:21 2003
+@@ -95,6 +95,12 @@
+   endif
+ endif
++ifeq ($(ARCH),um)
++  KEYMAP   =
++  KEYBD    =
++  CONSOLE  =
++endif
++
+ ifeq ($(ARCH),sh)
+   KEYMAP   =
+   KEYBD    =
+diff -Naur -X ../exclude-files orig/drivers/char/tty_io.c um/drivers/char/tty_io.c
+--- orig/drivers/char/tty_io.c Thu Feb 27 13:04:15 2003
++++ um/drivers/char/tty_io.c   Thu Feb 27 13:05:21 2003
+@@ -637,6 +637,9 @@
+       wake_up_interruptible(&tty->write_wait);
+ }
++extern int write_tty_log(int fd, const unsigned char *buf, int len, void *tty,
++                       int direction);
++
+ static ssize_t tty_read(struct file * file, char * buf, size_t count, 
+                       loff_t *ppos)
+ {
+@@ -677,8 +680,13 @@
+       else
+               i = -EIO;
+       unlock_kernel();
+-      if (i > 0)
++      if (i > 0){
+               inode->i_atime = CURRENT_TIME;
++#ifdef CONFIG_TTY_LOG
++              if(tty->log_fd >= 0) 
++                write_tty_log(tty->log_fd, buf, i, tty, 1);
++#endif
++      }
+       return i;
+ }
+@@ -732,6 +740,10 @@
+       if (written) {
+               file->f_dentry->d_inode->i_mtime = CURRENT_TIME;
+               ret = written;
++#ifdef CONFIG_TTY_LOG
++              if(tty->log_fd >= 0) 
++                write_tty_log(tty->log_fd, buf - ret, ret, tty, 0);
++#endif
+       }
+       up(&tty->atomic_write);
+       return ret;
+@@ -945,6 +957,9 @@
+                       goto release_mem_out;
+               }
+       }
++#ifdef CONFIG_TTY_LOG
++      tty->log_fd = -1;
++#endif
+       goto success;
+       /*
+@@ -1039,6 +1054,8 @@
+       free_tty_struct(tty);
+ }
++extern int close_tty_log(int fd, void *tty);
++
+ /*
+  * Even releasing the tty structures is a tricky business.. We have
+  * to be very careful that the structures are all released at the
+@@ -1267,6 +1284,10 @@
+       run_task_queue(&tq_timer);
+       flush_scheduled_tasks();
++#ifdef CONFIG_TTY_LOG
++      if(tty->log_fd >= 0) close_tty_log(tty->log_fd, tty);
++#endif
++
+       /* 
+        * The release_mem function takes care of the details of clearing
+        * the slots and preserving the termios structure.
+@@ -1274,6 +1295,8 @@
+       release_mem(tty, idx);
+ }
++extern int open_tty_log(void *tty, void *current_tty); 
++
+ /*
+  * tty_open and tty_release keep up the tty count that contains the
+  * number of opens done on a tty. We cannot use the inode-count, as
+@@ -1425,6 +1448,11 @@
+                       nr_warns++;
+               }
+       }
++
++#ifdef CONFIG_TTY_LOG
++      if(tty->log_fd < 0)
++             tty->log_fd = open_tty_log(tty, current->tty);
++#endif
+       return 0;
+ }
+diff -Naur -X ../exclude-files orig/drivers/net/setup.c um/drivers/net/setup.c
+--- orig/drivers/net/setup.c   Sun Sep 15 12:13:19 2002
++++ um/drivers/net/setup.c     Wed Oct 23 21:08:05 2002
+@@ -28,7 +28,6 @@
+ extern int lmc_setup(void);
+ extern int madgemc_probe(void);
+-extern int uml_net_probe(void);
+ /* Pad device name to IFNAMSIZ=16. F.e. __PAD6 is string of 9 zeros. */
+ #define __PAD6 "\0\0\0\0\0\0\0\0\0"
+@@ -102,9 +101,6 @@
+  */  
+ #ifdef CONFIG_MADGEMC
+       {madgemc_probe, 0},
+-#endif
+-#ifdef CONFIG_UML_NET
+-      {uml_net_probe, 0},
+ #endif
+  
+       {NULL, 0},
+diff -Naur -X ../exclude-files orig/include/asm-i386/hardirq.h um/include/asm-i386/hardirq.h
+--- orig/include/asm-i386/hardirq.h    Sun Sep 15 12:13:19 2002
++++ um/include/asm-i386/hardirq.h      Wed Apr 16 13:59:04 2003
+@@ -4,6 +4,7 @@
+ #include <linux/config.h>
+ #include <linux/threads.h>
+ #include <linux/irq.h>
++#include <asm/processor.h>            /* for cpu_relax */
+ /* assembly code in softirq.h is sensitive to the offsets of these fields */
+ typedef struct {
+diff -Naur -X ../exclude-files orig/include/asm-um/a.out.h um/include/asm-um/a.out.h
+--- orig/include/asm-um/a.out.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/a.out.h  Sun Oct 27 11:54:50 2002
+@@ -0,0 +1,20 @@
++#ifndef __UM_A_OUT_H
++#define __UM_A_OUT_H
++
++#include "linux/config.h"
++#include "asm/arch/a.out.h"
++#include "choose-mode.h"
++
++#undef STACK_TOP
++
++extern unsigned long stacksizelim;
++
++extern unsigned long host_task_size;
++
++#define STACK_ROOM (stacksizelim)
++
++extern int honeypot;
++#define STACK_TOP \
++      CHOOSE_MODE((honeypot ? host_task_size : task_size), task_size)
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/arch-signal-i386.h um/include/asm-um/arch-signal-i386.h
+--- orig/include/asm-um/arch-signal-i386.h     Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/arch-signal-i386.h       Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,24 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_ARCH_SIGNAL_I386_H
++#define __UM_ARCH_SIGNAL_I386_H
++
++struct arch_signal_context {
++      unsigned long extrasigs[_NSIG_WORDS];
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/archparam-i386.h um/include/asm-um/archparam-i386.h
+--- orig/include/asm-um/archparam-i386.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/archparam-i386.h Sun Dec  8 20:09:11 2002
+@@ -0,0 +1,80 @@
++/* 
++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_ARCHPARAM_I386_H
++#define __UM_ARCHPARAM_I386_H
++
++/********* Bits for asm-um/elf.h ************/
++
++#include "user.h"
++
++#define ELF_PLATFORM "i586"
++
++#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3)
++
++typedef struct user_i387_struct elf_fpregset_t;
++typedef unsigned long elf_greg_t;
++
++#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t))
++typedef elf_greg_t elf_gregset_t[ELF_NGREG];
++
++#define ELF_DATA        ELFDATA2LSB
++#define ELF_ARCH        EM_386
++
++#define ELF_PLAT_INIT(regs) do { \
++      PT_REGS_EBX(regs) = 0; \
++      PT_REGS_ECX(regs) = 0; \
++      PT_REGS_EDX(regs) = 0; \
++      PT_REGS_ESI(regs) = 0; \
++      PT_REGS_EDI(regs) = 0; \
++      PT_REGS_EBP(regs) = 0; \
++      PT_REGS_EAX(regs) = 0; \
++} while(0)
++
++/* Shamelessly stolen from include/asm-i386/elf.h */
++
++#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \
++      pr_reg[0] = PT_REGS_EBX(regs);          \
++      pr_reg[1] = PT_REGS_ECX(regs);          \
++      pr_reg[2] = PT_REGS_EDX(regs);          \
++      pr_reg[3] = PT_REGS_ESI(regs);          \
++      pr_reg[4] = PT_REGS_EDI(regs);          \
++      pr_reg[5] = PT_REGS_EBP(regs);          \
++      pr_reg[6] = PT_REGS_EAX(regs);          \
++      pr_reg[7] = PT_REGS_DS(regs);           \
++      pr_reg[8] = PT_REGS_ES(regs);           \
++      /* fake once used fs and gs selectors? */       \
++      pr_reg[9] = PT_REGS_DS(regs);           \
++      pr_reg[10] = PT_REGS_DS(regs);          \
++      pr_reg[11] = PT_REGS_SYSCALL_NR(regs);  \
++      pr_reg[12] = PT_REGS_IP(regs);          \
++      pr_reg[13] = PT_REGS_CS(regs);          \
++      pr_reg[14] = PT_REGS_EFLAGS(regs);      \
++      pr_reg[15] = PT_REGS_SP(regs);          \
++      pr_reg[16] = PT_REGS_SS(regs);          \
++} while(0);
++
++/********* Bits for asm-um/delay.h **********/
++
++typedef unsigned long um_udelay_t;
++
++/********* Nothing for asm-um/hardirq.h **********/
++
++/********* Nothing for asm-um/hw_irq.h **********/
++
++/********* Nothing for asm-um/string.h **********/
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/archparam-ppc.h um/include/asm-um/archparam-ppc.h
+--- orig/include/asm-um/archparam-ppc.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/archparam-ppc.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,41 @@
++#ifndef __UM_ARCHPARAM_PPC_H
++#define __UM_ARCHPARAM_PPC_H
++
++/********* Bits for asm-um/elf.h ************/
++
++#define ELF_PLATFORM (0)
++
++#define ELF_ET_DYN_BASE (0x08000000)
++
++/* the following stolen from asm-ppc/elf.h */
++#define ELF_NGREG     48      /* includes nip, msr, lr, etc. */
++#define ELF_NFPREG    33      /* includes fpscr */
++/* General registers */
++typedef unsigned long elf_greg_t;
++typedef elf_greg_t elf_gregset_t[ELF_NGREG];
++
++/* Floating point registers */
++typedef double elf_fpreg_t;
++typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
++
++#define ELF_DATA        ELFDATA2MSB
++#define ELF_ARCH      EM_PPC
++
++/********* Bits for asm-um/delay.h **********/
++
++typedef unsigned int um_udelay_t;
++
++/********* Bits for asm-um/hw_irq.h **********/
++
++struct hw_interrupt_type;
++
++/********* Bits for asm-um/hardirq.h **********/
++
++#define irq_enter(cpu, irq) hardirq_enter(cpu)
++#define irq_exit(cpu, irq) hardirq_exit(cpu)
++
++/********* Bits for asm-um/string.h **********/
++
++#define __HAVE_ARCH_STRRCHR
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/atomic.h um/include/asm-um/atomic.h
+--- orig/include/asm-um/atomic.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/atomic.h Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_ATOMIC_H
++#define __UM_ATOMIC_H
++
++#include "asm/arch/atomic.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/bitops.h um/include/asm-um/bitops.h
+--- orig/include/asm-um/bitops.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/bitops.h Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_BITOPS_H
++#define __UM_BITOPS_H
++
++#include "asm/arch/bitops.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/boot.h um/include/asm-um/boot.h
+--- orig/include/asm-um/boot.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/boot.h   Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_BOOT_H
++#define __UM_BOOT_H
++
++#include "asm/arch/boot.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/bugs.h um/include/asm-um/bugs.h
+--- orig/include/asm-um/bugs.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/bugs.h   Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_BUGS_H
++#define __UM_BUGS_H
++
++void check_bugs(void);
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/byteorder.h um/include/asm-um/byteorder.h
+--- orig/include/asm-um/byteorder.h    Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/byteorder.h      Thu Feb 27 13:20:12 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_BYTEORDER_H
++#define __UM_BYTEORDER_H
++
++#include "asm/arch/byteorder.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/cache.h um/include/asm-um/cache.h
+--- orig/include/asm-um/cache.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/cache.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_CACHE_H
++#define __UM_CACHE_H
++
++#define        L1_CACHE_BYTES  32
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/checksum.h um/include/asm-um/checksum.h
+--- orig/include/asm-um/checksum.h     Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/checksum.h       Tue Oct 29 17:25:12 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_CHECKSUM_H
++#define __UM_CHECKSUM_H
++
++#include "sysdep/checksum.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/cobalt.h um/include/asm-um/cobalt.h
+--- orig/include/asm-um/cobalt.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/cobalt.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_COBALT_H
++#define __UM_COBALT_H
++
++#include "asm/arch/cobalt.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/current.h um/include/asm-um/current.h
+--- orig/include/asm-um/current.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/current.h        Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,34 @@
++/* 
++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_CURRENT_H
++#define __UM_CURRENT_H
++
++#ifndef __ASSEMBLY__
++
++#include "linux/config.h"
++#include "asm/page.h"
++
++struct task_struct;
++
++#define CURRENT_TASK(dummy) (((unsigned long) &dummy) & \
++                           (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER))
++
++#define current ({ int dummy; (struct task_struct *) CURRENT_TASK(dummy); })
++
++#endif /* __ASSEMBLY__ */
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/delay.h um/include/asm-um/delay.h
+--- orig/include/asm-um/delay.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/delay.h  Sun Dec  8 20:09:15 2002
+@@ -0,0 +1,7 @@
++#ifndef __UM_DELAY_H
++#define __UM_DELAY_H
++
++#include "asm/arch/delay.h"
++#include "asm/archparam.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/desc.h um/include/asm-um/desc.h
+--- orig/include/asm-um/desc.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/desc.h   Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_DESC_H
++#define __UM_DESC_H
++
++#include "asm/arch/desc.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/div64.h um/include/asm-um/div64.h
+--- orig/include/asm-um/div64.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/div64.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef _UM_DIV64_H
++#define _UM_DIV64_H
++
++#include "asm/arch/div64.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/dma.h um/include/asm-um/dma.h
+--- orig/include/asm-um/dma.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/dma.h    Sun Oct 27 16:53:42 2002
+@@ -0,0 +1,10 @@
++#ifndef __UM_DMA_H
++#define __UM_DMA_H
++
++#include "asm/io.h"
++
++extern unsigned long uml_physmem;
++
++#define MAX_DMA_ADDRESS (uml_physmem)
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/elf.h um/include/asm-um/elf.h
+--- orig/include/asm-um/elf.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/elf.h    Sun Dec  8 20:13:07 2002
+@@ -0,0 +1,18 @@
++#ifndef __UM_ELF_H
++#define __UM_ELF_H
++
++#include "asm/archparam.h"
++
++#define ELF_HWCAP (0)
++
++#define SET_PERSONALITY(ex, ibcs2) do ; while(0)
++
++#define ELF_EXEC_PAGESIZE 4096
++
++#define elf_check_arch(x) (1)
++
++#define ELF_CLASS ELFCLASS32
++
++#define USE_ELF_CORE_DUMP
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/errno.h um/include/asm-um/errno.h
+--- orig/include/asm-um/errno.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/errno.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_ERRNO_H
++#define __UM_ERRNO_H
++
++#include "asm/arch/errno.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/fcntl.h um/include/asm-um/fcntl.h
+--- orig/include/asm-um/fcntl.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/fcntl.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_FCNTL_H
++#define __UM_FCNTL_H
++
++#include "asm/arch/fcntl.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/fixmap.h um/include/asm-um/fixmap.h
+--- orig/include/asm-um/fixmap.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/fixmap.h Wed Mar 26 22:01:27 2003
+@@ -0,0 +1,89 @@
++#ifndef __UM_FIXMAP_H
++#define __UM_FIXMAP_H
++
++#include <linux/config.h>
++#include <asm/kmap_types.h>
++
++/*
++ * Here we define all the compile-time 'special' virtual
++ * addresses. The point is to have a constant address at
++ * compile time, but to set the physical address only
++ * in the boot process. We allocate these special  addresses
++ * from the end of virtual memory (0xfffff000) backwards.
++ * Also this lets us do fail-safe vmalloc(), we
++ * can guarantee that these special addresses and
++ * vmalloc()-ed addresses never overlap.
++ *
++ * these 'compile-time allocated' memory buffers are
++ * fixed-size 4k pages. (or larger if used with an increment
++ * highger than 1) use fixmap_set(idx,phys) to associate
++ * physical memory with fixmap indices.
++ *
++ * TLB entries of such buffers will not be flushed across
++ * task switches.
++ */
++
++/*
++ * on UP currently we will have no trace of the fixmap mechanizm,
++ * no page table allocations, etc. This might change in the
++ * future, say framebuffers for the console driver(s) could be
++ * fix-mapped?
++ */
++enum fixed_addresses {
++#ifdef CONFIG_HIGHMEM
++      FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
++      FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
++#endif
++      __end_of_fixed_addresses
++};
++
++extern void __set_fixmap (enum fixed_addresses idx,
++                        unsigned long phys, pgprot_t flags);
++
++#define set_fixmap(idx, phys) \
++              __set_fixmap(idx, phys, PAGE_KERNEL)
++/*
++ * Some hardware wants to get fixmapped without caching.
++ */
++#define set_fixmap_nocache(idx, phys) \
++              __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
++/*
++ * used by vmalloc.c.
++ *
++ * Leave one empty page between vmalloc'ed areas and
++ * the start of the fixmap, and leave one page empty
++ * at the top of mem..
++ */
++extern unsigned long get_kmem_end(void);
++
++#define FIXADDR_TOP   (get_kmem_end() - 0x2000)
++#define FIXADDR_SIZE  (__end_of_fixed_addresses << PAGE_SHIFT)
++#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
++
++#define __fix_to_virt(x)      (FIXADDR_TOP - ((x) << PAGE_SHIFT))
++
++extern void __this_fixmap_does_not_exist(void);
++
++/*
++ * 'index to address' translation. If anyone tries to use the idx
++ * directly without tranlation, we catch the bug with a NULL-deference
++ * kernel oops. Illegal ranges of incoming indices are caught too.
++ */
++static inline unsigned long fix_to_virt(const unsigned int idx)
++{
++      /*
++       * this branch gets completely eliminated after inlining,
++       * except when someone tries to use fixaddr indices in an
++       * illegal way. (such as mixing up address types or using
++       * out-of-range indices).
++       *
++       * If it doesn't get removed, the linker will complain
++       * loudly with a reasonably clear error message..
++       */
++      if (idx >= __end_of_fixed_addresses)
++              __this_fixmap_does_not_exist();
++
++        return __fix_to_virt(idx);
++}
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/floppy.h um/include/asm-um/floppy.h
+--- orig/include/asm-um/floppy.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/floppy.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_FLOPPY_H
++#define __UM_FLOPPY_H
++
++#include "asm/arch/floppy.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/hardirq.h um/include/asm-um/hardirq.h
+--- orig/include/asm-um/hardirq.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/hardirq.h        Wed Apr 16 13:59:04 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_HARDIRQ_H
++#define __UM_HARDIRQ_H
++
++#include "asm/arch/hardirq.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/hdreg.h um/include/asm-um/hdreg.h
+--- orig/include/asm-um/hdreg.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/hdreg.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_HDREG_H
++#define __UM_HDREG_H
++
++#include "asm/arch/hdreg.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/highmem.h um/include/asm-um/highmem.h
+--- orig/include/asm-um/highmem.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/highmem.h        Wed Apr 16 13:59:04 2003
+@@ -0,0 +1,12 @@
++#ifndef __UM_HIGHMEM_H
++#define __UM_HIGHMEM_H
++
++#include "asm/page.h"
++#include "asm/fixmap.h"
++#include "asm/arch/highmem.h"
++
++#undef PKMAP_BASE
++
++#define PKMAP_BASE ((FIXADDR_START - LAST_PKMAP * PAGE_SIZE) & PMD_MASK)
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/hw_irq.h um/include/asm-um/hw_irq.h
+--- orig/include/asm-um/hw_irq.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/hw_irq.h Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,10 @@
++#ifndef _ASM_UM_HW_IRQ_H
++#define _ASM_UM_HW_IRQ_H
++
++#include "asm/irq.h"
++#include "asm/archparam.h"
++
++static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
++{}
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/ide.h um/include/asm-um/ide.h
+--- orig/include/asm-um/ide.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/ide.h    Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_IDE_H
++#define __UM_IDE_H
++
++#include "asm/arch/ide.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/init.h um/include/asm-um/init.h
+--- orig/include/asm-um/init.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/init.h   Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,11 @@
++#ifndef _UM_INIT_H
++#define _UM_INIT_H
++
++#ifdef notdef
++#define __init
++#define __initdata
++#define __initfunc(__arginit) __arginit
++#define __cacheline_aligned 
++#endif
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/io.h um/include/asm-um/io.h
+--- orig/include/asm-um/io.h   Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/io.h     Sun Oct 27 16:53:42 2002
+@@ -0,0 +1,25 @@
++#ifndef __UM_IO_H
++#define __UM_IO_H
++
++#include "asm/page.h"
++
++#define IO_SPACE_LIMIT 0xdeadbeef /* Sure hope nothing uses this */
++
++static inline int inb(unsigned long i) { return(0); }
++static inline void outb(char c, unsigned long i) { }
++
++/*
++ * Change virtual addresses to physical addresses and vv.
++ * These are pretty trivial
++ */
++static inline unsigned long virt_to_phys(volatile void * address)
++{
++      return __pa((void *) address);
++}
++
++static inline void * phys_to_virt(unsigned long address)
++{
++      return __va(address);
++}
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/ioctl.h um/include/asm-um/ioctl.h
+--- orig/include/asm-um/ioctl.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/ioctl.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_IOCTL_H
++#define __UM_IOCTL_H
++
++#include "asm/arch/ioctl.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/ioctls.h um/include/asm-um/ioctls.h
+--- orig/include/asm-um/ioctls.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/ioctls.h Wed Oct 23 21:11:14 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_IOCTLS_H
++#define __UM_IOCTLS_H
++
++#include "asm/arch/ioctls.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/ipc.h um/include/asm-um/ipc.h
+--- orig/include/asm-um/ipc.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/ipc.h    Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_IPC_H
++#define __UM_IPC_H
++
++#include "asm/arch/ipc.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/ipcbuf.h um/include/asm-um/ipcbuf.h
+--- orig/include/asm-um/ipcbuf.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/ipcbuf.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_IPCBUF_H
++#define __UM_IPCBUF_H
++
++#include "asm/arch/ipcbuf.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/irq.h um/include/asm-um/irq.h
+--- orig/include/asm-um/irq.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/irq.h    Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,36 @@
++#ifndef __UM_IRQ_H
++#define __UM_IRQ_H
++
++/* The i386 irq.h has a struct task_struct in a prototype without including
++ * sched.h.  This forward declaration kills the resulting warning.
++ */
++struct task_struct;
++
++#include "asm/arch/irq.h"
++#include "asm/ptrace.h"
++
++#undef NR_IRQS
++
++#define TIMER_IRQ             0
++#define UMN_IRQ                       1
++#define CONSOLE_IRQ           2
++#define CONSOLE_WRITE_IRQ     3
++#define UBD_IRQ                       4
++#define UM_ETH_IRQ            5
++#define SSL_IRQ                       6
++#define SSL_WRITE_IRQ         7
++#define ACCEPT_IRQ            8
++#define MCONSOLE_IRQ          9
++#define WINCH_IRQ             10
++#define SIGIO_WRITE_IRQ       11
++#define TELNETD_IRQ           12
++#define XTERM_IRQ             13
++
++#define LAST_IRQ XTERM_IRQ
++#define NR_IRQS (LAST_IRQ + 1)
++
++extern int um_request_irq(unsigned int irq, int fd, int type,
++                        void (*handler)(int, void *, struct pt_regs *),
++                        unsigned long irqflags,  const char * devname,
++                        void *dev_id);
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/keyboard.h um/include/asm-um/keyboard.h
+--- orig/include/asm-um/keyboard.h     Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/keyboard.h       Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_KEYBOARD_H
++#define __UM_KEYBOARD_H
++
++#include "asm/arch/keyboard.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/kmap_types.h um/include/asm-um/kmap_types.h
+--- orig/include/asm-um/kmap_types.h   Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/kmap_types.h     Thu Feb 27 13:20:14 2003
+@@ -0,0 +1,11 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_KMAP_TYPES_H
++#define __UM_KMAP_TYPES_H
++
++#include "asm/arch/kmap_types.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/linux_logo.h um/include/asm-um/linux_logo.h
+--- orig/include/asm-um/linux_logo.h   Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/linux_logo.h     Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_LINUX_LOGO_H
++#define __UM_LINUX_LOGO_H
++
++#include "asm/arch/linux_logo.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/locks.h um/include/asm-um/locks.h
+--- orig/include/asm-um/locks.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/locks.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_LOCKS_H
++#define __UM_LOCKS_H
++
++#include "asm/arch/locks.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/mca_dma.h um/include/asm-um/mca_dma.h
+--- orig/include/asm-um/mca_dma.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/mca_dma.h        Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef mca___UM_DMA_H
++#define mca___UM_DMA_H
++
++#include "asm/arch/mca_dma.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/mman.h um/include/asm-um/mman.h
+--- orig/include/asm-um/mman.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/mman.h   Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_MMAN_H
++#define __UM_MMAN_H
++
++#include "asm/arch/mman.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/mmu.h um/include/asm-um/mmu.h
+--- orig/include/asm-um/mmu.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/mmu.h    Sat Nov  9 12:51:11 2002
+@@ -0,0 +1,22 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __MMU_H
++#define __MMU_H
++
++#include "um_mmu.h"
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/mmu_context.h um/include/asm-um/mmu_context.h
+--- orig/include/asm-um/mmu_context.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/mmu_context.h    Wed Apr 16 13:59:16 2003
+@@ -0,0 +1,72 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_MMU_CONTEXT_H
++#define __UM_MMU_CONTEXT_H
++
++#include "linux/sched.h"
++#include "choose-mode.h"
++
++#define get_mmu_context(task) do ; while(0)
++#define activate_context(tsk) do ; while(0)
++
++static inline void activate_mm(struct mm_struct *old, struct mm_struct *new)
++{
++}
++
++extern void switch_mm_skas(int mm_fd);
++
++static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
++                           struct task_struct *tsk, unsigned cpu)
++{
++      if(prev != next){
++              clear_bit(cpu, &prev->cpu_vm_mask);
++              set_bit(cpu, &next->cpu_vm_mask);
++              if(next != &init_mm)
++                      CHOOSE_MODE((void) 0, 
++                                  switch_mm_skas(next->context.skas.mm_fd));
++      }
++}
++
++static inline void enter_lazy_tlb(struct mm_struct *mm, 
++                                struct task_struct *tsk, unsigned cpu)
++{
++}
++
++extern int init_new_context_skas(struct task_struct *task, 
++                               struct mm_struct *mm);
++
++static inline int init_new_context_tt(struct task_struct *task, 
++                                    struct mm_struct *mm)
++{
++      return(0);
++}
++
++static inline int init_new_context(struct task_struct *task, 
++                                 struct mm_struct *mm)
++{
++      return(CHOOSE_MODE_PROC(init_new_context_tt, init_new_context_skas, 
++                              task, mm));
++}
++
++extern void destroy_context_skas(struct mm_struct *mm);
++
++static inline void destroy_context(struct mm_struct *mm)
++{
++      CHOOSE_MODE((void) 0, destroy_context_skas(mm));
++}
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/module.h um/include/asm-um/module.h
+--- orig/include/asm-um/module.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/module.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_MODULE_H
++#define __UM_MODULE_H
++
++#include "asm/arch/module.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/msgbuf.h um/include/asm-um/msgbuf.h
+--- orig/include/asm-um/msgbuf.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/msgbuf.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_MSGBUF_H
++#define __UM_MSGBUF_H
++
++#include "asm/arch/msgbuf.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/mtrr.h um/include/asm-um/mtrr.h
+--- orig/include/asm-um/mtrr.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/mtrr.h   Thu Mar 27 15:11:56 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_MTRR_H
++#define __UM_MTRR_H
++
++#include "asm/arch/mtrr.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/namei.h um/include/asm-um/namei.h
+--- orig/include/asm-um/namei.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/namei.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_NAMEI_H
++#define __UM_NAMEI_H
++
++#include "asm/arch/namei.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/page.h um/include/asm-um/page.h
+--- orig/include/asm-um/page.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/page.h   Sun Oct 27 16:49:35 2002
+@@ -0,0 +1,53 @@
++#ifndef __UM_PAGE_H
++#define __UM_PAGE_H
++
++struct page;
++
++#include "asm/arch/page.h"
++
++#undef BUG
++#undef PAGE_BUG
++#undef __pa
++#undef __va
++#undef virt_to_page
++#undef VALID_PAGE
++#undef PAGE_OFFSET
++#undef KERNELBASE
++
++extern unsigned long uml_physmem;
++
++#define PAGE_OFFSET (uml_physmem)
++#define KERNELBASE PAGE_OFFSET
++
++#ifndef __ASSEMBLY__
++
++extern void stop(void);
++
++#define BUG() do { \
++      panic("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
++} while (0)
++
++#define PAGE_BUG(page) do { \
++      BUG(); \
++} while (0)
++
++#endif /* __ASSEMBLY__ */
++
++#define __va_space (8*1024*1024)
++
++extern unsigned long region_pa(void *virt);
++extern void *region_va(unsigned long phys);
++
++#define __pa(virt) region_pa((void *) (virt))
++#define __va(phys) region_va((unsigned long) (phys))
++
++extern struct page *page_mem_map(struct page *page);
++
++extern struct page *pfn_to_page(unsigned long pfn);
++
++#define VALID_PAGE(page) (page_mem_map(page) != NULL)
++
++extern struct page *arch_validate(struct page *page, int mask, int order);
++#define HAVE_ARCH_VALIDATE
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/page_offset.h um/include/asm-um/page_offset.h
+--- orig/include/asm-um/page_offset.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/page_offset.h    Wed Oct 23 21:08:05 2002
+@@ -0,0 +1 @@
++#define PAGE_OFFSET_RAW (uml_physmem)
+diff -Naur -X ../exclude-files orig/include/asm-um/param.h um/include/asm-um/param.h
+--- orig/include/asm-um/param.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/param.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,24 @@
++#ifndef _UM_PARAM_H
++#define _UM_PARAM_H
++
++#ifndef HZ
++#define HZ 52
++#endif
++
++#define EXEC_PAGESIZE   4096
++
++#ifndef NGROUPS
++#define NGROUPS         32
++#endif
++
++#ifndef NOGROUP
++#define NOGROUP         (-1)
++#endif
++
++#define MAXHOSTNAMELEN  64      /* max length of hostname */
++
++#ifdef __KERNEL__
++# define CLOCKS_PER_SEC 100    /* frequency at which times() counts */
++#endif
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/pci.h um/include/asm-um/pci.h
+--- orig/include/asm-um/pci.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/pci.h    Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_PCI_H
++#define __UM_PCI_H
++
++#define PCI_DMA_BUS_IS_PHYS     (1)
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/pgalloc.h um/include/asm-um/pgalloc.h
+--- orig/include/asm-um/pgalloc.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/pgalloc.h        Wed Apr 16 13:59:04 2003
+@@ -0,0 +1,162 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Derived from include/asm-i386/pgalloc.h and include/asm-i386/pgtable.h
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_PGALLOC_H
++#define __UM_PGALLOC_H
++
++#include "linux/config.h"
++#include "linux/mm.h"
++#include "asm/fixmap.h"
++#include "choose-mode.h"
++
++#define pgd_quicklist (current_cpu_data.pgd_quick)
++#define pmd_quicklist (current_cpu_data.pmd_quick)
++#define pte_quicklist (current_cpu_data.pte_quick)
++#define pgtable_cache_size (current_cpu_data.pgtable_cache_sz)
++
++#define pmd_populate(mm, pmd, pte) set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
++
++/*
++ * Allocate and free page tables.
++ */
++
++static inline pgd_t *get_pgd_slow_tt(void)
++{
++      pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
++
++      if (pgd) {
++              memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
++              memcpy(pgd + USER_PTRS_PER_PGD, 
++                     swapper_pg_dir + USER_PTRS_PER_PGD, 
++                     (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
++      }
++      return pgd;
++}
++
++static inline pgd_t *get_pgd_slow_skas(void)
++{
++      pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
++
++      if (pgd)
++              memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
++      return pgd;
++}
++
++static inline pgd_t *get_pgd_slow(void)
++{
++      return(CHOOSE_MODE(get_pgd_slow_tt(), get_pgd_slow_skas()));
++}
++
++static inline pgd_t *get_pgd_fast(void)
++{
++      unsigned long *ret;
++
++      if ((ret = pgd_quicklist) != NULL) {
++              pgd_quicklist = (unsigned long *)(*ret);
++              ret[0] = 0;
++              pgtable_cache_size--;
++      } else
++              ret = (unsigned long *)get_pgd_slow();
++      return (pgd_t *)ret;
++}
++
++static inline void free_pgd_fast(pgd_t *pgd)
++{
++      *(unsigned long *)pgd = (unsigned long) pgd_quicklist;
++      pgd_quicklist = (unsigned long *) pgd;
++      pgtable_cache_size++;
++}
++
++static inline void free_pgd_slow(pgd_t *pgd)
++{
++      free_page((unsigned long)pgd);
++}
++
++static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address)
++{
++      pte_t *pte;
++
++      pte = (pte_t *) __get_free_page(GFP_KERNEL);
++      if (pte)
++              clear_page(pte);
++      return pte;
++}
++
++static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
++{
++      unsigned long *ret;
++
++      if ((ret = (unsigned long *)pte_quicklist) != NULL) {
++              pte_quicklist = (unsigned long *)(*ret);
++              ret[0] = ret[1];
++              pgtable_cache_size--;
++      }
++      return (pte_t *)ret;
++}
++
++static inline void pte_free_fast(pte_t *pte)
++{
++      *(unsigned long *)pte = (unsigned long) pte_quicklist;
++      pte_quicklist = (unsigned long *) pte;
++      pgtable_cache_size++;
++}
++
++static inline void pte_free_slow(pte_t *pte)
++{
++      free_page((unsigned long)pte);
++}
++
++#define pte_free(pte)           pte_free_fast(pte)
++#define pgd_free(pgd)           free_pgd_slow(pgd)
++#define pgd_alloc(mm)           get_pgd_fast()
++
++/*
++ * allocating and freeing a pmd is trivial: the 1-entry pmd is
++ * inside the pgd, so has no extra memory associated with it.
++ */
++
++#define pmd_alloc_one_fast(mm, addr)  ({ BUG(); ((pmd_t *)1); })
++#define pmd_alloc_one(mm, addr)               ({ BUG(); ((pmd_t *)2); })
++#define pmd_free_slow(x)              do { } while (0)
++#define pmd_free_fast(x)              do { } while (0)
++#define pmd_free(x)                   do { } while (0)
++#define pgd_populate(mm, pmd, pte)    BUG()
++
++/*
++ * TLB flushing:
++ *
++ *  - flush_tlb() flushes the current mm struct TLBs
++ *  - flush_tlb_all() flushes all processes TLBs
++ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
++ *  - flush_tlb_page(vma, vmaddr) flushes one page
++ *  - flush_tlb_kernel_vm() flushes the kernel vm area
++ *  - flush_tlb_range(mm, start, end) flushes a range of pages
++ *  - flush_tlb_pgtables(mm, start, end) flushes a range of page tables
++ */
++
++extern void flush_tlb_all(void);
++extern void flush_tlb_mm(struct mm_struct *mm);
++extern void flush_tlb_range(struct mm_struct *mm, unsigned long start, 
++                          unsigned long end);
++extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
++extern void flush_tlb_kernel_vm(void);
++
++static inline void flush_tlb_pgtables(struct mm_struct *mm,
++                                    unsigned long start, unsigned long end)
++{
++}
++
++#endif
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/pgtable.h um/include/asm-um/pgtable.h
+--- orig/include/asm-um/pgtable.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/pgtable.h        Wed Apr 16 13:59:04 2003
+@@ -0,0 +1,428 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Derived from include/asm-i386/pgtable.h
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_PGTABLE_H
++#define __UM_PGTABLE_H
++
++#include "linux/sched.h"
++#include "asm/processor.h"
++#include "asm/page.h"
++
++extern pgd_t swapper_pg_dir[1024];
++
++#define flush_cache_all() do ; while (0)
++#define flush_cache_mm(mm) do ; while (0)
++#define flush_cache_range(vma, start, end) do ; while (0)
++#define flush_cache_page(vma, vmaddr) do ; while (0)
++#define flush_page_to_ram(page) do ; while (0)
++#define flush_dcache_page(page)       do ; while (0)
++#define flush_icache_range(from, to) do ; while (0)
++#define flush_icache_page(vma,pg) do ; while (0)
++#define flush_icache_user_range(vma,pg,adr,len)       do ; while (0)
++
++extern void __flush_tlb_one(unsigned long addr);
++
++extern void pte_free(pte_t *pte);
++
++extern void pgd_free(pgd_t *pgd);
++
++extern int do_check_pgt_cache(int, int);
++
++extern void *um_virt_to_phys(struct task_struct *task, unsigned long virt,
++                           pte_t *pte_out);
++
++/* zero page used for uninitialized stuff */
++extern unsigned long *empty_zero_page;
++
++#define pgtable_cache_init() do ; while (0)
++
++/* PMD_SHIFT determines the size of the area a second-level page table can map */
++#define PMD_SHIFT     22
++#define PMD_SIZE      (1UL << PMD_SHIFT)
++#define PMD_MASK      (~(PMD_SIZE-1))
++
++/* PGDIR_SHIFT determines what a third-level page table entry can map */
++#define PGDIR_SHIFT   22
++#define PGDIR_SIZE    (1UL << PGDIR_SHIFT)
++#define PGDIR_MASK    (~(PGDIR_SIZE-1))
++
++/*
++ * entries per page directory level: the i386 is two-level, so
++ * we don't really have any PMD directory physically.
++ */
++#define PTRS_PER_PTE  1024
++#define PTRS_PER_PMD  1
++#define PTRS_PER_PGD  1024
++#define USER_PTRS_PER_PGD     (TASK_SIZE/PGDIR_SIZE)
++#define FIRST_USER_PGD_NR       0
++
++#define pte_ERROR(e) \
++        printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
++#define pmd_ERROR(e) \
++        printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
++#define pgd_ERROR(e) \
++        printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
++
++/*
++ * pgd entries used up by user/kernel:
++ */
++
++#define USER_PGD_PTRS (TASK_SIZE >> PGDIR_SHIFT)
++#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
++
++#ifndef __ASSEMBLY__
++/* Just any arbitrary offset to the start of the vmalloc VM area: the
++ * current 8MB value just means that there will be a 8MB "hole" after the
++ * physical memory until the kernel virtual memory starts.  That means that
++ * any out-of-bounds memory accesses will hopefully be caught.
++ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
++ * area for the same reason. ;)
++ */
++
++extern unsigned long high_physmem;
++
++#define VMALLOC_OFFSET        (__va_space)
++#define VMALLOC_START (((unsigned long) high_physmem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
++#define VMALLOC_VMADDR(x) ((unsigned long)(x))
++
++#if CONFIG_HIGHMEM
++# define VMALLOC_END  (PKMAP_BASE-2*PAGE_SIZE)
++#else
++# define VMALLOC_END  (FIXADDR_START-2*PAGE_SIZE)
++#endif
++
++#define _PAGE_PRESENT 0x001
++#define _PAGE_NEWPAGE 0x002
++#define _PAGE_PROTNONE        0x004   /* If not present */
++#define _PAGE_RW      0x008
++#define _PAGE_USER    0x010
++#define _PAGE_ACCESSED        0x020
++#define _PAGE_DIRTY   0x040
++#define _PAGE_NEWPROT   0x080
++
++#define REGION_MASK   0xf0000000
++#define REGION_SHIFT  28
++
++#define _PAGE_TABLE   (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
++#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
++#define _PAGE_CHG_MASK        (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
++
++#define PAGE_NONE     __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
++#define PAGE_SHARED   __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_COPY     __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
++#define PAGE_KERNEL   __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
++#define PAGE_KERNEL_RO        __pgprot(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED)
++
++/*
++ * The i386 can't do page protection for execute, and considers that the same are read.
++ * Also, write permissions imply read permissions. This is the closest we can get..
++ */
++#define __P000        PAGE_NONE
++#define __P001        PAGE_READONLY
++#define __P010        PAGE_COPY
++#define __P011        PAGE_COPY
++#define __P100        PAGE_READONLY
++#define __P101        PAGE_READONLY
++#define __P110        PAGE_COPY
++#define __P111        PAGE_COPY
++
++#define __S000        PAGE_NONE
++#define __S001        PAGE_READONLY
++#define __S010        PAGE_SHARED
++#define __S011        PAGE_SHARED
++#define __S100        PAGE_READONLY
++#define __S101        PAGE_READONLY
++#define __S110        PAGE_SHARED
++#define __S111        PAGE_SHARED
++
++/*
++ * Define this if things work differently on an i386 and an i486:
++ * it will (on an i486) warn about kernel memory accesses that are
++ * done without a 'verify_area(VERIFY_WRITE,..)'
++ */
++#undef TEST_VERIFY_AREA
++
++/* page table for 0-4MB for everybody */
++extern unsigned long pg0[1024];
++
++/*
++ * BAD_PAGETABLE is used when we need a bogus page-table, while
++ * BAD_PAGE is used for a bogus page.
++ *
++ * ZERO_PAGE is a global shared page that is always zero: used
++ * for zero-mapped memory areas etc..
++ */
++extern pte_t __bad_page(void);
++extern pte_t * __bad_pagetable(void);
++
++#define BAD_PAGETABLE __bad_pagetable()
++#define BAD_PAGE __bad_page()
++#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
++
++/* number of bits that fit into a memory pointer */
++#define BITS_PER_PTR                  (8*sizeof(unsigned long))
++
++/* to align the pointer to a pointer address */
++#define PTR_MASK                      (~(sizeof(void*)-1))
++
++/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */
++/* 64-bit machines, beware!  SRB. */
++#define SIZEOF_PTR_LOG2                       2
++
++/* to find an entry in a page-table */
++#define PAGE_PTR(address) \
++((unsigned long)(address)>>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK)
++
++#define pte_none(x)   !(pte_val(x) & ~_PAGE_NEWPAGE)
++#define pte_present(x)        (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE))
++
++#define pte_clear(xp) do { pte_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
++
++#define phys_region_index(x) (((x) & REGION_MASK) >> REGION_SHIFT)
++#define pte_region_index(x) phys_region_index(pte_val(x))
++
++#define pmd_none(x)   (!(pmd_val(x) & ~_PAGE_NEWPAGE))
++#define       pmd_bad(x)      ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
++#define pmd_present(x)        (pmd_val(x) & _PAGE_PRESENT)
++#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0)
++
++#define pmd_newpage(x)  (pmd_val(x) & _PAGE_NEWPAGE)
++#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEWPAGE)
++
++/*
++ * The "pgd_xxx()" functions here are trivial for a folded two-level
++ * setup: the pgd is never bad, and a pmd always exists (as it's folded
++ * into the pgd entry)
++ */
++static inline int pgd_none(pgd_t pgd)         { return 0; }
++static inline int pgd_bad(pgd_t pgd)          { return 0; }
++static inline int pgd_present(pgd_t pgd)      { return 1; }
++static inline void pgd_clear(pgd_t * pgdp)    { }
++
++#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
++
++extern struct page *pte_mem_map(pte_t pte);
++extern struct page *phys_mem_map(unsigned long phys);
++extern unsigned long phys_to_pfn(unsigned long p);
++
++#define pte_page(x) pfn_to_page(pte_pfn(x))
++#define pte_address(x) (__va(pte_val(x) & PAGE_MASK))
++#define mk_phys(a, r) ((a) + (r << REGION_SHIFT))
++#define phys_addr(p) ((p) & ~REGION_MASK)
++#define phys_page(p) (phys_mem_map(p) + ((phys_addr(p)) >> PAGE_SHIFT))
++#define virt_to_page(kaddr) \
++      (phys_mem_map(__pa(kaddr)) + (phys_addr(__pa(kaddr)) >> PAGE_SHIFT))
++#define pte_pfn(x) phys_to_pfn(pte_val(x))
++
++static inline pte_t pte_mknewprot(pte_t pte)
++{
++      pte_val(pte) |= _PAGE_NEWPROT;
++      return(pte);
++}
++
++static inline pte_t pte_mknewpage(pte_t pte)
++{
++      pte_val(pte) |= _PAGE_NEWPAGE;
++      return(pte);
++}
++
++static inline void set_pte(pte_t *pteptr, pte_t pteval)
++{
++      /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so
++       * fix_range knows to unmap it.  _PAGE_NEWPROT is specific to
++       * mapped pages.
++       */
++      *pteptr = pte_mknewpage(pteval);
++      if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr);
++}
++
++/*
++ * (pmds are folded into pgds so this doesnt get actually called,
++ * but the define is needed for a generic inline function.)
++ */
++#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval)
++#define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval)
++
++/*
++ * The following only work if pte_present() is true.
++ * Undefined behaviour if not..
++ */
++static inline int pte_read(pte_t pte)
++{ 
++      return((pte_val(pte) & _PAGE_USER) && 
++             !(pte_val(pte) & _PAGE_PROTNONE));
++}
++
++static inline int pte_exec(pte_t pte){
++      return((pte_val(pte) & _PAGE_USER) &&
++             !(pte_val(pte) & _PAGE_PROTNONE));
++}
++
++static inline int pte_write(pte_t pte)
++{
++      return((pte_val(pte) & _PAGE_RW) &&
++             !(pte_val(pte) & _PAGE_PROTNONE));
++}
++
++static inline int pte_dirty(pte_t pte)        { return pte_val(pte) & _PAGE_DIRTY; }
++static inline int pte_young(pte_t pte)        { return pte_val(pte) & _PAGE_ACCESSED; }
++static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; }
++static inline int pte_newprot(pte_t pte)
++{ 
++      return(pte_present(pte) && (pte_val(pte) & _PAGE_NEWPROT)); 
++}
++
++static inline pte_t pte_rdprotect(pte_t pte)
++{ 
++      pte_val(pte) &= ~_PAGE_USER; 
++      return(pte_mknewprot(pte));
++}
++
++static inline pte_t pte_exprotect(pte_t pte)
++{ 
++      pte_val(pte) &= ~_PAGE_USER;
++      return(pte_mknewprot(pte));
++}
++
++static inline pte_t pte_mkclean(pte_t pte)
++{
++      pte_val(pte) &= ~_PAGE_DIRTY; 
++      return(pte);
++}
++
++static inline pte_t pte_mkold(pte_t pte)      
++{ 
++      pte_val(pte) &= ~_PAGE_ACCESSED; 
++      return(pte);
++}
++
++static inline pte_t pte_wrprotect(pte_t pte)
++{ 
++      pte_val(pte) &= ~_PAGE_RW; 
++      return(pte_mknewprot(pte)); 
++}
++
++static inline pte_t pte_mkread(pte_t pte)
++{ 
++      pte_val(pte) |= _PAGE_USER; 
++      return(pte_mknewprot(pte)); 
++}
++
++static inline pte_t pte_mkexec(pte_t pte)
++{ 
++      pte_val(pte) |= _PAGE_USER; 
++      return(pte_mknewprot(pte)); 
++}
++
++static inline pte_t pte_mkdirty(pte_t pte)
++{ 
++      pte_val(pte) |= _PAGE_DIRTY; 
++      return(pte);
++}
++
++static inline pte_t pte_mkyoung(pte_t pte)
++{
++      pte_val(pte) |= _PAGE_ACCESSED; 
++      return(pte);
++}
++
++static inline pte_t pte_mkwrite(pte_t pte)    
++{
++      pte_val(pte) |= _PAGE_RW; 
++      return(pte_mknewprot(pte)); 
++}
++
++static inline pte_t pte_mkuptodate(pte_t pte) 
++{
++      pte_val(pte) &= ~_PAGE_NEWPAGE;
++      if(pte_present(pte)) pte_val(pte) &= ~_PAGE_NEWPROT;
++      return(pte); 
++}
++
++extern unsigned long page_to_phys(struct page *page);
++
++/*
++ * Conversion functions: convert a page and protection to a page entry,
++ * and a page entry and page directory to the page they refer to.
++ */
++
++#define mk_pte(page, pgprot) \
++({                                    \
++      pte_t __pte;                    \
++                                        \
++      pte_val(__pte) = page_to_phys(page) + pgprot_val(pgprot);\
++      if(pte_present(__pte)) pte_mknewprot(pte_mknewpage(__pte)); \
++      __pte;                          \
++})
++
++/* This takes a physical page address that is used by the remapping functions */
++#define mk_pte_phys(physpage, pgprot) \
++      pte_mknewpage(mk_pte(phys_page(physpage), pgprot))
++
++static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
++{
++      pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot);
++      if(pte_present(pte)) pte = pte_mknewpage(pte_mknewprot(pte));
++      return pte; 
++}
++
++#define pmd_page(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
++
++/* to find an entry in a page-table-directory. */
++#define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
++#define __pgd_offset(address) pgd_index(address)
++
++/* to find an entry in a page-table-directory */
++#define pgd_offset(mm, address) \
++((mm)->pgd + ((address) >> PGDIR_SHIFT))
++
++/* to find an entry in a kernel page-table-directory */
++#define pgd_offset_k(address) pgd_offset(&init_mm, address)
++
++#define __pmd_offset(address) \
++              (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
++
++/* Find an entry in the second-level page table.. */
++static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
++{
++      return (pmd_t *) dir;
++}
++
++/* Find an entry in the third-level page table.. */ 
++#define pte_offset(pmd, address) \
++((pte_t *) (pmd_page(*pmd) + ((address>>10) & ((PTRS_PER_PTE-1)<<2))))
++
++#define update_mmu_cache(vma,address,pte) do ; while (0)
++
++/* Encode and de-code a swap entry */
++#define SWP_TYPE(x)                   (((x).val >> 3) & 0x7f)
++#define SWP_OFFSET(x)                 ((x).val >> 10)
++
++#define SWP_ENTRY(type, offset) \
++      ((swp_entry_t) { ((type) << 3) | ((offset) << 10) })
++#define pte_to_swp_entry(pte) \
++      ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) })
++#define swp_entry_to_pte(x)           ((pte_t) { (x).val })
++
++#define PageSkip(x) (0)
++#define kern_addr_valid(addr) (1)
++
++#include <asm-generic/pgtable.h>
++
++#endif
++
++#endif
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/poll.h um/include/asm-um/poll.h
+--- orig/include/asm-um/poll.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/poll.h   Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_POLL_H
++#define __UM_POLL_H
++
++#include "asm/arch/poll.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/posix_types.h um/include/asm-um/posix_types.h
+--- orig/include/asm-um/posix_types.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/posix_types.h    Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_POSIX_TYPES_H
++#define __UM_POSIX_TYPES_H
++
++#include "asm/arch/posix_types.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/processor-generic.h um/include/asm-um/processor-generic.h
+--- orig/include/asm-um/processor-generic.h    Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/processor-generic.h      Wed Apr 16 13:59:03 2003
+@@ -0,0 +1,182 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_PROCESSOR_GENERIC_H
++#define __UM_PROCESSOR_GENERIC_H
++
++struct pt_regs;
++
++struct task_struct;
++
++#include "linux/config.h"
++#include "linux/signal.h"
++#include "asm/ptrace.h"
++#include "asm/siginfo.h"
++#include "choose-mode.h"
++
++struct mm_struct;
++
++#define current_text_addr() ((void *) 0)
++
++#define cpu_relax()   do ; while (0)
++
++#ifdef CONFIG_MODE_TT
++struct proc_tt_mode {
++      int extern_pid;
++      int tracing;
++      int switch_pipe[2];
++      int singlestep_syscall;
++      int vm_seq;
++};
++#endif
++
++#ifdef CONFIG_MODE_SKAS
++struct proc_skas_mode {
++      void *switch_buf;
++      void *fork_buf;
++};
++#endif
++
++struct thread_struct {
++      int forking;
++      unsigned long kernel_stack;
++      int nsyscalls;
++      struct pt_regs regs;
++      unsigned long cr2;
++      int err;
++      void *fault_addr;
++      void *fault_catcher;
++      struct task_struct *prev_sched;
++      unsigned long temp_stack;
++      void *exec_buf;
++      struct arch_thread arch;
++      union {
++#ifdef CONFIG_MODE_TT
++              struct proc_tt_mode tt;
++#endif
++#ifdef CONFIG_MODE_SKAS
++              struct proc_skas_mode skas;
++#endif
++      } mode;
++      struct {
++              int op;
++              union {
++                      struct {
++                              int pid;
++                      } fork, exec;
++                      struct {
++                              int (*proc)(void *);
++                              void *arg;
++                      } thread;
++                      struct {
++                              void (*proc)(void *);
++                              void *arg;
++                      } cb;
++              } u;
++      } request;
++};
++
++#define INIT_THREAD \
++{ \
++      .forking                = 0, \
++      .kernel_stack           = 0, \
++      .nsyscalls              = 0, \
++        .regs                 = EMPTY_REGS, \
++      .cr2                    = 0, \
++      .err                    = 0, \
++      .fault_addr             = NULL, \
++      .prev_sched             = NULL, \
++      .temp_stack             = 0, \
++      .exec_buf               = NULL, \
++      .arch                   = INIT_ARCH_THREAD, \
++      .request                = { 0 } \
++}
++
++#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE)
++
++typedef struct {
++      unsigned long seg;
++} mm_segment_t;
++
++extern struct task_struct *alloc_task_struct(void);
++extern void free_task_struct(struct task_struct *task);
++
++#define get_task_struct(tsk)      atomic_inc(&virt_to_page(tsk)->count)
++
++extern void release_thread(struct task_struct *);
++extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags);
++extern void dump_thread(struct pt_regs *regs, struct user *u);
++
++extern unsigned long thread_saved_pc(struct thread_struct *t);
++
++static inline void mm_copy_segments(struct mm_struct *from_mm, 
++                                  struct mm_struct *new_mm)
++{
++}
++
++static inline void copy_segments(struct task_struct *p, 
++                               struct mm_struct *new_mm)
++{
++}
++
++static inline void release_segments(struct mm_struct *mm)
++{
++}
++
++#define init_task     (init_task_union.task)
++#define init_stack    (init_task_union.stack)
++
++/*
++ * User space process size: 3GB (default).
++ */
++extern unsigned long task_size;
++
++#define TASK_SIZE     (task_size)
++
++/* This decides where the kernel will search for a free chunk of vm
++ * space during mmap's.
++ */
++#define TASK_UNMAPPED_BASE    (0x40000000)
++
++extern void start_thread(struct pt_regs *regs, unsigned long entry, 
++                       unsigned long stack);
++
++struct cpuinfo_um {
++      unsigned long loops_per_jiffy;
++      unsigned long *pgd_quick;
++      unsigned long *pmd_quick;
++      unsigned long *pte_quick;
++      unsigned long pgtable_cache_sz;  
++      int ipi_pipe[2];
++};
++
++extern struct cpuinfo_um boot_cpu_data;
++
++#define my_cpu_data           cpu_data[smp_processor_id()]
++
++#ifdef CONFIG_SMP
++extern struct cpuinfo_um cpu_data[];
++#define current_cpu_data cpu_data[smp_processor_id()]
++#else
++#define cpu_data (&boot_cpu_data)
++#define current_cpu_data boot_cpu_data
++#endif
++
++#define KSTK_EIP(tsk) (PT_REGS_IP(&tsk->thread.regs))
++#define KSTK_ESP(tsk) (PT_REGS_SP(&tsk->thread.regs))
++#define get_wchan(p) (0)
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/processor-i386.h um/include/asm-um/processor-i386.h
+--- orig/include/asm-um/processor-i386.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/processor-i386.h Wed Apr 16 13:59:03 2003
+@@ -0,0 +1,35 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_PROCESSOR_I386_H
++#define __UM_PROCESSOR_I386_H
++
++extern int cpu_has_xmm;
++extern int cpu_has_cmov;
++
++struct arch_thread {
++      unsigned long debugregs[8];
++      int debugregs_seq;
++};
++
++#define INIT_ARCH_THREAD { .debugregs                 = { [ 0 ... 7 ] = 0 }, \
++                           .debugregs_seq     = 0 }
++
++#include "asm/arch/user.h"
++
++#include "asm/processor-generic.h"
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/processor-ppc.h um/include/asm-um/processor-ppc.h
+--- orig/include/asm-um/processor-ppc.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/processor-ppc.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,15 @@
++#ifndef __UM_PROCESSOR_PPC_H
++#define __UM_PROCESSOR_PPC_H
++
++#if defined(__ASSEMBLY__)
++
++#define CONFIG_ALL_PPC
++#include "arch/processor.h"
++
++#else
++
++#include "asm/processor-generic.h"
++
++#endif
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/ptrace-generic.h um/include/asm-um/ptrace-generic.h
+--- orig/include/asm-um/ptrace-generic.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/ptrace-generic.h Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,74 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_PTRACE_GENERIC_H
++#define __UM_PTRACE_GENERIC_H
++
++#ifndef __ASSEMBLY__
++
++#include "linux/config.h"
++
++#include "asm/current.h"
++
++#define pt_regs pt_regs_subarch
++#define show_regs show_regs_subarch
++
++#include "asm/arch/ptrace.h"
++
++#undef pt_regs
++#undef show_regs
++#undef user_mode
++#undef instruction_pointer
++
++#include "sysdep/ptrace.h"
++#include "skas_ptrace.h"
++
++struct pt_regs {
++      union uml_pt_regs regs;
++};
++
++#define EMPTY_REGS { regs : EMPTY_UML_PT_REGS }
++
++#define PT_REGS_IP(r) UPT_IP(&(r)->regs)
++#define PT_REGS_SP(r) UPT_SP(&(r)->regs)
++
++#define PT_REG(r, reg) UPT_REG(&(r)->regs, reg)
++#define PT_REGS_SET(r, reg, val) UPT_SET(&(r)->regs, reg, val)
++
++#define PT_REGS_SET_SYSCALL_RETURN(r, res) \
++      UPT_SET_SYSCALL_RETURN(&(r)->regs, res)
++#define PT_REGS_RESTART_SYSCALL(r) UPT_RESTART_SYSCALL(&(r)->regs)
++
++#define PT_REGS_SYSCALL_NR(r) UPT_SYSCALL_NR(&(r)->regs)
++
++#define PT_REGS_SC(r) UPT_SC(&(r)->regs)
++
++struct task_struct;
++
++extern unsigned long getreg(struct task_struct *child, int regno);
++extern int putreg(struct task_struct *child, int regno, unsigned long value);
++extern int get_fpregs(unsigned long buf, struct task_struct *child);
++extern int set_fpregs(unsigned long buf, struct task_struct *child);
++extern int get_fpxregs(unsigned long buf, struct task_struct *child);
++extern int set_fpxregs(unsigned long buf, struct task_struct *tsk);
++
++extern void show_regs(struct pt_regs *regs);
++
++#define INIT_TASK_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE)
++
++#endif
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/ptrace-i386.h um/include/asm-um/ptrace-i386.h
+--- orig/include/asm-um/ptrace-i386.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/ptrace-i386.h    Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,46 @@
++/* 
++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_PTRACE_I386_H
++#define __UM_PTRACE_I386_H
++
++#include "sysdep/ptrace.h"
++#include "asm/ptrace-generic.h"
++
++#define PT_REGS_EAX(r) UPT_EAX(&(r)->regs)
++#define PT_REGS_EBX(r) UPT_EBX(&(r)->regs)
++#define PT_REGS_ECX(r) UPT_ECX(&(r)->regs)
++#define PT_REGS_EDX(r) UPT_EDX(&(r)->regs)
++#define PT_REGS_ESI(r) UPT_ESI(&(r)->regs)
++#define PT_REGS_EDI(r) UPT_EDI(&(r)->regs)
++#define PT_REGS_EBP(r) UPT_EBP(&(r)->regs)
++
++#define PT_REGS_CS(r) UPT_CS(&(r)->regs)
++#define PT_REGS_SS(r) UPT_SS(&(r)->regs)
++#define PT_REGS_DS(r) UPT_DS(&(r)->regs)
++#define PT_REGS_ES(r) UPT_ES(&(r)->regs)
++#define PT_REGS_FS(r) UPT_FS(&(r)->regs)
++#define PT_REGS_GS(r) UPT_GS(&(r)->regs)
++
++#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs)
++
++#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_EAX(r)
++#define PT_REGS_SYSCALL_RET(r) PT_REGS_EAX(r)
++#define PT_FIX_EXEC_STACK(sp) do ; while(0)
++
++#define user_mode(r) UPT_IS_USER(&(r)->regs)
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/resource.h um/include/asm-um/resource.h
+--- orig/include/asm-um/resource.h     Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/resource.h       Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_RESOURCE_H
++#define __UM_RESOURCE_H
++
++#include "asm/arch/resource.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/rwlock.h um/include/asm-um/rwlock.h
+--- orig/include/asm-um/rwlock.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/rwlock.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_RWLOCK_H
++#define __UM_RWLOCK_H
++
++#include "asm/arch/rwlock.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/rwsem.h um/include/asm-um/rwsem.h
+--- orig/include/asm-um/rwsem.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/rwsem.h  Wed Apr 16 13:59:03 2003
+@@ -0,0 +1,10 @@
++#ifndef __UM_RWSEM_H__
++#define __UM_RWSEM_H__
++
++#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
++#define __builtin_expect(exp,c) (exp)
++#endif
++
++#include "asm/arch/rwsem.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/scatterlist.h um/include/asm-um/scatterlist.h
+--- orig/include/asm-um/scatterlist.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/scatterlist.h    Thu Feb 27 13:21:49 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_SCATTERLIST_H
++#define __UM_SCATTERLIST_H
++
++#include "asm/arch/scatterlist.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/segment.h um/include/asm-um/segment.h
+--- orig/include/asm-um/segment.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/segment.h        Fri Nov  1 19:45:34 2002
+@@ -0,0 +1,4 @@
++#ifndef __UM_SEGMENT_H
++#define __UM_SEGMENT_H
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/semaphore.h um/include/asm-um/semaphore.h
+--- orig/include/asm-um/semaphore.h    Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/semaphore.h      Wed Apr 16 13:59:03 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_SEMAPHORE_H
++#define __UM_SEMAPHORE_H
++
++#include "asm/arch/semaphore.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/sembuf.h um/include/asm-um/sembuf.h
+--- orig/include/asm-um/sembuf.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/sembuf.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_SEMBUF_H
++#define __UM_SEMBUF_H
++
++#include "asm/arch/sembuf.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/serial.h um/include/asm-um/serial.h
+--- orig/include/asm-um/serial.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/serial.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_SERIAL_H
++#define __UM_SERIAL_H
++
++#include "asm/arch/serial.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/shmbuf.h um/include/asm-um/shmbuf.h
+--- orig/include/asm-um/shmbuf.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/shmbuf.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_SHMBUF_H
++#define __UM_SHMBUF_H
++
++#include "asm/arch/shmbuf.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/shmparam.h um/include/asm-um/shmparam.h
+--- orig/include/asm-um/shmparam.h     Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/shmparam.h       Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_SHMPARAM_H
++#define __UM_SHMPARAM_H
++
++#include "asm/arch/shmparam.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/sigcontext-generic.h um/include/asm-um/sigcontext-generic.h
+--- orig/include/asm-um/sigcontext-generic.h   Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/sigcontext-generic.h     Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_SIGCONTEXT_GENERIC_H
++#define __UM_SIGCONTEXT_GENERIC_H
++
++#include "asm/arch/sigcontext.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/sigcontext-i386.h um/include/asm-um/sigcontext-i386.h
+--- orig/include/asm-um/sigcontext-i386.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/sigcontext-i386.h        Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_SIGCONTEXT_I386_H
++#define __UM_SIGCONTEXT_I386_H
++
++#include "asm/sigcontext-generic.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/sigcontext-ppc.h um/include/asm-um/sigcontext-ppc.h
+--- orig/include/asm-um/sigcontext-ppc.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/sigcontext-ppc.h Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,10 @@
++#ifndef __UM_SIGCONTEXT_PPC_H
++#define __UM_SIGCONTEXT_PPC_H
++
++#define pt_regs sys_pt_regs
++
++#include "asm/sigcontext-generic.h"
++
++#undef pt_regs
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/siginfo.h um/include/asm-um/siginfo.h
+--- orig/include/asm-um/siginfo.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/siginfo.h        Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_SIGINFO_H
++#define __UM_SIGINFO_H
++
++#include "asm/arch/siginfo.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/signal.h um/include/asm-um/signal.h
+--- orig/include/asm-um/signal.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/signal.h Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,22 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_SIGNAL_H
++#define __UM_SIGNAL_H
++
++#include "asm/arch/signal.h"
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/smp.h um/include/asm-um/smp.h
+--- orig/include/asm-um/smp.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/smp.h    Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,19 @@
++#ifndef __UM_SMP_H
++#define __UM_SMP_H
++
++#ifdef CONFIG_SMP
++
++#include "linux/config.h"
++#include "asm/current.h"
++
++#define smp_processor_id() (current->processor)
++#define cpu_logical_map(n) (n)
++#define cpu_number_map(n) (n)
++#define PROC_CHANGE_PENALTY   15 /* Pick a number, any number */
++extern int hard_smp_processor_id(void);
++extern unsigned long cpu_online_map;
++#define NO_PROC_ID -1
++
++#endif
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/smplock.h um/include/asm-um/smplock.h
+--- orig/include/asm-um/smplock.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/smplock.h        Wed Apr 16 13:59:04 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_SMPLOCK_H
++#define __UM_SMPLOCK_H
++
++#include "asm/arch/smplock.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/socket.h um/include/asm-um/socket.h
+--- orig/include/asm-um/socket.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/socket.h Thu Feb 27 13:20:13 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_SOCKET_H
++#define __UM_SOCKET_H
++
++#include "asm/arch/socket.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/sockios.h um/include/asm-um/sockios.h
+--- orig/include/asm-um/sockios.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/sockios.h        Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_SOCKIOS_H
++#define __UM_SOCKIOS_H
++
++#include "asm/arch/sockios.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/softirq.h um/include/asm-um/softirq.h
+--- orig/include/asm-um/softirq.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/softirq.h        Wed Apr 16 13:59:04 2003
+@@ -0,0 +1,13 @@
++#ifndef __UM_SOFTIRQ_H
++#define __UM_SOFTIRQ_H
++
++#include "linux/smp.h"
++#include "asm/system.h"
++#include "asm/processor.h"
++
++/* A gratuitous name change */
++#define i386_bh_lock um_bh_lock
++#include "asm/arch/softirq.h"
++#undef i386_bh_lock
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/spinlock.h um/include/asm-um/spinlock.h
+--- orig/include/asm-um/spinlock.h     Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/spinlock.h       Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,10 @@
++#ifndef __UM_SPINLOCK_H
++#define __UM_SPINLOCK_H
++
++#include "linux/config.h"
++
++#ifdef CONFIG_SMP
++#include "asm/arch/spinlock.h"
++#endif
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/stat.h um/include/asm-um/stat.h
+--- orig/include/asm-um/stat.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/stat.h   Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_STAT_H
++#define __UM_STAT_H
++
++#include "asm/arch/stat.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/statfs.h um/include/asm-um/statfs.h
+--- orig/include/asm-um/statfs.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/statfs.h Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,6 @@
++#ifndef _UM_STATFS_H
++#define _UM_STATFS_H
++
++#include "asm/arch/statfs.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/string.h um/include/asm-um/string.h
+--- orig/include/asm-um/string.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/string.h Mon Feb 24 22:52:09 2003
+@@ -0,0 +1,7 @@
++#ifndef __UM_STRING_H
++#define __UM_STRING_H
++
++#include "asm/arch/string.h"
++#include "asm/archparam.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/system-generic.h um/include/asm-um/system-generic.h
+--- orig/include/asm-um/system-generic.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/system-generic.h Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,50 @@
++#ifndef __UM_SYSTEM_GENERIC_H
++#define __UM_SYSTEM_GENERIC_H
++
++#include "asm/arch/system.h"
++
++#undef prepare_to_switch
++#undef switch_to
++#undef __save_flags
++#undef save_flags
++#undef __restore_flags
++#undef restore_flags
++#undef __cli
++#undef __sti
++#undef cli
++#undef sti
++#undef local_irq_save
++#undef local_irq_restore
++#undef local_irq_disable
++#undef local_irq_enable
++
++#define prepare_to_switch() do ; while(0)
++
++void *_switch_to(void *prev, void *next);
++
++#define switch_to(prev, next, last) prev = _switch_to(prev, next)
++
++extern int get_signals(void);
++extern int set_signals(int enable);
++extern void block_signals(void);
++extern void unblock_signals(void);
++
++#define local_irq_save(flags) do { (flags) = set_signals(0); } while(0)
++
++#define local_irq_restore(flags) do { set_signals(flags); } while(0)
++
++#define local_irq_enable() unblock_signals()
++#define local_irq_disable() block_signals()
++
++#define __sti() unblock_signals()
++#define sti() unblock_signals()
++#define __cli() block_signals()
++#define cli() block_signals()
++
++#define __save_flags(x) do { (flags) = get_signals(); } while(0)
++#define save_flags(x) __save_flags(x)
++
++#define __restore_flags(x) local_irq_restore(x)
++#define restore_flags(x) __restore_flags(x)
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/system-i386.h um/include/asm-um/system-i386.h
+--- orig/include/asm-um/system-i386.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/system-i386.h    Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,39 @@
++#ifndef __UM_SYSTEM_I386_H
++#define __UM_SYSTEM_I386_H
++
++#include "asm/system-generic.h"
++
++#define __HAVE_ARCH_CMPXCHG 1
++
++static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
++                                    unsigned long new, int size)
++{
++      unsigned long prev;
++      switch (size) {
++      case 1:
++              __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
++                                   : "=a"(prev)
++                                   : "q"(new), "m"(*__xg(ptr)), "0"(old)
++                                   : "memory");
++              return prev;
++      case 2:
++              __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
++                                   : "=a"(prev)
++                                   : "q"(new), "m"(*__xg(ptr)), "0"(old)
++                                   : "memory");
++              return prev;
++      case 4:
++              __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
++                                   : "=a"(prev)
++                                   : "q"(new), "m"(*__xg(ptr)), "0"(old)
++                                   : "memory");
++              return prev;
++      }
++      return old;
++}
++
++#define cmpxchg(ptr,o,n)\
++      ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
++                                      (unsigned long)(n),sizeof(*(ptr))))
++    
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/system-ppc.h um/include/asm-um/system-ppc.h
+--- orig/include/asm-um/system-ppc.h   Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/system-ppc.h     Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,12 @@
++#ifndef __UM_SYSTEM_PPC_H
++#define __UM_SYSTEM_PPC_H
++
++#define _switch_to _ppc_switch_to
++
++#include "asm/arch/system.h"
++
++#undef _switch_to
++ 
++#include "asm/system-generic.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/termbits.h um/include/asm-um/termbits.h
+--- orig/include/asm-um/termbits.h     Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/termbits.h       Wed Oct 23 21:11:14 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_TERMBITS_H
++#define __UM_TERMBITS_H
++
++#include "asm/arch/termbits.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/termios.h um/include/asm-um/termios.h
+--- orig/include/asm-um/termios.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/termios.h        Thu Feb 27 13:20:13 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_TERMIOS_H
++#define __UM_TERMIOS_H
++
++#include "asm/arch/termios.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/timex.h um/include/asm-um/timex.h
+--- orig/include/asm-um/timex.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/timex.h  Wed Mar 26 22:01:25 2003
+@@ -0,0 +1,18 @@
++#ifndef __UM_TIMEX_H
++#define __UM_TIMEX_H
++
++#include "linux/time.h"
++
++typedef unsigned long cycles_t;
++
++#define cacheflush_time (0)
++
++static inline cycles_t get_cycles (void)
++{
++      return 0;
++}
++
++#define vxtime_lock()         do ; while (0)
++#define vxtime_unlock()               do ; while (0)
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/tlb.h um/include/asm-um/tlb.h
+--- orig/include/asm-um/tlb.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/tlb.h    Wed Oct 23 21:08:05 2002
+@@ -0,0 +1 @@
++#include <asm-generic/tlb.h>
+diff -Naur -X ../exclude-files orig/include/asm-um/types.h um/include/asm-um/types.h
+--- orig/include/asm-um/types.h        Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/types.h  Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_TYPES_H
++#define __UM_TYPES_H
++
++#include "asm/arch/types.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/uaccess.h um/include/asm-um/uaccess.h
+--- orig/include/asm-um/uaccess.h      Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/uaccess.h        Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,97 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __UM_UACCESS_H
++#define __UM_UACCESS_H
++
++#define VERIFY_READ 0
++#define VERIFY_WRITE 1
++
++/*
++ * The fs value determines whether argument validity checking should be
++ * performed or not.  If get_fs() == USER_DS, checking is performed, with
++ * get_fs() == KERNEL_DS, checking is bypassed.
++ *
++ * For historical reasons, these macros are grossly misnamed.
++ */
++
++#define MAKE_MM_SEG(s)        ((mm_segment_t) { (s) })
++
++#define KERNEL_DS     MAKE_MM_SEG(0xFFFFFFFF)
++#define USER_DS               MAKE_MM_SEG(TASK_SIZE)
++
++#define get_ds()      (KERNEL_DS)
++#define get_fs()      (current->addr_limit)
++#define set_fs(x)     (current->addr_limit = (x))
++
++#define segment_eq(a, b) ((a).seg == (b).seg)
++
++#include "um_uaccess.h"
++
++#define __copy_from_user(to, from, n) copy_from_user(to, from, n)
++
++#define __copy_to_user(to, from, n) copy_to_user(to, from, n)
++
++#define __get_user(x, ptr) \
++({ \
++        const __typeof__(ptr) __private_ptr = ptr; \
++        __typeof__(*(__private_ptr)) __private_val; \
++        int __private_ret = -EFAULT; \
++        (x) = 0; \
++      if (__copy_from_user(&__private_val, (__private_ptr), \
++          sizeof(*(__private_ptr))) == 0) {\
++              (x) = (__typeof__(*(__private_ptr))) __private_val; \
++              __private_ret = 0; \
++      } \
++        __private_ret; \
++}) 
++
++#define get_user(x, ptr) \
++({ \
++        const __typeof__((*ptr)) *private_ptr = (ptr); \
++        (access_ok(VERIFY_READ, private_ptr, sizeof(*private_ptr)) ? \
++       __get_user(x, private_ptr) : ((x) = 0, -EFAULT)); \
++})
++
++#define __put_user(x, ptr) \
++({ \
++        __typeof__(ptr) __private_ptr = ptr; \
++        __typeof__(*(__private_ptr)) __private_val; \
++        int __private_ret = -EFAULT; \
++        __private_val = (__typeof__(*(__private_ptr))) (x); \
++        if (__copy_to_user((__private_ptr), &__private_val, \
++                         sizeof(*(__private_ptr))) == 0) { \
++              __private_ret = 0; \
++      } \
++        __private_ret; \
++})
++
++#define put_user(x, ptr) \
++({ \
++        __typeof__(*(ptr)) *private_ptr = (ptr); \
++        (access_ok(VERIFY_WRITE, private_ptr, sizeof(*private_ptr)) ? \
++       __put_user(x, private_ptr) : -EFAULT); \
++})
++
++#define strlen_user(str) strnlen_user(str, ~0UL >> 1)
++
++struct exception_table_entry
++{
++        unsigned long insn;
++      unsigned long fixup;
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/ucontext.h um/include/asm-um/ucontext.h
+--- orig/include/asm-um/ucontext.h     Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/ucontext.h       Sun Dec  1 13:20:58 2002
+@@ -0,0 +1,6 @@
++#ifndef _ASM_UM_UCONTEXT_H
++#define _ASM_UM_UCONTEXT_H
++
++#include "asm/arch/ucontext.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/unaligned.h um/include/asm-um/unaligned.h
+--- orig/include/asm-um/unaligned.h    Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/unaligned.h      Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_UNALIGNED_H
++#define __UM_UNALIGNED_H
++
++#include "asm/arch/unaligned.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/unistd.h um/include/asm-um/unistd.h
+--- orig/include/asm-um/unistd.h       Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/unistd.h Wed Mar 26 22:01:27 2003
+@@ -0,0 +1,118 @@
++/* 
++ * Copyright (C) 2000, 2001  Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef _UM_UNISTD_H_
++#define _UM_UNISTD_H_
++
++#include "linux/resource.h"
++#include "asm/uaccess.h"
++
++extern long sys_open(const char *filename, int flags, int mode);
++extern long sys_dup(unsigned int fildes);
++extern long sys_close(unsigned int fd);
++extern int um_execve(const char *file, char *const argv[], char *const env[]);
++extern long sys_setsid(void);
++extern long sys_waitpid(pid_t pid, unsigned int * stat_addr, int options);
++extern long sys_wait4(pid_t pid,unsigned int *stat_addr, int options, 
++                    struct rusage *ru);
++extern long sys_mount(char *dev_name, char *dir_name, char *type, 
++                    unsigned long flags, void *data);
++extern long sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, 
++                     struct timeval *tvp);
++extern long sys_lseek(unsigned int fildes, unsigned long offset, int whence);
++extern long sys_read(unsigned int fildes, char *buf, int len);
++extern long sys_write(unsigned int fildes, char *buf, int len);
++
++#ifdef __KERNEL_SYSCALLS__
++
++#define KERNEL_CALL(ret_t, sys, args...)      \
++      mm_segment_t fs = get_fs();             \
++      ret_t ret;                              \
++      set_fs(KERNEL_DS);                      \
++      ret = sys(args);                        \
++      set_fs(fs);                             \
++      return ret;
++
++static inline long open(const char *pathname, int flags, int mode) 
++{
++      KERNEL_CALL(int, sys_open, pathname, flags, mode)
++}
++
++static inline long dup(unsigned int fd)
++{
++      KERNEL_CALL(int, sys_dup, fd);
++}
++
++static inline long close(unsigned int fd)
++{
++      KERNEL_CALL(int, sys_close, fd);
++}
++
++static inline int execve(const char *filename, char *const argv[], 
++                       char *const envp[])
++{
++      KERNEL_CALL(int, um_execve, filename, argv, envp);
++}
++
++static inline long waitpid(pid_t pid, unsigned int *status, int options)
++{
++      KERNEL_CALL(pid_t, sys_wait4, pid, status, options, NULL)
++}
++
++static inline pid_t wait(int *status)
++{
++      KERNEL_CALL(pid_t, sys_wait4, -1, status, 0, NULL)
++}
++
++static inline pid_t setsid(void)
++{
++      KERNEL_CALL(pid_t, sys_setsid)
++}
++
++static inline long lseek(unsigned int fd, off_t offset, unsigned int whence)
++{
++      KERNEL_CALL(long, sys_lseek, fd, offset, whence)
++}
++
++static inline int read(unsigned int fd, char * buf, int len)
++{
++      KERNEL_CALL(int, sys_read, fd, buf, len)
++}
++
++static inline int write(unsigned int fd, char * buf, int len)
++{
++      KERNEL_CALL(int, sys_write, fd, buf, len)
++}
++
++#endif
++
++/* Save the value of __KERNEL_SYSCALLS__, undefine it, include the underlying
++ * arch's unistd.h for the system call numbers, and restore the old 
++ * __KERNEL_SYSCALLS__.
++ */
++
++#ifdef __KERNEL_SYSCALLS__
++#define __SAVE_KERNEL_SYSCALLS__ __KERNEL_SYSCALLS__
++#endif
++
++#undef __KERNEL_SYSCALLS__
++#include "asm/arch/unistd.h"
++
++#ifdef __KERNEL_SYSCALLS__
++#define __KERNEL_SYSCALLS__ __SAVE_KERNEL_SYSCALLS__
++#endif
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/asm-um/user.h um/include/asm-um/user.h
+--- orig/include/asm-um/user.h Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/user.h   Wed Apr 16 13:59:45 2003
+@@ -0,0 +1,6 @@
++#ifndef __UM_USER_H
++#define __UM_USER_H
++
++#include "asm/arch/user.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/vga.h um/include/asm-um/vga.h
+--- orig/include/asm-um/vga.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/vga.h    Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_VGA_H
++#define __UM_VGA_H
++
++#include "asm/arch/vga.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/asm-um/xor.h um/include/asm-um/xor.h
+--- orig/include/asm-um/xor.h  Wed Dec 31 19:00:00 1969
++++ um/include/asm-um/xor.h    Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,6 @@
++#ifndef __UM_XOR_H
++#define __UM_XOR_H
++
++#include "asm-generic/xor.h"
++
++#endif
+diff -Naur -X ../exclude-files orig/include/linux/blk.h um/include/linux/blk.h
+--- orig/include/linux/blk.h   Sun Sep 15 12:13:19 2002
++++ um/include/linux/blk.h     Wed Apr 16 13:59:04 2003
+@@ -320,6 +320,15 @@
+ #define DEVICE_REQUEST do_ida_request
+ #define DEVICE_NR(device) (MINOR(device) >> 4)
++#elif (MAJOR_NR == UBD_MAJOR)
++
++#define DEVICE_NAME "User-mode block device"
++#define DEVICE_INTR do_ubd
++#define DEVICE_REQUEST do_ubd_request
++#define DEVICE_NR(device) (MINOR(device) >> UBD_SHIFT)
++#define DEVICE_ON(device)
++#define DEVICE_OFF(device)
++
+ #endif /* MAJOR_NR == whatever */
+ /* provide DEVICE_xxx defaults, if not explicitly defined
+diff -Naur -X ../exclude-files orig/include/linux/fs.h um/include/linux/fs.h
+--- orig/include/linux/fs.h    Thu Feb 27 13:04:27 2003
++++ um/include/linux/fs.h      Wed Apr 16 13:59:03 2003
+@@ -318,6 +318,8 @@
+ #include <linux/ncp_fs_i.h>
+ #include <linux/proc_fs_i.h>
+ #include <linux/usbdev_fs_i.h>
++#include <linux/hostfs_fs_i.h>
++#include <linux/hppfs_fs_i.h>
+ #include <linux/jffs2_fs_i.h>
+ #include <linux/cramfs_fs_sb.h>
+@@ -509,7 +511,9 @@
+               struct proc_inode_info          proc_i;
+               struct socket                   socket_i;
+               struct usbdev_inode_info        usbdev_i;
+-              struct jffs2_inode_info         jffs2_i;
++              struct hostfs_inode_info        hostfs_i;
++              struct hppfs_inode_info         hppfs_i;
++              struct jffs2_inode_info         jffs2_i;
+               void                            *generic_ip;
+       } u;
+ };
+diff -Naur -X ../exclude-files orig/include/linux/hostfs_fs_i.h um/include/linux/hostfs_fs_i.h
+--- orig/include/linux/hostfs_fs_i.h   Wed Dec 31 19:00:00 1969
++++ um/include/linux/hostfs_fs_i.h     Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,21 @@
++#ifndef _HOSTFS_FS_I
++#define _HOSTFS_FS_I
++
++struct hostfs_inode_info {
++      char *host_filename;
++      int fd;
++      int mode;
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/linux/hppfs_fs_i.h um/include/linux/hppfs_fs_i.h
+--- orig/include/linux/hppfs_fs_i.h    Wed Dec 31 19:00:00 1969
++++ um/include/linux/hppfs_fs_i.h      Wed Oct 23 21:08:05 2002
+@@ -0,0 +1,19 @@
++#ifndef _HPPFS_FS_I
++#define _HPPFS_FS_I
++
++struct hppfs_inode_info {
++      struct dentry *proc_dentry;
++};
++
++#endif
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/include/linux/kernel.h um/include/linux/kernel.h
+--- orig/include/linux/kernel.h        Thu Feb 27 13:04:27 2003
++++ um/include/linux/kernel.h  Wed Mar 26 22:01:25 2003
+@@ -49,7 +49,7 @@
+ # define ATTRIB_NORET  __attribute__((noreturn))
+ # define NORET_AND     noreturn,
+-#ifdef __i386__
++#if defined(__i386__) || defined(UM_FASTCALL)
+ #define FASTCALL(x)   x __attribute__((regparm(3)))
+ #else
+ #define FASTCALL(x)   x
+diff -Naur -X ../exclude-files orig/include/linux/kernel_stat.h um/include/linux/kernel_stat.h
+--- orig/include/linux/kernel_stat.h   Thu Feb 27 13:04:27 2003
++++ um/include/linux/kernel_stat.h     Wed Apr 16 13:59:39 2003
+@@ -12,7 +12,7 @@
+  * used by rstatd/perfmeter
+  */
+-#define DK_MAX_MAJOR 16
++#define DK_MAX_MAJOR 99
+ #define DK_MAX_DISK 16
+ struct kernel_stat {
+diff -Naur -X ../exclude-files orig/include/linux/mm.h um/include/linux/mm.h
+--- orig/include/linux/mm.h    Sun Sep 15 12:13:19 2002
++++ um/include/linux/mm.h      Wed Apr 16 13:59:04 2003
+@@ -425,6 +425,14 @@
+ extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist));
+ extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
++#ifndef HAVE_ARCH_VALIDATE
++static inline struct page *arch_validate(struct page *page, 
++                                       unsigned int gfp_mask, int order)
++{
++        return(page);
++}
++#endif
++
+ static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
+ {
+       /*
+@@ -432,7 +440,7 @@
+        */
+       if (order >= MAX_ORDER)
+               return NULL;
+-      return _alloc_pages(gfp_mask, order);
++      return arch_validate(_alloc_pages(gfp_mask, order), gfp_mask, order);
+ }
+ #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
+@@ -492,6 +500,9 @@
+ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
+               int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
++extern long do_mprotect(struct mm_struct *mm, unsigned long start, 
++                      size_t len, unsigned long prot);
++
+ /*
+  * On a two-level page table, this ends up being trivial. Thus the
+  * inlining and the symmetry break with pte_alloc() that does all
+@@ -539,9 +550,10 @@
+ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+-      unsigned long len, unsigned long prot,
+-      unsigned long flag, unsigned long pgoff);
++extern unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file *file, 
++                                 unsigned long addr, unsigned long len,
++                                 unsigned long prot, unsigned long flag,
++                                 unsigned long pgoff);
+ static inline unsigned long do_mmap(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot,
+@@ -551,7 +563,8 @@
+       if ((offset + PAGE_ALIGN(len)) < offset)
+               goto out;
+       if (!(offset & ~PAGE_MASK))
+-              ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
++              ret = do_mmap_pgoff(current->mm, file, addr, len, prot, flag, 
++                                  offset >> PAGE_SHIFT);
+ out:
+       return ret;
+ }
+diff -Naur -X ../exclude-files orig/include/linux/proc_mm.h um/include/linux/proc_mm.h
+--- orig/include/linux/proc_mm.h       Wed Dec 31 19:00:00 1969
++++ um/include/linux/proc_mm.h Wed Apr 16 13:59:47 2003
+@@ -0,0 +1,48 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#ifndef __PROC_MM_H
++#define __PROC_MM_H
++
++#include "linux/sched.h"
++
++#define MM_MMAP 54
++#define MM_MUNMAP 55
++#define MM_MPROTECT 56
++#define MM_COPY_SEGMENTS 57
++
++struct mm_mmap {
++      unsigned long addr;
++      unsigned long len;
++      unsigned long prot;
++      unsigned long flags;
++      unsigned long fd;
++      unsigned long offset;
++};
++
++struct mm_munmap {
++      unsigned long addr;
++      unsigned long len;      
++};
++
++struct mm_mprotect {
++      unsigned long addr;
++      unsigned long len;
++        unsigned int prot;
++};
++
++struct proc_mm_op {
++      int op;
++      union {
++              struct mm_mmap mmap;
++              struct mm_munmap munmap;
++              struct mm_mprotect mprotect;
++              int copy_segments;
++      } u;
++};
++
++extern struct mm_struct *proc_mm_get_mm(int fd);
++
++#endif
+diff -Naur -X ../exclude-files orig/include/linux/tty.h um/include/linux/tty.h
+--- orig/include/linux/tty.h   Thu Feb 27 13:04:28 2003
++++ um/include/linux/tty.h     Wed Apr 16 13:59:04 2003
+@@ -309,6 +309,9 @@
+       spinlock_t read_lock;
+       /* If the tty has a pending do_SAK, queue it here - akpm */
+       struct tq_struct SAK_tq;
++#ifdef CONFIG_TTY_LOG
++        int log_fd;
++#endif
+ };
+ /* tty magic number */
+@@ -366,6 +369,7 @@
+ extern int specialix_init(void);
+ extern int espserial_init(void);
+ extern int macserial_init(void);
++extern int stdio_init(void);
+ extern int a2232board_init(void);
+ extern int tty_paranoia_check(struct tty_struct *tty, kdev_t device,
+@@ -420,6 +424,8 @@
+ extern int vt_ioctl(struct tty_struct *tty, struct file * file,
+                   unsigned int cmd, unsigned long arg);
++
++extern void stdio_console_init(void);
+ #endif /* __KERNEL__ */
+ #endif
+diff -Naur -X ../exclude-files orig/init/do_mounts.c um/init/do_mounts.c
+--- orig/init/do_mounts.c      Thu Feb 27 13:04:28 2003
++++ um/init/do_mounts.c        Thu Feb 27 13:05:27 2003
+@@ -153,6 +153,22 @@
+       { "pf",         0x2f00 },
+       { "apblock", APBLOCK_MAJOR << 8},
+       { "ddv", DDV_MAJOR << 8},
++      { "ubd0", UBD_MAJOR << 8 | 0 << 4},
++      { "ubda", UBD_MAJOR << 8 | 0 << 4},
++      { "ubd1", UBD_MAJOR << 8 | 1 << 4},
++      { "ubdb", UBD_MAJOR << 8 | 1 << 4},
++      { "ubd2", UBD_MAJOR << 8 | 2 << 4},
++      { "ubdc", UBD_MAJOR << 8 | 2 << 4},
++      { "ubd3", UBD_MAJOR << 8 | 3 << 4},
++      { "ubdd", UBD_MAJOR << 8 | 3 << 4},
++      { "ubd4", UBD_MAJOR << 8 | 4 << 4},
++      { "ubde", UBD_MAJOR << 8 | 4 << 4},
++      { "ubd5", UBD_MAJOR << 8 | 5 << 4},
++      { "ubdf", UBD_MAJOR << 8 | 5 << 4},
++      { "ubd6", UBD_MAJOR << 8 | 6 << 4},
++      { "ubdg", UBD_MAJOR << 8 | 6 << 4},
++      { "ubd7", UBD_MAJOR << 8 | 7 << 4},
++      { "ubdh", UBD_MAJOR << 8 | 7 << 4},
+       { "jsfd",    JSFD_MAJOR << 8},
+ #if defined(CONFIG_ARCH_S390)
+       { "dasda", (DASD_MAJOR << MINORBITS) },
+diff -Naur -X ../exclude-files orig/kernel/panic.c um/kernel/panic.c
+--- orig/kernel/panic.c        Thu Feb 27 13:04:29 2003
++++ um/kernel/panic.c  Thu Feb 27 13:05:27 2003
+@@ -66,7 +66,7 @@
+       smp_send_stop();
+ #endif
+-      notifier_call_chain(&panic_notifier_list, 0, NULL);
++      notifier_call_chain(&panic_notifier_list, 0, buf);
+       if (panic_timeout > 0)
+       {
+diff -Naur -X ../exclude-files orig/mm/Makefile um/mm/Makefile
+--- orig/mm/Makefile   Wed Aug 21 11:47:43 2002
++++ um/mm/Makefile     Fri Nov  8 14:21:36 2002
+@@ -17,5 +17,6 @@
+           shmem.o
+ obj-$(CONFIG_HIGHMEM) += highmem.o
++obj-$(CONFIG_PROC_MM) += proc_mm.o
+ include $(TOPDIR)/Rules.make
+diff -Naur -X ../exclude-files orig/mm/mmap.c um/mm/mmap.c
+--- orig/mm/mmap.c     Thu Feb 27 13:04:29 2003
++++ um/mm/mmap.c       Thu Feb 27 13:05:27 2003
+@@ -390,10 +390,11 @@
+       return 0;
+ }
+-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
+-      unsigned long prot, unsigned long flags, unsigned long pgoff)
++unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file * file, 
++                          unsigned long addr, unsigned long len,
++                          unsigned long prot, unsigned long flags, 
++                          unsigned long pgoff)
+ {
+-      struct mm_struct * mm = current->mm;
+       struct vm_area_struct * vma, * prev;
+       unsigned int vm_flags;
+       int correct_wcount = 0;
+diff -Naur -X ../exclude-files orig/mm/mprotect.c um/mm/mprotect.c
+--- orig/mm/mprotect.c Wed Aug 21 11:47:43 2002
++++ um/mm/mprotect.c   Sun Nov 10 20:24:32 2002
+@@ -264,7 +264,8 @@
+       return 0;
+ }
+-asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
++long do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, 
++               unsigned long prot)
+ {
+       unsigned long nstart, end, tmp;
+       struct vm_area_struct * vma, * next, * prev;
+@@ -281,9 +282,9 @@
+       if (end == start)
+               return 0;
+-      down_write(&current->mm->mmap_sem);
++      down_write(&mm->mmap_sem);
+-      vma = find_vma_prev(current->mm, start, &prev);
++      vma = find_vma_prev(mm, start, &prev);
+       error = -ENOMEM;
+       if (!vma || vma->vm_start > start)
+               goto out;
+@@ -332,6 +333,11 @@
+               prev->vm_mm->map_count--;
+       }
+ out:
+-      up_write(&current->mm->mmap_sem);
++      up_write(&mm->mmap_sem);
+       return error;
++}
++
++asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot)
++{
++        return(do_mprotect(current->mm, start, len, prot));
+ }
+diff -Naur -X ../exclude-files orig/mm/proc_mm.c um/mm/proc_mm.c
+--- orig/mm/proc_mm.c  Wed Dec 31 19:00:00 1969
++++ um/mm/proc_mm.c    Tue Nov 19 14:20:26 2002
+@@ -0,0 +1,173 @@
++/* 
++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
++ * Licensed under the GPL
++ */
++
++#include "linux/init.h"
++#include "linux/proc_fs.h"
++#include "linux/proc_mm.h"
++#include "linux/file.h"
++#include "asm/uaccess.h"
++#include "asm/mmu_context.h"
++
++static struct file_operations proc_mm_fops;
++
++struct mm_struct *proc_mm_get_mm(int fd)
++{
++      struct mm_struct *ret = ERR_PTR(-EBADF);
++      struct file *file;
++
++      file = fget(fd);
++      if (!file)
++              goto out;
++
++      ret = ERR_PTR(-EINVAL);
++      if(file->f_op != &proc_mm_fops)
++              goto out_fput;
++
++      ret = file->private_data;
++ out_fput:
++      fput(file);
++ out:
++      return(ret);
++}
++
++extern long do_mmap2(struct mm_struct *mm, unsigned long addr, 
++                   unsigned long len, unsigned long prot, 
++                   unsigned long flags, unsigned long fd,
++                   unsigned long pgoff);
++
++static ssize_t write_proc_mm(struct file *file, const char *buffer,
++                           size_t count, loff_t *ppos)
++{
++      struct mm_struct *mm = file->private_data;
++      struct proc_mm_op req;
++      int n, ret;
++
++      if(count > sizeof(req))
++              return(-EINVAL);
++
++      n = copy_from_user(&req, buffer, count);
++      if(n != 0)
++              return(-EFAULT);
++
++      ret = count;
++      switch(req.op){
++      case MM_MMAP: {
++              struct mm_mmap *map = &req.u.mmap;
++
++              ret = do_mmap2(mm, map->addr, map->len, map->prot, 
++                             map->flags, map->fd, map->offset >> PAGE_SHIFT);
++              if((ret & ~PAGE_MASK) == 0)
++                      ret = count;
++      
++              break;
++      }
++      case MM_MUNMAP: {
++              struct mm_munmap *unmap = &req.u.munmap;
++
++              down_write(&mm->mmap_sem);
++              ret = do_munmap(mm, unmap->addr, unmap->len);
++              up_write(&mm->mmap_sem);
++
++              if(ret == 0)
++                      ret = count;
++              break;
++      }
++      case MM_MPROTECT: {
++              struct mm_mprotect *protect = &req.u.mprotect;
++
++              ret = do_mprotect(mm, protect->addr, protect->len, 
++                                protect->prot);
++              if(ret == 0)
++                      ret = count;
++              break;
++      }
++
++      case MM_COPY_SEGMENTS: {
++              struct mm_struct *from = proc_mm_get_mm(req.u.copy_segments);
++
++              if(IS_ERR(from)){
++                      ret = PTR_ERR(from);
++                      break;
++              }
++
++              mm_copy_segments(from, mm);
++              break;
++      }
++      default:
++              ret = -EINVAL;
++              break;
++      }
++
++      return(ret);
++}
++
++static int open_proc_mm(struct inode *inode, struct file *file)
++{
++      struct mm_struct *mm = mm_alloc();
++      int ret;
++
++      ret = -ENOMEM;
++      if(mm == NULL)
++              goto out_mem;
++
++      ret = init_new_context(current, mm);
++      if(ret)
++              goto out_free;
++
++      spin_lock(&mmlist_lock);
++      list_add(&mm->mmlist, &current->mm->mmlist);
++      mmlist_nr++;
++      spin_unlock(&mmlist_lock);
++
++      file->private_data = mm;
++
++      return(0);
++
++ out_free:
++      mmput(mm);
++ out_mem:
++      return(ret);
++}
++
++static int release_proc_mm(struct inode *inode, struct file *file)
++{
++      struct mm_struct *mm = file->private_data;
++
++      mmput(mm);
++      return(0);
++}
++
++static struct file_operations proc_mm_fops = {
++      .open           = open_proc_mm,
++      .release        = release_proc_mm,
++      .write          = write_proc_mm,
++};
++
++static int make_proc_mm(void)
++{
++      struct proc_dir_entry *ent;
++
++      ent = create_proc_entry("mm", 0222, &proc_root);
++      if(ent == NULL){
++              printk("make_proc_mm : Failed to register /proc/mm\n");
++              return(0);
++      }
++      ent->proc_fops = &proc_mm_fops;
++
++      return(0);
++}
++
++__initcall(make_proc_mm);
++
++/*
++ * Overrides for Emacs so that we follow Linus's tabbing style.
++ * Emacs will notice this stuff at the end of the file and automatically
++ * adjust the settings for this buffer only.  This must remain at the end
++ * of the file.
++ * ---------------------------------------------------------------------------
++ * Local variables:
++ * c-file-style: "linux"
++ * End:
++ */
+diff -Naur -X ../exclude-files orig/mm/slab.c um/mm/slab.c
+--- orig/mm/slab.c     Thu Feb 27 13:04:29 2003
++++ um/mm/slab.c       Thu Feb 27 13:05:27 2003
+@@ -1946,10 +1946,14 @@
+       name = cachep->name; 
+       {
++        mm_segment_t fs;
+       char tmp; 
++      fs = get_fs();
++      set_fs(KERNEL_DS);
+       if (__get_user(tmp, name)) 
+               name = "broken"; 
+-      }       
++      set_fs(fs);
++      }
+       seq_printf(m, "%-17s %6lu %6lu %6u %4lu %4lu %4u",
+               name, active_objs, num_objs, cachep->objsize,
index fafdf90..228d086 100644 (file)
@@ -1,11 +1,12 @@
 
 
 
- 0 files changed
+ arch/um/kernel/mem.c |   15 +++++++++++++++
+ 1 files changed, 15 insertions(+)
 
---- linux-2.4.18-17.8.0/arch/um/kernel/mem.c~uml_check_get_page        2002-12-06 14:52:30.000000000 -0800
-+++ linux-2.4.18-17.8.0-zab/arch/um/kernel/mem.c       2002-12-06 14:52:30.000000000 -0800
-@@ -529,6 +529,21 @@ struct page *pte_mem_map(pte_t pte)
+--- linux-2.4.20/arch/um/kernel/mem.c~uml_check_get_page       2003-04-08 23:34:50.000000000 -0600
++++ linux-2.4.20-braam/arch/um/kernel/mem.c    2003-04-08 23:34:50.000000000 -0600
+@@ -712,6 +712,21 @@ struct page *pte_mem_map(pte_t pte)
        return(phys_mem_map(pte_val(pte)));
  }
  
index b0c305b..59069f9 100644 (file)
@@ -1,11 +1,12 @@
 
 
 
- 0 files changed
+ arch/um/kernel/mem.c |    8 ++++++--
+ 1 files changed, 6 insertions(+), 2 deletions(-)
 
---- linux-2.4.18-17.8.0/arch/um/kernel/mem.c~uml_no_panic      2002-12-06 14:52:30.000000000 -0800
-+++ linux-2.4.18-17.8.0-zab/arch/um/kernel/mem.c       2002-12-06 14:52:30.000000000 -0800
-@@ -559,7 +559,9 @@ struct mem_region *page_region(struct pa
+--- linux-2.4.20/arch/um/kernel/mem.c~uml_no_panic     2003-04-08 23:34:57.000000000 -0600
++++ linux-2.4.20-braam/arch/um/kernel/mem.c    2003-04-08 23:34:57.000000000 -0600
+@@ -742,7 +742,9 @@ struct mem_region *page_region(struct pa
                        return(region);
                }
        }
@@ -16,7 +17,7 @@
        return(NULL);
  }
  
-@@ -581,7 +583,9 @@ unsigned long region_pa(void *virt)
+@@ -814,7 +816,9 @@ extern unsigned long region_pa(void *vir
                   (addr <= region->start + region->len))
                        return(mk_phys(addr - region->start, i));
        }
diff --git a/lustre/kernel_patches/patches/vanilla-2.4.18.patch b/lustre/kernel_patches/patches/vanilla-2.4.18.patch
deleted file mode 100644 (file)
index 00cc57c..0000000
+++ /dev/null
@@ -1,1672 +0,0 @@
---- lum-pristine/include/linux/lustre_version.h        Wed Dec 31 19:00:00 1969
-+++ lum/include/linux/lustre_version.h Tue Nov 26 07:02:14 2002
-@@ -0,0 +1,1 @@
-+#define LUSTRE_KERNEL_VERSION 5
---- lum-pristine/arch/ia64/mm/init.c   Fri Nov  9 17:26:17 2001
-+++ lum/arch/ia64/mm/init.c    Thu Aug  1 18:07:35 2002
-@@ -37,6 +37,12 @@
- static unsigned long totalram_pages;
-+struct page *check_get_page(unsigned long kaddr)
-+{
-+#warning FIXME: Lustre team, is this solid?
-+      return virt_to_page(kaddr);
-+}
-+
- int
- do_check_pgt_cache (int low, int high)
- {
---- lum-pristine/arch/i386/mm/init.c   Fri Dec 21 12:41:53 2001
-+++ lum/arch/i386/mm/init.c    Thu Aug  1 18:07:35 2002
-@@ -43,6 +43,12 @@
- static unsigned long totalram_pages;
- static unsigned long totalhigh_pages;
-+struct page *check_get_page(unsigned long kaddr)
-+{
-+#warning FIXME: Lustre team, is this solid?
-+      return virt_to_page(kaddr);
-+}
-+
- int do_check_pgt_cache(int low, int high)
- {
-       int freed = 0;
---- lum-pristine/drivers/block/blkpg.c Mon Feb 25 14:37:57 2002
-+++ lum/drivers/block/blkpg.c  Thu Aug  1 18:07:35 2002
-@@ -294,3 +294,38 @@
- }
- EXPORT_SYMBOL(blk_ioctl);
-+
-+#define NUM_DEV_NO_WRITE 16
-+static int dev_no_write[NUM_DEV_NO_WRITE];
-+
-+/*
-+ * Debug code for turning block devices "read-only" (will discard writes
-+ * silently).  This is for filesystem crash/recovery testing.
-+ */
-+void dev_set_rdonly(kdev_t dev, int no_write)
-+{
-+      if (dev) {
-+              printk(KERN_WARNING "Turning device %s read-only\n",
-+                     bdevname(dev));
-+              dev_no_write[no_write] = 0xdead0000 + dev;
-+      }
-+}
-+
-+int dev_check_rdonly(kdev_t dev) {
-+      int i;
-+
-+      for (i = 0; i < NUM_DEV_NO_WRITE; i++) {
-+              if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 &&
-+                  dev == (dev_no_write[i] & 0xffff))
-+                      return 1;
-+      }
-+      return 0;
-+}
-+
-+void dev_clear_rdonly(int no_write) {
-+      dev_no_write[no_write] = 0;
-+}
-+
-+EXPORT_SYMBOL(dev_set_rdonly);
-+EXPORT_SYMBOL(dev_check_rdonly);
-+EXPORT_SYMBOL(dev_clear_rdonly);
---- lum-pristine/drivers/block/loop.c  Fri Dec 21 12:41:53 2001
-+++ lum/drivers/block/loop.c   Thu Aug  1 18:07:35 2002
-@@ -471,6 +471,11 @@
-       spin_unlock_irq(&lo->lo_lock);
-       if (rw == WRITE) {
-+#ifdef CONFIG_DEV_RDONLY
-+              if (dev_check_rdonly(rbh->b_rdev))
-+                      goto err;
-+#endif
-+
-               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
-                       goto err;
-       } else if (rw == READA) {
---- lum-pristine/drivers/ide/ide-disk.c        Fri Dec 21 12:41:54 2001
-+++ lum/drivers/ide/ide-disk.c Thu Aug  1 18:07:35 2002
-@@ -367,6 +367,12 @@
-  */
- static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
- {
-+#ifdef CONFIG_DEV_RDONLY
-+      if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) {
-+              ide_end_request(1, HWGROUP(drive));
-+              return ide_stopped;
-+      }
-+#endif
-       if (IDE_CONTROL_REG)
-               OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
-       OUT_BYTE(0x00, IDE_FEATURE_REG);
---- lum-pristine/fs/ext3/Makefile      Fri Dec 21 12:41:55 2001
-+++ lum/fs/ext3/Makefile       Thu Aug  1 18:07:35 2002
-@@ -9,6 +9,8 @@
- O_TARGET := ext3.o
-+export-objs :=        super.o
-+
- obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-               ioctl.o namei.o super.o symlink.o
- obj-m    := $(O_TARGET)
---- lum-pristine/fs/ext3/super.c       Mon Feb 25 14:38:08 2002
-+++ lum/fs/ext3/super.c        Thu Aug  1 18:07:35 2002
-@@ -1744,7 +1744,7 @@
-       unregister_filesystem(&ext3_fs_type);
- }
--EXPORT_NO_SYMBOLS;
-+EXPORT_SYMBOL(ext3_bread);
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
- MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
---- lum-pristine/fs/jbd/commit.c       Mon Feb 25 14:38:08 2002
-+++ lum/fs/jbd/commit.c        Thu Aug  1 18:07:35 2002
-@@ -475,7 +475,7 @@
-            transaction's t_log_list queue, and metadata buffers are on
-            the t_iobuf_list queue.
--         Wait for the transactions in reverse order.  That way we are
-+         Wait for the buffers in reverse order.  That way we are
-          less likely to be woken up until all IOs have completed, and
-          so we incur less scheduling load.
-       */
-@@ -566,8 +566,10 @@
-       jbd_debug(3, "JBD: commit phase 6\n");
--      if (is_journal_aborted(journal))
-+      if (is_journal_aborted(journal)) {
-+              unlock_journal(journal);
-               goto skip_commit;
-+      }
-       /* Done it all: now write the commit record.  We should have
-        * cleaned up our previous buffers by now, so if we are in abort
-@@ -577,6 +579,7 @@
-       descriptor = journal_get_descriptor_buffer(journal);
-       if (!descriptor) {
-               __journal_abort_hard(journal);
-+              unlock_journal(journal);
-               goto skip_commit;
-       }
-       
-@@ -600,7 +603,6 @@
-               put_bh(bh);             /* One for getblk() */
-               journal_unlock_journal_head(descriptor);
-       }
--      lock_journal(journal);
-       /* End of a transaction!  Finally, we can do checkpoint
-            processing: any buffers committed as a result of this
-@@ -609,6 +611,25 @@
- skip_commit:
-+      /* Call any callbacks that had been registered for handles in this
-+       * transaction.  It is up to the callback to free any allocated
-+       * memory.
-+       */
-+      if (!list_empty(&commit_transaction->t_jcb)) {
-+              struct list_head *p, *n;
-+              int error = is_journal_aborted(journal);
-+
-+              list_for_each_safe(p, n, &commit_transaction->t_jcb) {
-+                      struct journal_callback *jcb;
-+
-+                      jcb = list_entry(p, struct journal_callback, jcb_list);
-+                      list_del(p);
-+                      jcb->jcb_func(jcb, error);
-+              }
-+      }
-+
-+      lock_journal(journal);
-+
-       jbd_debug(3, "JBD: commit phase 7\n");
-       J_ASSERT(commit_transaction->t_sync_datalist == NULL);
---- lum-pristine/fs/jbd/journal.c      Mon Feb 25 14:38:08 2002
-+++ lum/fs/jbd/journal.c       Thu Aug  1 18:07:35 2002
-@@ -58,6 +58,7 @@
- #endif
- EXPORT_SYMBOL(journal_flush);
- EXPORT_SYMBOL(journal_revoke);
-+EXPORT_SYMBOL(journal_callback_set);
- EXPORT_SYMBOL(journal_init_dev);
- EXPORT_SYMBOL(journal_init_inode);
---- lum-pristine/fs/jbd/transaction.c  Mon Feb 25 14:38:08 2002
-+++ lum/fs/jbd/transaction.c   Thu Aug  1 18:07:35 2002
-@@ -57,6 +57,7 @@
-       transaction->t_state = T_RUNNING;
-       transaction->t_tid = journal->j_transaction_sequence++;
-       transaction->t_expires = jiffies + journal->j_commit_interval;
-+      INIT_LIST_HEAD(&transaction->t_jcb);
-       /* Set up the commit timer for the new transaction. */
-       J_ASSERT (!journal->j_commit_timer_active);
-@@ -201,6 +202,20 @@
-       return 0;
- }
-+/* Allocate a new handle.  This should probably be in a slab... */
-+static handle_t *new_handle(int nblocks)
-+{
-+      handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+      if (!handle)
-+              return NULL;
-+      memset(handle, 0, sizeof (handle_t));
-+      handle->h_buffer_credits = nblocks;
-+      handle->h_ref = 1;
-+      INIT_LIST_HEAD(&handle->h_jcb);
-+
-+      return handle;
-+}
-+
- /*
-  * Obtain a new handle.  
-  *
-@@ -227,14 +242,11 @@
-               handle->h_ref++;
-               return handle;
-       }
--      
--      handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+
-+      handle = new_handle(nblocks);
-       if (!handle)
-               return ERR_PTR(-ENOMEM);
--      memset (handle, 0, sizeof (handle_t));
--      handle->h_buffer_credits = nblocks;
--      handle->h_ref = 1;
-       current->journal_info = handle;
-       err = start_this_handle(journal, handle);
-@@ -333,14 +345,11 @@
-       
-       if (is_journal_aborted(journal))
-               return ERR_PTR(-EIO);
--      
--      handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
-+
-+      handle = new_handle(nblocks);
-       if (!handle)
-               return ERR_PTR(-ENOMEM);
--      memset (handle, 0, sizeof (handle_t));
--      handle->h_buffer_credits = nblocks;
--      handle->h_ref = 1;
-       current->journal_info = handle;
-       err = try_start_this_handle(journal, handle);
-@@ -1328,6 +1337,28 @@
- #endif
- /*
-+ * Register a callback function for this handle.  The function will be
-+ * called when the transaction that this handle is part of has been
-+ * committed to disk with the original callback data struct and the
-+ * error status of the journal as parameters.  There is no guarantee of
-+ * ordering between handles within a single transaction, nor between
-+ * callbacks registered on the same handle.
-+ *
-+ * The caller is responsible for allocating the journal_callback struct.
-+ * This is to allow the caller to add as much extra data to the callback
-+ * as needed, but reduce the overhead of multiple allocations.  The caller
-+ * allocated struct must start with a struct journal_callback at offset 0,
-+ * and has the caller-specific data afterwards.
-+ */
-+void journal_callback_set(handle_t *handle,
-+                        void (*func)(struct journal_callback *jcb, int error),
-+                        struct journal_callback *jcb)
-+{
-+      list_add(&jcb->jcb_list, &handle->h_jcb);
-+      jcb->jcb_func = func;
-+}
-+
-+/*
-  * All done for a particular handle.
-  *
-  * There is not much action needed here.  We just return any remaining
-@@ -1383,7 +1415,10 @@
-                       wake_up(&journal->j_wait_transaction_locked);
-       }
--      /* 
-+      /* Move callbacks from the handle to the transaction. */
-+      list_splice(&handle->h_jcb, &transaction->t_jcb);
-+
-+      /*
-        * If the handle is marked SYNC, we need to set another commit
-        * going!  We also want to force a commit if the current
-        * transaction is occupying too much of the log, or if the
---- lum-pristine/include/linux/blkdev.h        Mon Nov 26 08:29:17 2001
-+++ lum/include/linux/blkdev.h Mon Aug 12 11:48:39 2002
-@@ -228,4 +228,8 @@
-       return retval;
- }
-+#define CONFIG_DEV_RDONLY
-+void dev_set_rdonly(kdev_t, int);
-+int dev_check_rdonly(kdev_t);
-+void dev_clear_rdonly(int);
- #endif
---- lum-pristine/include/linux/slab.h  Fri Dec 21 12:42:04 2001
-+++ lum/include/linux/slab.h   Mon Aug 12 11:48:38 2002
-@@ -57,6 +57,7 @@
- extern int kmem_cache_shrink(kmem_cache_t *);
- extern void *kmem_cache_alloc(kmem_cache_t *, int);
- extern void kmem_cache_free(kmem_cache_t *, void *);
-+extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp);
- extern void *kmalloc(size_t, int);
- extern void kfree(const void *);
---- lum-pristine/include/linux/jbd.h   Mon Feb 25 14:38:13 2002
-+++ lum/include/linux/jbd.h    Mon Aug 12 11:50:09 2002
-@@ -249,6 +249,13 @@
-       return bh->b_private;
- }
-+#define HAVE_JOURNAL_CALLBACK_STATUS
-+struct journal_callback {
-+      struct list_head jcb_list;
-+      void (*jcb_func)(struct journal_callback *jcb, int error);
-+      /* user data goes here */
-+};
-+
- struct jbd_revoke_table_s;
- /* The handle_t type represents a single atomic update being performed
-@@ -279,6 +286,12 @@
-          operations */
-       int                     h_err;
-+      /* List of application registered callbacks for this handle.
-+       * The function(s) will be called after the transaction that
-+       * this handle is part of has been committed to disk.
-+       */
-+      struct list_head        h_jcb;
-+
-       /* Flags */
-       unsigned int    h_sync:         1;      /* sync-on-close */
-       unsigned int    h_jdata:        1;      /* force data journaling */
-@@ -398,6 +411,10 @@
-       /* How many handles used this transaction? */
-       int t_handle_count;
-+
-+      /* List of registered callback functions for this transaction.
-+       * Called when the transaction is committed. */
-+      struct list_head        t_jcb;
- };
-@@ -646,6 +663,9 @@
- extern int     journal_try_to_free_buffers(journal_t *, struct page *, int);
- extern int     journal_stop(handle_t *);
- extern int     journal_flush (journal_t *);
-+extern void    journal_callback_set(handle_t *handle,
-+                                    void (*fn)(struct journal_callback *,int),
-+                                    struct journal_callback *jcb);
- extern void    journal_lock_updates (journal_t *);
- extern void    journal_unlock_updates (journal_t *);
---- lum-pristine/kernel/ksyms.c        Mon Feb 25 14:38:13 2002
-+++ lum/kernel/ksyms.c Thu Aug  1 18:07:35 2002
-@@ -260,6 +260,7 @@
- EXPORT_SYMBOL(set_page_dirty);
- EXPORT_SYMBOL(vfs_readlink);
- EXPORT_SYMBOL(vfs_follow_link);
-+EXPORT_SYMBOL(vfs_follow_link_it);
- EXPORT_SYMBOL(page_readlink);
- EXPORT_SYMBOL(page_follow_link);
- EXPORT_SYMBOL(page_symlink_inode_operations);
-@@ -271,6 +272,12 @@
- EXPORT_SYMBOL(lock_may_write);
- EXPORT_SYMBOL(dcache_readdir);
-+/* lustre */
-+EXPORT_SYMBOL(panic_notifier_list);
-+EXPORT_SYMBOL(pagecache_lock);
-+EXPORT_SYMBOL(do_kern_mount);
-+EXPORT_SYMBOL(kmem_cache_validate);
-+
- /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
- EXPORT_SYMBOL(default_llseek);
- EXPORT_SYMBOL(dentry_open);
---- lum-pristine/include/linux/dcache.h        Thu Nov 22 14:46:18 2001
-+++ lum/include/linux/dcache.h Mon Aug 12 00:02:29 2002
-@@ -6,6 +6,34 @@
- #include <asm/atomic.h>
- #include <linux/mount.h>
-+#define IT_OPEN  (1)
-+#define IT_CREAT  (1<<1)
-+#define IT_MKDIR  (1<<2)
-+#define IT_LINK  (1<<3)
-+#define IT_LINK2  (1<<4)
-+#define IT_SYMLINK  (1<<5)
-+#define IT_UNLINK  (1<<6)
-+#define IT_RMDIR  (1<<7)
-+#define IT_RENAME  (1<<8)
-+#define IT_RENAME2  (1<<9)
-+#define IT_READDIR  (1<<10)
-+#define IT_GETATTR  (1<<11)
-+#define IT_SETATTR  (1<<12)
-+#define IT_READLINK  (1<<13)
-+#define IT_MKNOD  (1<<14)
-+#define IT_LOOKUP  (1<<15)
-+
-+struct lookup_intent {
-+      int it_op;
-+      int it_mode;
-+      int it_disposition;
-+      int it_status;
-+      struct iattr *it_iattr;
-+      __u64 it_lock_handle[2];
-+      int it_lock_mode;
-+      void *it_data;
-+};
-+
- /*
-  * linux/include/linux/dcache.h
-  *
-@@ -78,6 +106,7 @@
-       unsigned long d_time;           /* used by d_revalidate */
-       struct dentry_operations  *d_op;
-       struct super_block * d_sb;      /* The root of the dentry tree */
-+      struct lookup_intent *d_it;
-       unsigned long d_vfs_flags;
-       void * d_fsdata;                /* fs-specific data */
-       unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
-@@ -91,6 +119,8 @@
-       int (*d_delete)(struct dentry *);
-       void (*d_release)(struct dentry *);
-       void (*d_iput)(struct dentry *, struct inode *);
-+      int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *);
-+      void (*d_intent_release)(struct dentry *, struct lookup_intent *);
- };
- /* the dentry parameter passed to d_hash and d_compare is the parent
---- lum-pristine/include/linux/fs.h    Mon Aug 12 11:02:53 2002
-+++ lum/include/linux/fs.h     Mon Aug 12 11:48:38 2002
-@@ -536,6 +536,7 @@
-       /* needed for tty driver, and maybe others */
-       void                    *private_data;
-+      struct lookup_intent    *f_intent;
-       /* preallocated helper kiobuf to speedup O_DIRECT */
-       struct kiobuf           *f_iobuf;
-@@ -779,7 +780,9 @@
- extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
- extern int vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_unlink(struct inode *, struct dentry *);
--extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-+              struct inode *new_dir, struct dentry *new_dentry,
-+              struct lookup_intent *it);
- /*
-  * File types
-@@ -840,6 +843,7 @@
- struct inode_operations {
-       int (*create) (struct inode *,struct dentry *,int);
-       struct dentry * (*lookup) (struct inode *,struct dentry *);
-+      struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *);
-       int (*link) (struct dentry *,struct inode *,struct dentry *);
-       int (*unlink) (struct inode *,struct dentry *);
-       int (*symlink) (struct inode *,struct dentry *,const char *);
-@@ -850,6 +854,8 @@
-                       struct inode *, struct dentry *);
-       int (*readlink) (struct dentry *, char *,int);
-       int (*follow_link) (struct dentry *, struct nameidata *);
-+      int (*follow_link2) (struct dentry *, struct nameidata *,
-+                             struct lookup_intent *it);
-       void (*truncate) (struct inode *);
-       int (*permission) (struct inode *, int);
-       int (*revalidate) (struct dentry *);
-@@ -986,7 +990,7 @@
- extern struct vfsmount *kern_mount(struct file_system_type *);
- extern int may_umount(struct vfsmount *);
- extern long do_mount(char *, char *, char *, unsigned long, void *);
--
-+struct vfsmount *do_kern_mount(char *type, int flags, char *name, void *data);
- #define kern_umount mntput
- extern int vfs_statfs(struct super_block *, struct statfs *);
-@@ -1307,6 +1311,7 @@
- extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
- extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
-+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it));
- extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
- extern int FASTCALL(path_walk(const char *, struct nameidata *));
- extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
-@@ -1317,6 +1322,8 @@
- extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
- #define user_path_walk(name,nd)        __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
- #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
-+#define user_path_walk_it(name,nd,it)  __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it)
-+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it)
- extern void iput(struct inode *);
- extern void force_delete(struct inode *);
-@@ -1422,6 +1428,8 @@
- extern int vfs_readlink(struct dentry *, char *, int, const char *);
- extern int vfs_follow_link(struct nameidata *, const char *);
-+extern int vfs_follow_link_it(struct nameidata *, const char *,
-+                            struct lookup_intent *it);
- extern int page_readlink(struct dentry *, char *, int);
- extern int page_follow_link(struct dentry *, struct nameidata *);
- extern struct inode_operations page_symlink_inode_operations;
---- lum-pristine/fs/dcache.c   Mon Feb 25 14:38:08 2002
-+++ lum/fs/dcache.c    Thu Aug  1 18:07:35 2002
-@@ -617,6 +617,7 @@
-       dentry->d_op = NULL;
-       dentry->d_fsdata = NULL;
-       dentry->d_mounted = 0;
-+      dentry->d_it = NULL;
-       INIT_LIST_HEAD(&dentry->d_hash);
-       INIT_LIST_HEAD(&dentry->d_lru);
-       INIT_LIST_HEAD(&dentry->d_subdirs);
---- lum-pristine/fs/nfsd/vfs.c Fri Dec 21 12:41:55 2001
-+++ lum/fs/nfsd/vfs.c  Thu Aug  1 18:07:35 2002
-@@ -1285,7 +1285,7 @@
-                       err = nfserr_perm;
-       } else
- #endif
--      err = vfs_rename(fdir, odentry, tdir, ndentry);
-+      err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
-       if (!err && EX_ISSYNC(tfhp->fh_export)) {
-               nfsd_sync_dir(tdentry);
-               nfsd_sync_dir(fdentry);
---- lum-pristine/fs/namei.c    Mon Feb 25 14:38:09 2002
-+++ lum/fs/namei.c     Mon Aug 12 11:47:56 2002
-@@ -94,6 +94,12 @@
-  * XEmacs seems to be relying on it...
-  */
-+void intent_release(struct dentry *de, struct lookup_intent *it)
-+{
-+      if (it && de->d_op && de->d_op->d_intent_release)
-+              de->d_op->d_intent_release(de, it);
-+}
-+
- /* In order to reduce some races, while at the same time doing additional
-  * checking and hopefully speeding things up, we copy filenames to the
-  * kernel data space before using them..
-@@ -260,10 +268,19 @@
-  * Internal lookup() using the new generic dcache.
-  * SMP-safe
-  */
--static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name,
-+                                  int flags, struct lookup_intent *it)
- {
-       struct dentry * dentry = d_lookup(parent, name);
-+      if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) {
-+              if (!dentry->d_op->d_revalidate2(dentry, flags, it) &&
-+                  !d_invalidate(dentry)) {
-+                      dput(dentry);
-+                      dentry = NULL;
-+              }
-+              return dentry;
-+      } else
-       if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
-               if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
-                       dput(dentry);
-@@ -281,7 +298,8 @@
-  * make sure that nobody added the entry to the dcache in the meantime..
-  * SMP-safe
-  */
--static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name,
-+                                int flags, struct lookup_intent *it)
- {
-       struct dentry * result;
-       struct inode *dir = parent->d_inode;
-@@ -300,6 +318,9 @@
-               result = ERR_PTR(-ENOMEM);
-               if (dentry) {
-                       lock_kernel();
-+                      if (dir->i_op->lookup2)
-+                              result = dir->i_op->lookup2(dir, dentry, it);
-+                      else
-                       result = dir->i_op->lookup(dir, dentry);
-                       unlock_kernel();
-                       if (result)
-@@ -321,6 +342,12 @@
-                       dput(result);
-                       result = ERR_PTR(-ENOENT);
-               }
-+      } else if (result->d_op && result->d_op->d_revalidate2) {
-+              if (!result->d_op->d_revalidate2(result, flags, it) &&
-+                  !d_invalidate(result)) {
-+                      dput(result);
-+                      result = ERR_PTR(-ENOENT);
-+              }
-       }
-       return result;
- }
-@@ -334,7 +361,8 @@
-  * Without that kind of total limit, nasty chains of consecutive
-  * symlinks can cause almost arbitrarily long lookups. 
-  */
--static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
-+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd,
-+                               struct lookup_intent *it)
- {
-       int err;
-       if (current->link_count >= max_recursive_link)
-@@ -348,10 +376,14 @@
-       current->link_count++;
-       current->total_link_count++;
-       UPDATE_ATIME(dentry->d_inode);
--      err = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (dentry->d_inode->i_op->follow_link2)
-+              err = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
-+      else
-+              err = dentry->d_inode->i_op->follow_link(dentry, nd);
-       current->link_count--;
-       return err;
- loop:
-+      intent_release(dentry, it);
-       path_release(nd);
-       return -ELOOP;
- }
-@@ -445,7 +472,8 @@
-  *
-  * We expect 'base' to be positive and a directory.
-  */
--int link_path_walk(const char * name, struct nameidata *nd)
-+int link_path_walk_it(const char *name, struct nameidata *nd,
-+                    struct lookup_intent *it)
- {
-       struct dentry *dentry;
-       struct inode *inode;
-@@ -518,9 +546,9 @@
-                               break;
-               }
-               /* This does the actual lookups.. */
--              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
-               if (!dentry) {
--                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
-                       err = PTR_ERR(dentry);
-                       if (IS_ERR(dentry))
-                               break;
-@@ -537,8 +570,8 @@
-               if (!inode->i_op)
-                       goto out_dput;
--              if (inode->i_op->follow_link) {
--                      err = do_follow_link(dentry, nd);
-+              if (inode->i_op->follow_link || inode->i_op->follow_link2) {
-+                      err = do_follow_link(dentry, nd, NULL);
-                       dput(dentry);
-                       if (err)
-                               goto return_err;
-@@ -554,7 +582,7 @@
-                       nd->dentry = dentry;
-               }
-               err = -ENOTDIR; 
--              if (!inode->i_op->lookup)
-+              if (!inode->i_op->lookup && !inode->i_op->lookup2)
-                       break;
-               continue;
-               /* here ends the main loop */
-@@ -581,9 +609,9 @@
-                       if (err < 0)
-                               break;
-               }
--              dentry = cached_lookup(nd->dentry, &this, 0);
-+              dentry = cached_lookup(nd->dentry, &this, 0, it);
-               if (!dentry) {
--                      dentry = real_lookup(nd->dentry, &this, 0);
-+                      dentry = real_lookup(nd->dentry, &this, 0, it);
-                       err = PTR_ERR(dentry);
-                       if (IS_ERR(dentry))
-                               break;
-@@ -591,9 +625,9 @@
-               while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
-                       ;
-               inode = dentry->d_inode;
--              if ((lookup_flags & LOOKUP_FOLLOW)
--                  && inode && inode->i_op && inode->i_op->follow_link) {
--                      err = do_follow_link(dentry, nd);
-+              if ((lookup_flags & LOOKUP_FOLLOW) && inode && inode->i_op &&
-+                  (inode->i_op->follow_link || inode->i_op->follow_link2)) {
-+                      err = do_follow_link(dentry, nd, it);
-                       dput(dentry);
-                       if (err)
-                               goto return_err;
-@@ -607,7 +635,8 @@
-                       goto no_inode;
-               if (lookup_flags & LOOKUP_DIRECTORY) {
-                       err = -ENOTDIR; 
--                      if (!inode->i_op || !inode->i_op->lookup)
-+                      if (!inode->i_op ||
-+                          (!inode->i_op->lookup && !inode->i_op->lookup2))
-                               break;
-               }
-               goto return_base;
-@@ -630,12 +660,23 @@
-       return err;
- }
-+int link_path_walk(const char * name, struct nameidata *nd)
-+{
-+      return link_path_walk_it(name, nd, NULL);
-+}
-+
-+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it)
-+{
-+      current->total_link_count = 0;
-+      return link_path_walk_it(name, nd, it);
-+}
-+
- int path_walk(const char * name, struct nameidata *nd)
- {
-       current->total_link_count = 0;
--      return link_path_walk(name, nd);
-+      return link_path_walk_it(name, nd, NULL);
- }
- /* SMP-safe */
- /* returns 1 if everything is done */
- static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
-@@ -742,7 +786,8 @@
-  * needs parent already locked. Doesn't follow mounts.
-  * SMP-safe.
-  */
--struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base,
-+                             struct lookup_intent *it)
- {
-       struct dentry * dentry;
-       struct inode *inode;
-@@ -765,13 +810,16 @@
-                       goto out;
-       }
--      dentry = cached_lookup(base, name, 0);
-+      dentry = cached_lookup(base, name, 0, it);
-       if (!dentry) {
-               struct dentry *new = d_alloc(base, name);
-               dentry = ERR_PTR(-ENOMEM);
-               if (!new)
-                       goto out;
-               lock_kernel();
-+              if (inode->i_op->lookup2)
-+                      dentry = inode->i_op->lookup2(inode, new, it);
-+              else
-               dentry = inode->i_op->lookup(inode, new);
-               unlock_kernel();
-               if (!dentry)
-@@ -783,6 +831,12 @@
-       return dentry;
- }
-+struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+{
-+      return lookup_hash_it(name, base, NULL);
-+}
-+
-+
- /* SMP-safe */
- struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
- {
-@@ -804,7 +858,7 @@
-       }
-       this.hash = end_name_hash(hash);
--      return lookup_hash(&this, base);
-+      return lookup_hash_it(&this, base, NULL);
- access:
-       return ERR_PTR(-EACCES);
- }
-@@ -836,6 +890,23 @@
-       return err;
- }
-+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd,
-+                 struct lookup_intent *it)
-+{
-+      char *tmp;
-+      int err;
-+
-+      tmp = getname(name);
-+      err = PTR_ERR(tmp);
-+      if (!IS_ERR(tmp)) {
-+              err = 0;
-+              if (path_init(tmp, flags, nd))
-+                      err = path_walk_it(tmp, nd, it);
-+              putname(tmp);
-+      }
-+      return err;
-+}
-+
- /*
-  * It's inline, so penalty for filesystems that don't use sticky bit is
-  * minimal.
-@@ -970,7 +1041,8 @@
-  * for symlinks (where the permissions are checked later).
-  * SMP-safe
-  */
--int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
-+int open_namei_it(const char *pathname, int flag, int mode,
-+                struct nameidata *nd, struct lookup_intent *it)
- {
-       int acc_mode, error = 0;
-       struct inode *inode;
-@@ -985,7 +1057,7 @@
-        */
-       if (!(flag & O_CREAT)) {
-               if (path_init(pathname, lookup_flags(flag), nd))
--                      error = path_walk(pathname, nd);
-+                      error = path_walk_it(pathname, nd, it);
-               if (error)
-                       return error;
-               dentry = nd->dentry;
-@@ -994,6 +1067,10 @@
-       /*
-        * Create - we need to know the parent.
-        */
-+      if (it) {
-+              it->it_mode = mode;
-+              it->it_op |= IT_CREAT;
-+      }
-       if (path_init(pathname, LOOKUP_PARENT, nd))
-               error = path_walk(pathname, nd);
-       if (error)
-@@ -1011,7 +1089,7 @@
-       dir = nd->dentry;
-       down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
- do_last:
-       error = PTR_ERR(dentry);
-@@ -1020,6 +1098,7 @@
-               goto exit;
-       }
-+      it->it_mode = mode;
-       /* Negative dentry, just create the file */
-       if (!dentry->d_inode) {
-               error = vfs_create(dir->d_inode, dentry,
-@@ -1053,7 +1134,8 @@
-       error = -ENOENT;
-       if (!dentry->d_inode)
-               goto exit_dput;
--      if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link)
-+      if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link ||
-+                                    dentry->d_inode->i_op->follow_link2))
-               goto do_link;
-       dput(nd->dentry);
-@@ -1139,8 +1219,10 @@
-       return 0;
- exit_dput:
-+      intent_release(dentry, it);
-       dput(dentry);
- exit:
-+      intent_release(nd->dentry, it);
-       path_release(nd);
-       return error;
-@@ -1160,7 +1242,12 @@
-        * are done. Procfs-like symlinks just set LAST_BIND.
-        */
-       UPDATE_ATIME(dentry->d_inode);
--      error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (dentry->d_inode->i_op->follow_link2)
-+              error = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
-+      else
-+              error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (error)
-+              intent_release(dentry, it);
-       dput(dentry);
-       if (error)
-               return error;
-@@ -1181,13 +1265,20 @@
-       }
-       dir = nd->dentry;
-       down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
-       putname(nd->last.name);
-       goto do_last;
- }
-+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd)
-+{
-+      return open_namei_it(pathname, flag, mode, nd, NULL);
-+}
-+
-+
- /* SMP-safe */
--static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
-+static struct dentry *lookup_create(struct nameidata *nd, int is_dir,
-+                                  struct lookup_intent *it)
- {
-       struct dentry *dentry;
-@@ -1195,7 +1286,7 @@
-       dentry = ERR_PTR(-EEXIST);
-       if (nd->last_type != LAST_NORM)
-               goto fail;
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
-       if (IS_ERR(dentry))
-               goto fail;
-       if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1241,6 +1332,7 @@
-       char * tmp;
-       struct dentry * dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_MKNOD, .it_mode = mode };
-       if (S_ISDIR(mode))
-               return -EPERM;
-@@ -1252,7 +1344,7 @@
-               error = path_walk(tmp, &nd);
-       if (error)
-               goto out;
--      dentry = lookup_create(&nd, 0);
-+      dentry = lookup_create(&nd, 0, &it);
-       error = PTR_ERR(dentry);
-       mode &= ~current->fs->umask;
-@@ -1270,6 +1363,7 @@
-               default:
-                       error = -EINVAL;
-               }
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-@@ -1310,6 +1404,7 @@
- {
-       int error = 0;
-       char * tmp;
-+      struct lookup_intent it = { .it_op = IT_MKDIR, .it_mode = mode };
-       tmp = getname(pathname);
-       error = PTR_ERR(tmp);
-@@ -1321,11 +1416,12 @@
-                       error = path_walk(tmp, &nd);
-               if (error)
-                       goto out;
--              dentry = lookup_create(&nd, 1);
-+              dentry = lookup_create(&nd, 1, &it);
-               error = PTR_ERR(dentry);
-               if (!IS_ERR(dentry)) {
-                       error = vfs_mkdir(nd.dentry->d_inode, dentry,
-                                         mode & ~current->fs->umask);
-+                      intent_release(dentry, &it);
-                       dput(dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-@@ -1407,6 +1504,7 @@
-       char * name;
-       struct dentry *dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_RMDIR };
-       name = getname(pathname);
-       if(IS_ERR(name))
-@@ -1429,10 +1527,11 @@
-                       goto exit1;
-       }
-       down(&nd.dentry->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
-       error = PTR_ERR(dentry);
-       if (!IS_ERR(dentry)) {
-               error = vfs_rmdir(nd.dentry->d_inode, dentry);
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-@@ -1476,6 +1576,7 @@
-       char * name;
-       struct dentry *dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_UNLINK };
-       name = getname(pathname);
-       if(IS_ERR(name))
-@@ -1489,14 +1590,15 @@
-       if (nd.last_type != LAST_NORM)
-               goto exit1;
-       down(&nd.dentry->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
-       error = PTR_ERR(dentry);
-       if (!IS_ERR(dentry)) {
-               /* Why not before? Because we want correct error value */
-               if (nd.last.name[nd.last.len])
-                       goto slashes;
-               error = vfs_unlink(nd.dentry->d_inode, dentry);
-       exit2:
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-@@ -1543,6 +1646,7 @@
-       int error = 0;
-       char * from;
-       char * to;
-+      struct lookup_intent it = { .it_op = IT_SYMLINK };
-       from = getname(oldname);
-       if(IS_ERR(from))
-@@ -1557,10 +1661,12 @@
-                       error = path_walk(to, &nd);
-               if (error)
-                       goto out;
--              dentry = lookup_create(&nd, 0);
-+              it.it_data = from;
-+              dentry = lookup_create(&nd, 0, &it);
-               error = PTR_ERR(dentry);
-               if (!IS_ERR(dentry)) {
-                       error = vfs_symlink(nd.dentry->d_inode, dentry, from);
-+                      intent_release(dentry, &it);
-                       dput(dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-@@ -1626,6 +1732,7 @@
-       int error;
-       char * from;
-       char * to;
-+      struct lookup_intent it = { .it_op = IT_LINK };
-       from = getname(oldname);
-       if(IS_ERR(from))
-@@ -1639,7 +1745,7 @@
-               error = 0;
-               if (path_init(from, LOOKUP_POSITIVE, &old_nd))
--                      error = path_walk(from, &old_nd);
-+                      error = path_walk_it(from, &old_nd, &it);
-               if (error)
-                       goto exit;
-               if (path_init(to, LOOKUP_PARENT, &nd))
-@@ -1648,10 +1755,12 @@
-               error = -EXDEV;
-               if (old_nd.mnt != nd.mnt)
-                       goto out_release;
--              new_dentry = lookup_create(&nd, 0);
-+              it.it_op = IT_LINK2;
-+              new_dentry = lookup_create(&nd, 0, &it);
-               error = PTR_ERR(new_dentry);
-               if (!IS_ERR(new_dentry)) {
-                       error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-+                      intent_release(new_dentry, &it);
-                       dput(new_dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-@@ -1694,7 +1803,8 @@
-  *       locking].
-  */
- int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+                 struct inode *new_dir, struct dentry *new_dentry,
-+                 struct lookup_intent *it)
- {
-       int error;
-       struct inode *target;
-@@ -1754,6 +1864,7 @@
-               error = -EBUSY;
-       else 
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-+      intent_release(new_dentry, it);
-       if (target) {
-               if (!error)
-                       target->i_flags |= S_DEAD;
-@@ -1775,7 +1887,8 @@
- }
- int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+                   struct inode *new_dir, struct dentry *new_dentry,
-+                   struct lookup_intent *it)
- {
-       int error;
-@@ -1806,6 +1919,7 @@
-               error = -EBUSY;
-       else
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-+      intent_release(new_dentry, it);
-       double_up(&old_dir->i_zombie, &new_dir->i_zombie);
-       if (error)
-               return error;
-@@ -1817,13 +1932,14 @@
- }
- int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+             struct inode *new_dir, struct dentry *new_dentry,
-+             struct lookup_intent *it)
- {
-       int error;
-       if (S_ISDIR(old_dentry->d_inode->i_mode))
--              error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
-+              error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry,it);
-       else
--              error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
-+              error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,it);
-       if (!error) {
-               if (old_dir == new_dir)
-                       inode_dir_notify(old_dir, DN_RENAME);
-@@ -1840,6 +1956,7 @@
-       int error = 0;
-       struct dentry * old_dir, * new_dir;
-       struct dentry * old_dentry, *new_dentry;
-+      struct lookup_intent it = { .it_op = IT_RENAME };
-       struct nameidata oldnd, newnd;
-       if (path_init(oldname, LOOKUP_PARENT, &oldnd))
-@@ -1868,7 +1985,7 @@
-       double_lock(new_dir, old_dir);
--      old_dentry = lookup_hash(&oldnd.last, old_dir);
-+      old_dentry = lookup_hash_it(&oldnd.last, old_dir, &it);
-       error = PTR_ERR(old_dentry);
-       if (IS_ERR(old_dentry))
-               goto exit3;
-@@ -1884,18 +2003,21 @@
-               if (newnd.last.name[newnd.last.len])
-                       goto exit4;
-       }
--      new_dentry = lookup_hash(&newnd.last, new_dir);
-+      it.it_op = IT_RENAME2;
-+      new_dentry = lookup_hash_it(&newnd.last, new_dir, &it);
-       error = PTR_ERR(new_dentry);
-       if (IS_ERR(new_dentry))
-               goto exit4;
-       lock_kernel();
-       error = vfs_rename(old_dir->d_inode, old_dentry,
--                                 new_dir->d_inode, new_dentry);
-+                                 new_dir->d_inode, new_dentry, &it);
-       unlock_kernel();
-+      intent_release(new_dentry, &it);
-       dput(new_dentry);
- exit4:
-+      intent_release(old_dentry, &it);
-       dput(old_dentry);
- exit3:
-       double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem);
-@@ -1965,7 +2094,8 @@
- }
- static inline int
--__vfs_follow_link(struct nameidata *nd, const char *link)
-+__vfs_follow_link(struct nameidata *nd, const char *link,
-+                struct lookup_intent *it)
- {
-       int res = 0;
-       char *name;
-@@ -1978,7 +2108,7 @@
-                       /* weird __emul_prefix() stuff did it */
-                       goto out;
-       }
--      res = link_path_walk(link, nd);
-+      res = link_path_walk_it(link, nd, it);
- out:
-       if (current->link_count || res || nd->last_type!=LAST_NORM)
-               return res;
-@@ -2000,7 +2130,13 @@
- int vfs_follow_link(struct nameidata *nd, const char *link)
- {
--      return __vfs_follow_link(nd, link);
-+      return __vfs_follow_link(nd, link, NULL);
-+}
-+
-+int vfs_follow_link_it(struct nameidata *nd, const char *link,
-+                     struct lookup_intent *it)
-+{
-+      return __vfs_follow_link(nd, link, it);
- }
- /* get the link contents into pagecache */
-@@ -2042,7 +2178,7 @@
- {
-       struct page *page = NULL;
-       char *s = page_getlink(dentry, &page);
--      int res = __vfs_follow_link(nd, s);
-+      int res = __vfs_follow_link(nd, s, NULL);
-       if (page) {
-               kunmap(page);
-               page_cache_release(page);
---- lum-pristine/fs/open.c     Fri Oct 12 16:48:42 2001
-+++ lum/fs/open.c      Sun Aug 11 15:26:29 2002
-@@ -19,6 +19,9 @@
- #include <asm/uaccess.h>
- #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
-+extern int path_walk_it(const char *name, struct nameidata *nd,
-+                      struct lookup_intent *it);
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- int vfs_statfs(struct super_block *sb, struct statfs *buf)
- {
-@@ -94,12 +97,13 @@
-       struct nameidata nd;
-       struct inode * inode;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
-       error = -EINVAL;
-       if (length < 0) /* sorry, but loff_t says... */
-               goto out;
--      error = user_path_walk(path, &nd);
-+      error = user_path_walk_it(path, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -144,6 +149,7 @@
-       put_write_access(inode);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -235,8 +241,9 @@
-       struct nameidata nd;
-       struct inode * inode;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -262,6 +270,7 @@
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -279,8 +288,9 @@
-       struct nameidata nd;
-       struct inode * inode;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-@@ -306,6 +317,7 @@
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -322,6 +334,7 @@
-       int old_fsuid, old_fsgid;
-       kernel_cap_t old_cap;
-       int res;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
-               return -EINVAL;
-@@ -339,13 +352,14 @@
-       else
-               current->cap_effective = current->cap_permitted;
--      res = user_path_walk(filename, &nd);
-+      res = user_path_walk_it(filename, &nd, &it);
-       if (!res) {
-               res = permission(nd.dentry->d_inode, mode);
-               /* SuS v2 requires we report a read only fs too */
-               if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
-                  && !special_file(nd.dentry->d_inode->i_mode))
-                       res = -EROFS;
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-@@ -361,6 +375,7 @@
-       int error;
-       struct nameidata nd;
-       char *name;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       name = getname(filename);
-       error = PTR_ERR(name);
-@@ -369,7 +384,7 @@
-       error = 0;
-       if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd))
--              error = path_walk(name, &nd);
-+              error = path_walk_it(name, &nd, &it);
-       putname(name);
-       if (error)
-               goto out;
-@@ -381,6 +397,7 @@
-       set_fs_pwd(current->fs, nd.mnt, nd.dentry);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -421,6 +438,7 @@
-       int error;
-       struct nameidata nd;
-       char *name;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       name = getname(filename);
-       error = PTR_ERR(name);
-@@ -429,7 +447,7 @@
-       path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
-                     LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
--      error = path_walk(name, &nd);   
-+      error = path_walk_it(name, &nd, &it);
-       putname(name);
-       if (error)
-               goto out;
-@@ -446,6 +465,7 @@
-       set_fs_altroot();
-       error = 0;
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -490,8 +510,9 @@
-       struct inode * inode;
-       int error;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -511,6 +532,7 @@
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -580,10 +602,12 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -593,10 +618,12 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -630,10 +658,16 @@
-  * for the internal routines (ie open_namei()/follow_link() etc). 00 is
-  * used by symlinks.
-  */
-+extern int open_namei_it(const char *filename, int namei_flags, int mode,
-+                       struct nameidata *nd, struct lookup_intent *it);
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it);
-+
- struct file *filp_open(const char * filename, int flags, int mode)
- {
-       int namei_flags, error;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_OPEN };
-       namei_flags = flags;
-       if ((namei_flags+1) & O_ACCMODE)
-@@ -641,14 +675,15 @@
-       if (namei_flags & O_TRUNC)
-               namei_flags |= 2;
--      error = open_namei(filename, namei_flags, mode, &nd);
--      if (!error)
--              return dentry_open(nd.dentry, nd.mnt, flags);
-+      error = open_namei_it(filename, namei_flags, mode, &nd, &it);
-+      if (error)
-+              return ERR_PTR(error);
--      return ERR_PTR(error);
-+      return dentry_open_it(nd.dentry, nd.mnt, flags, &it);
- }
--struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it)
- {
-       struct file * f;
-       struct inode *inode;
-@@ -691,6 +726,7 @@
-       }
-       f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
-+      intent_release(dentry, it);
-       return f;
- cleanup_all:
-@@ -705,11 +741,17 @@
- cleanup_file:
-       put_filp(f);
- cleanup_dentry:
-+      intent_release(dentry, it);
-       dput(dentry);
-       mntput(mnt);
-       return ERR_PTR(error);
- }
-+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+{
-+      return dentry_open_it(dentry, mnt, flags, NULL);
-+}
-+
- /*
-  * Find an empty file descriptor entry, and mark it busy.
-  */
---- lum-pristine/fs/stat.c     Thu Sep 13 19:04:43 2001
-+++ lum/fs/stat.c      Mon Aug 12 00:04:39 2002
-@@ -13,6 +13,7 @@
- #include <asm/uaccess.h>
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- /*
-  * Revalidate the inode. This is required for proper NFS attribute caching.
-  */
-@@ -135,13 +135,15 @@
- asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -151,13 +153,15 @@
- asmlinkage long sys_newstat(char * filename, struct stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -172,13 +176,15 @@
- asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -189,13 +195,15 @@
- asmlinkage long sys_newlstat(char * filename, struct stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -247,20 +255,21 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_READLINK };
-       if (bufsiz <= 0)
-               return -EINVAL;
--      error = user_path_walk_link(path, &nd);
-+      error = user_path_walk_link_it(path, &nd, &it);
-       if (!error) {
-               struct inode * inode = nd.dentry->d_inode;
--
-               error = -EINVAL;
-               if (inode->i_op && inode->i_op->readlink &&
-                   !(error = do_revalidate(nd.dentry))) {
-                       UPDATE_ATIME(inode);
-                       error = inode->i_op->readlink(nd.dentry, buf, bufsiz);
-               }
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -333,12 +342,14 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -348,12 +359,14 @@
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
---- lum-pristine/mm/slab.c     Fri Dec 21 12:42:05 2001
-+++ lum/mm/slab.c      Thu Aug  1 18:07:35 2002
-@@ -1187,6 +1187,59 @@
-  * Called with the cache-lock held.
-  */
-+extern struct page *check_get_page(unsigned long kaddr);
-+struct page *page_mem_map(struct page *page);
-+static int kmem_check_cache_obj (kmem_cache_t * cachep,
-+                               slab_t *slabp, void * objp)
-+{
-+      int i;
-+      unsigned int objnr;
-+
-+#if DEBUG
-+      if (cachep->flags & SLAB_RED_ZONE) {
-+              objp -= BYTES_PER_WORD;
-+              if ( *(unsigned long *)objp != RED_MAGIC2)
-+                      /* Either write before start, or a double free. */
-+                      return 0;
-+              if (*(unsigned long *)(objp+cachep->objsize -
-+                              BYTES_PER_WORD) != RED_MAGIC2)
-+                      /* Either write past end, or a double free. */
-+                      return 0;
-+      }
-+#endif
-+
-+      objnr = (objp-slabp->s_mem)/cachep->objsize;
-+      if (objnr >= cachep->num)
-+              return 0;
-+      if (objp != slabp->s_mem + objnr*cachep->objsize)
-+              return 0;
-+
-+      /* Check slab's freelist to see if this obj is there. */
-+      for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
-+              if (i == objnr)
-+                      return 0;
-+      }
-+      return 1;
-+}
-+
-+
-+int kmem_cache_validate(kmem_cache_t *cachep, void *objp)
-+{
-+      struct page *page = check_get_page((unsigned long)objp);
-+
-+      if (!VALID_PAGE(page))
-+              return 0;
-+
-+      if (!PageSlab(page))
-+              return 0;
-+
-+      /* XXX check for freed slab objects ? */
-+      if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp))
-+              return 0;
-+
-+      return (cachep == GET_PAGE_CACHE(page));
-+}
-+
- #if DEBUG
- static int kmem_extra_free_checks (kmem_cache_t * cachep,
-                       slab_t *slabp, void * objp)
diff --git a/lustre/kernel_patches/patches/vanilla-2.4.19.patch b/lustre/kernel_patches/patches/vanilla-2.4.19.patch
deleted file mode 100644 (file)
index 4ed5bb9..0000000
+++ /dev/null
@@ -1,1576 +0,0 @@
-
-
-
- arch/i386/mm/init.c            |    6 
- arch/ia64/mm/init.c            |    6 
- drivers/block/blkpg.c          |   35 ++++
- drivers/block/loop.c           |    5 
- drivers/ide/ide-disk.c         |    6 
- fs/dcache.c                    |    1 
- fs/ext3/Makefile               |    2 
- fs/ext3/super.c                |    2 
- fs/namei.c                     |  296 ++++++++++++++++++++++++++++++++++-------
- fs/nfsd/vfs.c                  |    2 
- fs/open.c                      |   63 ++++++--
- fs/stat.c                      |   30 +++-
- include/linux/blkdev.h         |    4 
- include/linux/dcache.h         |   31 ++++
- include/linux/fs.h             |   23 +++
- include/linux/lustre_version.h |    1 
- include/linux/slab.h           |    1 
- kernel/ksyms.c                 |    7 
- mm/slab.c                      |   53 +++++++
- 19 files changed, 501 insertions(+), 73 deletions(-)
-
---- /dev/null  Fri Aug 30 17:31:37 2002
-+++ linux-2.4.19-root/include/linux/lustre_version.h   Sun Jan 19 19:54:00 2003
-@@ -0,0 +1 @@
-+#define LUSTRE_KERNEL_VERSION 7
---- linux-2.4.19/arch/ia64/mm/init.c~vanilla-2.4.19    Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/arch/ia64/mm/init.c      Sun Jan 19 19:46:42 2003
-@@ -37,6 +37,12 @@ unsigned long MAX_DMA_ADDRESS = PAGE_OFF
- static unsigned long totalram_pages;
-+struct page *check_get_page(unsigned long kaddr)
-+{
-+#warning FIXME: Lustre team, is this solid?
-+      return virt_to_page(kaddr);
-+}
-+
- int
- do_check_pgt_cache (int low, int high)
- {
---- linux-2.4.19/arch/i386/mm/init.c~vanilla-2.4.19    Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/arch/i386/mm/init.c      Sun Jan 19 19:46:42 2003
-@@ -43,6 +43,12 @@ unsigned long highstart_pfn, highend_pfn
- static unsigned long totalram_pages;
- static unsigned long totalhigh_pages;
-+struct page *check_get_page(unsigned long kaddr)
-+{
-+#warning FIXME: Lustre team, is this solid?
-+      return virt_to_page(kaddr);
-+}
-+
- int do_check_pgt_cache(int low, int high)
- {
-       int freed = 0;
---- linux-2.4.19/drivers/block/blkpg.c~vanilla-2.4.19  Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/drivers/block/blkpg.c    Sun Jan 19 19:46:42 2003
-@@ -296,3 +296,38 @@ int blk_ioctl(kdev_t dev, unsigned int c
- }
- EXPORT_SYMBOL(blk_ioctl);
-+
-+#define NUM_DEV_NO_WRITE 16
-+static int dev_no_write[NUM_DEV_NO_WRITE];
-+
-+/*
-+ * Debug code for turning block devices "read-only" (will discard writes
-+ * silently).  This is for filesystem crash/recovery testing.
-+ */
-+void dev_set_rdonly(kdev_t dev, int no_write)
-+{
-+      if (dev) {
-+              printk(KERN_WARNING "Turning device %s read-only\n",
-+                     bdevname(dev));
-+              dev_no_write[no_write] = 0xdead0000 + dev;
-+      }
-+}
-+
-+int dev_check_rdonly(kdev_t dev) {
-+      int i;
-+
-+      for (i = 0; i < NUM_DEV_NO_WRITE; i++) {
-+              if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 &&
-+                  dev == (dev_no_write[i] & 0xffff))
-+                      return 1;
-+      }
-+      return 0;
-+}
-+
-+void dev_clear_rdonly(int no_write) {
-+      dev_no_write[no_write] = 0;
-+}
-+
-+EXPORT_SYMBOL(dev_set_rdonly);
-+EXPORT_SYMBOL(dev_check_rdonly);
-+EXPORT_SYMBOL(dev_clear_rdonly);
---- linux-2.4.19/drivers/block/loop.c~vanilla-2.4.19   Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/drivers/block/loop.c     Sun Jan 19 19:46:42 2003
-@@ -474,6 +474,11 @@ static int loop_make_request(request_que
-       spin_unlock_irq(&lo->lo_lock);
-       if (rw == WRITE) {
-+#ifdef CONFIG_DEV_RDONLY
-+              if (dev_check_rdonly(rbh->b_rdev))
-+                      goto err;
-+#endif
-+
-               if (lo->lo_flags & LO_FLAGS_READ_ONLY)
-                       goto err;
-       } else if (rw == READA) {
---- linux-2.4.19/drivers/ide/ide-disk.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/drivers/ide/ide-disk.c   Sun Jan 19 19:46:42 2003
-@@ -551,6 +551,12 @@ static ide_startstop_t lba_48_rw_disk (i
-  */
- static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
- {
-+#ifdef CONFIG_DEV_RDONLY
-+      if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) {
-+              ide_end_request(1, HWGROUP(drive));
-+              return ide_stopped;
-+      }
-+#endif
-       if (IDE_CONTROL_REG)
-               OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
---- linux-2.4.19/fs/ext3/Makefile~vanilla-2.4.19       Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/fs/ext3/Makefile Sun Jan 19 19:46:42 2003
-@@ -9,6 +9,8 @@
- O_TARGET := ext3.o
-+export-objs :=        super.o
-+
- obj-y    := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-               ioctl.o namei.o super.o symlink.o
- obj-m    := $(O_TARGET)
---- linux-2.4.19/fs/ext3/super.c~vanilla-2.4.19        Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/fs/ext3/super.c  Sun Jan 19 19:46:42 2003
-@@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void)
-       unregister_filesystem(&ext3_fs_type);
- }
--EXPORT_NO_SYMBOLS;
-+EXPORT_SYMBOL(ext3_bread);
- MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
- MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
---- linux-2.4.19/include/linux/blkdev.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/include/linux/blkdev.h   Sun Jan 19 21:05:55 2003
-@@ -240,4 +240,8 @@ static inline unsigned int block_size(kd
-       return retval;
- }
-+#define CONFIG_DEV_RDONLY
-+void dev_set_rdonly(kdev_t, int);
-+int dev_check_rdonly(kdev_t);
-+void dev_clear_rdonly(int);
- #endif
---- linux-2.4.19/include/linux/slab.h~vanilla-2.4.19   Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/include/linux/slab.h     Sun Jan 19 21:05:52 2003
-@@ -57,6 +57,7 @@ extern int kmem_cache_destroy(kmem_cache
- extern int kmem_cache_shrink(kmem_cache_t *);
- extern void *kmem_cache_alloc(kmem_cache_t *, int);
- extern void kmem_cache_free(kmem_cache_t *, void *);
-+extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp);
- extern void *kmalloc(size_t, int);
- extern void kfree(const void *);
---- linux-2.4.19/kernel/ksyms.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/kernel/ksyms.c   Sun Jan 19 19:46:42 2003
-@@ -264,6 +264,7 @@ EXPORT_SYMBOL(read_cache_page);
- EXPORT_SYMBOL(set_page_dirty);
- EXPORT_SYMBOL(vfs_readlink);
- EXPORT_SYMBOL(vfs_follow_link);
-+EXPORT_SYMBOL(vfs_follow_link_it);
- EXPORT_SYMBOL(page_readlink);
- EXPORT_SYMBOL(page_follow_link);
- EXPORT_SYMBOL(page_symlink_inode_operations);
-@@ -280,6 +281,12 @@ EXPORT_SYMBOL(dcache_dir_fsync);
- EXPORT_SYMBOL(dcache_readdir);
- EXPORT_SYMBOL(dcache_dir_ops);
-+/* lustre */
-+EXPORT_SYMBOL(panic_notifier_list);
-+EXPORT_SYMBOL(pagecache_lock_cacheline);
-+EXPORT_SYMBOL(do_kern_mount);
-+EXPORT_SYMBOL(kmem_cache_validate);
-+
- /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */
- EXPORT_SYMBOL(default_llseek);
- EXPORT_SYMBOL(dentry_open);
---- linux-2.4.19/include/linux/dcache.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/include/linux/dcache.h   Sun Jan 19 19:46:42 2003
-@@ -6,6 +6,34 @@
- #include <asm/atomic.h>
- #include <linux/mount.h>
-+#define IT_OPEN  (1)
-+#define IT_CREAT  (1<<1)
-+#define IT_MKDIR  (1<<2)
-+#define IT_LINK  (1<<3)
-+#define IT_LINK2  (1<<4)
-+#define IT_SYMLINK  (1<<5)
-+#define IT_UNLINK  (1<<6)
-+#define IT_RMDIR  (1<<7)
-+#define IT_RENAME  (1<<8)
-+#define IT_RENAME2  (1<<9)
-+#define IT_READDIR  (1<<10)
-+#define IT_GETATTR  (1<<11)
-+#define IT_SETATTR  (1<<12)
-+#define IT_READLINK  (1<<13)
-+#define IT_MKNOD  (1<<14)
-+#define IT_LOOKUP  (1<<15)
-+
-+struct lookup_intent {
-+      int it_op;
-+      int it_mode;
-+      int it_disposition;
-+      int it_status;
-+      struct iattr *it_iattr;
-+      __u64 it_lock_handle[2];
-+      int it_lock_mode;
-+      void *it_data;
-+};
-+
- /*
-  * linux/include/linux/dcache.h
-  *
-@@ -78,6 +106,7 @@ struct dentry {
-       unsigned long d_time;           /* used by d_revalidate */
-       struct dentry_operations  *d_op;
-       struct super_block * d_sb;      /* The root of the dentry tree */
-+      struct lookup_intent *d_it;
-       unsigned long d_vfs_flags;
-       void * d_fsdata;                /* fs-specific data */
-       unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
-@@ -90,6 +119,8 @@ struct dentry_operations {
-       int (*d_delete)(struct dentry *);
-       void (*d_release)(struct dentry *);
-       void (*d_iput)(struct dentry *, struct inode *);
-+      int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *);
-+      void (*d_intent_release)(struct dentry *, struct lookup_intent *);
- };
- /* the dentry parameter passed to d_hash and d_compare is the parent
---- linux-2.4.19/include/linux/fs.h~vanilla-2.4.19     Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/include/linux/fs.h       Sun Jan 19 21:05:40 2003
-@@ -541,6 +541,7 @@ struct file {
-       /* needed for tty driver, and maybe others */
-       void                    *private_data;
-+      struct lookup_intent    *f_intent;
-       /* preallocated helper kiobuf to speedup O_DIRECT */
-       struct kiobuf           *f_iobuf;
-@@ -792,7 +793,9 @@ extern int vfs_symlink(struct inode *, s
- extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
- extern int vfs_rmdir(struct inode *, struct dentry *);
- extern int vfs_unlink(struct inode *, struct dentry *);
--extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
-+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-+              struct inode *new_dir, struct dentry *new_dentry,
-+              struct lookup_intent *it);
- /*
-  * File types
-@@ -853,16 +856,28 @@ struct file_operations {
- struct inode_operations {
-       int (*create) (struct inode *,struct dentry *,int);
-       struct dentry * (*lookup) (struct inode *,struct dentry *);
-+      struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *);
-       int (*link) (struct dentry *,struct inode *,struct dentry *);
-+      int (*link2) (struct inode *,struct inode *, const char *, int);
-       int (*unlink) (struct inode *,struct dentry *);
-+      int (*unlink2) (struct inode *, char *, int);
-       int (*symlink) (struct inode *,struct dentry *,const char *);
-+      int (*symlink2) (struct inode *,const char *, int, const char *);
-       int (*mkdir) (struct inode *,struct dentry *,int);
-+      int (*mkdir2) (struct inode *,char *, int,int);
-       int (*rmdir) (struct inode *,struct dentry *);
-+      int (*rmdir2) (struct inode *, char *, int);
-       int (*mknod) (struct inode *,struct dentry *,int,int);
-+      int (*mknod2) (struct inode *,char *, int,int,int);
-       int (*rename) (struct inode *, struct dentry *,
-                       struct inode *, struct dentry *);
-+      int (*rename2) (struct inode *, struct inode *, 
-+                      char *oldname, int oldlen, 
-+                      char *newname, int newlen);
-       int (*readlink) (struct dentry *, char *,int);
-       int (*follow_link) (struct dentry *, struct nameidata *);
-+      int (*follow_link2) (struct dentry *, struct nameidata *,
-+                             struct lookup_intent *it);
-       void (*truncate) (struct inode *);
-       int (*permission) (struct inode *, int);
-       int (*revalidate) (struct dentry *);
-@@ -999,6 +1014,7 @@ extern int unregister_filesystem(struct 
- extern struct vfsmount *kern_mount(struct file_system_type *);
- extern int may_umount(struct vfsmount *);
- extern long do_mount(char *, char *, char *, unsigned long, void *);
-+struct vfsmount *do_kern_mount(const char *fstype, int flags, char *name, void *data);
- extern void umount_tree(struct vfsmount *);
- #define kern_umount mntput
-@@ -1329,6 +1345,7 @@ typedef int (*read_actor_t)(read_descrip
- extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
- extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
-+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it));
- extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
- extern int FASTCALL(path_walk(const char *, struct nameidata *));
- extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
-@@ -1339,6 +1356,8 @@ extern struct dentry * lookup_one_len(co
- extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
- #define user_path_walk(name,nd)        __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
- #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
-+#define user_path_walk_it(name,nd,it)  __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it)
-+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it)
- extern void iput(struct inode *);
- extern void force_delete(struct inode *);
-@@ -1448,6 +1467,8 @@ extern struct file_operations generic_ro
- extern int vfs_readlink(struct dentry *, char *, int, const char *);
- extern int vfs_follow_link(struct nameidata *, const char *);
-+extern int vfs_follow_link_it(struct nameidata *, const char *,
-+                            struct lookup_intent *it);
- extern int page_readlink(struct dentry *, char *, int);
- extern int page_follow_link(struct dentry *, struct nameidata *);
- extern struct inode_operations page_symlink_inode_operations;
---- linux-2.4.19/fs/dcache.c~vanilla-2.4.19    Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/fs/dcache.c      Sun Jan 19 19:46:42 2003
-@@ -616,6 +616,7 @@ struct dentry * d_alloc(struct dentry * 
-       dentry->d_op = NULL;
-       dentry->d_fsdata = NULL;
-       dentry->d_mounted = 0;
-+      dentry->d_it = NULL;
-       INIT_LIST_HEAD(&dentry->d_hash);
-       INIT_LIST_HEAD(&dentry->d_lru);
-       INIT_LIST_HEAD(&dentry->d_subdirs);
---- linux-2.4.19/fs/nfsd/vfs.c~vanilla-2.4.19  Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/fs/nfsd/vfs.c    Sun Jan 19 19:46:42 2003
-@@ -1295,7 +1295,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
-                       err = nfserr_perm;
-       } else
- #endif
--      err = vfs_rename(fdir, odentry, tdir, ndentry);
-+      err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
-       if (!err && EX_ISSYNC(tfhp->fh_export)) {
-               nfsd_sync_dir(tdentry);
-               nfsd_sync_dir(fdentry);
---- linux-2.4.19/fs/namei.c~vanilla-2.4.19     Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/fs/namei.c       Sun Jan 19 19:46:42 2003
-@@ -94,6 +94,12 @@
-  * XEmacs seems to be relying on it...
-  */
-+void intent_release(struct dentry *de, struct lookup_intent *it)
-+{
-+      if (it && de->d_op && de->d_op->d_intent_release)
-+              de->d_op->d_intent_release(de, it);
-+}
-+
- /* In order to reduce some races, while at the same time doing additional
-  * checking and hopefully speeding things up, we copy filenames to the
-  * kernel data space before using them..
-@@ -260,10 +266,19 @@ void path_release(struct nameidata *nd)
-  * Internal lookup() using the new generic dcache.
-  * SMP-safe
-  */
--static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name,
-+                                  int flags, struct lookup_intent *it)
- {
-       struct dentry * dentry = d_lookup(parent, name);
-+      if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) {
-+              if (!dentry->d_op->d_revalidate2(dentry, flags, it) &&
-+                  !d_invalidate(dentry)) {
-+                      dput(dentry);
-+                      dentry = NULL;
-+              }
-+              return dentry;
-+      } else
-       if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
-               if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
-                       dput(dentry);
-@@ -281,7 +296,8 @@ static struct dentry * cached_lookup(str
-  * make sure that nobody added the entry to the dcache in the meantime..
-  * SMP-safe
-  */
--static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
-+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name,
-+                                int flags, struct lookup_intent *it)
- {
-       struct dentry * result;
-       struct inode *dir = parent->d_inode;
-@@ -300,6 +316,9 @@ static struct dentry * real_lookup(struc
-               result = ERR_PTR(-ENOMEM);
-               if (dentry) {
-                       lock_kernel();
-+                      if (dir->i_op->lookup2)
-+                              result = dir->i_op->lookup2(dir, dentry, it);
-+                      else
-                       result = dir->i_op->lookup(dir, dentry);
-                       unlock_kernel();
-                       if (result)
-@@ -321,6 +340,12 @@ static struct dentry * real_lookup(struc
-                       dput(result);
-                       result = ERR_PTR(-ENOENT);
-               }
-+      } else if (result->d_op && result->d_op->d_revalidate2) {
-+              if (!result->d_op->d_revalidate2(result, flags, it) &&
-+                  !d_invalidate(result)) {
-+                      dput(result);
-+                      result = ERR_PTR(-ENOENT);
-+              }
-       }
-       return result;
- }
-@@ -332,7 +357,8 @@ static struct dentry * real_lookup(struc
-  * Without that kind of total limit, nasty chains of consecutive
-  * symlinks can cause almost arbitrarily long lookups. 
-  */
--static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
-+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd,
-+                               struct lookup_intent *it)
- {
-       int err;
-       if (current->link_count >= 5)
-@@ -346,10 +372,14 @@ static inline int do_follow_link(struct 
-       current->link_count++;
-       current->total_link_count++;
-       UPDATE_ATIME(dentry->d_inode);
--      err = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (dentry->d_inode->i_op->follow_link2)
-+              err = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
-+      else
-+              err = dentry->d_inode->i_op->follow_link(dentry, nd);
-       current->link_count--;
-       return err;
- loop:
-+      intent_release(dentry, it);
-       path_release(nd);
-       return -ELOOP;
- }
-@@ -447,7 +477,8 @@ static inline void follow_dotdot(struct 
-  *
-  * We expect 'base' to be positive and a directory.
-  */
--int link_path_walk(const char * name, struct nameidata *nd)
-+int link_path_walk_it(const char *name, struct nameidata *nd,
-+                    struct lookup_intent *it)
- {
-       struct dentry *dentry;
-       struct inode *inode;
-@@ -520,9 +551,9 @@ int link_path_walk(const char * name, st
-                               break;
-               }
-               /* This does the actual lookups.. */
--              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
-               if (!dentry) {
--                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
-                       err = PTR_ERR(dentry);
-                       if (IS_ERR(dentry))
-                               break;
-@@ -539,8 +570,8 @@ int link_path_walk(const char * name, st
-               if (!inode->i_op)
-                       goto out_dput;
--              if (inode->i_op->follow_link) {
--                      err = do_follow_link(dentry, nd);
-+              if (inode->i_op->follow_link || inode->i_op->follow_link2) {
-+                      err = do_follow_link(dentry, nd, NULL);
-                       dput(dentry);
-                       if (err)
-                               goto return_err;
-@@ -556,7 +587,7 @@ int link_path_walk(const char * name, st
-                       nd->dentry = dentry;
-               }
-               err = -ENOTDIR; 
--              if (!inode->i_op->lookup)
-+              if (!inode->i_op->lookup && !inode->i_op->lookup2)
-                       break;
-               continue;
-               /* here ends the main loop */
-@@ -583,9 +614,9 @@ last_component:
-                       if (err < 0)
-                               break;
-               }
--              dentry = cached_lookup(nd->dentry, &this, 0);
-+              dentry = cached_lookup(nd->dentry, &this, 0, it);
-               if (!dentry) {
--                      dentry = real_lookup(nd->dentry, &this, 0);
-+                      dentry = real_lookup(nd->dentry, &this, 0, it);
-                       err = PTR_ERR(dentry);
-                       if (IS_ERR(dentry))
-                               break;
-@@ -593,9 +624,9 @@ last_component:
-               while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
-                       ;
-               inode = dentry->d_inode;
--              if ((lookup_flags & LOOKUP_FOLLOW)
--                  && inode && inode->i_op && inode->i_op->follow_link) {
--                      err = do_follow_link(dentry, nd);
-+              if ((lookup_flags & LOOKUP_FOLLOW) && inode && inode->i_op &&
-+                  (inode->i_op->follow_link || inode->i_op->follow_link2)) {
-+                      err = do_follow_link(dentry, nd, it);
-                       dput(dentry);
-                       if (err)
-                               goto return_err;
-@@ -609,7 +640,8 @@ last_component:
-                       goto no_inode;
-               if (lookup_flags & LOOKUP_DIRECTORY) {
-                       err = -ENOTDIR; 
--                      if (!inode->i_op || !inode->i_op->lookup)
-+                      if (!inode->i_op ||
-+                          (!inode->i_op->lookup && !inode->i_op->lookup2))
-                               break;
-               }
-               goto return_base;
-@@ -646,15 +678,28 @@ out_dput:
-               dput(dentry);
-               break;
-       }
-+      if (err)
-+              intent_release(nd->dentry, it);
-       path_release(nd);
- return_err:
-       return err;
- }
-+int link_path_walk(const char * name, struct nameidata *nd)
-+{
-+      return link_path_walk_it(name, nd, NULL);
-+}
-+
-+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it)
-+{
-+      current->total_link_count = 0;
-+      return link_path_walk_it(name, nd, it);
-+}
-+
- int path_walk(const char * name, struct nameidata *nd)
- {
-       current->total_link_count = 0;
--      return link_path_walk(name, nd);
-+      return link_path_walk_it(name, nd, NULL);
- }
- /* SMP-safe */
-@@ -757,7 +802,8 @@ int path_init(const char *name, unsigned
-  * needs parent already locked. Doesn't follow mounts.
-  * SMP-safe.
-  */
--struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base,
-+                             struct lookup_intent *it)
- {
-       struct dentry * dentry;
-       struct inode *inode;
-@@ -780,13 +826,16 @@ struct dentry * lookup_hash(struct qstr 
-                       goto out;
-       }
--      dentry = cached_lookup(base, name, 0);
-+      dentry = cached_lookup(base, name, 0, it);
-       if (!dentry) {
-               struct dentry *new = d_alloc(base, name);
-               dentry = ERR_PTR(-ENOMEM);
-               if (!new)
-                       goto out;
-               lock_kernel();
-+              if (inode->i_op->lookup2)
-+                      dentry = inode->i_op->lookup2(inode, new, it);
-+              else
-               dentry = inode->i_op->lookup(inode, new);
-               unlock_kernel();
-               if (!dentry)
-@@ -798,6 +847,12 @@ out:
-       return dentry;
- }
-+struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
-+{
-+      return lookup_hash_it(name, base, NULL);
-+}
-+
-+
- /* SMP-safe */
- struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
- {
-@@ -819,7 +874,7 @@ struct dentry * lookup_one_len(const cha
-       }
-       this.hash = end_name_hash(hash);
--      return lookup_hash(&this, base);
-+      return lookup_hash_it(&this, base, NULL);
- access:
-       return ERR_PTR(-EACCES);
- }
-@@ -851,6 +906,23 @@ int __user_walk(const char *name, unsign
-       return err;
- }
-+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd,
-+                 struct lookup_intent *it)
-+{
-+      char *tmp;
-+      int err;
-+
-+      tmp = getname(name);
-+      err = PTR_ERR(tmp);
-+      if (!IS_ERR(tmp)) {
-+              err = 0;
-+              if (path_init(tmp, flags, nd))
-+                      err = path_walk_it(tmp, nd, it);
-+              putname(tmp);
-+      }
-+      return err;
-+}
-+
- /*
-  * It's inline, so penalty for filesystems that don't use sticky bit is
-  * minimal.
-@@ -987,7 +1059,8 @@ exit_lock:
-  * for symlinks (where the permissions are checked later).
-  * SMP-safe
-  */
--int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
-+int open_namei_it(const char *pathname, int flag, int mode,
-+                struct nameidata *nd, struct lookup_intent *it)
- {
-       int acc_mode, error = 0;
-       struct inode *inode;
-@@ -1002,7 +1075,7 @@ int open_namei(const char * pathname, in
-        */
-       if (!(flag & O_CREAT)) {
-               if (path_init(pathname, lookup_flags(flag), nd))
--                      error = path_walk(pathname, nd);
-+                      error = path_walk_it(pathname, nd, it);
-               if (error)
-                       return error;
-               dentry = nd->dentry;
-@@ -1012,6 +1085,10 @@ int open_namei(const char * pathname, in
-       /*
-        * Create - we need to know the parent.
-        */
-+      if (it) {
-+              it->it_mode = mode;
-+              it->it_op |= IT_CREAT;
-+      }
-       if (path_init(pathname, LOOKUP_PARENT, nd))
-               error = path_walk(pathname, nd);
-       if (error)
-@@ -1028,7 +1105,7 @@ int open_namei(const char * pathname, in
-       dir = nd->dentry;
-       down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
- do_last:
-       error = PTR_ERR(dentry);
-@@ -1037,6 +1114,7 @@ do_last:
-               goto exit;
-       }
-+      it->it_mode = mode;
-       /* Negative dentry, just create the file */
-       if (!dentry->d_inode) {
-               error = vfs_create(dir->d_inode, dentry,
-@@ -1070,7 +1148,8 @@ do_last:
-       error = -ENOENT;
-       if (!dentry->d_inode)
-               goto exit_dput;
--      if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link)
-+      if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link ||
-+                                    dentry->d_inode->i_op->follow_link2))
-               goto do_link;
-       dput(nd->dentry);
-@@ -1156,8 +1235,10 @@ ok:
-       return 0;
- exit_dput:
-+      intent_release(dentry, it);
-       dput(dentry);
- exit:
-+      intent_release(nd->dentry, it);
-       path_release(nd);
-       return error;
-@@ -1176,7 +1257,12 @@ do_link:
-        * are done. Procfs-like symlinks just set LAST_BIND.
-        */
-       UPDATE_ATIME(dentry->d_inode);
--      error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (dentry->d_inode->i_op->follow_link2)
-+              error = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
-+      else
-+              error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (error)
-+              intent_release(dentry, it);
-       dput(dentry);
-       if (error)
-               return error;
-@@ -1198,13 +1284,20 @@ do_link:
-       }
-       dir = nd->dentry;
-       down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
-       putname(nd->last.name);
-       goto do_last;
- }
-+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd)
-+{
-+      return open_namei_it(pathname, flag, mode, nd, NULL);
-+}
-+
-+
- /* SMP-safe */
--static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
-+static struct dentry *lookup_create(struct nameidata *nd, int is_dir,
-+                                  struct lookup_intent *it)
- {
-       struct dentry *dentry;
-@@ -1212,7 +1305,7 @@ static struct dentry *lookup_create(stru
-       dentry = ERR_PTR(-EEXIST);
-       if (nd->last_type != LAST_NORM)
-               goto fail;
--      dentry = lookup_hash(&nd->last, nd->dentry);
-+      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
-       if (IS_ERR(dentry))
-               goto fail;
-       if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1258,6 +1351,7 @@ asmlinkage long sys_mknod(const char * f
-       char * tmp;
-       struct dentry * dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_MKNOD, .it_mode = mode };
-       if (S_ISDIR(mode))
-               return -EPERM;
-@@ -1269,7 +1363,19 @@ asmlinkage long sys_mknod(const char * f
-               error = path_walk(tmp, &nd);
-       if (error)
-               goto out;
--      dentry = lookup_create(&nd, 0);
-+
-+      if (nd.dentry->d_inode->i_op->mknod2) {
-+              struct inode_operations *op = nd.dentry->d_inode->i_op;
-+              error = op->mknod2(nd.dentry->d_inode, 
-+                                 nd.last.name, 
-+                                 nd.last.len,
-+                                 mode, dev);
-+              /* the file system want to use normal vfs path now */
-+              if (error != -EOPNOTSUPP)
-+                      goto out2;
-+      }
-+      
-+      dentry = lookup_create(&nd, 0, &it);
-       error = PTR_ERR(dentry);
-       mode &= ~current->fs->umask;
-@@ -1287,9 +1393,11 @@ asmlinkage long sys_mknod(const char * f
-               default:
-                       error = -EINVAL;
-               }
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-+ out2:
-       path_release(&nd);
- out:
-       putname(tmp);
-@@ -1327,6 +1435,7 @@ asmlinkage long sys_mkdir(const char * p
- {
-       int error = 0;
-       char * tmp;
-+      struct lookup_intent it = { .it_op = IT_MKDIR, .it_mode = mode };
-       tmp = getname(pathname);
-       error = PTR_ERR(tmp);
-@@ -1338,14 +1447,26 @@ asmlinkage long sys_mkdir(const char * p
-                       error = path_walk(tmp, &nd);
-               if (error)
-                       goto out;
--              dentry = lookup_create(&nd, 1);
-+              if (nd.dentry->d_inode->i_op->mkdir2) {
-+                      struct inode_operations *op = nd.dentry->d_inode->i_op;
-+                      error = op->mkdir2(nd.dentry->d_inode, 
-+                                         nd.last.name, 
-+                                         nd.last.len,
-+                                         mode);
-+                      /* the file system want to use normal vfs path now */
-+                      if (error != -EOPNOTSUPP)
-+                              goto out2;
-+              }
-+              dentry = lookup_create(&nd, 1, &it);
-               error = PTR_ERR(dentry);
-               if (!IS_ERR(dentry)) {
-                       error = vfs_mkdir(nd.dentry->d_inode, dentry,
-                                         mode & ~current->fs->umask);
-+                      intent_release(dentry, &it);
-                       dput(dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-+out2:
-               path_release(&nd);
- out:
-               putname(tmp);
-@@ -1426,6 +1547,7 @@ asmlinkage long sys_rmdir(const char * p
-       char * name;
-       struct dentry *dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_RMDIR };
-       name = getname(pathname);
-       if(IS_ERR(name))
-@@ -1447,11 +1569,21 @@ asmlinkage long sys_rmdir(const char * p
-                       error = -EBUSY;
-                       goto exit1;
-       }
-+      if (nd.dentry->d_inode->i_op->rmdir2) {
-+              struct inode_operations *op = nd.dentry->d_inode->i_op;
-+              error = op->rmdir2(nd.dentry->d_inode, 
-+                                 nd.last.name, 
-+                                 nd.last.len);
-+              /* the file system want to use normal vfs path now */
-+              if (error != -EOPNOTSUPP)
-+                      goto exit1;
-+      }
-       down(&nd.dentry->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
-       error = PTR_ERR(dentry);
-       if (!IS_ERR(dentry)) {
-               error = vfs_rmdir(nd.dentry->d_inode, dentry);
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-@@ -1495,6 +1627,7 @@ asmlinkage long sys_unlink(const char * 
-       char * name;
-       struct dentry *dentry;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_UNLINK };
-       name = getname(pathname);
-       if(IS_ERR(name))
-@@ -1507,8 +1640,17 @@ asmlinkage long sys_unlink(const char * 
-       error = -EISDIR;
-       if (nd.last_type != LAST_NORM)
-               goto exit1;
-+      if (nd.dentry->d_inode->i_op->unlink2) {
-+              struct inode_operations *op = nd.dentry->d_inode->i_op;
-+              error = op->unlink2(nd.dentry->d_inode, 
-+                                  nd.last.name, 
-+                                  nd.last.len);
-+              /* the file system want to use normal vfs path now */
-+              if (error != -EOPNOTSUPP)
-+                      goto exit1;
-+      }
-       down(&nd.dentry->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
-       error = PTR_ERR(dentry);
-       if (!IS_ERR(dentry)) {
-               /* Why not before? Because we want correct error value */
-@@ -1516,6 +1658,7 @@ asmlinkage long sys_unlink(const char * 
-                       goto slashes;
-               error = vfs_unlink(nd.dentry->d_inode, dentry);
-       exit2:
-+              intent_release(dentry, &it);
-               dput(dentry);
-       }
-       up(&nd.dentry->d_inode->i_sem);
-@@ -1562,6 +1705,7 @@ asmlinkage long sys_symlink(const char *
-       int error = 0;
-       char * from;
-       char * to;
-+      struct lookup_intent it = { .it_op = IT_SYMLINK };
-       from = getname(oldname);
-       if(IS_ERR(from))
-@@ -1576,15 +1720,28 @@ asmlinkage long sys_symlink(const char *
-                       error = path_walk(to, &nd);
-               if (error)
-                       goto out;
--              dentry = lookup_create(&nd, 0);
-+              if (nd.dentry->d_inode->i_op->symlink2) {
-+                      struct inode_operations *op = nd.dentry->d_inode->i_op;
-+                      error = op->symlink2(nd.dentry->d_inode, 
-+                                           nd.last.name, 
-+                                           nd.last.len,
-+                                           from);
-+                      /* the file system want to use normal vfs path now */
-+                      if (error != -EOPNOTSUPP)
-+                              goto out2;
-+              }
-+              it.it_data = from;
-+              dentry = lookup_create(&nd, 0, &it);
-               error = PTR_ERR(dentry);
-               if (!IS_ERR(dentry)) {
-                       error = vfs_symlink(nd.dentry->d_inode, dentry, from);
-+                      intent_release(dentry, &it);
-                       dput(dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-+      out2:
-               path_release(&nd);
--out:
-+      out:
-               putname(to);
-       }
-       putname(from);
-@@ -1645,6 +1802,7 @@ asmlinkage long sys_link(const char * ol
-       int error;
-       char * from;
-       char * to;
-+      struct lookup_intent it = { .it_op = IT_LINK };
-       from = getname(oldname);
-       if(IS_ERR(from))
-@@ -1657,7 +1815,7 @@ asmlinkage long sys_link(const char * ol
-               error = 0;
-               if (path_init(from, LOOKUP_POSITIVE, &old_nd))
--                      error = path_walk(from, &old_nd);
-+                      error = path_walk_it(from, &old_nd, &it);
-               if (error)
-                       goto exit;
-               if (path_init(to, LOOKUP_PARENT, &nd))
-@@ -1667,10 +1825,22 @@ asmlinkage long sys_link(const char * ol
-               error = -EXDEV;
-               if (old_nd.mnt != nd.mnt)
-                       goto out_release;
--              new_dentry = lookup_create(&nd, 0);
-+              if (nd.dentry->d_inode->i_op->link2) {
-+                      struct inode_operations *op = nd.dentry->d_inode->i_op;
-+                      error = op->link2(old_nd.dentry->d_inode, 
-+                                        nd.dentry->d_inode, 
-+                                        nd.last.name, 
-+                                        nd.last.len);
-+                      /* the file system want to use normal vfs path now */
-+                      if (error != -EOPNOTSUPP)
-+                              goto out_release;
-+              }
-+              it.it_op = IT_LINK2;
-+              new_dentry = lookup_create(&nd, 0, &it);
-               error = PTR_ERR(new_dentry);
-               if (!IS_ERR(new_dentry)) {
-                       error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-+                      intent_release(new_dentry, &it);
-                       dput(new_dentry);
-               }
-               up(&nd.dentry->d_inode->i_sem);
-@@ -1713,7 +1883,8 @@ exit:
-  *       locking].
-  */
- int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+                 struct inode *new_dir, struct dentry *new_dentry,
-+                 struct lookup_intent *it)
- {
-       int error;
-       struct inode *target;
-@@ -1771,6 +1942,7 @@ int vfs_rename_dir(struct inode *old_dir
-               error = -EBUSY;
-       else 
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-+      intent_release(new_dentry, it);
-       if (target) {
-               if (!error)
-                       target->i_flags |= S_DEAD;
-@@ -1792,7 +1964,8 @@ out_unlock:
- }
- int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+                   struct inode *new_dir, struct dentry *new_dentry,
-+                   struct lookup_intent *it)
- {
-       int error;
-@@ -1823,6 +1996,7 @@ int vfs_rename_other(struct inode *old_d
-               error = -EBUSY;
-       else
-               error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
-+      intent_release(new_dentry, it);
-       double_up(&old_dir->i_zombie, &new_dir->i_zombie);
-       if (error)
-               return error;
-@@ -1834,13 +2008,14 @@ int vfs_rename_other(struct inode *old_d
- }
- int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
--             struct inode *new_dir, struct dentry *new_dentry)
-+             struct inode *new_dir, struct dentry *new_dentry,
-+             struct lookup_intent *it)
- {
-       int error;
-       if (S_ISDIR(old_dentry->d_inode->i_mode))
--              error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
-+              error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry,it);
-       else
--              error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
-+              error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,it);
-       if (!error) {
-               if (old_dir == new_dir)
-                       inode_dir_notify(old_dir, DN_RENAME);
-@@ -1857,6 +2032,7 @@ static inline int do_rename(const char *
-       int error = 0;
-       struct dentry * old_dir, * new_dir;
-       struct dentry * old_dentry, *new_dentry;
-+      struct lookup_intent it = { .it_op = IT_RENAME };
-       struct nameidata oldnd, newnd;
-       if (path_init(oldname, LOOKUP_PARENT, &oldnd))
-@@ -1883,9 +2059,23 @@ static inline int do_rename(const char *
-       if (newnd.last_type != LAST_NORM)
-               goto exit2;
-+      if (old_dir->d_inode->i_op->rename2) {
-+              lock_kernel();
-+              error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, 
-+                                              new_dir->d_inode,
-+                                              oldnd.last.name, 
-+                                              oldnd.last.len,
-+                                              newnd.last.name,
-+                                              newnd.last.len);
-+              unlock_kernel();
-+              /* the file system want to use normal vfs path now */
-+              if (error != -EOPNOTSUPP)
-+                      goto exit2;
-+      }
-+
-       double_lock(new_dir, old_dir);
--      old_dentry = lookup_hash(&oldnd.last, old_dir);
-+      old_dentry = lookup_hash_it(&oldnd.last, old_dir, &it);
-       error = PTR_ERR(old_dentry);
-       if (IS_ERR(old_dentry))
-               goto exit3;
-@@ -1901,18 +2091,21 @@ static inline int do_rename(const char *
-               if (newnd.last.name[newnd.last.len])
-                       goto exit4;
-       }
--      new_dentry = lookup_hash(&newnd.last, new_dir);
-+      it.it_op = IT_RENAME2;
-+      new_dentry = lookup_hash_it(&newnd.last, new_dir, &it);
-       error = PTR_ERR(new_dentry);
-       if (IS_ERR(new_dentry))
-               goto exit4;
-       lock_kernel();
-       error = vfs_rename(old_dir->d_inode, old_dentry,
--                                 new_dir->d_inode, new_dentry);
-+                                 new_dir->d_inode, new_dentry, &it);
-       unlock_kernel();
-+      intent_release(new_dentry, &it);
-       dput(new_dentry);
- exit4:
-+      intent_release(old_dentry, &it);
-       dput(old_dentry);
- exit3:
-       double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem);
-@@ -1961,7 +2154,8 @@ out:
- }
- static inline int
--__vfs_follow_link(struct nameidata *nd, const char *link)
-+__vfs_follow_link(struct nameidata *nd, const char *link,
-+                struct lookup_intent *it)
- {
-       int res = 0;
-       char *name;
-@@ -1974,7 +2168,7 @@ __vfs_follow_link(struct nameidata *nd, 
-                       /* weird __emul_prefix() stuff did it */
-                       goto out;
-       }
--      res = link_path_walk(link, nd);
-+      res = link_path_walk_it(link, nd, it);
- out:
-       if (current->link_count || res || nd->last_type!=LAST_NORM)
-               return res;
-@@ -1996,7 +2190,13 @@ fail:
- int vfs_follow_link(struct nameidata *nd, const char *link)
- {
--      return __vfs_follow_link(nd, link);
-+      return __vfs_follow_link(nd, link, NULL);
-+}
-+
-+int vfs_follow_link_it(struct nameidata *nd, const char *link,
-+                     struct lookup_intent *it)
-+{
-+      return __vfs_follow_link(nd, link, it);
- }
- /* get the link contents into pagecache */
-@@ -2038,7 +2238,7 @@ int page_follow_link(struct dentry *dent
- {
-       struct page *page = NULL;
-       char *s = page_getlink(dentry, &page);
--      int res = __vfs_follow_link(nd, s);
-+      int res = __vfs_follow_link(nd, s, NULL);
-       if (page) {
-               kunmap(page);
-               page_cache_release(page);
---- linux-2.4.19/fs/open.c~vanilla-2.4.19      Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/fs/open.c        Sun Jan 19 19:46:42 2003
-@@ -19,6 +19,9 @@
- #include <asm/uaccess.h>
- #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
-+extern int path_walk_it(const char *name, struct nameidata *nd,
-+                      struct lookup_intent *it);
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- int vfs_statfs(struct super_block *sb, struct statfs *buf)
- {
-@@ -118,12 +121,13 @@ static inline long do_sys_truncate(const
-       struct nameidata nd;
-       struct inode * inode;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
-       error = -EINVAL;
-       if (length < 0) /* sorry, but loff_t says... */
-               goto out;
--      error = user_path_walk(path, &nd);
-+      error = user_path_walk_it(path, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -168,6 +172,7 @@ static inline long do_sys_truncate(const
-       put_write_access(inode);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -259,8 +264,9 @@ asmlinkage long sys_utime(char * filenam
-       struct nameidata nd;
-       struct inode * inode;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -286,6 +292,7 @@ asmlinkage long sys_utime(char * filenam
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -303,8 +310,9 @@ asmlinkage long sys_utimes(char * filena
-       struct nameidata nd;
-       struct inode * inode;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-@@ -331,6 +339,7 @@ asmlinkage long sys_utimes(char * filena
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * 
-       int old_fsuid, old_fsgid;
-       kernel_cap_t old_cap;
-       int res;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
-               return -EINVAL;
-@@ -364,13 +374,14 @@ asmlinkage long sys_access(const char * 
-       else
-               current->cap_effective = current->cap_permitted;
--      res = user_path_walk(filename, &nd);
-+      res = user_path_walk_it(filename, &nd, &it);
-       if (!res) {
-               res = permission(nd.dentry->d_inode, mode);
-               /* SuS v2 requires we report a read only fs too */
-               if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
-                  && !special_file(nd.dentry->d_inode->i_mode))
-                       res = -EROFS;
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-@@ -386,6 +397,7 @@ asmlinkage long sys_chdir(const char * f
-       int error;
-       struct nameidata nd;
-       char *name;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       name = getname(filename);
-       error = PTR_ERR(name);
-@@ -394,7 +406,7 @@ asmlinkage long sys_chdir(const char * f
-       error = 0;
-       if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd))
--              error = path_walk(name, &nd);
-+              error = path_walk_it(name, &nd, &it);
-       putname(name);
-       if (error)
-               goto out;
-@@ -406,6 +418,7 @@ asmlinkage long sys_chdir(const char * f
-       set_fs_pwd(current->fs, nd.mnt, nd.dentry);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -446,6 +459,7 @@ asmlinkage long sys_chroot(const char * 
-       int error;
-       struct nameidata nd;
-       char *name;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       name = getname(filename);
-       error = PTR_ERR(name);
-@@ -454,7 +468,7 @@ asmlinkage long sys_chroot(const char * 
-       path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
-                     LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
--      error = path_walk(name, &nd);   
-+      error = path_walk_it(name, &nd, &it);
-       putname(name);
-       if (error)
-               goto out;
-@@ -471,6 +485,7 @@ asmlinkage long sys_chroot(const char * 
-       set_fs_altroot();
-       error = 0;
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -515,8 +530,9 @@ asmlinkage long sys_chmod(const char * f
-       struct inode * inode;
-       int error;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
-               goto out;
-       inode = nd.dentry->d_inode;
-@@ -536,6 +552,7 @@ asmlinkage long sys_chmod(const char * f
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -605,10 +622,12 @@ asmlinkage long sys_chown(const char * f
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -618,10 +637,12 @@ asmlinkage long sys_lchown(const char * 
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -655,10 +676,16 @@ asmlinkage long sys_fchown(unsigned int 
-  * for the internal routines (ie open_namei()/follow_link() etc). 00 is
-  * used by symlinks.
-  */
-+extern int open_namei_it(const char *filename, int namei_flags, int mode,
-+                       struct nameidata *nd, struct lookup_intent *it);
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it);
-+
- struct file *filp_open(const char * filename, int flags, int mode)
- {
-       int namei_flags, error;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_OPEN };
-       namei_flags = flags;
-       if ((namei_flags+1) & O_ACCMODE)
-@@ -666,14 +693,15 @@ struct file *filp_open(const char * file
-       if (namei_flags & O_TRUNC)
-               namei_flags |= 2;
--      error = open_namei(filename, namei_flags, mode, &nd);
--      if (!error)
--              return dentry_open(nd.dentry, nd.mnt, flags);
-+      error = open_namei_it(filename, namei_flags, mode, &nd, &it);
-+      if (error)
-+              return ERR_PTR(error);
--      return ERR_PTR(error);
-+      return dentry_open_it(nd.dentry, nd.mnt, flags, &it);
- }
--struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it)
- {
-       struct file * f;
-       struct inode *inode;
-@@ -716,6 +744,7 @@ struct file *dentry_open(struct dentry *
-       }
-       f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
-+      intent_release(dentry, it);
-       return f;
- cleanup_all:
-@@ -730,11 +759,17 @@ cleanup_all:
- cleanup_file:
-       put_filp(f);
- cleanup_dentry:
-+      intent_release(dentry, it);
-       dput(dentry);
-       mntput(mnt);
-       return ERR_PTR(error);
- }
-+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
-+{
-+      return dentry_open_it(dentry, mnt, flags, NULL);
-+}
-+
- /*
-  * Find an empty file descriptor entry, and mark it busy.
-  */
---- linux-2.4.19/fs/stat.c~vanilla-2.4.19      Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/fs/stat.c        Sun Jan 19 19:46:42 2003
-@@ -13,6 +13,7 @@
- #include <asm/uaccess.h>
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- /*
-  * Revalidate the inode. This is required for proper NFS attribute caching.
-  */
-@@ -135,13 +136,15 @@ static int cp_new_stat(struct inode * in
- asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -151,13 +154,15 @@ asmlinkage long sys_stat(char * filename
- asmlinkage long sys_newstat(char * filename, struct stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -172,13 +177,15 @@ asmlinkage long sys_newstat(char * filen
- asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_old_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -189,13 +196,15 @@ asmlinkage long sys_lstat(char * filenam
- asmlinkage long sys_newlstat(char * filename, struct stat * statbuf)
- {
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       int error;
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -247,20 +256,21 @@ asmlinkage long sys_readlink(const char 
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_READLINK };
-       if (bufsiz <= 0)
-               return -EINVAL;
--      error = user_path_walk_link(path, &nd);
-+      error = user_path_walk_link_it(path, &nd, &it);
-       if (!error) {
-               struct inode * inode = nd.dentry->d_inode;
--
-               error = -EINVAL;
-               if (inode->i_op && inode->i_op->readlink &&
-                   !(error = do_revalidate(nd.dentry))) {
-                       UPDATE_ATIME(inode);
-                       error = inode->i_op->readlink(nd.dentry, buf, bufsiz);
-               }
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -333,12 +343,14 @@ asmlinkage long sys_stat64(char * filena
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -348,12 +360,14 @@ asmlinkage long sys_lstat64(char * filen
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = do_revalidate(nd.dentry);
-               if (!error)
-                       error = cp_new_stat64(nd.dentry->d_inode, statbuf);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
---- linux-2.4.19/mm/slab.c~vanilla-2.4.19      Sun Jan 19 19:46:42 2003
-+++ linux-2.4.19-root/mm/slab.c        Sun Jan 19 19:46:42 2003
-@@ -1207,6 +1207,59 @@ failed:
-  * Called with the cache-lock held.
-  */
-+extern struct page *check_get_page(unsigned long kaddr);
-+struct page *page_mem_map(struct page *page);
-+static int kmem_check_cache_obj (kmem_cache_t * cachep,
-+                               slab_t *slabp, void * objp)
-+{
-+      int i;
-+      unsigned int objnr;
-+
-+#if DEBUG
-+      if (cachep->flags & SLAB_RED_ZONE) {
-+              objp -= BYTES_PER_WORD;
-+              if ( *(unsigned long *)objp != RED_MAGIC2)
-+                      /* Either write before start, or a double free. */
-+                      return 0;
-+              if (*(unsigned long *)(objp+cachep->objsize -
-+                              BYTES_PER_WORD) != RED_MAGIC2)
-+                      /* Either write past end, or a double free. */
-+                      return 0;
-+      }
-+#endif
-+
-+      objnr = (objp-slabp->s_mem)/cachep->objsize;
-+      if (objnr >= cachep->num)
-+              return 0;
-+      if (objp != slabp->s_mem + objnr*cachep->objsize)
-+              return 0;
-+
-+      /* Check slab's freelist to see if this obj is there. */
-+      for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
-+              if (i == objnr)
-+                      return 0;
-+      }
-+      return 1;
-+}
-+
-+
-+int kmem_cache_validate(kmem_cache_t *cachep, void *objp)
-+{
-+      struct page *page = check_get_page((unsigned long)objp);
-+
-+      if (!VALID_PAGE(page))
-+              return 0;
-+
-+      if (!PageSlab(page))
-+              return 0;
-+
-+      /* XXX check for freed slab objects ? */
-+      if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp))
-+              return 0;
-+
-+      return (cachep == GET_PAGE_CACHE(page));
-+}
-+
- #if DEBUG
- static int kmem_extra_free_checks (kmem_cache_t * cachep,
-                       slab_t *slabp, void * objp)
-
-_
index 5c1f090..141b5d4 100644 (file)
@@ -1,16 +1,17 @@
  fs/dcache.c            |   20 ++
- fs/exec.c              |   18 +-
- fs/namei.c             |  338 ++++++++++++++++++++++++++++++++++++++++---------
+ fs/exec.c              |   19 +-
+ fs/namei.c             |  378 +++++++++++++++++++++++++++++++++++++++++--------
  fs/nfsd/vfs.c          |    2 
- fs/open.c              |  120 +++++++++++++++--
+ fs/open.c              |  120 +++++++++++++--
+ fs/proc/base.c         |    1 
  fs/stat.c              |    8 -
- include/linux/dcache.h |   28 ++++
- include/linux/fs.h     |   27 +++
+ include/linux/dcache.h |   31 ++++
+ include/linux/fs.h     |   28 +++
  kernel/ksyms.c         |    1 
9 files changed, 478 insertions(+), 84 deletions(-)
10 files changed, 522 insertions(+), 85 deletions(-)
 
---- linux-2.4.18-18.8.0-l12/fs/dcache.c~vfs_intent-2.4.18-18   Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/fs/dcache.c   Wed Feb 26 17:31:36 2003
+--- linux-2.4.18-61chaos/fs/dcache.c~vfs_intent-2.4.18-18      Sun Jun  1 21:55:14 2003
++++ linux-2.4.18-61chaos-root/fs/dcache.c      Sun Jun  1 21:59:04 2003
 @@ -186,6 +186,13 @@ int d_invalidate(struct dentry * dentry)
                spin_unlock(&dcache_lock);
                return 0;
@@ -56,8 +57,8 @@
  }
  
  #define do_switch(x,y) do { \
---- linux-2.4.18-18.8.0-l12/fs/namei.c~vfs_intent-2.4.18-18    Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/fs/namei.c    Wed Feb 26 16:54:17 2003
+--- linux-2.4.18-61chaos/fs/namei.c~vfs_intent-2.4.18-18       Sun Jun  1 21:55:14 2003
++++ linux-2.4.18-61chaos-root/fs/namei.c       Sun Jun  1 23:14:49 2003
 @@ -94,6 +94,13 @@
   * XEmacs seems to be relying on it...
   */
  {
        int err;
        if (current->link_count >= max_recursive_link)
-@@ -348,10 +377,14 @@ static inline int do_follow_link(struct 
+@@ -348,10 +377,21 @@ static inline int do_follow_link(struct 
        current->link_count++;
        current->total_link_count++;
        UPDATE_ATIME(dentry->d_inode);
 -      err = dentry->d_inode->i_op->follow_link(dentry, nd);
++      nd->it = it;
 +      if (dentry->d_inode->i_op->follow_link2)
 +              err = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
 +      else
 +              err = dentry->d_inode->i_op->follow_link(dentry, nd);
++      if (!err && it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) {
++              /* vfs_follow_link was never called */
++              intent_release(dentry, it);
++              path_release(nd);
++              err = -ENOLINK;
++      }
        current->link_count--;
        return err;
  loop:
        path_release(nd);
        return -ELOOP;
  }
-@@ -381,15 +414,26 @@ int follow_up(struct vfsmount **mnt, str
+@@ -381,15 +421,26 @@ int follow_up(struct vfsmount **mnt, str
        return __follow_up(mnt, dentry);
  }
  
                dput(*dentry);
                mntput(mounted->mnt_parent);
                *dentry = dget(mounted->mnt_root);
-@@ -401,7 +445,7 @@ static inline int __follow_down(struct v
+@@ -401,7 +452,7 @@ static inline int __follow_down(struct v
  
  int follow_down(struct vfsmount **mnt, struct dentry **dentry)
  {
  }
   
  static inline void follow_dotdot(struct nameidata *nd)
-@@ -437,7 +481,7 @@ static inline void follow_dotdot(struct 
+@@ -437,7 +488,7 @@ static inline void follow_dotdot(struct 
                mntput(nd->mnt);
                nd->mnt = parent;
        }
                ;
  }
  
-@@ -449,7 +493,8 @@ static inline void follow_dotdot(struct 
+@@ -449,7 +500,8 @@ static inline void follow_dotdot(struct 
   *
   * We expect 'base' to be positive and a directory.
   */
  {
        struct dentry *dentry;
        struct inode *inode;
-@@ -526,18 +571,18 @@ int link_path_walk(const char * name, st
+@@ -526,18 +578,18 @@ int link_path_walk(const char * name, st
                                break;
                }
                /* This does the actual lookups.. */
                        ;
  
                err = -ENOENT;
-@@ -548,8 +593,8 @@ int link_path_walk(const char * name, st
+@@ -548,8 +600,8 @@ int link_path_walk(const char * name, st
                if (!inode->i_op)
                        goto out_dput;
  
                        dput(dentry);
                        if (err)
                                goto return_err;
-@@ -565,7 +610,7 @@ int link_path_walk(const char * name, st
+@@ -565,7 +617,7 @@ int link_path_walk(const char * name, st
                        nd->dentry = dentry;
                }
                err = -ENOTDIR; 
                        break;
                continue;
                /* here ends the main loop */
-@@ -592,22 +637,23 @@ last_component:
+@@ -592,22 +644,23 @@ last_component:
                        if (err < 0)
                                break;
                }
                        dput(dentry);
                        if (err)
                                goto return_err;
-@@ -621,7 +667,8 @@ last_component:
+@@ -621,7 +674,8 @@ last_component:
                        goto no_inode;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
                                break;
                }
                goto return_base;
-@@ -658,15 +705,28 @@ out_dput:
+@@ -645,7 +699,24 @@ return_reval:
+                * Check the cached dentry for staleness.
+                */
+               dentry = nd->dentry;
+-              if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
++        revalidate_again:
++              if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) {
++                      err = -ESTALE;
++                      if (!dentry->d_op->d_revalidate2(dentry, 0, it)) {
++                                struct dentry *new;
++                                err = permission(dentry->d_parent->d_inode,
++                                                 MAY_EXEC);
++                                if (err)
++                                        break;
++                                new = real_lookup(dentry->d_parent,
++                                                  &dentry->d_name, 0, NULL);
++                              d_invalidate(dentry);
++                                dput(dentry);
++                                dentry = new;
++                                goto revalidate_again;
++                      }
++              } 
++                else if (dentry && dentry->d_op && dentry->d_op->d_revalidate){
+                       err = -ESTALE;
+                       if (!dentry->d_op->d_revalidate(dentry, 0)) {
+                               d_invalidate(dentry);
+@@ -658,15 +729,28 @@ out_dput:
                dput(dentry);
                break;
        }
  }
  
  /* SMP-safe */
-@@ -751,6 +811,17 @@ walk_init_root(const char *name, struct 
+@@ -751,6 +835,17 @@ walk_init_root(const char *name, struct 
  }
  
  /* SMP-safe */
  int path_lookup(const char *path, unsigned flags, struct nameidata *nd)
  {
        int error = 0;
-@@ -779,7 +850,8 @@ int path_init(const char *name, unsigned
+@@ -765,6 +860,7 @@ int path_init(const char *name, unsigned
+ {
+       nd->last_type = LAST_ROOT; /* if there are only slashes... */
+       nd->flags = flags;
++      nd->it = NULL;
+       if (*name=='/')
+               return walk_init_root(name,nd);
+       read_lock(&current->fs->lock);
+@@ -779,7 +875,8 @@ int path_init(const char *name, unsigned
   * needs parent already locked. Doesn't follow mounts.
   * SMP-safe.
   */
  {
        struct dentry * dentry;
        struct inode *inode;
-@@ -802,13 +874,16 @@ struct dentry * lookup_hash(struct qstr 
+@@ -802,13 +899,16 @@ struct dentry * lookup_hash(struct qstr 
                        goto out;
        }
  
                dentry = inode->i_op->lookup(inode, new);
                unlock_kernel();
                if (!dentry)
-@@ -820,6 +895,12 @@ out:
+@@ -820,6 +920,12 @@ out:
        return dentry;
  }
  
  /* SMP-safe */
  struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
  {
-@@ -841,7 +922,7 @@ struct dentry * lookup_one_len(const cha
+@@ -841,7 +947,7 @@ struct dentry * lookup_one_len(const cha
        }
        this.hash = end_name_hash(hash);
  
  access:
        return ERR_PTR(-EACCES);
  }
-@@ -872,6 +953,23 @@ int __user_walk(const char *name, unsign
+@@ -872,6 +978,23 @@ int __user_walk(const char *name, unsign
        return err;
  }
  
  /*
   * It's inline, so penalty for filesystems that don't use sticky bit is
   * minimal.
-@@ -1045,14 +1143,17 @@ int may_open(struct nameidata *nd, int a
+@@ -1045,14 +1168,17 @@ int may_open(struct nameidata *nd, int a
          return get_lease(inode, flag);
  }
  
        struct nameidata nd;
 +      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = open_flags };
        int count = 0;
-       if ((flag+1) & O_ACCMODE)
-@@ -1066,7 +1167,7 @@ struct file *filp_open(const char * path
+       
+       if (!capable(CAP_SYS_ADMIN))
+@@ -1069,7 +1195,7 @@ struct file *filp_open(const char * path
         * The simplest case - just a plain lookup.
         */
        if (!(flag & O_CREAT)) {
                if (error)
                        return ERR_PTR(error);
                dentry = nd.dentry;
-@@ -1076,6 +1177,8 @@ struct file *filp_open(const char * path
+@@ -1079,6 +1205,8 @@ struct file *filp_open(const char * path
        /*
         * Create - we need to know the parent.
         */
        error = path_lookup(pathname, LOOKUP_PARENT, &nd);
        if (error)
                return ERR_PTR(error);
-@@ -1091,7 +1194,7 @@ struct file *filp_open(const char * path
+@@ -1094,7 +1222,7 @@ struct file *filp_open(const char * path
  
        dir = nd.dentry;
        down(&dir->d_inode->i_sem);
  
  do_last:
        error = PTR_ERR(dentry);
-@@ -1100,6 +1203,7 @@ do_last:
+@@ -1103,6 +1231,7 @@ do_last:
                goto exit;
        }
  
        /* Negative dentry, just create the file */
        if (!dentry->d_inode) {
                error = vfs_create(dir->d_inode, dentry,
-@@ -1129,12 +1233,13 @@ do_last:
+@@ -1132,12 +1261,13 @@ do_last:
                error = -ELOOP;
                if (flag & O_NOFOLLOW)
                        goto exit_dput;
                goto do_link;
  
        dput(nd.dentry);
-@@ -1149,11 +1254,13 @@ ok:
+@@ -1152,11 +1282,13 @@ ok:
        if (!S_ISREG(nd.dentry->d_inode->i_mode))
                open_flags &= ~O_TRUNC;
  
        path_release(&nd);
        return ERR_PTR(error);
  
-@@ -1172,10 +1279,15 @@ do_link:
+@@ -1175,10 +1307,22 @@ do_link:
         * are done. Procfs-like symlinks just set LAST_BIND.
         */
        UPDATE_ATIME(dentry->d_inode);
 -      error = dentry->d_inode->i_op->follow_link(dentry, &nd);
++      nd.it = &it;
 +      if (dentry->d_inode->i_op->follow_link2)
 +              error = dentry->d_inode->i_op->follow_link2(dentry, &nd, &it);
 +      else
 +              error = dentry->d_inode->i_op->follow_link(dentry, &nd);
-+      if (error)
++      if (error) {
++              intent_release(dentry, &it);
++      } else if (!(it.it_int_flags & IT_FL_FOLLOWED)) {
++              /* vfs_follow_link was never called */
 +              intent_release(dentry, &it);
++              path_release(&nd);
++              error = -ENOLINK;
++      }
        dput(dentry);
        if (error)
 -              return error;
        if (nd.last_type == LAST_BIND) {
                dentry = nd.dentry;
                goto ok;
-@@ -1194,13 +1306,15 @@ do_link:
+@@ -1197,13 +1341,15 @@ do_link:
        }
        dir = nd.dentry;
        down(&dir->d_inode->i_sem);
  {
        struct dentry *dentry;
  
-@@ -1208,7 +1322,7 @@ static struct dentry *lookup_create(stru
+@@ -1211,7 +1357,7 @@ static struct dentry *lookup_create(stru
        dentry = ERR_PTR(-EEXIST);
        if (nd->last_type != LAST_NORM)
                goto fail;
        if (IS_ERR(dentry))
                goto fail;
        if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1264,7 +1378,19 @@ asmlinkage long sys_mknod(const char * f
+@@ -1267,7 +1413,19 @@ asmlinkage long sys_mknod(const char * f
        error = path_lookup(tmp, LOOKUP_PARENT, &nd);
        if (error)
                goto out;
        error = PTR_ERR(dentry);
  
        mode &= ~current->fs->umask;
-@@ -1285,6 +1411,7 @@ asmlinkage long sys_mknod(const char * f
+@@ -1288,6 +1446,7 @@ asmlinkage long sys_mknod(const char * f
                dput(dentry);
        }
        up(&nd.dentry->d_inode->i_sem);
        path_release(&nd);
  out:
        putname(tmp);
-@@ -1332,7 +1459,17 @@ asmlinkage long sys_mkdir(const char * p
+@@ -1335,7 +1494,17 @@ asmlinkage long sys_mkdir(const char * p
                error = path_lookup(tmp, LOOKUP_PARENT, &nd);
                if (error)
                        goto out;
                error = PTR_ERR(dentry);
                if (!IS_ERR(dentry)) {
                        error = vfs_mkdir(nd.dentry->d_inode, dentry,
-@@ -1340,6 +1477,7 @@ asmlinkage long sys_mkdir(const char * p
+@@ -1343,6 +1512,7 @@ asmlinkage long sys_mkdir(const char * p
                        dput(dentry);
                }
                up(&nd.dentry->d_inode->i_sem);
                path_release(&nd);
  out:
                putname(tmp);
-@@ -1440,8 +1578,33 @@ asmlinkage long sys_rmdir(const char * p
+@@ -1443,8 +1613,33 @@ asmlinkage long sys_rmdir(const char * p
                        error = -EBUSY;
                        goto exit1;
        }
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                error = vfs_rmdir(nd.dentry->d_inode, dentry);
-@@ -1499,8 +1662,17 @@ asmlinkage long sys_unlink(const char * 
+@@ -1502,8 +1697,17 @@ asmlinkage long sys_unlink(const char * 
        error = -EISDIR;
        if (nd.last_type != LAST_NORM)
                goto exit1;
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
-@@ -1567,15 +1739,26 @@ asmlinkage long sys_symlink(const char *
+@@ -1570,15 +1774,26 @@ asmlinkage long sys_symlink(const char *
                error = path_lookup(to, LOOKUP_PARENT, &nd);
                if (error)
                        goto out;
                putname(to);
        }
        putname(from);
-@@ -1642,7 +1825,7 @@ asmlinkage long sys_link(const char * ol
+@@ -1645,7 +1860,7 @@ asmlinkage long sys_link(const char * ol
                struct dentry *new_dentry;
                struct nameidata nd, old_nd;
  
                if (error)
                        goto exit;
                error = path_lookup(to, LOOKUP_PARENT, &nd);
-@@ -1651,7 +1834,17 @@ asmlinkage long sys_link(const char * ol
+@@ -1654,7 +1869,17 @@ asmlinkage long sys_link(const char * ol
                error = -EXDEV;
                if (old_nd.mnt != nd.mnt)
                        goto out_release;
                error = PTR_ERR(new_dentry);
                if (!IS_ERR(new_dentry)) {
                        error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-@@ -1695,7 +1888,8 @@ exit:
+@@ -1698,7 +1923,8 @@ exit:
   *       locking].
   */
  int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
  {
        int error;
        struct inode *target;
-@@ -1753,6 +1947,7 @@ int vfs_rename_dir(struct inode *old_dir
+@@ -1756,6 +1982,7 @@ int vfs_rename_dir(struct inode *old_dir
                error = -EBUSY;
        else 
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (target) {
                if (!error)
                        target->i_flags |= S_DEAD;
-@@ -1774,7 +1969,8 @@ out_unlock:
+@@ -1777,7 +2004,8 @@ out_unlock:
  }
  
  int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
  {
        int error;
  
-@@ -1805,6 +2001,7 @@ int vfs_rename_other(struct inode *old_d
+@@ -1808,6 +2036,7 @@ int vfs_rename_other(struct inode *old_d
                error = -EBUSY;
        else
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        double_up(&old_dir->i_zombie, &new_dir->i_zombie);
        if (error)
                return error;
-@@ -1816,13 +2013,14 @@ int vfs_rename_other(struct inode *old_d
+@@ -1819,13 +2048,14 @@ int vfs_rename_other(struct inode *old_d
  }
  
  int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (!error) {
                if (old_dir == new_dir)
                        inode_dir_notify(old_dir, DN_RENAME);
-@@ -1864,7 +2062,7 @@ static inline int do_rename(const char *
+@@ -1867,7 +2097,7 @@ static inline int do_rename(const char *
  
        double_lock(new_dir, old_dir);
  
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
-@@ -1880,16 +2078,37 @@ static inline int do_rename(const char *
+@@ -1883,16 +2113,37 @@ static inline int do_rename(const char *
                if (newnd.last.name[newnd.last.len])
                        goto exit4;
        }
        dput(new_dentry);
  exit4:
        dput(old_dentry);
-@@ -1940,7 +2159,8 @@ out:
+@@ -1943,12 +2194,19 @@ out:
  }
  
  static inline int
  {
        int res = 0;
        char *name;
-@@ -1953,7 +2173,7 @@ __vfs_follow_link(struct nameidata *nd, 
+       if (IS_ERR(link))
+               goto fail;
++      if (it == NULL)
++              it = nd->it;
++      else if (it != nd->it)
++              printk("it != nd->it: tell phil@clusterfs.com\n");
++      if (it != NULL)
++              it->it_int_flags |= IT_FL_FOLLOWED;
+       if (*link == '/') {
+               path_release(nd);
+@@ -1956,7 +2214,7 @@ __vfs_follow_link(struct nameidata *nd, 
                        /* weird __emul_prefix() stuff did it */
                        goto out;
        }
  out:
        if (current->link_count || res || nd->last_type!=LAST_NORM)
                return res;
-@@ -1975,7 +2195,13 @@ fail:
+@@ -1978,7 +2236,13 @@ fail:
  
  int vfs_follow_link(struct nameidata *nd, const char *link)
  {
  }
  
  /* get the link contents into pagecache */
-@@ -2017,7 +2243,7 @@ int page_follow_link(struct dentry *dent
+@@ -2020,7 +2284,7 @@ int page_follow_link(struct dentry *dent
  {
        struct page *page = NULL;
        char *s = page_getlink(dentry, &page);
        if (page) {
                kunmap(page);
                page_cache_release(page);
---- linux-2.4.18-18.8.0-l12/fs/nfsd/vfs.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/fs/nfsd/vfs.c Wed Feb 26 16:54:17 2003
+--- linux-2.4.18-61chaos/fs/nfsd/vfs.c~vfs_intent-2.4.18-18    Sun Jun  1 21:55:14 2003
++++ linux-2.4.18-61chaos-root/fs/nfsd/vfs.c    Sun Jun  1 21:59:04 2003
 @@ -1298,7 +1298,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
                        err = nfserr_perm;
        } else
        unlock_kernel();
        if (!err && EX_ISSYNC(tfhp->fh_export)) {
                nfsd_sync_dir(tdentry);
---- linux-2.4.18-18.8.0-l12/fs/open.c~vfs_intent-2.4.18-18     Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/fs/open.c     Wed Feb 26 16:54:17 2003
+--- linux-2.4.18-61chaos/fs/open.c~vfs_intent-2.4.18-18        Sun Jun  1 21:55:14 2003
++++ linux-2.4.18-61chaos-root/fs/open.c        Sun Jun  1 21:59:04 2003
 @@ -19,6 +19,8 @@
  #include <asm/uaccess.h>
  
  /*
   * Find an empty file descriptor entry, and mark it busy.
   */
---- linux-2.4.18-18.8.0-l12/fs/stat.c~vfs_intent-2.4.18-18     Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/fs/stat.c     Wed Feb 26 16:54:17 2003
+--- linux-2.4.18-61chaos/fs/stat.c~vfs_intent-2.4.18-18        Sun Jun  1 21:55:14 2003
++++ linux-2.4.18-61chaos-root/fs/stat.c        Sun Jun  1 21:59:04 2003
 @@ -104,10 +104,12 @@ int vfs_stat(char *name, struct kstat *s
  {
        struct nameidata nd;
                path_release(&nd);
        }
        return error;
---- linux-2.4.18-18.8.0-l12/fs/exec.c~vfs_intent-2.4.18-18     Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/fs/exec.c     Wed Feb 26 16:54:17 2003
-@@ -103,13 +103,18 @@ static inline void put_binfmt(struct lin
+--- linux-2.4.18-61chaos/fs/exec.c~vfs_intent-2.4.18-18        Sun Jun  1 21:55:14 2003
++++ linux-2.4.18-61chaos-root/fs/exec.c        Sun Jun  1 21:59:04 2003
+@@ -112,13 +112,18 @@ static inline void put_binfmt(struct lin
   *
   * Also note that we take the address to load from from the file itself.
   */
        if (error)
                goto out;
  
-@@ -121,7 +126,8 @@ asmlinkage long sys_uselib(const char * 
+@@ -130,7 +135,8 @@ asmlinkage long sys_uselib(const char * 
        if (error)
                goto exit;
  
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto out;
-@@ -350,8 +356,9 @@ struct file *open_exec(const char *name)
+@@ -359,8 +365,9 @@ struct file *open_exec(const char *name)
        struct inode *inode;
        struct file *file;
        int err = 0;
        file = ERR_PTR(err);
        if (!err) {
                inode = nd.dentry->d_inode;
-@@ -363,7 +370,8 @@ struct file *open_exec(const char *name)
+@@ -372,8 +379,9 @@ struct file *open_exec(const char *name)
                                err = -EACCES;
                        file = ERR_PTR(err);
                        if (!err) {
 -                              file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
 +                              file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
-+                              intent_release(nd.dentry, &it);
                                if (!IS_ERR(file)) {
++                                        intent_release(nd.dentry, &it);
                                        err = deny_write_access(file);
                                        if (err) {
-@@ -976,7 +984,7 @@ int do_coredump(long signr, struct pt_re
+                                               fput(file);
+@@ -384,6 +392,7 @@ out:
+                               return file;
+                       }
+               }
++                intent_release(nd.dentry, &it);
+               path_release(&nd);
+       }
+       goto out;
+@@ -1104,7 +1113,7 @@ int do_coredump(long signr, struct pt_re
                goto close_fail;
        if (!file->f_op->write)
                goto close_fail;
                goto close_fail;
  
        retval = binfmt->core_dump(signr, regs, file);
---- linux-2.4.18-18.8.0-l12/include/linux/dcache.h~vfs_intent-2.4.18-18        Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/include/linux/dcache.h        Wed Feb 26 17:01:30 2003
-@@ -6,6 +6,25 @@
+--- linux-2.4.18-61chaos/include/linux/dcache.h~vfs_intent-2.4.18-18   Sun Jun  1 21:55:14 2003
++++ linux-2.4.18-61chaos-root/include/linux/dcache.h   Sun Jun  1 22:02:31 2003
+@@ -6,6 +6,28 @@
  #include <asm/atomic.h>
  #include <linux/mount.h>
  
 +#define IT_LOOKUP   (1<<4)
 +#define IT_UNLINK   (1<<5)
 +
++#define IT_FL_LOCKED   (1)
++#define IT_FL_FOLLOWED (1<<1) /* set by vfs_follow_link */
++
 +struct lookup_intent {
 +      int it_op;
 +      int it_mode;
 +      int it_flags;
 +      int it_disposition;
 +      int it_status;
-+      struct iattr *it_iattr;
++      int it_int_flags;
 +      __u64 it_lock_handle[2];
 +      int it_lock_mode;
 +      void *it_data;
  /*
   * linux/include/linux/dcache.h
   *
-@@ -78,6 +97,7 @@ struct dentry {
+@@ -78,6 +100,7 @@ struct dentry {
        unsigned long d_time;           /* used by d_revalidate */
        struct dentry_operations  *d_op;
        struct super_block * d_sb;      /* The root of the dentry tree */
        unsigned long d_vfs_flags;
        void * d_fsdata;                /* fs-specific data */
        void * d_extra_attributes;      /* TUX-specific data */
-@@ -91,8 +111,15 @@ struct dentry_operations {
+@@ -91,8 +114,15 @@ struct dentry_operations {
        int (*d_delete)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
  /* the dentry parameter passed to d_hash and d_compare is the parent
   * directory of the entries to be compared. It is used in case these
   * functions need any directory specific information for determining
-@@ -124,6 +151,7 @@ d_iput:            no              no              yes
+@@ -124,6 +154,7 @@ d_iput:            no              no              yes
                                         * s_nfsd_free_path semaphore will be down
                                         */
  #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
  
  extern spinlock_t dcache_lock;
  
---- linux-2.4.18-18.8.0-l12/include/linux/fs.h~vfs_intent-2.4.18-18    Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/include/linux/fs.h    Wed Feb 26 17:31:42 2003
-@@ -338,6 +338,8 @@ extern void set_bh_page(struct buffer_he
+--- linux-2.4.18-61chaos/include/linux/fs.h~vfs_intent-2.4.18-18       Sun Jun  1 21:59:03 2003
++++ linux-2.4.18-61chaos-root/include/linux/fs.h       Sun Jun  1 22:01:46 2003
+@@ -339,6 +339,8 @@ extern void set_bh_page(struct buffer_he
  #define ATTR_MTIME_SET        256
  #define ATTR_FORCE    512     /* Not a change, but a change it */
  #define ATTR_ATTR_FLAG        1024
  
  /*
   * This is the Inode Attributes structure, used for notify_change().  It
-@@ -576,6 +578,7 @@ struct file {
+@@ -578,6 +580,7 @@ struct file {
  
        /* needed for tty driver, and maybe others */
        void                    *private_data;
  
        /* preallocated helper kiobuf to speedup O_DIRECT */
        struct kiobuf           *f_iobuf;
-@@ -836,7 +839,9 @@ extern int vfs_symlink(struct inode *, s
+@@ -707,6 +710,7 @@ struct nameidata {
+       struct qstr last;
+       unsigned int flags;
+       int last_type;
++      struct lookup_intent *it;
+ };
+ #define DQUOT_USR_ENABLED     0x01            /* User diskquotas enabled */
+@@ -840,7 +844,9 @@ extern int vfs_symlink(struct inode *, s
  extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
  extern int vfs_rmdir(struct inode *, struct dentry *);
  extern int vfs_unlink(struct inode *, struct dentry *);
  
  /*
   * File types
-@@ -897,20 +902,33 @@ struct file_operations {
+@@ -901,20 +907,33 @@ struct file_operations {
  struct inode_operations {
        int (*create) (struct inode *,struct dentry *,int);
        struct dentry * (*lookup) (struct inode *,struct dentry *);
        int (*getattr) (struct dentry *, struct iattr *);
  };
  
-@@ -1112,7 +1130,7 @@ static inline int get_lease(struct inode
+@@ -1119,7 +1138,7 @@ static inline int get_lease(struct inode
  
  asmlinkage long sys_open(const char *, int, int);
  asmlinkage long sys_close(unsigned int);      /* yes, it's really unsigned */
  
  extern struct file *filp_open(const char *, int, int);
  extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
-@@ -1381,6 +1399,7 @@ typedef int (*read_actor_t)(read_descrip
+@@ -1388,6 +1407,7 @@ typedef int (*read_actor_t)(read_descrip
  extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
  
  extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
  extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
  extern int FASTCALL(path_walk(const char *, struct nameidata *));
  extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
-@@ -1392,6 +1411,8 @@ extern struct dentry * lookup_one_len(co
+@@ -1399,6 +1419,8 @@ extern struct dentry * lookup_one_len(co
  extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
  #define user_path_walk(name,nd)        __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
  #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
  
  extern void inode_init_once(struct inode *);
  extern void iput(struct inode *);
-@@ -1492,6 +1513,8 @@ extern struct file_operations generic_ro
+@@ -1499,6 +1521,8 @@ extern struct file_operations generic_ro
  
  extern int vfs_readlink(struct dentry *, char *, int, const char *);
  extern int vfs_follow_link(struct nameidata *, const char *);
  extern int page_readlink(struct dentry *, char *, int);
  extern int page_follow_link(struct dentry *, struct nameidata *);
  extern struct inode_operations page_symlink_inode_operations;
---- linux-2.4.18-18.8.0-l12/kernel/ksyms.c~vfs_intent-2.4.18-18        Wed Feb 26 16:54:17 2003
-+++ linux-2.4.18-18.8.0-l12-phil/kernel/ksyms.c        Wed Feb 26 16:54:17 2003
-@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page);
+--- linux-2.4.18-61chaos/kernel/ksyms.c~vfs_intent-2.4.18-18   Sun Jun  1 21:59:03 2003
++++ linux-2.4.18-61chaos-root/kernel/ksyms.c   Sun Jun  1 21:59:04 2003
+@@ -294,6 +294,7 @@ EXPORT_SYMBOL(read_cache_page);
  EXPORT_SYMBOL(set_page_dirty);
  EXPORT_SYMBOL(vfs_readlink);
  EXPORT_SYMBOL(vfs_follow_link);
  EXPORT_SYMBOL(page_readlink);
  EXPORT_SYMBOL(page_follow_link);
  EXPORT_SYMBOL(page_symlink_inode_operations);
+
+_
+--- linux/fs/proc/base.c.old   Sat Jun  7 00:55:09 2003
++++ linux/fs/proc/base.c       Sat Jun  7 00:55:33 2003
+@@ -465,6 +465,9 @@
+       error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt);
+       nd->last_type = LAST_BIND;
++
++        if (nd->it != NULL)
++                nd->it->it_int_flags |= IT_FL_FOLLOWED;
+ out:
+       return error;
+ }
@@ -1,22 +1,14 @@
- fs/dcache.c            |    8 +
- fs/namei.c             |  287 ++++++++++++++++++++++++++++++++++++++++---------
- fs/nfsd/vfs.c          |    2 
- fs/open.c              |   53 +++++++--
- fs/stat.c              |    9 +
- include/linux/dcache.h |   25 ++++
- include/linux/fs.h     |   22 +++
- kernel/ksyms.c         |    1 
- 8 files changed, 344 insertions(+), 63 deletions(-)
+ 0 files changed
 
---- linux-2.4.18-18.8.0-l7/fs/dcache.c~vfs_intent-2.4.18-18    Mon Jan 20 08:28:00 2003
-+++ linux-2.4.18-18.8.0-l7-root/fs/dcache.c    Mon Jan 20 08:54:54 2003
-@@ -186,6 +188,13 @@ int d_invalidate(struct dentry * dentry)
+--- linux-2.4.20-rh/fs/dcache.c~vfs_intent-2.4.20-rh   2003-04-11 14:04:58.000000000 +0800
++++ linux-2.4.20-rh-root/fs/dcache.c   2003-06-09 23:18:07.000000000 +0800
+@@ -186,6 +186,13 @@ int d_invalidate(struct dentry * dentry)
                spin_unlock(&dcache_lock);
                return 0;
        }
 +
 +      /* network invalidation by Lustre */
-+      if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { 
++      if (dentry->d_flags & DCACHE_LUSTRE_INVALID) {
 +              spin_unlock(&dcache_lock);
 +              return 0;
 +      }
        /*
         * Check whether to do a partial shrink_dcache
         * to get rid of unused child entries.
-@@ -645,6 +654,7 @@ struct dentry * d_alloc(struct dentry * 
+@@ -624,6 +631,7 @@ struct dentry * d_alloc(struct dentry * 
        dentry->d_fsdata = NULL;
        dentry->d_extra_attributes = NULL;
        dentry->d_mounted = 0;
-+      dentry->d_it = NULL;
++      dentry->d_it = NULL;
+       dentry->d_cookie = NULL;
        INIT_LIST_HEAD(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
-       INIT_LIST_HEAD(&dentry->d_subdirs);
---- linux-2.4.18-18.8.0-l7/fs/namei.c~vfs_intent-2.4.18-18     Mon Jan 20 12:25:10 2003
-+++ linux-2.4.18-18.8.0-l7-root/fs/namei.c     Wed Jan 22 22:53:28 2003
-@@ -94,6 +97,13 @@
+@@ -839,13 +847,19 @@ void d_delete(struct dentry * dentry)
+  * Adds a dentry to the hash according to its name.
+  */
+  
+-void d_rehash(struct dentry * entry)
++void __d_rehash(struct dentry * entry, int lock)
+ {
+       struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+       if (!list_empty(&entry->d_hash)) BUG();
+-      spin_lock(&dcache_lock);
++      if (lock) spin_lock(&dcache_lock);
+       list_add(&entry->d_hash, list);
+-      spin_unlock(&dcache_lock);
++      if (lock) spin_unlock(&dcache_lock);
++}
++EXPORT_SYMBOL(__d_rehash);
++
++void d_rehash(struct dentry * entry)
++{
++      __d_rehash(entry, 1);
+ }
+ #define do_switch(x,y) do { \
+--- linux-2.4.20-rh/fs/namei.c~vfs_intent-2.4.20-rh    2003-04-11 14:04:57.000000000 +0800
++++ linux-2.4.20-rh-root/fs/namei.c    2003-06-09 23:18:07.000000000 +0800
+@@ -94,6 +94,13 @@
   * XEmacs seems to be relying on it...
   */
  
@@ -48,7 +63,7 @@
  /* In order to reduce some races, while at the same time doing additional
   * checking and hopefully speeding things up, we copy filenames to the
   * kernel data space before using them..
-@@ -260,10 +271,19 @@ void path_release(struct nameidata *nd)
+@@ -260,10 +267,19 @@ void path_release(struct nameidata *nd)
   * Internal lookup() using the new generic dcache.
   * SMP-safe
   */
@@ -69,7 +84,7 @@
        if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
                if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
                        dput(dentry);
-@@ -281,7 +301,8 @@ static struct dentry * cached_lookup(str
+@@ -281,11 +297,14 @@ static struct dentry * cached_lookup(str
   * make sure that nobody added the entry to the dcache in the meantime..
   * SMP-safe
   */
  {
        struct dentry * result;
        struct inode *dir = parent->d_inode;
-@@ -300,6 +321,9 @@ static struct dentry * real_lookup(struc
++again:
++
+       down(&dir->i_sem);
+       /*
+        * First re-do the cached lookup just in case it was created
+@@ -300,6 +319,9 @@ static struct dentry * real_lookup(struc
                result = ERR_PTR(-ENOMEM);
                if (dentry) {
                        lock_kernel();
                        result = dir->i_op->lookup(dir, dentry);
                        unlock_kernel();
                        if (result)
-@@ -321,6 +345,12 @@ static struct dentry * real_lookup(struc
+@@ -321,6 +343,12 @@ static struct dentry * real_lookup(struc
                        dput(result);
                        result = ERR_PTR(-ENOENT);
                }
 +              if (!result->d_op->d_revalidate2(result, flags, it) &&
 +                  !d_invalidate(result)) {
 +                      dput(result);
-+                      result = ERR_PTR(-ENOENT);
++                      goto again;
 +              }
        }
        return result;
  }
-@@ -334,7 +364,8 @@ int max_recursive_link = 5;
+@@ -334,7 +362,8 @@ int max_recursive_link = 5;
   * Without that kind of total limit, nasty chains of consecutive
   * symlinks can cause almost arbitrarily long lookups. 
   */
 -static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
-+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, 
++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd,
 +                               struct lookup_intent *it)
  {
        int err;
        if (current->link_count >= max_recursive_link)
-@@ -348,10 +379,14 @@ static inline int do_follow_link(struct 
+@@ -348,10 +377,21 @@ static inline int do_follow_link(struct 
        current->link_count++;
        current->total_link_count++;
        UPDATE_ATIME(dentry->d_inode);
 -      err = dentry->d_inode->i_op->follow_link(dentry, nd);
++      nd->it = it;
 +      if (dentry->d_inode->i_op->follow_link2)
 +              err = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
-+      else 
++      else
 +              err = dentry->d_inode->i_op->follow_link(dentry, nd);
++      if (!err && it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) {
++              /* vfs_follow_link was never called */
++              intent_release(dentry, it);
++              path_release(nd);
++              err = -ENOLINK;
++      }
        current->link_count--;
        return err;
  loop:
        path_release(nd);
        return -ELOOP;
  }
-@@ -449,7 +484,8 @@ static inline void follow_dotdot(struct 
+@@ -381,15 +421,26 @@ int follow_up(struct vfsmount **mnt, str
+       return __follow_up(mnt, dentry);
+ }
+-static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
++static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry,
++                              struct lookup_intent *it)
+ {
+       struct vfsmount *mounted;
+       spin_lock(&dcache_lock);
+       mounted = lookup_mnt(*mnt, *dentry);
+       if (mounted) {
++              int opc = 0, mode = 0;
+               *mnt = mntget(mounted);
+               spin_unlock(&dcache_lock);
++              if (it) {
++                      opc = it->it_op;
++                      mode = it->it_mode;
++              }
++              intent_release(*dentry, it);
++              if (it) {
++                      it->it_op = opc;
++                      it->it_mode = mode;
++              }
+               dput(*dentry);
+               mntput(mounted->mnt_parent);
+               *dentry = dget(mounted->mnt_root);
+@@ -401,7 +452,7 @@ static inline int __follow_down(struct v
+ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+ {
+-      return __follow_down(mnt,dentry);
++      return __follow_down(mnt,dentry,NULL);
+ }
+  
+ static inline void follow_dotdot(struct nameidata *nd)
+@@ -437,7 +488,7 @@ static inline void follow_dotdot(struct 
+               mntput(nd->mnt);
+               nd->mnt = parent;
+       }
+-      while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry))
++      while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry, NULL))
+               ;
+ }
+@@ -449,7 +500,8 @@ static inline void follow_dotdot(struct 
   *
   * We expect 'base' to be positive and a directory.
   */
  {
        struct dentry *dentry;
        struct inode *inode;
-@@ -526,12 +562,12 @@ int link_path_walk(const char * name, st
+@@ -526,18 +578,18 @@ int link_path_walk(const char * name, st
                                break;
                }
                /* This does the actual lookups.. */
 -              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
++              dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
                if (!dentry) {
                        err = -EWOULDBLOCKIO;
                        if (atomic)
                                break;
 -                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
-+                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
++                      dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
                        err = PTR_ERR(dentry);
                        if (IS_ERR(dentry))
                                break;
-@@ -548,8 +584,8 @@ int link_path_walk(const char * name, st
+               }
+               /* Check mountpoints.. */
+-              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
++              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, NULL))
+                       ;
+               err = -ENOENT;
+@@ -548,8 +600,8 @@ int link_path_walk(const char * name, st
                if (!inode->i_op)
                        goto out_dput;
  
                        dput(dentry);
                        if (err)
                                goto return_err;
-@@ -565,7 +601,7 @@ int link_path_walk(const char * name, st
+@@ -565,7 +617,7 @@ int link_path_walk(const char * name, st
                        nd->dentry = dentry;
                }
                err = -ENOTDIR; 
                        break;
                continue;
                /* here ends the main loop */
-@@ -592,12 +628,12 @@ last_component:
+@@ -592,22 +644,23 @@ last_component:
                        if (err < 0)
                                break;
                }
 -              dentry = cached_lookup(nd->dentry, &this, 0);
-+              dentry = cached_lookup(nd->dentry, &this, 0, it);
++              dentry = cached_lookup(nd->dentry, &this, 0, it);
                if (!dentry) {
                        err = -EWOULDBLOCKIO;
                        if (atomic)
                                break;
 -                      dentry = real_lookup(nd->dentry, &this, 0);
-+                      dentry = real_lookup(nd->dentry, &this, 0, it);
++                      dentry = real_lookup(nd->dentry, &this, 0, it);
                        err = PTR_ERR(dentry);
                        if (IS_ERR(dentry))
                                break;
-@@ -606,8 +642,9 @@ last_component:
+               }
+-              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))
++              while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, it))
                        ;
                inode = dentry->d_inode;
                if ((lookup_flags & LOOKUP_FOLLOW)
 -                  && inode && inode->i_op && inode->i_op->follow_link) {
 -                      err = do_follow_link(dentry, nd);
-+                  && inode && inode->i_op && 
-+                  (inode->i_op->follow_link || inode->i_op->follow_link2)) {
-+                      err = do_follow_link(dentry, nd, it);
++                  && inode && inode->i_op &&
++                  (inode->i_op->follow_link || inode->i_op->follow_link2)) {
++                      err = do_follow_link(dentry, nd, it);
                        dput(dentry);
                        if (err)
                                goto return_err;
-@@ -621,7 +659,8 @@ last_component:
+@@ -621,7 +674,8 @@ last_component:
                        goto no_inode;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
 -                      if (!inode->i_op || !inode->i_op->lookup)
-+                      if (!inode->i_op || (!inode->i_op->lookup &&
-+                                           !inode->i_op->lookup2))
++                      if (!inode->i_op ||
++                          (!inode->i_op->lookup && !inode->i_op->lookup2))
                                break;
                }
                goto return_base;
-@@ -658,15 +697,28 @@ out_dput:
+@@ -645,6 +699,23 @@ return_reval:
+                * Check the cached dentry for staleness.
+                */
+               dentry = nd->dentry;
++        revalidate_again:
++              if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) {
++                      err = -ESTALE;
++                      if (!dentry->d_op->d_revalidate2(dentry, 0, it)) {
++                                struct dentry *new;
++                                err = permission(dentry->d_parent->d_inode, 
++                                                 MAY_EXEC);
++                                if (err)
++                                        break;
++                                new = real_lookup(dentry->d_parent,
++                                                  &dentry->d_name, 0, NULL);
++                              d_invalidate(dentry);
++                                dput(dentry);
++                                dentry = new;
++                                goto revalidate_again;
++                        }
++              } else
+               if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+                       err = -ESTALE;
+                       if (!dentry->d_op->d_revalidate(dentry, 0)) {
+@@ -658,15 +729,28 @@ out_dput:
                dput(dentry);
                break;
        }
  }
  
  /* SMP-safe */
-@@ -751,6 +803,17 @@ walk_init_root(const char *name, struct 
+@@ -751,6 +835,17 @@ walk_init_root(const char *name, struct 
  }
  
  /* SMP-safe */
  int path_lookup(const char *path, unsigned flags, struct nameidata *nd)
  {
        int error = 0;
-@@ -779,7 +842,8 @@ int path_init(const char *name, unsigned
+@@ -765,6 +860,7 @@ int path_init(const char *name, unsigned
+ {
+       nd->last_type = LAST_ROOT; /* if there are only slashes... */
+       nd->flags = flags;
++      nd->it = NULL;
+       if (*name=='/')
+               return walk_init_root(name,nd);
+       read_lock(&current->fs->lock);
+@@ -779,7 +875,8 @@ int path_init(const char *name, unsigned
   * needs parent already locked. Doesn't follow mounts.
   * SMP-safe.
   */
  {
        struct dentry * dentry;
        struct inode *inode;
-@@ -802,13 +866,16 @@ struct dentry * lookup_hash(struct qstr 
+@@ -802,13 +899,16 @@ struct dentry * lookup_hash(struct qstr 
                        goto out;
        }
  
                dentry = inode->i_op->lookup(inode, new);
                unlock_kernel();
                if (!dentry)
-@@ -820,6 +887,12 @@ out:
+@@ -820,6 +920,12 @@ out:
        return dentry;
  }
  
  /* SMP-safe */
  struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
  {
-@@ -841,7 +914,7 @@ struct dentry * lookup_one_len(const cha
+@@ -841,7 +947,7 @@ struct dentry * lookup_one_len(const cha
        }
        this.hash = end_name_hash(hash);
  
  access:
        return ERR_PTR(-EACCES);
  }
-@@ -872,6 +945,23 @@ int __user_walk(const char *name, unsign
+@@ -872,6 +978,23 @@ int __user_walk(const char *name, unsign
        return err;
  }
  
  /*
   * It's inline, so penalty for filesystems that don't use sticky bit is
   * minimal.
-@@ -1045,14 +1135,17 @@ int may_open(struct nameidata *nd, int a
-         return get_lease(inode, flag);
- }
-+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it);
-+
- struct file *filp_open(const char * pathname, int open_flags, int mode)
+@@ -1010,7 +1133,8 @@ exit_lock:
+  * for symlinks (where the permissions are checked later).
+  * SMP-safe
+  */
+-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
++int open_namei_it(const char *pathname, int flag, int mode,
++                struct nameidata *nd, struct lookup_intent *it)
  {
        int acc_mode, error = 0;
--      struct inode *inode;
-       struct dentry *dentry;
-       struct dentry *dir;
-       int flag = open_flags;
-       struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = open_flags };
-       int count = 0;
-       if ((flag+1) & O_ACCMODE)
-@@ -1066,7 +1159,7 @@ struct file *filp_open(const char * path
+       struct inode *inode;
+@@ -1024,7 +1148,7 @@ int open_namei(const char * pathname, in
         * The simplest case - just a plain lookup.
         */
        if (!(flag & O_CREAT)) {
--              error = path_lookup(pathname, lookup_flags(flag), &nd);
-+              error = path_lookup_it(pathname, lookup_flags(flag), &nd, &it);
+-              error = path_lookup(pathname, lookup_flags(flag), nd);
++              error = path_lookup_it(pathname, lookup_flags(flag), nd, it);
                if (error)
-                       return ERR_PTR(error);
-               dentry = nd.dentry;
-@@ -1076,6 +1169,8 @@ struct file *filp_open(const char * path
+                       return error;
+               dentry = nd->dentry;
+@@ -1034,6 +1158,10 @@ int open_namei(const char * pathname, in
        /*
         * Create - we need to know the parent.
         */
-+      it.it_mode = mode;
-+      it.it_op |= IT_CREAT;
-       error = path_lookup(pathname, LOOKUP_PARENT, &nd);
++      if (it) {
++              it->it_mode = mode;
++              it->it_op |= IT_CREAT;
++      }
+       error = path_lookup(pathname, LOOKUP_PARENT, nd);
        if (error)
-               return ERR_PTR(error);
-@@ -1091,7 +1186,7 @@ struct file *filp_open(const char * path
+               return error;
+@@ -1049,7 +1177,7 @@ int open_namei(const char * pathname, in
  
-       dir = nd.dentry;
+       dir = nd->dentry;
        down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
+-      dentry = lookup_hash(&nd->last, nd->dentry);
++      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
  
  do_last:
        error = PTR_ERR(dentry);
-@@ -1100,6 +1195,7 @@ do_last:
+@@ -1058,6 +1186,7 @@ do_last:
                goto exit;
        }
  
-+      it.it_mode = mode;
++      it->it_mode = mode;
        /* Negative dentry, just create the file */
        if (!dentry->d_inode) {
                error = vfs_create(dir->d_inode, dentry,
-@@ -1134,7 +1230,8 @@ do_last:
+@@ -1086,12 +1215,13 @@ do_last:
+               error = -ELOOP;
+               if (flag & O_NOFOLLOW)
+                       goto exit_dput;
+-              while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry));
++              while (__follow_down(&nd->mnt,&dentry,it) && d_mountpoint(dentry));
+       }
        error = -ENOENT;
        if (!dentry->d_inode)
                goto exit_dput;
 -      if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link)
-+      if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || 
++      if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link ||
 +                                    dentry->d_inode->i_op->follow_link2))
                goto do_link;
  
-       dput(nd.dentry);
-@@ -1149,11 +1246,13 @@ ok:
-       if (!S_ISREG(nd.dentry->d_inode->i_mode))
-               open_flags &= ~O_TRUNC;
--        return dentry_open(nd.dentry, nd.mnt, open_flags);
-+      return dentry_open_it(nd.dentry, nd.mnt, open_flags, &it);
+       dput(nd->dentry);
+@@ -1165,7 +1295,7 @@ ok:
+               if (!error) {
+                       DQUOT_INIT(inode);
+                       
+-                      error = do_truncate(dentry, 0);
++                      error = do_truncate(dentry, 0, 1);
+               }
+               put_write_access(inode);
+               if (error)
+@@ -1177,8 +1307,10 @@ ok:
+       return 0;
  
  exit_dput:
-+      intent_release(dentry, &it);
++      intent_release(dentry, it);
        dput(dentry);
  exit:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
-       return ERR_PTR(error);
++      intent_release(nd->dentry, it);
+       path_release(nd);
+       return error;
  
-@@ -1172,7 +1271,12 @@ do_link:
+@@ -1197,7 +1329,19 @@ do_link:
         * are done. Procfs-like symlinks just set LAST_BIND.
         */
        UPDATE_ATIME(dentry->d_inode);
--      error = dentry->d_inode->i_op->follow_link(dentry, &nd);
-+      if (dentry->d_inode->i_op->follow_link2) 
-+              error = dentry->d_inode->i_op->follow_link2(dentry, &nd, &it);
-+      else 
-+              error = dentry->d_inode->i_op->follow_link(dentry, &nd);
-+      if (error)
-+              intent_release(dentry, &it);
+-      error = dentry->d_inode->i_op->follow_link(dentry, nd);
++      nd->it = it;
++      if (dentry->d_inode->i_op->follow_link2)
++              error = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
++      else
++              error = dentry->d_inode->i_op->follow_link(dentry, nd);
++      if (error) {
++              intent_release(dentry, it);
++      } else if (it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) {
++              /* vfs_follow_link was never called */
++              intent_release(dentry, it);
++              path_release(nd);
++              error = -ENOLINK;
++      }
        dput(dentry);
        if (error)
                return error;
-@@ -1194,13 +1298,15 @@ do_link:
+@@ -1219,13 +1363,20 @@ do_link:
        }
-       dir = nd.dentry;
+       dir = nd->dentry;
        down(&dir->d_inode->i_sem);
--      dentry = lookup_hash(&nd.last, nd.dentry);
-+      dentry = lookup_hash_it(&nd.last, nd.dentry, &it);
-       putname(nd.last.name);
+-      dentry = lookup_hash(&nd->last, nd->dentry);
++      dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+       putname(nd->last.name);
        goto do_last;
  }
  
++int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd)
++{
++      return open_namei_it(pathname, flag, mode, nd, NULL);
++}
++
 +
  /* SMP-safe */
 -static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
  {
        struct dentry *dentry;
  
-@@ -1208,7 +1314,7 @@ static struct dentry *lookup_create(stru
+@@ -1233,7 +1384,7 @@ static struct dentry *lookup_create(stru
        dentry = ERR_PTR(-EEXIST);
        if (nd->last_type != LAST_NORM)
                goto fail;
        if (IS_ERR(dentry))
                goto fail;
        if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1264,7 +1370,19 @@ asmlinkage long sys_mknod(const char * f
+@@ -1289,7 +1440,19 @@ asmlinkage long sys_mknod(const char * f
        error = path_lookup(tmp, LOOKUP_PARENT, &nd);
        if (error)
                goto out;
 +
 +      if (nd.dentry->d_inode->i_op->mknod2) {
 +              struct inode_operations *op = nd.dentry->d_inode->i_op;
-+              error = op->mknod2(nd.dentry->d_inode, 
-+                                 nd.last.name, 
++              error = op->mknod2(nd.dentry->d_inode,
++                                 nd.last.name,
 +                                 nd.last.len,
 +                                 mode, dev);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto out2;
 +      }
        error = PTR_ERR(dentry);
  
        mode &= ~current->fs->umask;
-@@ -1285,6 +1403,7 @@ asmlinkage long sys_mknod(const char * f
+@@ -1310,6 +1473,7 @@ asmlinkage long sys_mknod(const char * f
                dput(dentry);
        }
        up(&nd.dentry->d_inode->i_sem);
-+ out2:
++out2:
        path_release(&nd);
  out:
        putname(tmp);
-@@ -1332,7 +1451,17 @@ asmlinkage long sys_mkdir(const char * p
+@@ -1357,7 +1521,17 @@ asmlinkage long sys_mkdir(const char * p
                error = path_lookup(tmp, LOOKUP_PARENT, &nd);
                if (error)
                        goto out;
 -              dentry = lookup_create(&nd, 1);
-+              if (nd.dentry->d_inode->i_op->mkdir2) {
-+                      struct inode_operations *op = nd.dentry->d_inode->i_op;
-+                      error = op->mkdir2(nd.dentry->d_inode, 
-+                                         nd.last.name, 
-+                                         nd.last.len,
-+                                         mode);
-+                      /* the file system want to use normal vfs path now */
-+                      if (error != -EOPNOTSUPP)
-+                              goto out2;
-+              }
++              if (nd.dentry->d_inode->i_op->mkdir2) {
++                      struct inode_operations *op = nd.dentry->d_inode->i_op;
++                      error = op->mkdir2(nd.dentry->d_inode,
++                                         nd.last.name,
++                                         nd.last.len,
++                                         mode);
++                      /* the file system wants to use normal vfs path now */
++                      if (error != -EOPNOTSUPP)
++                              goto out2;
++              }
 +              dentry = lookup_create(&nd, 1, NULL);
                error = PTR_ERR(dentry);
                if (!IS_ERR(dentry)) {
                        error = vfs_mkdir(nd.dentry->d_inode, dentry,
-@@ -1340,6 +1469,7 @@ asmlinkage long sys_mkdir(const char * p
+@@ -1365,6 +1539,7 @@ asmlinkage long sys_mkdir(const char * p
                        dput(dentry);
                }
                up(&nd.dentry->d_inode->i_sem);
                path_release(&nd);
  out:
                putname(tmp);
-@@ -1440,8 +1570,17 @@ asmlinkage long sys_rmdir(const char * p
+@@ -1465,8 +1640,33 @@ asmlinkage long sys_rmdir(const char * p
                        error = -EBUSY;
                        goto exit1;
        }
 +      if (nd.dentry->d_inode->i_op->rmdir2) {
 +              struct inode_operations *op = nd.dentry->d_inode->i_op;
-+              error = op->rmdir2(nd.dentry->d_inode, 
-+                                 nd.last.name, 
++              struct dentry *last;
++
++              down(&nd.dentry->d_inode->i_sem);
++              last = lookup_hash_it(&nd.last, nd.dentry, NULL);
++              up(&nd.dentry->d_inode->i_sem);
++              if (IS_ERR(last)) {
++                      error = PTR_ERR(last);
++                      goto exit1;
++              }
++              if (d_mountpoint(last)) {
++                      dput(last);
++                      error = -EBUSY;
++                      goto exit1;
++              }
++              dput(last);
++
++              error = op->rmdir2(nd.dentry->d_inode,
++                                 nd.last.name,
 +                                 nd.last.len);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto exit1;
 +      }
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                error = vfs_rmdir(nd.dentry->d_inode, dentry);
-@@ -1499,8 +1638,17 @@ asmlinkage long sys_unlink(const char * 
+@@ -1524,8 +1724,17 @@ asmlinkage long sys_unlink(const char * 
        error = -EISDIR;
        if (nd.last_type != LAST_NORM)
                goto exit1;
 +      if (nd.dentry->d_inode->i_op->unlink2) {
 +              struct inode_operations *op = nd.dentry->d_inode->i_op;
-+              error = op->unlink2(nd.dentry->d_inode, 
-+                                  nd.last.name, 
++              error = op->unlink2(nd.dentry->d_inode,
++                                  nd.last.name,
 +                                  nd.last.len);
-+              /* the file system want to use normal vfs path now */
++              /* the file system wants to use normal vfs path now */
 +              if (error != -EOPNOTSUPP)
 +                      goto exit1;
 +      }
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
-@@ -1567,15 +1715,26 @@ asmlinkage long sys_symlink(const char *
+@@ -1592,15 +1801,26 @@ asmlinkage long sys_symlink(const char *
                error = path_lookup(to, LOOKUP_PARENT, &nd);
                if (error)
                        goto out;
 -              dentry = lookup_create(&nd, 0);
 +              if (nd.dentry->d_inode->i_op->symlink2) {
 +                      struct inode_operations *op = nd.dentry->d_inode->i_op;
-+                      error = op->symlink2(nd.dentry->d_inode, 
-+                                           nd.last.name, 
++                      error = op->symlink2(nd.dentry->d_inode,
++                                           nd.last.name,
 +                                           nd.last.len,
 +                                           from);
-+                      /* the file system want to use normal vfs path now */
++                      /* the file system wants to use normal vfs path now */
 +                      if (error != -EOPNOTSUPP)
 +                              goto out2;
 +              }
                putname(to);
        }
        putname(from);
-@@ -1642,7 +1801,7 @@ asmlinkage long sys_link(const char * ol
-               struct dentry *new_dentry;
-               struct nameidata nd, old_nd;
--              error = __user_walk(oldname, LOOKUP_POSITIVE, &old_nd);
-+              error = __user_walk_it(oldname, LOOKUP_POSITIVE, &old_nd, NULL);
-               if (error)
-                       goto exit;
-               error = path_lookup(to, LOOKUP_PARENT, &nd);
-@@ -1651,7 +1810,17 @@ asmlinkage long sys_link(const char * ol
+@@ -1676,7 +1896,17 @@ asmlinkage long sys_link(const char * ol
                error = -EXDEV;
                if (old_nd.mnt != nd.mnt)
                        goto out_release;
 -              new_dentry = lookup_create(&nd, 0);
 +              if (nd.dentry->d_inode->i_op->link2) {
 +                      struct inode_operations *op = nd.dentry->d_inode->i_op;
-+                      error = op->link2(old_nd.dentry->d_inode, 
-+                                        nd.dentry->d_inode, 
-+                                        nd.last.name, 
++                      error = op->link2(old_nd.dentry->d_inode,
++                                        nd.dentry->d_inode,
++                                        nd.last.name,
 +                                        nd.last.len);
-+                      /* the file system want to use normal vfs path now */
++                      /* the file system wants to use normal vfs path now */
 +                      if (error != -EOPNOTSUPP)
 +                              goto out_release;
 +              }
                error = PTR_ERR(new_dentry);
                if (!IS_ERR(new_dentry)) {
                        error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-@@ -1695,7 +1864,8 @@ exit:
+@@ -1720,7 +1950,8 @@ exit:
   *       locking].
   */
  int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
  {
        int error;
        struct inode *target;
-@@ -1753,6 +1923,7 @@ int vfs_rename_dir(struct inode *old_dir
+@@ -1778,6 +2009,7 @@ int vfs_rename_dir(struct inode *old_dir
                error = -EBUSY;
        else 
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (target) {
                if (!error)
                        target->i_flags |= S_DEAD;
-@@ -1774,7 +1945,8 @@ out_unlock:
+@@ -1799,7 +2031,8 @@ out_unlock:
  }
  
  int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
  {
        int error;
  
-@@ -1805,6 +1977,7 @@ int vfs_rename_other(struct inode *old_d
+@@ -1830,6 +2063,7 @@ int vfs_rename_other(struct inode *old_d
                error = -EBUSY;
        else
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        double_up(&old_dir->i_zombie, &new_dir->i_zombie);
        if (error)
                return error;
-@@ -1816,13 +1989,14 @@ int vfs_rename_other(struct inode *old_d
+@@ -1841,13 +2075,14 @@ int vfs_rename_other(struct inode *old_d
  }
  
  int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (!error) {
                if (old_dir == new_dir)
                        inode_dir_notify(old_dir, DN_RENAME);
-@@ -1862,9 +2036,23 @@ static inline int do_rename(const char *
-       if (newnd.last_type != LAST_NORM)
-               goto exit2;
+@@ -1889,7 +2124,7 @@ static inline int do_rename(const char *
  
-+      if (old_dir->d_inode->i_op->rename2) {
-+              lock_kernel();
-+              error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, 
-+                                                      new_dir->d_inode,
-+                                                      oldnd.last.name, 
-+                                                      oldnd.last.len,
-+                                                      newnd.last.name,
-+                                                      newnd.last.len);
-+              unlock_kernel();
-+              /* the file system want to use normal vfs path now */
-+              if (error != -EOPNOTSUPP)
-+                      goto exit2;
-+      }
-+
        double_lock(new_dir, old_dir);
  
 -      old_dentry = lookup_hash(&oldnd.last, old_dir);
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
-@@ -1880,14 +2068,14 @@ static inline int do_rename(const char *
+@@ -1905,16 +2140,37 @@ static inline int do_rename(const char *
                if (newnd.last.name[newnd.last.len])
                        goto exit4;
        }
        if (IS_ERR(new_dentry))
                goto exit4;
  
++      if (old_dir->d_inode->i_op->rename2) {
++              lock_kernel();
++              /* don't rename mount point. mds will take care of
++               * the rest sanity checking */
++              if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) {
++                      error = -EBUSY;
++                      goto exit5;
++              }
++
++              error = old_dir->d_inode->i_op->rename2(old_dir->d_inode,
++                                                      new_dir->d_inode,
++                                                      oldnd.last.name,
++                                                      oldnd.last.len,
++                                                      newnd.last.name,
++                                                      newnd.last.len);
++              unlock_kernel();
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto exit5;
++      }
++
        lock_kernel();
        error = vfs_rename(old_dir->d_inode, old_dentry,
 -                                 new_dir->d_inode, new_dentry);
 +                                 new_dir->d_inode, new_dentry, NULL);
        unlock_kernel();
+-
++exit5:
        dput(new_dentry);
-@@ -1940,7 +2127,8 @@ out:
+ exit4:
+       dput(old_dentry);
+@@ -1965,20 +2221,28 @@ out:
  }
  
  static inline int
 -__vfs_follow_link(struct nameidata *nd, const char *link)
-+__vfs_follow_link(struct nameidata *nd, const char *link, 
++__vfs_follow_link(struct nameidata *nd, const char *link,
 +                struct lookup_intent *it)
  {
        int res = 0;
        char *name;
-@@ -1953,7 +2141,7 @@ __vfs_follow_link(struct nameidata *nd, 
+       if (IS_ERR(link))
+               goto fail;
++      if (it == NULL)
++              it = nd->it;
++      else if (it != nd->it)
++              printk("it != nd->it: tell phil@clusterfs.com\n");
++      if (it != NULL)
++              it->it_int_flags |= IT_FL_FOLLOWED;
++
+       if (*link == '/') {
+               path_release(nd);
+               if (!walk_init_root(link, nd))
                        /* weird __emul_prefix() stuff did it */
                        goto out;
        }
  out:
        if (current->link_count || res || nd->last_type!=LAST_NORM)
                return res;
-@@ -1975,7 +2163,13 @@ fail:
+@@ -2002,7 +2266,13 @@ fail:
  
  int vfs_follow_link(struct nameidata *nd, const char *link)
  {
 +      return __vfs_follow_link(nd, link, NULL);
 +}
 +
-+int vfs_follow_link_it(struct nameidata *nd, const char *link, 
++int vfs_follow_link_it(struct nameidata *nd, const char *link,
 +                     struct lookup_intent *it)
 +{
 +      return __vfs_follow_link(nd, link, it);
  }
  
  /* get the link contents into pagecache */
-@@ -2017,7 +2211,7 @@ int page_follow_link(struct dentry *dent
+@@ -2044,7 +2314,7 @@ int page_follow_link(struct dentry *dent
  {
        struct page *page = NULL;
        char *s = page_getlink(dentry, &page);
        if (page) {
                kunmap(page);
                page_cache_release(page);
---- linux-2.4.18-18.8.0-l7/fs/nfsd/vfs.c~vfs_intent-2.4.18-18  Mon Jan 20 12:25:10 2003
-+++ linux-2.4.18-18.8.0-l7-root/fs/nfsd/vfs.c  Mon Jan 20 12:25:10 2003
-@@ -1298,7 +1298,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
+--- linux-2.4.20-rh/fs/nfsd/vfs.c~vfs_intent-2.4.20-rh 2003-04-11 14:04:48.000000000 +0800
++++ linux-2.4.20-rh-root/fs/nfsd/vfs.c 2003-06-09 23:18:07.000000000 +0800
+@@ -1293,7 +1293,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
                        err = nfserr_perm;
        } else
  #endif
 -      err = vfs_rename(fdir, odentry, tdir, ndentry);
 +      err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
-       unlock_kernel();
        if (!err && EX_ISSYNC(tfhp->fh_export)) {
                nfsd_sync_dir(tdentry);
---- linux-2.4.18-18.8.0-l7/fs/open.c~vfs_intent-2.4.18-18      Mon Jan 20 12:25:10 2003
-+++ linux-2.4.18-18.8.0-l7-root/fs/open.c      Wed Jan 22 10:39:31 2003
-@@ -19,6 +19,9 @@
+               nfsd_sync_dir(fdentry);
+--- linux-2.4.20-rh/fs/open.c~vfs_intent-2.4.20-rh     2003-04-11 14:04:57.000000000 +0800
++++ linux-2.4.20-rh-root/fs/open.c     2003-06-09 23:18:07.000000000 +0800
+@@ -19,6 +19,8 @@
  #include <asm/uaccess.h>
  
  #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
 +extern int path_walk_it(const char *name, struct nameidata *nd,
 +                      struct lookup_intent *it);
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
  
  int vfs_statfs(struct super_block *sb, struct statfs *buf)
  {
-@@ -118,12 +121,13 @@ static inline long do_sys_truncate(const
+@@ -95,9 +97,10 @@ void fd_install(unsigned int fd, struct 
+       write_unlock(&files->file_lock);
+ }
+-int do_truncate(struct dentry *dentry, loff_t length)
++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
+ {
+       struct inode *inode = dentry->d_inode;
++      struct inode_operations *op = dentry->d_inode->i_op;
+       int error;
+       struct iattr newattrs;
+@@ -108,7 +111,14 @@ int do_truncate(struct dentry *dentry, l
+       down(&inode->i_sem);
+       newattrs.ia_size = length;
+       newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+-      error = notify_change(dentry, &newattrs);
++      if (called_from_open)
++              newattrs.ia_valid |= ATTR_FROM_OPEN;
++      if (op->setattr_raw) {
++              newattrs.ia_valid |= ATTR_RAW;
++              newattrs.ia_ctime = CURRENT_TIME;
++              error = op->setattr_raw(inode, &newattrs);
++      } else 
++              error = notify_change(dentry, &newattrs);
+       up(&inode->i_sem);
+       return error;
+ }
+@@ -118,12 +128,13 @@ static inline long do_sys_truncate(const
        struct nameidata nd;
        struct inode * inode;
        int error;
-+      struct lookup_intent it = { .it_op = IT_TRUNC };
++      struct lookup_intent it = { .it_op = IT_GETATTR };
  
        error = -EINVAL;
        if (length < 0) /* sorry, but loff_t says... */
        if (error)
                goto out;
        inode = nd.dentry->d_inode;
-@@ -168,6 +172,7 @@ static inline long do_sys_truncate(const
+@@ -163,11 +174,13 @@ static inline long do_sys_truncate(const
+       error = locks_verify_truncate(inode, NULL, length);
+       if (!error) {
+               DQUOT_INIT(inode);
+-              error = do_truncate(nd.dentry, length);
++              intent_release(nd.dentry, &it);
++              error = do_truncate(nd.dentry, length, 0);
+       }
        put_write_access(inode);
  
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -259,8 +264,9 @@ asmlinkage long sys_utime(char * filenam
-       struct nameidata nd;
+@@ -215,7 +228,7 @@ static inline long do_sys_ftruncate(unsi
+       error = locks_verify_truncate(inode, file, length);
+       if (!error)
+-              error = do_truncate(dentry, length);
++              error = do_truncate(dentry, length, 0);
+ out_putf:
+       fput(file);
+ out:
+@@ -260,11 +273,13 @@ asmlinkage long sys_utime(char * filenam
        struct inode * inode;
        struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
  
 -      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
++      error = user_path_walk_it(filename, &nd, NULL);
        if (error)
                goto out;
        inode = nd.dentry->d_inode;
-@@ -286,6 +292,7 @@ asmlinkage long sys_utime(char * filenam
++      /* this is safe without a Lustre lock because it only depends
++         on the super block */
+       error = -EROFS;
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
+@@ -279,11 +294,29 @@ asmlinkage long sys_utime(char * filenam
+                       goto dput_and_out;
+               newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+-      } else {
++      }
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
++      error = -EROFS;
++      if (IS_RDONLY(inode))
++              goto dput_and_out;
++
++      error = -EPERM;
++      if (!times) {
+               if (current->fsuid != inode->i_uid &&
+                   (error = permission(inode,MAY_WRITE)) != 0)
+                       goto dput_and_out;
        }
++
        error = notify_change(nd.dentry, &newattrs);
  dput_and_out:
-+      intent_release(nd.dentry, &it);
        path_release(&nd);
- out:
-       return error;
-@@ -303,8 +310,9 @@ asmlinkage long sys_utimes(char * filena
-       struct nameidata nd;
+@@ -304,12 +337,14 @@ asmlinkage long sys_utimes(char * filena
        struct inode * inode;
        struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
  
 -      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
++      error = user_path_walk_it(filename, &nd, NULL);
  
        if (error)
                goto out;
-@@ -331,6 +339,7 @@ asmlinkage long sys_utimes(char * filena
-       }
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * 
+       inode = nd.dentry->d_inode;
++      /* this is safe without a Lustre lock because it only depends
++         on the super block */
+       error = -EROFS;
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
+@@ -324,7 +359,20 @@ asmlinkage long sys_utimes(char * filena
+               newattrs.ia_atime = times[0].tv_sec;
+               newattrs.ia_mtime = times[1].tv_sec;
+               newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+-      } else {
++      }
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
++      error = -EPERM;
++      if (!utimes) {
+               if (current->fsuid != inode->i_uid &&
+                   (error = permission(inode,MAY_WRITE)) != 0)
+                       goto dput_and_out;
+@@ -347,6 +395,7 @@ asmlinkage long sys_access(const char * 
        int old_fsuid, old_fsgid;
        kernel_cap_t old_cap;
        int res;
  
        if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;
-@@ -364,13 +374,14 @@ asmlinkage long sys_access(const char * 
+@@ -364,13 +413,14 @@ asmlinkage long sys_access(const char * 
        else
                current->cap_effective = current->cap_permitted;
  
                path_release(&nd);
        }
  
-@@ -385,8 +396,11 @@ asmlinkage long sys_chdir(const char * f
+@@ -385,8 +435,9 @@ asmlinkage long sys_chdir(const char * f
  {
        int error;
        struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
++      struct lookup_intent it = { .it_op = IT_GETATTR };
  
 -      error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd);
-+      error = __user_walk_it(filename,
-+                             LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,
-+                             &nd, &it);
++      error = __user_walk_it(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd, &it);
        if (error)
                goto out;
  
-@@ -397,6 +411,7 @@ asmlinkage long sys_chdir(const char * f
+@@ -397,6 +448,7 @@ asmlinkage long sys_chdir(const char * f
        set_fs_pwd(current->fs, nd.mnt, nd.dentry);
  
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -436,9 +451,10 @@ asmlinkage long sys_chroot(const char * 
+@@ -436,9 +488,10 @@ asmlinkage long sys_chroot(const char * 
  {
        int error;
        struct nameidata nd;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
++      struct lookup_intent it = { .it_op = IT_GETATTR };
  
 -      error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
 -                    LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
 +      error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
-+                             LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it);
++                    LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it);
        if (error)
                goto out;
  
-@@ -454,6 +470,7 @@ asmlinkage long sys_chroot(const char * 
+@@ -454,6 +507,7 @@ asmlinkage long sys_chroot(const char * 
        set_fs_altroot();
        error = 0;
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -498,8 +515,9 @@ asmlinkage long sys_chmod(const char * f
-       struct inode * inode;
-       int error;
-       struct iattr newattrs;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
+@@ -508,6 +562,18 @@ asmlinkage long sys_chmod(const char * f
+       if (IS_RDONLY(inode))
+               goto dput_and_out;
  
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (error)
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++              newattrs.ia_mode = mode;
++              newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      goto dput_and_out;
++      }
++
+       error = -EPERM;
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+               goto dput_and_out;
+@@ -538,6 +604,20 @@ static int chown_common(struct dentry * 
+       error = -EROFS;
+       if (IS_RDONLY(inode))
                goto out;
-       inode = nd.dentry->d_inode;
-@@ -519,6 +537,7 @@ asmlinkage long sys_chmod(const char * f
-       error = notify_change(nd.dentry, &newattrs);
- dput_and_out:
-+      intent_release(nd.dentry, &it);
-       path_release(&nd);
- out:
-       return error;
-@@ -588,10 +607,12 @@ asmlinkage long sys_chown(const char * f
++
++      if (inode->i_op->setattr_raw) {
++              struct inode_operations *op = dentry->d_inode->i_op;
++
++              newattrs.ia_uid = user;
++              newattrs.ia_gid = group;
++              newattrs.ia_valid = ATTR_UID | ATTR_GID;
++              newattrs.ia_valid |= ATTR_RAW;
++              error = op->setattr_raw(inode, &newattrs);
++              /* the file system wants to use normal vfs path now */
++              if (error != -EOPNOTSUPP)
++                      return error;
++      }
++
+       error = -EPERM;
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+               goto out;
+@@ -642,6 +722,7 @@ struct file *filp_open(const char * file
  {
+       int namei_flags, error;
        struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = flags };
+       
+       flags &= ~O_DIRECT;
  
--      error = user_path_walk(filename, &nd);
-+      error = user_path_walk_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -601,10 +622,12 @@ asmlinkage long sys_lchown(const char * 
- {
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_SETATTR };
+@@ -651,14 +732,15 @@ struct file *filp_open(const char * file
+       if (namei_flags & O_TRUNC)
+               namei_flags |= 2;
  
--      error = user_path_walk_link(filename, &nd);
-+      error = user_path_walk_link_it(filename, &nd, &it);
-       if (!error) {
-               error = chown_common(nd.dentry, user, group);
-+              intent_release(nd.dentry, &it);
-               path_release(&nd);
-       }
-       return error;
-@@ -628,7 +651,8 @@ extern ssize_t do_readahead(struct file 
- /* for files over a certains size it doesn't pay to do readahead on open */
- #define READAHEAD_CUTOFF 48000
+-      error = open_namei(filename, namei_flags, mode, &nd);
+-      if (!error)
+-              return dentry_open(nd.dentry, nd.mnt, flags);
++      error = open_namei_it(filename, namei_flags, mode, &nd, &it);
++      if (error)
++              return ERR_PTR(error);
+-      return ERR_PTR(error);
++      return dentry_open_it(nd.dentry, nd.mnt, flags, &it);
+ }
  
 -struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 +struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
  {
        struct file * f;
        struct inode *inode;
-@@ -693,6 +717,7 @@ struct file *dentry_open(struct dentry *
-               do_readahead(f, 0, (48 * 1024) >> PAGE_SHIFT);
-       
+@@ -701,6 +783,7 @@ struct file *dentry_open(struct dentry *
+       }
+       f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
  
 +      intent_release(dentry, it);
        return f;
  
  cleanup_all:
-@@ -707,11 +732,17 @@ cleanup_all:
+@@ -715,11 +798,17 @@ cleanup_all:
  cleanup_file:
        put_filp(f);
  cleanup_dentry:
  /*
   * Find an empty file descriptor entry, and mark it busy.
   */
---- linux-2.4.18-18.8.0-l7/fs/stat.c~vfs_intent-2.4.18-18      Mon Jan 20 12:25:10 2003
-+++ linux-2.4.18-18.8.0-l7-root/fs/stat.c      Mon Jan 20 12:25:10 2003
-@@ -13,6 +13,7 @@
- #include <asm/uaccess.h>
-+extern void intent_release(struct dentry *de, struct lookup_intent *it);
- /*
-  * Revalidate the inode. This is required for proper NFS attribute caching.
-  */
-@@ -104,10 +105,12 @@ int vfs_stat(char *name, struct kstat *s
+--- linux-2.4.20-rh/fs/stat.c~vfs_intent-2.4.20-rh     2003-04-11 14:05:08.000000000 +0800
++++ linux-2.4.20-rh-root/fs/stat.c     2003-06-09 23:18:07.000000000 +0800
+@@ -110,11 +110,13 @@ static int do_getattr(struct vfsmount *m
+ int vfs_stat(char *name, struct kstat *stat)
  {
        struct nameidata nd;
++      struct lookup_intent it = { .it_op = IT_GETATTR };
        int error;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
  
 -      error = user_path_walk(name, &nd);
-+      error = user_path_walk_it(name, &nd, &it);
++      error = user_path_walk_it(name, &nd, &it);
        if (!error) {
                error = do_getattr(nd.mnt, nd.dentry, stat);
-+              intent_release(nd.dentry, &it);
++              intent_release(nd.dentry, &it);
                path_release(&nd);
        }
        return error;
-@@ -117,10 +120,12 @@ int vfs_lstat(char *name, struct kstat *
+@@ -123,11 +125,13 @@ int vfs_stat(char *name, struct kstat *s
+ int vfs_lstat(char *name, struct kstat *stat)
  {
        struct nameidata nd;
++      struct lookup_intent it = { .it_op = IT_GETATTR };
        int error;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
  
 -      error = user_path_walk_link(name, &nd);
-+      error = user_path_walk_link_it(name, &nd, &it);
++      error = user_path_walk_link_it(name, &nd, &it);
        if (!error) {
                error = do_getattr(nd.mnt, nd.dentry, stat);
-+              intent_release(nd.dentry, &it);
++              intent_release(nd.dentry, &it);
                path_release(&nd);
        }
        return error;
---- linux-2.4.18-18.8.0-l7/include/linux/dcache.h~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003
-+++ linux-2.4.18-18.8.0-l7-root/include/linux/dcache.h Wed Jan 22 19:38:12 2003
-@@ -6,6 +6,27 @@
- #include <asm/atomic.h>
+--- linux-2.4.20-rh/include/linux/dcache.h~vfs_intent-2.4.20-rh        2003-04-12 15:46:39.000000000 +0800
++++ linux-2.4.20-rh-root/include/linux/dcache.h        2003-06-09 23:18:07.000000000 +0800
+@@ -7,6 +7,28 @@
  #include <linux/mount.h>
+ #include <linux/kernel.h>
  
 +#define IT_OPEN     (1)
 +#define IT_CREAT    (1<<1)
 +#define IT_READDIR  (1<<2)
 +#define IT_GETATTR  (1<<3)
-+#define IT_SETATTR  (1<<4)
-+#define IT_TRUNC    (1<<5)
-+#define IT_READLINK (1<<6)
-+#define IT_LOOKUP   (1<<7)
++#define IT_LOOKUP   (1<<4)
++#define IT_UNLINK   (1<<5)
++
++#define IT_FL_LOCKED   (1)
++#define IT_FL_FOLLOWED (1<<1) /* set by vfs_follow_link */
 +
 +struct lookup_intent {
 +      int it_op;
 +      int it_flags;
 +      int it_disposition;
 +      int it_status;
-+      struct iattr *it_iattr;
++      int it_int_flags;
 +      __u64 it_lock_handle[2];
 +      int it_lock_mode;
 +      void *it_data;
  /*
   * linux/include/linux/dcache.h
   *
-@@ -78,6 +99,7 @@ struct dentry {
+@@ -82,6 +104,7 @@ struct dentry {
        unsigned long d_time;           /* used by d_revalidate */
        struct dentry_operations  *d_op;
        struct super_block * d_sb;      /* The root of the dentry tree */
        unsigned long d_vfs_flags;
        void * d_fsdata;                /* fs-specific data */
        void * d_extra_attributes;      /* TUX-specific data */
-@@ -91,6 +113,8 @@ struct dentry_operations {
+@@ -96,8 +119,15 @@ struct dentry_operations {
        int (*d_delete)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
 +      void (*d_intent_release)(struct dentry *, struct lookup_intent *);
  };
  
++/* defined in fs/namei.c */
++extern void intent_release(struct dentry *de, struct lookup_intent *it);
++/* defined in fs/dcache.c */
++extern void __d_rehash(struct dentry * entry, int lock);
++
  /* the dentry parameter passed to d_hash and d_compare is the parent
-@@ -124,6 +148,7 @@ d_iput:            no              no              yes
+  * directory of the entries to be compared. It is used in case these
+  * functions need any directory specific information for determining
+@@ -129,6 +159,7 @@ d_iput:            no              no              yes
                                         * s_nfsd_free_path semaphore will be down
                                         */
  #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
  
  extern spinlock_t dcache_lock;
  
---- linux-2.4.18-18.8.0-l7/include/linux/fs.h~vfs_intent-2.4.18-18     Mon Jan 20 12:25:10 2003
-+++ linux-2.4.18-18.8.0-l7-root/include/linux/fs.h     Wed Jan 22 22:46:13 2003
-@@ -576,6 +576,7 @@ struct file {
+--- linux-2.4.20-rh/include/linux/fs.h~vfs_intent-2.4.20-rh    2003-05-30 02:07:39.000000000 +0800
++++ linux-2.4.20-rh-root/include/linux/fs.h    2003-06-09 23:18:07.000000000 +0800
+@@ -337,6 +337,8 @@ extern void set_bh_page(struct buffer_he
+ #define ATTR_MTIME_SET        256
+ #define ATTR_FORCE    512     /* Not a change, but a change it */
+ #define ATTR_ATTR_FLAG        1024
++#define ATTR_RAW      2048    /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN        4096    /* called from open path, ie O_TRUNC */
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
+@@ -574,6 +576,7 @@ struct file {
  
        /* needed for tty driver, and maybe others */
        void                    *private_data;
  
        /* preallocated helper kiobuf to speedup O_DIRECT */
        struct kiobuf           *f_iobuf;
-@@ -836,7 +837,9 @@ extern int vfs_symlink(struct inode *, s
+@@ -701,6 +704,7 @@ struct nameidata {
+       struct qstr last;
+       unsigned int flags;
+       int last_type;
++      struct lookup_intent *it;
+ };
+ /*
+@@ -821,7 +825,9 @@ extern int vfs_symlink(struct inode *, s
  extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
  extern int vfs_rmdir(struct inode *, struct dentry *);
  extern int vfs_unlink(struct inode *, struct dentry *);
  
  /*
   * File types
-@@ -897,16 +900,28 @@ struct file_operations {
+@@ -882,20 +888,33 @@ struct file_operations {
  struct inode_operations {
        int (*create) (struct inode *,struct dentry *,int);
        struct dentry * (*lookup) (struct inode *,struct dentry *);
 +      int (*mknod2) (struct inode *, const char *, int,int,int);
        int (*rename) (struct inode *, struct dentry *,
                        struct inode *, struct dentry *);
-+      int (*rename2) (struct inode *, struct inode *, 
-+                      const char *oldname, int oldlen, 
++      int (*rename2) (struct inode *, struct inode *,
++                      const char *oldname, int oldlen,
 +                      const char *newname, int newlen);
        int (*readlink) (struct dentry *, char *,int);
        int (*follow_link) (struct dentry *, struct nameidata *);
-+      int (*follow_link2) (struct dentry *, struct nameidata *, 
++      int (*follow_link2) (struct dentry *, struct nameidata *,
 +                           struct lookup_intent *it);
        void (*truncate) (struct inode *);
        int (*permission) (struct inode *, int);
        int (*revalidate) (struct dentry *);
-@@ -1381,6 +1396,7 @@ typedef int (*read_actor_t)(read_descrip
+       int (*setattr) (struct dentry *, struct iattr *);
++      int (*setattr_raw) (struct inode *, struct iattr *);
+       int (*getattr) (struct dentry *, struct iattr *);
+       int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
+       ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+@@ -1091,10 +1110,14 @@ static inline int get_lease(struct inode
+ asmlinkage long sys_open(const char *, int, int);
+ asmlinkage long sys_close(unsigned int);      /* yes, it's really unsigned */
+-extern int do_truncate(struct dentry *, loff_t start);
++extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern int open_namei_it(const char *filename, int namei_flags, int mode,
++                       struct nameidata *nd, struct lookup_intent *it);
++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
++                          int flags, struct lookup_intent *it);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char *);
+@@ -1385,6 +1408,7 @@ typedef int (*read_actor_t)(read_descrip
  extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
  
  extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
  extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
  extern int FASTCALL(path_walk(const char *, struct nameidata *));
  extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
-@@ -1392,6 +1408,8 @@ extern struct dentry * lookup_one_len(co
+@@ -1396,6 +1420,8 @@ extern struct dentry * lookup_one_len(co
  extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
  #define user_path_walk(name,nd)        __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
  #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
  
  extern void inode_init_once(struct inode *);
  extern void iput(struct inode *);
-@@ -1492,6 +1510,8 @@ extern struct file_operations generic_ro
+@@ -1495,6 +1521,8 @@ extern struct file_operations generic_ro
  
  extern int vfs_readlink(struct dentry *, char *, int, const char *);
  extern int vfs_follow_link(struct nameidata *, const char *);
-+extern int vfs_follow_link_it(struct nameidata *, const char *, 
++extern int vfs_follow_link_it(struct nameidata *, const char *,
 +                            struct lookup_intent *it);
  extern int page_readlink(struct dentry *, char *, int);
  extern int page_follow_link(struct dentry *, struct nameidata *);
  extern struct inode_operations page_symlink_inode_operations;
---- linux-2.4.18-18.8.0-l7/kernel/ksyms.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003
-+++ linux-2.4.18-18.8.0-l7-root/kernel/ksyms.c Mon Jan 20 12:25:10 2003
-@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page);
+--- linux-2.4.20-rh/kernel/ksyms.c~vfs_intent-2.4.20-rh        2003-05-30 02:07:42.000000000 +0800
++++ linux-2.4.20-rh-root/kernel/ksyms.c        2003-06-09 23:18:07.000000000 +0800
+@@ -298,6 +298,7 @@ EXPORT_SYMBOL(read_cache_page);
  EXPORT_SYMBOL(set_page_dirty);
  EXPORT_SYMBOL(vfs_readlink);
  EXPORT_SYMBOL(vfs_follow_link);
  EXPORT_SYMBOL(page_readlink);
  EXPORT_SYMBOL(page_follow_link);
  EXPORT_SYMBOL(page_symlink_inode_operations);
+--- linux-2.4.20-rh/fs/exec.c~vfs_intent-2.4.20-rh     2003-04-13 10:07:02.000000000 +0800
++++ linux-2.4.20-rh-root/fs/exec.c     2003-06-09 23:18:07.000000000 +0800
+@@ -114,8 +114,9 @@ asmlinkage long sys_uselib(const char * 
+       struct file * file;
+       struct nameidata nd;
+       int error;
+-
+-      error = user_path_walk(library, &nd);
++              struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
++                                                                                                                                             
++        error = user_path_walk_it(library, &nd, &it);
+       if (error)
+               goto out;
+@@ -127,7 +128,8 @@ asmlinkage long sys_uselib(const char * 
+       if (error)
+               goto exit;
+-      file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++      file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);        
++      intent_release(nd.dentry, &it);
+       error = PTR_ERR(file);
+       if (IS_ERR(file))
+               goto out;
+@@ -382,8 +384,9 @@ struct file *open_exec(const char *name)
+       struct inode *inode;
+       struct file *file;
+       int err = 0;
+-
+-      err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
++                                                                                                                                             
++      err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it);
+       file = ERR_PTR(err);
+       if (!err) {
+               inode = nd.dentry->d_inode;
+@@ -395,7 +398,7 @@ struct file *open_exec(const char *name)
+                               err = -EACCES;
+                       file = ERR_PTR(err);
+                       if (!err) {
+-                              file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++                                file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
+                               if (!IS_ERR(file)) {
+                                       err = deny_write_access(file);
+                                       if (err) {
+@@ -404,6 +407,7 @@ struct file *open_exec(const char *name)
+                                       }
+                               }
+ out:
++                              intent_release(nd.dentry, &it);
+                               return file;
+                       }
+               }
+@@ -1283,7 +1287,7 @@ int do_coredump(long signr, int exit_cod
+               goto close_fail;
+       if (!file->f_op->write)
+               goto close_fail;
+-      if (do_truncate(file->f_dentry, 0) != 0)
++      if (do_truncate(file->f_dentry, 0, 0) != 0)
+               goto close_fail;
+       retval = binfmt->core_dump(signr, regs, file);
+--- linux-2.4.20-rh/fs/proc/base.c~vfs_intent-2.4.20-rh        2003-06-09 23:16:51.000000000 +0800
++++ linux-2.4.20-rh-root/fs/proc/base.c        2003-06-09 23:18:52.000000000 +0800
+@@ -464,6 +464,9 @@ static int proc_pid_follow_link(struct d
+       error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt);
+       nd->last_type = LAST_BIND;
++
++        if (nd->it != NULL)
++                nd->it->it_int_flags |= IT_FL_FOLLOWED;
+ out:
+       return error;
+ }
 
 _
@@ -1,17 +1,79 @@
- fs/dcache.c            |    8 
- fs/namei.c             |  335 +++++++++++++++++----
+ fs/dcache.c            |   20 ++
+ fs/exec.c              |   15 +
+ fs/namei.c             |  378 ++++++++++++++++++++++++++++++++++++++++++-------
  fs/nfsd/vfs.c          |    2 
- fs/open.c              |  142 +++++++-
- fs/stat.c              |   24 +
- include/linux/dcache.h |   26 +
- include/linux/fs.h     |   27 +
+ fs/open.c              |  126 ++++++++++++++--
+ fs/proc/base.c         |    3 
+ fs/stat.c              |   24 ++-
+ include/linux/dcache.h |   31 ++++
+ include/linux/fs.h     |   32 +++-
  kernel/ksyms.c         |    1 
- fs/exec.c                   |   18 -
- 9 files changed, 487 insertions(+), 96 deletions(-)
+ 10 files changed, 543 insertions(+), 89 deletions(-)
+
+--- linux-2.4.20-l18/fs/exec.c~vfs_intent-2.4.20-vanilla       Thu Nov 28 18:53:15 2002
++++ linux-2.4.20-l18-phil/fs/exec.c    Wed May 28 01:39:18 2003
+@@ -107,8 +107,9 @@ asmlinkage long sys_uselib(const char * 
+       struct file * file;
+       struct nameidata nd;
+       int error;
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
+-      error = user_path_walk(library, &nd);
++      error = user_path_walk_it(library, &nd, &it);
+       if (error)
+               goto out;
+@@ -120,7 +121,8 @@ asmlinkage long sys_uselib(const char * 
+       if (error)
+               goto exit;
+-      file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++      file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++      intent_release(nd.dentry, &it);
+       error = PTR_ERR(file);
+       if (IS_ERR(file))
+               goto out;
+@@ -363,8 +365,9 @@ struct file *open_exec(const char *name)
+       struct inode *inode;
+       struct file *file;
+       int err = 0;
++      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
+-      err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
++      err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it);
+       file = ERR_PTR(err);
+       if (!err) {
+               inode = nd.dentry->d_inode;
+@@ -376,7 +379,8 @@ struct file *open_exec(const char *name)
+                               err = -EACCES;
+                       file = ERR_PTR(err);
+                       if (!err) {
+-                              file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++                              file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++                                intent_release(nd.dentry, &it);
+                               if (!IS_ERR(file)) {
+                                       err = deny_write_access(file);
+                                       if (err) {
+@@ -388,6 +392,7 @@ out:
+                               return file;
+                       }
+               }
++                intent_release(nd.dentry, &it);
+               path_release(&nd);
+       }
+       goto out;
+@@ -989,7 +994,7 @@ int do_coredump(long signr, struct pt_re
+               goto close_fail;
+       if (!file->f_op->write)
+               goto close_fail;
+-      if (do_truncate(file->f_dentry, 0) != 0)
++      if (do_truncate(file->f_dentry, 0, 0) != 0)
+               goto close_fail;
  
---- linux-2.4.19-hp2_pnnl4/fs/dcache.c~vfs_intent_hp   Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl4-root/fs/dcache.c    Sun Jan 19 19:04:47 2003
-@@ -186,6 +188,13 @@ int d_invalidate(struct dentry * dentry)
+       retval = binfmt->core_dump(signr, regs, file);
+--- linux-2.4.20-l18/fs/dcache.c~vfs_intent-2.4.20-vanilla     Thu Nov 28 18:53:15 2002
++++ linux-2.4.20-l18-phil/fs/dcache.c  Wed May 28 01:39:18 2003
+@@ -181,6 +181,13 @@ int d_invalidate(struct dentry * dentry)
                spin_unlock(&dcache_lock);
                return 0;
        }
@@ -25,7 +87,7 @@
        /*
         * Check whether to do a partial shrink_dcache
         * to get rid of unused child entries.
-@@ -616,6 +618,7 @@ struct dentry * d_alloc(struct dentry * 
+@@ -616,6 +623,7 @@ struct dentry * d_alloc(struct dentry * 
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        dentry->d_mounted = 0;
@@ -33,7 +95,7 @@
        INIT_LIST_HEAD(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
-@@ -859,13 +867,19 @@ void d_delete(struct dentry * dentry)
+@@ -830,13 +838,19 @@ void d_delete(struct dentry * dentry)
   * Adds a dentry to the hash according to its name.
   */
   
  }
  
  #define do_switch(x,y) do { \
---- linux-2.4.19-hp2_pnnl4/fs/namei.c~vfs_intent_hp    Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl4-root/fs/namei.c     Sun Jan 19 19:35:55 2003
-@@ -94,6 +97,13 @@
+--- linux-2.4.20-l18/fs/namei.c~vfs_intent-2.4.20-vanilla      Thu Nov 28 18:53:15 2002
++++ linux-2.4.20-l18-phil/fs/namei.c   Sun Jun  1 23:41:35 2003
+@@ -94,6 +94,13 @@
   * XEmacs seems to be relying on it...
   */
  
  /* In order to reduce some races, while at the same time doing additional
   * checking and hopefully speeding things up, we copy filenames to the
   * kernel data space before using them..
-@@ -260,10 +271,19 @@ void path_release(struct nameidata *nd)
+@@ -260,10 +267,19 @@ void path_release(struct nameidata *nd)
   * Internal lookup() using the new generic dcache.
   * SMP-safe
   */
        if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
                if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
                        dput(dentry);
-@@ -281,11 +301,14 @@ static struct dentry * cached_lookup(str
+@@ -281,11 +297,14 @@ static struct dentry * cached_lookup(str
   * make sure that nobody added the entry to the dcache in the meantime..
   * SMP-safe
   */
        down(&dir->i_sem);
        /*
         * First re-do the cached lookup just in case it was created
-@@ -300,6 +321,9 @@ static struct dentry * real_lookup(struc
+@@ -300,6 +319,9 @@ static struct dentry * real_lookup(struc
                result = ERR_PTR(-ENOMEM);
                if (dentry) {
                        lock_kernel();
                        result = dir->i_op->lookup(dir, dentry);
                        unlock_kernel();
                        if (result)
-@@ -321,6 +345,12 @@ static struct dentry * real_lookup(struc
+@@ -321,6 +343,12 @@ static struct dentry * real_lookup(struc
                        dput(result);
                        result = ERR_PTR(-ENOENT);
                }
        }
        return result;
  }
-@@ -332,7 +362,8 @@ static struct dentry * real_lookup(struc
+@@ -332,7 +360,8 @@ static struct dentry * real_lookup(struc
   * Without that kind of total limit, nasty chains of consecutive
   * symlinks can cause almost arbitrarily long lookups. 
   */
  {
        int err;
        if (current->link_count >= 5)
-@@ -346,10 +377,14 @@ static inline int do_follow_link(struct 
+@@ -346,10 +375,21 @@ static inline int do_follow_link(struct 
        current->link_count++;
        current->total_link_count++;
        UPDATE_ATIME(dentry->d_inode);
 -      err = dentry->d_inode->i_op->follow_link(dentry, nd);
++        nd->it = it;
 +      if (dentry->d_inode->i_op->follow_link2)
 +              err = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
-+      else
++        else
 +              err = dentry->d_inode->i_op->follow_link(dentry, nd);
++        if (!err && it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) {
++                /* vfs_follow_link was never called */
++              intent_release(dentry, it);
++                path_release(nd);
++                err = -ENOLINK;
++        }
        current->link_count--;
        return err;
  loop:
        path_release(nd);
        return -ELOOP;
  }
-@@ -381,15 +416,26 @@ int follow_up(struct vfsmount **mnt, str
+@@ -379,15 +419,26 @@ int follow_up(struct vfsmount **mnt, str
        return __follow_up(mnt, dentry);
  }
  
                dput(*dentry);
                mntput(mounted->mnt_parent);
                *dentry = dget(mounted->mnt_root);
-@@ -401,7 +447,7 @@ static inline int __follow_down(struct v
+@@ -399,7 +450,7 @@ static inline int __follow_down(struct v
  
  int follow_down(struct vfsmount **mnt, struct dentry **dentry)
  {
  }
   
  static inline void follow_dotdot(struct nameidata *nd)
-@@ -437,7 +483,7 @@ static inline void follow_dotdot(struct 
+@@ -435,7 +486,7 @@ static inline void follow_dotdot(struct 
                mntput(nd->mnt);
                nd->mnt = parent;
        }
                ;
  }
  
-@@ -447,7 +482,8 @@ static inline void follow_dotdot(struct 
+@@ -447,7 +498,8 @@ static inline void follow_dotdot(struct 
   *
   * We expect 'base' to be positive and a directory.
   */
  {
        struct dentry *dentry;
        struct inode *inode;
-@@ -520,15 +556,15 @@ int link_path_walk(const char * name, st
+@@ -520,15 +572,15 @@ int link_path_walk(const char * name, st
                                break;
                }
                /* This does the actual lookups.. */
                        ;
  
                err = -ENOENT;
-@@ -539,8 +575,8 @@ int link_path_walk(const char * name, st
+@@ -539,8 +591,8 @@ int link_path_walk(const char * name, st
                if (!inode->i_op)
                        goto out_dput;
  
                        dput(dentry);
                        if (err)
                                goto return_err;
-@@ -556,7 +592,7 @@ int link_path_walk(const char * name, st
+@@ -556,7 +608,7 @@ int link_path_walk(const char * name, st
                        nd->dentry = dentry;
                }
                err = -ENOTDIR; 
                        break;
                continue;
                /* here ends the main loop */
-@@ -583,19 +619,20 @@ last_component:
+@@ -583,19 +635,20 @@ last_component:
                        if (err < 0)
                                break;
                }
                        dput(dentry);
                        if (err)
                                goto return_err;
-@@ -609,7 +647,8 @@ last_component:
+@@ -609,7 +662,8 @@ last_component:
                        goto no_inode;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
                                break;
                }
                goto return_base;
-@@ -646,15 +685,28 @@ out_dput:
+@@ -633,6 +687,23 @@ return_reval:
+                * Check the cached dentry for staleness.
+                */
+               dentry = nd->dentry;
++        revalidate_again:
++              if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) {
++                      err = -ESTALE;
++                      if (!dentry->d_op->d_revalidate2(dentry, 0, it)) {
++                                struct dentry *new;
++                                err = permission(dentry->d_parent->d_inode, 
++                                                 MAY_EXEC);
++                                if (err)
++                                        break;
++                                new = real_lookup(dentry->d_parent,
++                                                  &dentry->d_name, 0, NULL);
++                              d_invalidate(dentry);
++                                dput(dentry);
++                                dentry = new;
++                                goto revalidate_again;
++                        }
++              } else
+               if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+                       err = -ESTALE;
+                       if (!dentry->d_op->d_revalidate(dentry, 0)) {
+@@ -646,15 +717,28 @@ out_dput:
                dput(dentry);
                break;
        }
  }
  
  /* SMP-safe */
-@@ -757,7 +809,8 @@ int path_init(const char *name, unsigned
+@@ -739,6 +823,17 @@ walk_init_root(const char *name, struct 
+ }
+ /* SMP-safe */
++int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd,
++                 struct lookup_intent *it)
++{
++      int error = 0;
++      if (path_init(path, flags, nd))
++              error = path_walk_it(path, nd, it);
++      return error;
++}
++
++
++/* SMP-safe */
+ int path_lookup(const char *path, unsigned flags, struct nameidata *nd)
+ {
+       int error = 0;
+@@ -753,6 +848,7 @@ int path_init(const char *name, unsigned
+ {
+       nd->last_type = LAST_ROOT; /* if there are only slashes... */
+       nd->flags = flags;
++        nd->it = NULL;
+       if (*name=='/')
+               return walk_init_root(name,nd);
+       read_lock(&current->fs->lock);
+@@ -767,7 +863,8 @@ int path_init(const char *name, unsigned
   * needs parent already locked. Doesn't follow mounts.
   * SMP-safe.
   */
  {
        struct dentry * dentry;
        struct inode *inode;
-@@ -780,13 +833,16 @@ struct dentry * lookup_hash(struct qstr 
+@@ -790,13 +887,16 @@ struct dentry * lookup_hash(struct qstr 
                        goto out;
        }
  
                dentry = inode->i_op->lookup(inode, new);
                unlock_kernel();
                if (!dentry)
-@@ -798,6 +854,12 @@ out:
+@@ -808,6 +908,12 @@ out:
        return dentry;
  }
  
  /* SMP-safe */
  struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
  {
-@@ -819,7 +881,7 @@ struct dentry * lookup_one_len(const cha
+@@ -829,7 +935,7 @@ struct dentry * lookup_one_len(const cha
        }
        this.hash = end_name_hash(hash);
  
  access:
        return ERR_PTR(-EACCES);
  }
-@@ -851,6 +913,23 @@ int __user_walk(const char *name, unsign
+@@ -860,6 +966,23 @@ int __user_walk(const char *name, unsign
        return err;
  }
  
  /*
   * It's inline, so penalty for filesystems that don't use sticky bit is
   * minimal.
-@@ -987,7 +1066,8 @@ exit_lock:
+@@ -996,7 +1119,8 @@ exit_lock:
   * for symlinks (where the permissions are checked later).
   * SMP-safe
   */
  {
        int acc_mode, error = 0;
        struct inode *inode;
-@@ -1002,7 +1082,7 @@ int open_namei(const char * pathname, in
+@@ -1010,7 +1134,7 @@ int open_namei(const char * pathname, in
+        * The simplest case - just a plain lookup.
         */
        if (!(flag & O_CREAT)) {
-               if (path_init(pathname, lookup_flags(flag), nd))
--                      error = path_walk(pathname, nd);
-+                      error = path_walk_it(pathname, nd, it);
+-              error = path_lookup(pathname, lookup_flags(flag), nd);
++              error = path_lookup_it(pathname, lookup_flags(flag), nd, it);
                if (error)
                        return error;
                dentry = nd->dentry;
-@@ -1012,6 +1092,10 @@ int open_namei(const char * pathname, in
+@@ -1020,6 +1144,10 @@ int open_namei(const char * pathname, in
        /*
         * Create - we need to know the parent.
         */
-+      if (it) {
-+              it->it_mode = mode;
-+              it->it_op |= IT_CREAT;
-+      }
-       if (path_init(pathname, LOOKUP_PARENT, nd))
-               error = path_walk(pathname, nd);
++      if (it) {
++              it->it_mode = mode;
++              it->it_op |= IT_CREAT;
++      }
+       error = path_lookup(pathname, LOOKUP_PARENT, nd);
        if (error)
-@@ -1028,7 +1112,7 @@ int open_namei(const char * pathname, in
+               return error;
+@@ -1035,7 +1163,7 @@ int open_namei(const char * pathname, in
  
        dir = nd->dentry;
        down(&dir->d_inode->i_sem);
  
  do_last:
        error = PTR_ERR(dentry);
-@@ -1037,6 +1121,7 @@ do_last:
+@@ -1044,6 +1172,7 @@ do_last:
                goto exit;
        }
  
 +      it->it_mode = mode;
        /* Negative dentry, just create the file */
        if (!dentry->d_inode) {
-               if (!IS_POSIXACL(dir->d_inode))
-@@ -1066,12 +1151,13 @@ do_last:
+               error = vfs_create(dir->d_inode, dentry,
+@@ -1072,12 +1201,13 @@ do_last:
                error = -ELOOP;
                if (flag & O_NOFOLLOW)
                        goto exit_dput;
                goto do_link;
  
        dput(nd->dentry);
-@@ -1145,7 +1231,7 @@ do_last:
+@@ -1151,7 +1281,7 @@ ok:
                if (!error) {
                        DQUOT_INIT(inode);
                        
                }
                put_write_access(inode);
                if (error)
-@@ -1157,8 +1243,10 @@ ok:
+@@ -1163,8 +1293,10 @@ ok:
        return 0;
  
  exit_dput:
        path_release(nd);
        return error;
  
-@@ -1177,7 +1265,12 @@ do_link:
+@@ -1183,7 +1315,19 @@ do_link:
         * are done. Procfs-like symlinks just set LAST_BIND.
         */
        UPDATE_ATIME(dentry->d_inode);
 -      error = dentry->d_inode->i_op->follow_link(dentry, nd);
++        nd->it = it;
 +      if (dentry->d_inode->i_op->follow_link2)
 +              error = dentry->d_inode->i_op->follow_link2(dentry, nd, it);
 +      else
 +              error = dentry->d_inode->i_op->follow_link(dentry, nd);
-+      if (error)
++      if (error) {
++              intent_release(dentry, it);
++        } else if (it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) {
++                /* vfs_follow_link was never called */
 +              intent_release(dentry, it);
++                path_release(nd);
++                error = -ENOLINK;
++        }
        dput(dentry);
        if (error)
                return error;
-@@ -1199,13 +1292,20 @@ do_link:
+@@ -1205,13 +1349,20 @@ do_link:
        }
        dir = nd->dentry;
        down(&dir->d_inode->i_sem);
  {
        struct dentry *dentry;
  
-@@ -1213,7 +1313,7 @@ static struct dentry *lookup_create(stru
+@@ -1219,7 +1370,7 @@ static struct dentry *lookup_create(stru
        dentry = ERR_PTR(-EEXIST);
        if (nd->last_type != LAST_NORM)
                goto fail;
        if (IS_ERR(dentry))
                goto fail;
        if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-@@ -1270,7 +1371,19 @@ asmlinkage long sys_mknod(const char * f
-               error = path_walk(tmp, &nd);
+@@ -1275,7 +1426,19 @@ asmlinkage long sys_mknod(const char * f
+       error = path_lookup(tmp, LOOKUP_PARENT, &nd);
        if (error)
                goto out;
 -      dentry = lookup_create(&nd, 0);
 +      dentry = lookup_create(&nd, 0, NULL);
        error = PTR_ERR(dentry);
  
-       if (!IS_POSIXACL(nd.dentry->d_inode))
-@@ -1289,6 +1402,7 @@ asmlinkage long sys_mknod(const char * f
+       mode &= ~current->fs->umask;
+@@ -1296,6 +1459,7 @@ asmlinkage long sys_mknod(const char * f
                dput(dentry);
        }
        up(&nd.dentry->d_inode->i_sem);
        path_release(&nd);
  out:
        putname(tmp);
-@@ -1340,15 +1456,25 @@ asmlinkage long sys_mkdir(const char * p
-                       error = path_walk(tmp, &nd);
+@@ -1343,7 +1507,17 @@ asmlinkage long sys_mkdir(const char * p
+               error = path_lookup(tmp, LOOKUP_PARENT, &nd);
                if (error)
                        goto out;
 -              dentry = lookup_create(&nd, 1);
-+              if (nd.dentry->d_inode->i_op->mkdir2) {
-+                      struct inode_operations *op = nd.dentry->d_inode->i_op;
-+                      error = op->mkdir2(nd.dentry->d_inode,
-+                                         nd.last.name,
-+                                         nd.last.len,
-+                                         mode);
-+                      /* the file system wants to use normal vfs path now */
-+                      if (error != -EOPNOTSUPP)
-+                              goto out2;
-+              }
++              if (nd.dentry->d_inode->i_op->mkdir2) {
++                      struct inode_operations *op = nd.dentry->d_inode->i_op;
++                      error = op->mkdir2(nd.dentry->d_inode,
++                                         nd.last.name,
++                                         nd.last.len,
++                                         mode);
++                      /* the file system wants to use normal vfs path now */
++                      if (error != -EOPNOTSUPP)
++                              goto out2;
++              }
 +              dentry = lookup_create(&nd, 1, NULL);
                error = PTR_ERR(dentry);
                if (!IS_ERR(dentry)) {
--                      if (!IS_POSIXACL(nd.dentry->d_inode))
--                              mode &= ~current->fs->umask;
--                      error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
-+                      error = vfs_mkdir(nd.dentry->d_inode, dentry,
-+                                        mode & ~current->fs->umask);
+                       error = vfs_mkdir(nd.dentry->d_inode, dentry,
+@@ -1351,6 +1525,7 @@ asmlinkage long sys_mkdir(const char * p
                        dput(dentry);
                }
                up(&nd.dentry->d_inode->i_sem);
                path_release(&nd);
  out:
                putname(tmp);
-@@ -1450,8 +1578,33 @@ asmlinkage long sys_rmdir(const char * p
+@@ -1451,8 +1626,33 @@ asmlinkage long sys_rmdir(const char * p
                        error = -EBUSY;
                        goto exit1;
        }
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                error = vfs_rmdir(nd.dentry->d_inode, dentry);
-@@ -1510,8 +1649,17 @@ asmlinkage long sys_unlink(const char * 
+@@ -1510,8 +1710,17 @@ asmlinkage long sys_unlink(const char * 
        error = -EISDIR;
        if (nd.last_type != LAST_NORM)
                goto exit1;
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {
                /* Why not before? Because we want correct error value */
-@@ -1579,15 +1729,26 @@ asmlinkage long sys_symlink(const char *
-                       error = path_walk(to, &nd);
+@@ -1578,15 +1787,26 @@ asmlinkage long sys_symlink(const char *
+               error = path_lookup(to, LOOKUP_PARENT, &nd);
                if (error)
                        goto out;
 -              dentry = lookup_create(&nd, 0);
                putname(to);
        }
        putname(from);
-@@ -1660,7 +1824,7 @@ asmlinkage long sys_link(const char * ol
-               error = 0;
-               if (path_init(from, LOOKUP_POSITIVE, &old_nd))
--                      error = path_walk(from, &old_nd);
-+                      error = path_walk_it(from, &old_nd, NULL);
-               if (error)
-                       goto exit;
-               if (path_init(to, LOOKUP_PARENT, &nd))
-@@ -1670,7 +1834,17 @@ asmlinkage long sys_link(const char * ol
+@@ -1662,7 +1882,17 @@ asmlinkage long sys_link(const char * ol
                error = -EXDEV;
                if (old_nd.mnt != nd.mnt)
                        goto out_release;
                error = PTR_ERR(new_dentry);
                if (!IS_ERR(new_dentry)) {
                        error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-@@ -1716,7 +1892,8 @@ exit:
+@@ -1706,7 +1936,8 @@ exit:
   *       locking].
   */
  int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
  {
        int error;
        struct inode *target;
-@@ -1753,6 +1923,7 @@ int vfs_rename_dir(struct inode *old_dir
+@@ -1764,6 +1995,7 @@ int vfs_rename_dir(struct inode *old_dir
                error = -EBUSY;
        else 
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (target) {
                if (!error)
                        target->i_flags |= S_DEAD;
-@@ -1795,7 +1973,8 @@ out_unlock:
+@@ -1785,7 +2017,8 @@ out_unlock:
  }
  
  int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
  {
        int error;
  
-@@ -1826,6 +2005,7 @@ int vfs_rename_other(struct inode *old_d
+@@ -1816,6 +2049,7 @@ int vfs_rename_other(struct inode *old_d
                error = -EBUSY;
        else
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        double_up(&old_dir->i_zombie, &new_dir->i_zombie);
        if (error)
                return error;
-@@ -1837,13 +2017,14 @@ int vfs_rename_other(struct inode *old_d
+@@ -1827,13 +2061,14 @@ int vfs_rename_other(struct inode *old_d
  }
  
  int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (!error) {
                if (old_dir == new_dir)
                        inode_dir_notify(old_dir, DN_RENAME);
-@@ -1888,7 +2070,7 @@ static inline int do_rename(const char *
+@@ -1875,7 +2110,7 @@ static inline int do_rename(const char *
  
        double_lock(new_dir, old_dir);
  
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
-@@ -1904,16 +2086,37 @@ static inline int do_rename(const char *
+@@ -1891,16 +2126,37 @@ static inline int do_rename(const char *
                if (newnd.last.name[newnd.last.len])
                        goto exit4;
        }
        dput(new_dentry);
  exit4:
        dput(old_dentry);
-@@ -1964,7 +2163,8 @@ out:
+@@ -1951,20 +2207,28 @@ out:
  }
  
  static inline int
  {
        int res = 0;
        char *name;
-@@ -1977,7 +2177,7 @@ __vfs_follow_link(struct nameidata *nd, 
+       if (IS_ERR(link))
+               goto fail;
++        if (it == NULL)
++                it = nd->it;
++        else if (it != nd->it)
++                printk("it != nd->it: tell phil@clusterfs.com\n");
++        if (it != NULL)
++                it->it_int_flags |= IT_FL_FOLLOWED;
++
+       if (*link == '/') {
+               path_release(nd);
+               if (!walk_init_root(link, nd))
                        /* weird __emul_prefix() stuff did it */
                        goto out;
        }
  out:
        if (current->link_count || res || nd->last_type!=LAST_NORM)
                return res;
-@@ -1999,7 +2199,13 @@ fail:
+@@ -1986,7 +2250,13 @@ fail:
  
  int vfs_follow_link(struct nameidata *nd, const char *link)
  {
  }
  
  /* get the link contents into pagecache */
-@@ -2041,7 +2247,7 @@ int page_follow_link(struct dentry *dent
+@@ -2028,7 +2298,7 @@ int page_follow_link(struct dentry *dent
  {
        struct page *page = NULL;
        char *s = page_getlink(dentry, &page);
        if (page) {
                kunmap(page);
                page_cache_release(page);
---- linux-2.4.19-hp2_pnnl4/fs/nfsd/vfs.c~vfs_intent_hp Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl4-root/fs/nfsd/vfs.c  Sun Jan 19 19:37:57 2003
-@@ -1295,7 +1295,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
+--- linux-2.4.20-l18/fs/nfsd/vfs.c~vfs_intent-2.4.20-vanilla   Thu Nov 28 18:53:15 2002
++++ linux-2.4.20-l18-phil/fs/nfsd/vfs.c        Wed May 28 01:39:18 2003
+@@ -1291,7 +1291,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru
                        err = nfserr_perm;
        } else
  #endif
        if (!err && EX_ISSYNC(tfhp->fh_export)) {
                nfsd_sync_dir(tdentry);
                nfsd_sync_dir(fdentry);
---- linux-2.4.19-hp2_pnnl4/fs/open.c~vfs_intent_hp     Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl4-root/fs/open.c      Sun Jan 19 19:41:00 2003
+--- linux-2.4.20-l18/fs/open.c~vfs_intent-2.4.20-vanilla       Thu Nov 28 18:53:15 2002
++++ linux-2.4.20-l18-phil/fs/open.c    Wed May 28 01:39:18 2003
 @@ -19,6 +19,8 @@
  #include <asm/uaccess.h>
  
        up(&inode->i_sem);
        return error;
  }
-@@ -118,12 +121,13 @@ static inline long do_sys_truncate(const
+@@ -118,12 +128,13 @@ static inline long do_sys_truncate(const
        struct nameidata nd;
        struct inode * inode;
        int error;
        if (error)
                goto out;
        inode = nd.dentry->d_inode;
-@@ -163,11 +167,13 @@ static inline long do_sys_truncate(const
+@@ -163,11 +174,13 @@ static inline long do_sys_truncate(const
        error = locks_verify_truncate(inode, NULL, length);
        if (!error) {
                DQUOT_INIT(inode);
                if (current->fsuid != inode->i_uid &&
                    (error = permission(inode,MAY_WRITE)) != 0)
                        goto dput_and_out;
-@@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * 
+@@ -347,6 +395,7 @@ asmlinkage long sys_access(const char * 
        int old_fsuid, old_fsgid;
        kernel_cap_t old_cap;
        int res;
  
        if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;
-@@ -364,13 +374,14 @@ asmlinkage long sys_access(const char * 
+@@ -364,13 +413,14 @@ asmlinkage long sys_access(const char * 
        else
                current->cap_effective = current->cap_permitted;
  
                path_release(&nd);
        }
  
-@@ -386,6 +397,7 @@ asmlinkage long sys_chdir(const char * f
+@@ -385,8 +435,9 @@ asmlinkage long sys_chdir(const char * f
+ {
        int error;
        struct nameidata nd;
-       char *name;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
-       name = getname(filename);
-       error = PTR_ERR(name);
-@@ -394,7 +406,7 @@ asmlinkage long sys_chdir(const char * f
++      struct lookup_intent it = { .it_op = IT_GETATTR };
  
-       error = 0;
-       if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd))
--              error = path_walk(name, &nd);
-+              error = path_walk_it(name, &nd, &it);
-       putname(name);
+-      error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd);
++      error = __user_walk_it(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd, &it);
        if (error)
                goto out;
-@@ -406,6 +418,7 @@ asmlinkage long sys_chdir(const char * f
+@@ -397,6 +448,7 @@ asmlinkage long sys_chdir(const char * f
        set_fs_pwd(current->fs, nd.mnt, nd.dentry);
  
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -446,6 +459,7 @@ asmlinkage long sys_chroot(const char * 
+@@ -436,9 +488,10 @@ asmlinkage long sys_chroot(const char * 
+ {
        int error;
        struct nameidata nd;
-       char *name;
-+      struct lookup_intent it = { .it_op = IT_GETATTR };
++      struct lookup_intent it = { .it_op = IT_GETATTR };
  
-       name = getname(filename);
-       error = PTR_ERR(name);
-@@ -454,7 +468,7 @@ asmlinkage long sys_chroot(const char * 
-       path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
-                     LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
--      error = path_walk(name, &nd);   
-+      error = path_walk_it(name, &nd, &it);
-       putname(name);
+-      error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
+-                    LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++      error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
++                    LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it);
        if (error)
                goto out;
-@@ -471,6 +485,7 @@ asmlinkage long sys_chroot(const char * 
+@@ -454,6 +507,7 @@ asmlinkage long sys_chroot(const char * 
        set_fs_altroot();
        error = 0;
  dput_and_out:
        path_release(&nd);
  out:
        return error;
-@@ -508,6 +564,18 @@ asmlinkage long sys_chmod(const char * f
+@@ -508,6 +562,18 @@ asmlinkage long sys_chmod(const char * f
        if (IS_RDONLY(inode))
                goto dput_and_out;
  
        error = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                goto dput_and_out;
-@@ -538,6 +606,20 @@ static int chown_common(struct dentry * 
+@@ -538,6 +604,20 @@ static int chown_common(struct dentry * 
        error = -EROFS;
        if (IS_RDONLY(inode))
                goto out;
        error = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                goto out;
-@@ -655,10 +676,16 @@ asmlinkage long sys_fchown(unsigned int 
+@@ -638,10 +718,12 @@ asmlinkage long sys_fchown(unsigned int 
   * for the internal routines (ie open_namei()/follow_link() etc). 00 is
   * used by symlinks.
   */
-+extern int open_namei_it(const char *filename, int namei_flags, int mode,
-+                       struct nameidata *nd, struct lookup_intent *it);
-+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it);
 +
  struct file *filp_open(const char * filename, int flags, int mode)
  {
  
        namei_flags = flags;
        if ((namei_flags+1) & O_ACCMODE)
-@@ -666,14 +693,15 @@ struct file *filp_open(const char * file
+@@ -649,14 +731,15 @@ struct file *filp_open(const char * file
        if (namei_flags & O_TRUNC)
                namei_flags |= 2;
  
  {
        struct file * f;
        struct inode *inode;
-@@ -716,6 +744,7 @@ struct file *dentry_open(struct dentry *
+@@ -699,6 +782,7 @@ struct file *dentry_open(struct dentry *
        }
        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
  
        return f;
  
  cleanup_all:
-@@ -730,11 +759,17 @@ cleanup_all:
+@@ -713,11 +797,17 @@ cleanup_all:
  cleanup_file:
        put_filp(f);
  cleanup_dentry:
  /*
   * Find an empty file descriptor entry, and mark it busy.
   */
---- linux-2.4.19-hp2_pnnl4/fs/stat.c~vfs_intent_hp     Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl4-root/fs/stat.c      Sun Jan 19 19:44:51 2003
-@@ -135,13 +136,15 @@ static int cp_new_stat(struct inode * in
+--- linux-2.4.20-l18/fs/stat.c~vfs_intent-2.4.20-vanilla       Thu Sep 13 19:04:43 2001
++++ linux-2.4.20-l18-phil/fs/stat.c    Wed May 28 01:39:18 2003
+@@ -135,13 +135,15 @@ static int cp_new_stat(struct inode * in
  asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
  {
        struct nameidata nd;
                path_release(&nd);
        }
        return error;
-@@ -151,13 +154,15 @@ asmlinkage long sys_stat(char * filename
+@@ -151,13 +153,15 @@ asmlinkage long sys_stat(char * filename
  asmlinkage long sys_newstat(char * filename, struct stat * statbuf)
  {
        struct nameidata nd;
                path_release(&nd);
        }
        return error;
-@@ -172,13 +177,15 @@ asmlinkage long sys_newstat(char * filen
+@@ -172,13 +176,15 @@ asmlinkage long sys_newstat(char * filen
  asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf)
  {
        struct nameidata nd;
                path_release(&nd);
        }
        return error;
-@@ -189,13 +196,15 @@ asmlinkage long sys_lstat(char * filenam
+@@ -189,13 +195,15 @@ asmlinkage long sys_lstat(char * filenam
  asmlinkage long sys_newlstat(char * filename, struct stat * statbuf)
  {
        struct nameidata nd;
                path_release(&nd);
        }
        return error;
-@@ -333,12 +344,14 @@ asmlinkage long sys_stat64(char * filena
+@@ -333,12 +341,14 @@ asmlinkage long sys_stat64(char * filena
  {
        struct nameidata nd;
        int error;
                path_release(&nd);
        }
        return error;
-@@ -348,12 +361,14 @@ asmlinkage long sys_lstat64(char * filen
+@@ -348,12 +358,14 @@ asmlinkage long sys_lstat64(char * filen
  {
        struct nameidata nd;
        int error;
                path_release(&nd);
        }
        return error;
---- linux-2.4.19-hp2_pnnl4/fs/exec.c~vfs_intent_hp     Sun Feb  9 01:14:52 2003
-+++ linux-2.4.19-hp2_pnnl4-root/fs/exec.c      Sun Feb  9 01:29:49 2003
-@@ -103,13 +104,18 @@ static inline void put_binfmt(struct lin
-  *
-  * Also note that we take the address to load from from the file itself.
-  */
-+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
-+                          int flags, struct lookup_intent *it);
-+int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd,
-+                 struct lookup_intent *it);
- asmlinkage long sys_uselib(const char * library)
- {
-       struct file * file;
-       struct nameidata nd;
-       int error;
-+      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
--      error = user_path_walk(library, &nd);
-+      error = user_path_walk_it(library, &nd, &it);
-       if (error)
-               goto out;
-@@ -121,7 +127,8 @@ asmlinkage long sys_uselib(const char * 
-       if (error)
-               goto exit;
--      file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
-+      file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
-+      intent_release(nd.dentry, &it);
-       error = PTR_ERR(file);
-       if (IS_ERR(file))
-               goto out;
-@@ -350,9 +350,10 @@ struct file *open_exec(const char *name)
-       struct inode *inode;
-       struct file *file;
-       int err = 0;
-+      struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY };
+--- linux-2.4.20-l18/fs/proc/base.c~vfs_intent-2.4.20-vanilla  Wed Jun  4 22:53:14 2003
++++ linux-2.4.20-l18-phil/fs/proc/base.c       Wed Jun  4 22:50:35 2003
+@@ -464,6 +464,9 @@ static int proc_pid_follow_link(struct d
  
-       if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd))
--              err = path_walk(name, &nd);
-+              err = path_walk_it(name, &nd, &it);
-       file = ERR_PTR(err);
-       if (!err) {
-               inode = nd.dentry->d_inode;
-@@ -363,7 +369,8 @@ struct file *open_exec(const char *name)
-                               err = -EACCES;
-                       file = ERR_PTR(err);
-                       if (!err) {
--                              file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
-+                              file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
-+                              intent_release(nd.dentry, &it);
-                               if (!IS_ERR(file)) {
-                                       err = deny_write_access(file);
-                                       if (err) {
-@@ -976,7 +986,7 @@ int do_coredump(long signr, struct pt_re
-               goto close_fail;
-       if (!file->f_op->write)
-               goto close_fail;
--      if (do_truncate(file->f_dentry, 0) != 0)
-+      if (do_truncate(file->f_dentry, 0, 0) != 0)
-               goto close_fail;
-       retval = binfmt->core_dump(signr, regs, file);
---- linux-2.4.19-hp2_pnnl4/include/linux/dcache.h~vfs_intent_hp        Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl4-root/include/linux/dcache.h Sun Jan 19 19:04:48 2003
-@@ -6,6 +6,25 @@
- #include <asm/atomic.h>
+       error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt);
+       nd->last_type = LAST_BIND;
++
++        if (nd->it != NULL)
++                nd->it->it_int_flags |= IT_FL_FOLLOWED;
+ out:
+       return error;
+ }
+--- linux-2.4.20-l18/include/linux/dcache.h~vfs_intent-2.4.20-vanilla  Thu Nov 28 18:53:15 2002
++++ linux-2.4.20-l18-phil/include/linux/dcache.h       Sun Jun  1 22:35:10 2003
+@@ -7,6 +7,28 @@
  #include <linux/mount.h>
+ #include <linux/kernel.h>
  
 +#define IT_OPEN     (1)
 +#define IT_CREAT    (1<<1)
 +#define IT_LOOKUP   (1<<4)
 +#define IT_UNLINK   (1<<5)
 +
++#define IT_FL_LOCKED   (1)
++#define IT_FL_FOLLOWED (1<<1) /* set by vfs_follow_link */
++
 +struct lookup_intent {
 +      int it_op;
 +      int it_mode;
 +      int it_flags;
 +      int it_disposition;
 +      int it_status;
-+      struct iattr *it_iattr;
++      int it_int_flags;
 +      __u64 it_lock_handle[2];
 +      int it_lock_mode;
 +      void *it_data;
  /*
   * linux/include/linux/dcache.h
   *
-@@ -78,6 +106,7 @@ struct dentry {
+@@ -79,6 +101,7 @@ struct dentry {
        unsigned long d_time;           /* used by d_revalidate */
        struct dentry_operations  *d_op;
        struct super_block * d_sb;      /* The root of the dentry tree */
        unsigned long d_vfs_flags;
        void * d_fsdata;                /* fs-specific data */
        unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
-@@ -90,8 +119,15 @@ struct dentry_operations {
+@@ -91,8 +114,15 @@ struct dentry_operations {
        int (*d_delete)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
  /* the dentry parameter passed to d_hash and d_compare is the parent
   * directory of the entries to be compared. It is used in case these
   * functions need any directory specific information for determining
-@@ -124,6 +149,7 @@ d_iput:            no              no              yes
+@@ -124,6 +154,7 @@ d_iput:            no              no              yes
                                         * s_nfsd_free_path semaphore will be down
                                         */
  #define DCACHE_REFERENCED     0x0008  /* Recently used, don't discard. */
  
  extern spinlock_t dcache_lock;
  
---- linux-2.4.19-hp2_pnnl4/include/linux/fs.h~vfs_intent_hp    Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl4-root/include/linux/fs.h     Sun Jan 19 19:04:48 2003
+--- linux-2.4.20-l18/include/linux/fs.h~vfs_intent-2.4.20-vanilla      Wed May 28 01:39:17 2003
++++ linux-2.4.20-l18-phil/include/linux/fs.h   Sun Jun  1 22:07:11 2003
 @@ -338,6 +338,8 @@ extern void set_bh_page(struct buffer_he
  #define ATTR_MTIME_SET        256
  #define ATTR_FORCE    512     /* Not a change, but a change it */
  
  /*
   * This is the Inode Attributes structure, used for notify_change().  It
-@@ -575,6 +575,7 @@ struct file {
+@@ -542,6 +544,7 @@ struct file {
  
        /* needed for tty driver, and maybe others */
        void                    *private_data;
  
        /* preallocated helper kiobuf to speedup O_DIRECT */
        struct kiobuf           *f_iobuf;
-@@ -815,7 +816,9 @@ extern int vfs_symlink(struct inode *, s
+@@ -661,6 +664,7 @@ struct nameidata {
+       struct qstr last;
+       unsigned int flags;
+       int last_type;
++        struct lookup_intent *it;
+ };
+ #define DQUOT_USR_ENABLED     0x01            /* User diskquotas enabled */
+@@ -794,7 +798,9 @@ extern int vfs_symlink(struct inode *, s
  extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
  extern int vfs_rmdir(struct inode *, struct dentry *);
  extern int vfs_unlink(struct inode *, struct dentry *);
  
  /*
   * File types
-@@ -876,20 +879,33 @@ struct file_operations {
+@@ -855,20 +861,33 @@ struct file_operations {
  struct inode_operations {
        int (*create) (struct inode *,struct dentry *,int);
        struct dentry * (*lookup) (struct inode *,struct dentry *);
        int (*getattr) (struct dentry *, struct iattr *);
        int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
        ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
-@@ -1112,7 +1130,7 @@ static inline int get_lease(struct inode
+@@ -1070,10 +1089,14 @@ static inline int get_lease(struct inode
  
  asmlinkage long sys_open(const char *, int, int);
  asmlinkage long sys_close(unsigned int);      /* yes, it's really unsigned */
  
  extern struct file *filp_open(const char *, int, int);
  extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
-@@ -1354,6 +1369,7 @@ typedef int (*read_actor_t)(read_descrip
++extern int open_namei_it(const char *filename, int namei_flags, int mode,
++                       struct nameidata *nd, struct lookup_intent *it);
++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
++                          int flags, struct lookup_intent *it);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char *);
+@@ -1335,6 +1358,7 @@ typedef int (*read_actor_t)(read_descrip
  extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
  
  extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
 +extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it));
  extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
  extern int FASTCALL(path_walk(const char *, struct nameidata *));
- extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
-@@ -1364,6 +1380,8 @@ extern struct dentry * lookup_one_len(co
+ extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
+@@ -1346,6 +1370,8 @@ extern struct dentry * lookup_one_len(co
  extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
  #define user_path_walk(name,nd)        __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
  #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
 +#define user_path_walk_it(name,nd,it)  __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it)
 +#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it)
  
- extern void inode_init_once(struct inode *);
  extern void iput(struct inode *);
-@@ -1499,6 +1517,8 @@ extern struct file_operations generic_ro
+ extern void force_delete(struct inode *);
+@@ -1455,6 +1481,8 @@ extern struct file_operations generic_ro
  
  extern int vfs_readlink(struct dentry *, char *, int, const char *);
  extern int vfs_follow_link(struct nameidata *, const char *);
  extern int page_readlink(struct dentry *, char *, int);
  extern int page_follow_link(struct dentry *, struct nameidata *);
  extern struct inode_operations page_symlink_inode_operations;
---- linux-2.4.19-hp2_pnnl4/kernel/ksyms.c~vfs_intent_hp        Sun Jan 19 19:04:47 2003
-+++ linux-2.4.19-hp2_pnnl4-root/kernel/ksyms.c Sun Jan 19 19:04:48 2003
-@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page);
+--- linux-2.4.20-l18/kernel/ksyms.c~vfs_intent-2.4.20-vanilla  Wed May 28 01:39:18 2003
++++ linux-2.4.20-l18-phil/kernel/ksyms.c       Wed May 28 01:39:18 2003
+@@ -269,6 +269,7 @@ EXPORT_SYMBOL(read_cache_page);
  EXPORT_SYMBOL(set_page_dirty);
  EXPORT_SYMBOL(vfs_readlink);
  EXPORT_SYMBOL(vfs_follow_link);
  EXPORT_SYMBOL(page_readlink);
  EXPORT_SYMBOL(page_follow_link);
  EXPORT_SYMBOL(page_symlink_inode_operations);
+
+_
diff --git a/lustre/kernel_patches/pc/dev_read_only_2.4.20.pc b/lustre/kernel_patches/pc/dev_read_only_2.4.20.pc
new file mode 100644 (file)
index 0000000..4760ad1
--- /dev/null
@@ -0,0 +1,3 @@
+drivers/block/blkpg.c
+drivers/block/loop.c
+drivers/ide/ide-disk.c
diff --git a/lustre/kernel_patches/pc/dev_read_only_hp_2.4.20.pc b/lustre/kernel_patches/pc/dev_read_only_hp_2.4.20.pc
new file mode 100644 (file)
index 0000000..4760ad1
--- /dev/null
@@ -0,0 +1,3 @@
+drivers/block/blkpg.c
+drivers/block/loop.c
+drivers/ide/ide-disk.c
diff --git a/lustre/kernel_patches/pc/dsp.pc b/lustre/kernel_patches/pc/dsp.pc
new file mode 100644 (file)
index 0000000..fdbf418
--- /dev/null
@@ -0,0 +1,6 @@
+kernel/bootimg.c
+kernel/bootimg_pic.c
+include/asm-i386/apic.h
+include/linux/crash.h
+arch/i386/kernel/crash.c
+arch/i386/kernel/nmi.c
diff --git a/lustre/kernel_patches/pc/export-truncate-2.5.63.pc b/lustre/kernel_patches/pc/export-truncate-2.5.63.pc
new file mode 100644 (file)
index 0000000..3f61c00
--- /dev/null
@@ -0,0 +1,2 @@
+include/linux/mm.h
+mm/truncate.c
diff --git a/lustre/kernel_patches/pc/export-truncate.pc b/lustre/kernel_patches/pc/export-truncate.pc
new file mode 100644 (file)
index 0000000..bd58c82
--- /dev/null
@@ -0,0 +1,2 @@
+include/linux/mm.h
+mm/filemap.c
diff --git a/lustre/kernel_patches/pc/exports_2.4.20.pc b/lustre/kernel_patches/pc/exports_2.4.20.pc
new file mode 100644 (file)
index 0000000..6472a11
--- /dev/null
@@ -0,0 +1,4 @@
+fs/ext3/Makefile
+fs/ext3/super.c
+include/linux/fs.h
+kernel/ksyms.c
diff --git a/lustre/kernel_patches/pc/exports_hp_2.4.20.pc b/lustre/kernel_patches/pc/exports_hp_2.4.20.pc
new file mode 100644 (file)
index 0000000..6472a11
--- /dev/null
@@ -0,0 +1,4 @@
+fs/ext3/Makefile
+fs/ext3/super.c
+include/linux/fs.h
+kernel/ksyms.c
diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-1-chaos.pc b/lustre/kernel_patches/pc/ext-2.4-patch-1-chaos.pc
new file mode 100644 (file)
index 0000000..634b944
--- /dev/null
@@ -0,0 +1,11 @@
+fs/ext3/Makefile
+fs/ext3/dir.c
+fs/ext3/file.c
+fs/ext3/hash.c
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
+include/linux/ext3_fs_sb.h
+include/linux/ext3_jbd.h
+include/linux/rbtree.h
+lib/rbtree.c
diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-1.pc b/lustre/kernel_patches/pc/ext-2.4-patch-1.pc
new file mode 100644 (file)
index 0000000..634b944
--- /dev/null
@@ -0,0 +1,11 @@
+fs/ext3/Makefile
+fs/ext3/dir.c
+fs/ext3/file.c
+fs/ext3/hash.c
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
+include/linux/ext3_fs_sb.h
+include/linux/ext3_jbd.h
+include/linux/rbtree.h
+lib/rbtree.c
diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-2.pc b/lustre/kernel_patches/pc/ext-2.4-patch-2.pc
new file mode 100644 (file)
index 0000000..9b16759
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/namei.c
diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-3.pc b/lustre/kernel_patches/pc/ext-2.4-patch-3.pc
new file mode 100644 (file)
index 0000000..65d4845
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/dir.c
+fs/ext3/namei.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-4.pc b/lustre/kernel_patches/pc/ext-2.4-patch-4.pc
new file mode 100644 (file)
index 0000000..9b16759
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/namei.c
diff --git a/lustre/kernel_patches/pc/ext3-2.4-ino_t.pc b/lustre/kernel_patches/pc/ext3-2.4-ino_t.pc
new file mode 100644 (file)
index 0000000..4cef979
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/ialloc.c
+fs/ext3/namei.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/ext3-2.4.18-fixes.pc b/lustre/kernel_patches/pc/ext3-2.4.18-fixes.pc
new file mode 100644 (file)
index 0000000..0822c5e
--- /dev/null
@@ -0,0 +1,7 @@
+fs/ext3/balloc.c
+fs/ext3/file.c
+fs/ext3/fsync.c
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/namei.c
+fs/ext3/super.c
diff --git a/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro.pc b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro.pc
new file mode 100644 (file)
index 0000000..cd21583
--- /dev/null
@@ -0,0 +1,10 @@
+fs/ext3/balloc.c
+fs/ext3/dir.c
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/ioctl.c
+fs/ext3/namei.c
+fs/ext3/super.c
+fs/ext3/symlink.c
+include/linux/ext3_fs.h
+include/linux/ext3_jbd.h
diff --git a/lustre/kernel_patches/pc/ext3-2.4.20-fixes.pc b/lustre/kernel_patches/pc/ext3-2.4.20-fixes.pc
new file mode 100644 (file)
index 0000000..441ced8
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/balloc.c
diff --git a/lustre/kernel_patches/pc/ext3-2.5-noread.pc b/lustre/kernel_patches/pc/ext3-2.5-noread.pc
new file mode 100644 (file)
index 0000000..9c3cea8
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/ext3-2.5.63.pc b/lustre/kernel_patches/pc/ext3-2.5.63.pc
new file mode 100644 (file)
index 0000000..b1e5de5
--- /dev/null
@@ -0,0 +1,4 @@
+fs/ext3/xattr.c
+fs/ext3/inode.c
+fs/ext3/super.c
+fs/ext3/xattr.h
diff --git a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18.pc b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18.pc
new file mode 100644 (file)
index 0000000..5770132
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/super.c
+include/linux/ext3_fs.h
+include/linux/ext3_fs_sb.h
diff --git a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.20.pc b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.20.pc
new file mode 100644 (file)
index 0000000..5770132
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/super.c
+include/linux/ext3_fs.h
+include/linux/ext3_fs_sb.h
diff --git a/lustre/kernel_patches/pc/ext3-largefile.pc b/lustre/kernel_patches/pc/ext3-largefile.pc
new file mode 100644 (file)
index 0000000..76d683f
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/inode.c
diff --git a/lustre/kernel_patches/pc/ext3-noread-2.4.20.pc b/lustre/kernel_patches/pc/ext3-noread-2.4.20.pc
new file mode 100644 (file)
index 0000000..9c3cea8
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/ext3-orphan_lock.pc b/lustre/kernel_patches/pc/ext3-orphan_lock.pc
new file mode 100644 (file)
index 0000000..98aebb0
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs_sb.h
diff --git a/lustre/kernel_patches/pc/ext3-san-2.4.20.pc b/lustre/kernel_patches/pc/ext3-san-2.4.20.pc
new file mode 100644 (file)
index 0000000..9ed5141
--- /dev/null
@@ -0,0 +1,2 @@
+fs/ext3/inode.c
+fs/ext3/ext3-exports.c
diff --git a/lustre/kernel_patches/pc/ext3-truncate_blocks-chaos.patch.pc b/lustre/kernel_patches/pc/ext3-truncate_blocks-chaos.patch.pc
new file mode 100644 (file)
index 0000000..76d683f
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/inode.c
diff --git a/lustre/kernel_patches/pc/ext3-truncate_blocks.pc b/lustre/kernel_patches/pc/ext3-truncate_blocks.pc
new file mode 100644 (file)
index 0000000..76d683f
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/inode.c
diff --git a/lustre/kernel_patches/pc/ext3-unmount_sync.pc b/lustre/kernel_patches/pc/ext3-unmount_sync.pc
new file mode 100644 (file)
index 0000000..08795de
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/super.c
diff --git a/lustre/kernel_patches/pc/ext3-use-after-free.pc b/lustre/kernel_patches/pc/ext3-use-after-free.pc
new file mode 100644 (file)
index 0000000..daf8787
--- /dev/null
@@ -0,0 +1 @@
+./fs/ext3/namei.c
diff --git a/lustre/kernel_patches/pc/ext3_orphan_lock-2.4.20-rh.pc b/lustre/kernel_patches/pc/ext3_orphan_lock-2.4.20-rh.pc
new file mode 100644 (file)
index 0000000..98aebb0
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs_sb.h
diff --git a/lustre/kernel_patches/pc/extN-2.4.18-ino_sb_fixup.pc b/lustre/kernel_patches/pc/extN-2.4.18-ino_sb_fixup.pc
new file mode 100644 (file)
index 0000000..7191405
--- /dev/null
@@ -0,0 +1 @@
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/extN-delete_thread.pc b/lustre/kernel_patches/pc/extN-delete_thread.pc
new file mode 100644 (file)
index 0000000..bc81732
--- /dev/null
@@ -0,0 +1,3 @@
+include/linux/ext3_fs.h
+include/linux/ext3_fs_sb.h
+fs/ext3/super.c
diff --git a/lustre/kernel_patches/pc/extN-iget-debug.pc b/lustre/kernel_patches/pc/extN-iget-debug.pc
new file mode 100644 (file)
index 0000000..e9fe01e
--- /dev/null
@@ -0,0 +1,2 @@
+fs/ext3/namei.c
+fs/ext3/inode.c
diff --git a/lustre/kernel_patches/pc/extN-misc-fixup.pc b/lustre/kernel_patches/pc/extN-misc-fixup.pc
new file mode 100644 (file)
index 0000000..08795de
--- /dev/null
@@ -0,0 +1 @@
+fs/ext3/super.c
diff --git a/lustre/kernel_patches/pc/extN-noread.pc b/lustre/kernel_patches/pc/extN-noread.pc
new file mode 100644 (file)
index 0000000..9c3cea8
--- /dev/null
@@ -0,0 +1,3 @@
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/extN-san.pc b/lustre/kernel_patches/pc/extN-san.pc
new file mode 100644 (file)
index 0000000..231df0e
--- /dev/null
@@ -0,0 +1,2 @@
+fs/ext3/inode.c
+fs/ext3/super.c
diff --git a/lustre/kernel_patches/pc/extN-wantedi.pc b/lustre/kernel_patches/pc/extN-wantedi.pc
new file mode 100644 (file)
index 0000000..31901ee
--- /dev/null
@@ -0,0 +1,4 @@
+fs/ext3/namei.c
+fs/ext3/ialloc.c
+fs/ext3/ioctl.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/htree-ext3-2.4.18.pc b/lustre/kernel_patches/pc/htree-ext3-2.4.18.pc
new file mode 100644 (file)
index 0000000..6499778
--- /dev/null
@@ -0,0 +1,4 @@
+fs/ext3/super.c
+fs/ext3/namei.c
+include/linux/ext3_fs.h
+include/linux/ext3_jbd.h
diff --git a/lustre/kernel_patches/pc/invalidate_show-2.4.20-rh.pc b/lustre/kernel_patches/pc/invalidate_show-2.4.20-rh.pc
new file mode 100644 (file)
index 0000000..1d4ed77
--- /dev/null
@@ -0,0 +1,4 @@
+fs/inode.c
+fs/super.c
+include/linux/fs.h
+fs/smbfs/inode.c
index 1f565ab..1d4ed77 100644 (file)
@@ -1,5 +1,4 @@
 fs/inode.c
-fs/block_dev.c
-fs/devfs/base.c
 fs/super.c
 include/linux/fs.h
+fs/smbfs/inode.c
diff --git a/lustre/kernel_patches/pc/iod-rmap-exports-2.4.20.pc b/lustre/kernel_patches/pc/iod-rmap-exports-2.4.20.pc
new file mode 100644 (file)
index 0000000..07288b0
--- /dev/null
@@ -0,0 +1,5 @@
+fs/inode.c
+fs/Makefile
+mm/vmscan.c
+mm/Makefile
+mm/page_alloc.c
index 1218f55..07288b0 100644 (file)
@@ -1,6 +1,5 @@
 fs/inode.c
 fs/Makefile
-mm/filemap.c
 mm/vmscan.c
 mm/Makefile
 mm/page_alloc.c
diff --git a/lustre/kernel_patches/pc/iod-stock-24-exports.pc b/lustre/kernel_patches/pc/iod-stock-24-exports.pc
new file mode 100644 (file)
index 0000000..e4eceee
--- /dev/null
@@ -0,0 +1,3 @@
+fs/inode.c
+fs/Makefile
+mm/page_alloc.c
diff --git a/lustre/kernel_patches/pc/iod-stock-24-exports_hp.pc b/lustre/kernel_patches/pc/iod-stock-24-exports_hp.pc
new file mode 100644 (file)
index 0000000..e4eceee
--- /dev/null
@@ -0,0 +1,3 @@
+fs/inode.c
+fs/Makefile
+mm/page_alloc.c
diff --git a/lustre/kernel_patches/pc/iopen-2.4.18.pc b/lustre/kernel_patches/pc/iopen-2.4.18.pc
new file mode 100644 (file)
index 0000000..b40b1f3
--- /dev/null
@@ -0,0 +1,8 @@
+Documentation/filesystems/ext2.txt
+fs/ext3/Makefile
+fs/ext3/inode.c
+fs/ext3/iopen.c
+fs/ext3/iopen.h
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/iopen-2.4.20.pc b/lustre/kernel_patches/pc/iopen-2.4.20.pc
new file mode 100644 (file)
index 0000000..b40b1f3
--- /dev/null
@@ -0,0 +1,8 @@
+Documentation/filesystems/ext2.txt
+fs/ext3/Makefile
+fs/ext3/inode.c
+fs/ext3/iopen.c
+fs/ext3/iopen.h
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
diff --git a/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20-rh.pc b/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20-rh.pc
new file mode 100644 (file)
index 0000000..a0a6297
--- /dev/null
@@ -0,0 +1,5 @@
+arch/i386/mm/init.c
+arch/ia64/mm/init.c
+include/linux/slab.h
+kernel/ksyms.c
+mm/slab.c
diff --git a/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20.pc b/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20.pc
new file mode 100644 (file)
index 0000000..bdba884
--- /dev/null
@@ -0,0 +1,5 @@
+arch/ia64/mm/init.c
+include/linux/slab.h
+kernel/ksyms.c
+mm/slab.c
+arch/i386/mm/init.c
index a0a6297..bdba884 100644 (file)
@@ -1,5 +1,5 @@
-arch/i386/mm/init.c
 arch/ia64/mm/init.c
 include/linux/slab.h
 kernel/ksyms.c
 mm/slab.c
+arch/i386/mm/init.c
diff --git a/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26.pc b/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26.pc
new file mode 100644 (file)
index 0000000..b647d5a
--- /dev/null
@@ -0,0 +1,10 @@
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/namei.c
+fs/ext3/super.c
+fs/ext3/xattr.c
+include/linux/ext3_fs.h
+include/linux/ext3_jbd.h
+include/linux/ext3_xattr.h
+include/linux/xattr.h
+fs/ext3/Makefile
diff --git a/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-chaos.pc b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-chaos.pc
new file mode 100644 (file)
index 0000000..dbf35cb
--- /dev/null
@@ -0,0 +1,62 @@
+Documentation/Configure.help
+arch/alpha/defconfig
+arch/alpha/kernel/entry.S
+arch/arm/defconfig
+arch/arm/kernel/calls.S
+arch/i386/defconfig
+arch/ia64/defconfig
+arch/m68k/defconfig
+arch/mips/defconfig
+arch/mips64/defconfig
+arch/ppc/defconfig
+arch/ppc64/kernel/misc.S
+arch/s390/defconfig
+arch/s390/kernel/entry.S
+arch/s390x/defconfig
+arch/s390x/kernel/entry.S
+arch/s390x/kernel/wrapper32.S
+arch/sparc/defconfig
+arch/sparc/kernel/systbls.S
+arch/sparc64/defconfig
+arch/sparc64/kernel/systbls.S
+fs/Config.in
+fs/Makefile
+fs/ext2/Makefile
+fs/ext2/file.c
+fs/ext2/ialloc.c
+fs/ext2/inode.c
+fs/ext2/namei.c
+fs/ext2/super.c
+fs/ext2/symlink.c
+fs/ext2/xattr.c
+fs/ext2/xattr_user.c
+fs/ext3/Makefile
+fs/ext3/file.c
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/namei.c
+fs/ext3/super.c
+fs/ext3/symlink.c
+fs/ext3/xattr.c
+fs/ext3/xattr_user.c
+fs/jfs/jfs_xattr.h
+fs/jfs/xattr.c
+fs/mbcache.c
+include/asm-arm/unistd.h
+include/asm-ppc64/unistd.h
+include/asm-s390/unistd.h
+include/asm-s390x/unistd.h
+include/asm-sparc/unistd.h
+include/asm-sparc64/unistd.h
+include/linux/cache_def.h
+include/linux/errno.h
+include/linux/ext2_fs.h
+include/linux/ext2_xattr.h
+include/linux/ext3_fs.h
+include/linux/ext3_jbd.h
+include/linux/ext3_xattr.h
+include/linux/fs.h
+include/linux/mbcache.h
+kernel/ksyms.c
+mm/vmscan.c
+fs/ext3/ext3-exports.c
diff --git a/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-hp.pc b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-hp.pc
new file mode 100644 (file)
index 0000000..1e8cf75
--- /dev/null
@@ -0,0 +1,62 @@
+Documentation/Configure.help
+arch/alpha/defconfig
+arch/alpha/kernel/entry.S
+arch/arm/defconfig
+arch/arm/kernel/calls.S
+arch/i386/defconfig
+arch/ia64/defconfig
+arch/m68k/defconfig
+arch/mips/defconfig
+arch/mips64/defconfig
+arch/ppc/defconfig
+arch/ppc64/kernel/misc.S
+arch/s390/defconfig
+arch/s390/kernel/entry.S
+arch/s390x/defconfig
+arch/s390x/kernel/entry.S
+arch/s390x/kernel/wrapper32.S
+arch/sparc/defconfig
+arch/sparc/kernel/systbls.S
+arch/sparc64/defconfig
+arch/sparc64/kernel/systbls.S
+fs/Config.in
+fs/Makefile
+fs/ext2/Makefile
+fs/ext2/file.c
+fs/ext2/ialloc.c
+fs/ext2/inode.c
+fs/ext2/namei.c
+fs/ext2/super.c
+fs/ext2/symlink.c
+fs/ext2/xattr.c
+fs/ext2/xattr_user.c
+fs/ext3/Makefile
+fs/ext3/file.c
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/namei.c
+fs/ext3/super.c
+fs/ext3/symlink.c
+fs/ext3/xattr.c
+fs/ext3/xattr_user.c
+fs/ext3/ext3-exports.c
+fs/jfs/jfs_xattr.h
+fs/jfs/xattr.c
+fs/mbcache.c
+include/asm-arm/unistd.h
+include/asm-ppc64/unistd.h
+include/asm-s390/unistd.h
+include/asm-s390x/unistd.h
+include/asm-sparc/unistd.h
+include/asm-sparc64/unistd.h
+include/linux/cache_def.h
+include/linux/errno.h
+include/linux/ext2_fs.h
+include/linux/ext2_xattr.h
+include/linux/ext3_fs.h
+include/linux/ext3_jbd.h
+include/linux/ext3_xattr.h
+include/linux/fs.h
+include/linux/mbcache.h
+kernel/ksyms.c
+mm/vmscan.c
diff --git a/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54.pc b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54.pc
new file mode 100644 (file)
index 0000000..2de1b2c
--- /dev/null
@@ -0,0 +1,64 @@
+Documentation/Configure.help
+arch/alpha/defconfig
+arch/alpha/kernel/entry.S
+arch/arm/defconfig
+arch/arm/kernel/calls.S
+arch/i386/defconfig
+arch/ia64/defconfig
+arch/ia64/kernel/entry.S
+arch/m68k/defconfig
+arch/mips/defconfig
+arch/mips64/defconfig
+arch/ppc/defconfig
+arch/ppc64/kernel/misc.S
+arch/s390/defconfig
+arch/s390/kernel/entry.S
+arch/s390x/defconfig
+arch/s390x/kernel/entry.S
+arch/s390x/kernel/wrapper32.S
+arch/sparc/defconfig
+arch/sparc/kernel/systbls.S
+arch/sparc64/defconfig
+arch/sparc64/kernel/systbls.S
+fs/Config.in
+fs/Makefile
+fs/ext2/Makefile
+fs/ext2/file.c
+fs/ext2/ialloc.c
+fs/ext2/inode.c
+fs/ext2/namei.c
+fs/ext2/super.c
+fs/ext2/symlink.c
+fs/ext2/xattr.c
+fs/ext2/xattr_user.c
+fs/ext3/Makefile
+fs/ext3/file.c
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/namei.c
+fs/ext3/super.c
+fs/ext3/symlink.c
+fs/ext3/xattr.c
+fs/ext3/xattr_user.c
+fs/jfs/jfs_xattr.h
+fs/jfs/xattr.c
+fs/mbcache.c
+include/asm-arm/unistd.h
+include/asm-ia64/unistd.h
+include/asm-ppc64/unistd.h
+include/asm-s390/unistd.h
+include/asm-s390x/unistd.h
+include/asm-sparc/unistd.h
+include/asm-sparc64/unistd.h
+include/linux/cache_def.h
+include/linux/errno.h
+include/linux/ext2_fs.h
+include/linux/ext2_xattr.h
+include/linux/ext3_fs.h
+include/linux/ext3_jbd.h
+include/linux/ext3_xattr.h
+include/linux/fs.h
+include/linux/mbcache.h
+kernel/ksyms.c
+mm/vmscan.c
+fs/ext3/ext3-exports.c
diff --git a/lustre/kernel_patches/pc/lustre-2.5.63.pc b/lustre/kernel_patches/pc/lustre-2.5.63.pc
new file mode 100644 (file)
index 0000000..daeea17
--- /dev/null
@@ -0,0 +1,12 @@
+arch/um/kernel/mem.c
+fs/namei.c
+fs/nfsd/vfs.c
+fs/sysfs/inode.c
+include/linux/dcache.h
+include/linux/fs.h
+include/linux/namei.h
+include/linux/slab.h
+kernel/ksyms.c
+mm/slab.c
+net/unix/af_unix.c
+fs/dcache.c
diff --git a/lustre/kernel_patches/pc/mcore-2.4.20-8.pc b/lustre/kernel_patches/pc/mcore-2.4.20-8.pc
new file mode 100644 (file)
index 0000000..b290f60
--- /dev/null
@@ -0,0 +1,34 @@
+Makefile
+Documentation/Configure.help
+arch/i386/config.in
+arch/i386/vmlinux.lds
+arch/i386/boot/setup.S
+arch/i386/kernel/Makefile
+arch/i386/kernel/crash.c
+arch/i386/kernel/nmi.c
+arch/i386/kernel/process.c
+arch/i386/kernel/setup.c
+arch/i386/kernel/smp.c
+arch/i386/kernel/traps.c
+drivers/char/misc.c
+drivers/char/sysrq.c
+include/asm-i386/bootimg.h
+include/asm-i386/crash.h
+include/linux/bootimg.h
+include/linux/crash.h
+include/linux/mm.h
+include/linux/reboot.h
+include/linux/sysctl.h
+init/main.c
+kernel/Makefile
+kernel/bootimg.c
+kernel/bootimg_pic.c
+kernel/crash.c
+kernel/module.c
+kernel/panic.c
+kernel/sysctl.c
+lib/Config.in
+mm/memory.c
+mm/page_alloc.c
+arch/i386//boot/compressed/head.S
+arch/i386//kernel/head.S
diff --git a/lustre/kernel_patches/pc/patch-2.4.18-hp1_pnnl18.2.8qsnet.pc b/lustre/kernel_patches/pc/patch-2.4.18-hp1_pnnl18.2.8qsnet.pc
deleted file mode 100644 (file)
index 44d4abf..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-./include/linux/lustre_version.h
-./arch/ia64/mm/init.c
-./arch/i386/mm/init.c
-./drivers/block/blkpg.c
-./drivers/block/loop.c
-./drivers/ide/ide-disk.c
-./fs/ext3/Makefile
-./fs/ext3/super.c
-./fs/jbd/commit.c
-./fs/jbd/journal.c
-./fs/jbd/transaction.c
-./include/linux/blkdev.h
-./include/linux/slab.h
-./include/linux/jbd.h
-./kernel/ksyms.c
-./include/linux/dcache.h
-./include/linux/fs.h
-./fs/dcache.c
-./fs/nfsd/vfs.c
-./fs/namei.c
-./fs/open.c
-./fs/stat.c
-./mm/slab.c
diff --git a/lustre/kernel_patches/pc/tcp-zero-copy.pc b/lustre/kernel_patches/pc/tcp-zero-copy.pc
new file mode 100644 (file)
index 0000000..02877c0
--- /dev/null
@@ -0,0 +1,5 @@
+include/linux/skbuff.h
+include/net/tcp.h
+net/netsyms.c
+net/core/skbuff.c
+net/ipv4/tcp.c
diff --git a/lustre/kernel_patches/pc/uml-patch-2.4.20-4.pc b/lustre/kernel_patches/pc/uml-patch-2.4.20-4.pc
new file mode 100644 (file)
index 0000000..887e3fa
--- /dev/null
@@ -0,0 +1,394 @@
+CREDITS
+Documentation/Configure.help
+MAINTAINERS
+Makefile
+arch/um/Makefile
+arch/um/Makefile-i386
+arch/um/Makefile-ia64
+arch/um/Makefile-os-Linux
+arch/um/Makefile-ppc
+arch/um/Makefile-skas
+arch/um/Makefile-tt
+arch/um/common.ld.in
+arch/um/config.in
+arch/um/config.release
+arch/um/config_block.in
+arch/um/config_char.in
+arch/um/config_net.in
+arch/um/config_scsi.in
+arch/um/defconfig
+arch/um/drivers/Makefile
+arch/um/drivers/chan_kern.c
+arch/um/drivers/chan_user.c
+arch/um/drivers/daemon.h
+arch/um/drivers/daemon_kern.c
+arch/um/drivers/daemon_user.c
+arch/um/drivers/fd.c
+arch/um/drivers/harddog_kern.c
+arch/um/drivers/harddog_user.c
+arch/um/drivers/hostaudio_kern.c
+arch/um/drivers/hostaudio_user.c
+arch/um/drivers/line.c
+arch/um/drivers/mcast.h
+arch/um/drivers/mcast_kern.c
+arch/um/drivers/mcast_user.c
+arch/um/drivers/mconsole_kern.c
+arch/um/drivers/mconsole_user.c
+arch/um/drivers/mmapper_kern.c
+arch/um/drivers/net_kern.c
+arch/um/drivers/net_user.c
+arch/um/drivers/null.c
+arch/um/drivers/pcap_kern.c
+arch/um/drivers/pcap_user.c
+arch/um/drivers/pcap_user.h
+arch/um/drivers/port.h
+arch/um/drivers/port_kern.c
+arch/um/drivers/port_user.c
+arch/um/drivers/pty.c
+arch/um/drivers/slip.h
+arch/um/drivers/slip_kern.c
+arch/um/drivers/slip_proto.h
+arch/um/drivers/slip_user.c
+arch/um/drivers/slirp.h
+arch/um/drivers/slirp_kern.c
+arch/um/drivers/slirp_user.c
+arch/um/drivers/ssl.c
+arch/um/drivers/ssl.h
+arch/um/drivers/stdio_console.c
+arch/um/drivers/stdio_console.h
+arch/um/drivers/tty.c
+arch/um/drivers/ubd_kern.c
+arch/um/drivers/ubd_user.c
+arch/um/drivers/xterm.c
+arch/um/drivers/xterm.h
+arch/um/drivers/xterm_kern.c
+arch/um/dyn_link.ld.in
+arch/um/fs/Makefile
+arch/um/fs/hostfs/Makefile
+arch/um/fs/hostfs/hostfs.h
+arch/um/fs/hostfs/hostfs_kern.c
+arch/um/fs/hostfs/hostfs_user.c
+arch/um/fs/hppfs/Makefile
+arch/um/fs/hppfs/hppfs_kern.c
+arch/um/include/2_5compat.h
+arch/um/include/Makefile
+arch/um/include/chan_kern.h
+arch/um/include/chan_user.h
+arch/um/include/choose-mode.h
+arch/um/include/frame.h
+arch/um/include/frame_kern.h
+arch/um/include/frame_user.h
+arch/um/include/helper.h
+arch/um/include/hostaudio.h
+arch/um/include/init.h
+arch/um/include/initrd.h
+arch/um/include/irq_user.h
+arch/um/include/kern.h
+arch/um/include/kern_util.h
+arch/um/include/line.h
+arch/um/include/mconsole.h
+arch/um/include/mconsole_kern.h
+arch/um/include/mem.h
+arch/um/include/mem_user.h
+arch/um/include/mode.h
+arch/um/include/mode_kern.h
+arch/um/include/net_kern.h
+arch/um/include/net_user.h
+arch/um/include/os.h
+arch/um/include/process.h
+arch/um/include/ptrace_user.h
+arch/um/include/sigcontext.h
+arch/um/include/sigio.h
+arch/um/include/signal_kern.h
+arch/um/include/signal_user.h
+arch/um/include/skas_ptrace.h
+arch/um/include/syscall_user.h
+arch/um/include/sysdep-i386/checksum.h
+arch/um/include/sysdep-i386/frame.h
+arch/um/include/sysdep-i386/frame_kern.h
+arch/um/include/sysdep-i386/frame_user.h
+arch/um/include/sysdep-i386/ptrace.h
+arch/um/include/sysdep-i386/ptrace_user.h
+arch/um/include/sysdep-i386/sigcontext.h
+arch/um/include/sysdep-i386/syscalls.h
+arch/um/include/sysdep-ia64/ptrace.h
+arch/um/include/sysdep-ia64/sigcontext.h
+arch/um/include/sysdep-ia64/syscalls.h
+arch/um/include/sysdep-ppc/ptrace.h
+arch/um/include/sysdep-ppc/sigcontext.h
+arch/um/include/sysdep-ppc/syscalls.h
+arch/um/include/sysrq.h
+arch/um/include/tempfile.h
+arch/um/include/time_user.h
+arch/um/include/tlb.h
+arch/um/include/ubd_user.h
+arch/um/include/um_mmu.h
+arch/um/include/um_uaccess.h
+arch/um/include/umid.h
+arch/um/include/uml_uaccess.h
+arch/um/include/umn.h
+arch/um/include/user.h
+arch/um/include/user_util.h
+arch/um/kernel/Makefile
+arch/um/kernel/checksum.c
+arch/um/kernel/config.c.in
+arch/um/kernel/exec_kern.c
+arch/um/kernel/exitcode.c
+arch/um/kernel/frame.c
+arch/um/kernel/frame_kern.c
+arch/um/kernel/gmon_syms.c
+arch/um/kernel/gprof_syms.c
+arch/um/kernel/helper.c
+arch/um/kernel/init_task.c
+arch/um/kernel/initrd_kern.c
+arch/um/kernel/initrd_user.c
+arch/um/kernel/irq.c
+arch/um/kernel/irq_user.c
+arch/um/kernel/ksyms.c
+arch/um/kernel/mem.c
+arch/um/kernel/mem_user.c
+arch/um/kernel/mprot.h
+arch/um/kernel/process.c
+arch/um/kernel/process_kern.c
+arch/um/kernel/ptrace.c
+arch/um/kernel/reboot.c
+arch/um/kernel/resource.c
+arch/um/kernel/sigio_kern.c
+arch/um/kernel/sigio_user.c
+arch/um/kernel/signal_kern.c
+arch/um/kernel/signal_user.c
+arch/um/kernel/skas/Makefile
+arch/um/kernel/skas/exec_kern.c
+arch/um/kernel/skas/exec_user.c
+arch/um/kernel/skas/include/mmu.h
+arch/um/kernel/skas/include/mode.h
+arch/um/kernel/skas/include/mode_kern.h
+arch/um/kernel/skas/include/proc_mm.h
+arch/um/kernel/skas/include/ptrace-skas.h
+arch/um/kernel/skas/include/skas.h
+arch/um/kernel/skas/include/uaccess.h
+arch/um/kernel/skas/mem.c
+arch/um/kernel/skas/mem_user.c
+arch/um/kernel/skas/mmu.c
+arch/um/kernel/skas/process.c
+arch/um/kernel/skas/process_kern.c
+arch/um/kernel/skas/sys-i386/Makefile
+arch/um/kernel/skas/sys-i386/sigcontext.c
+arch/um/kernel/skas/syscall_kern.c
+arch/um/kernel/skas/syscall_user.c
+arch/um/kernel/skas/time.c
+arch/um/kernel/skas/tlb.c
+arch/um/kernel/skas/trap_user.c
+arch/um/kernel/skas/util/Makefile
+arch/um/kernel/skas/util/mk_ptregs.c
+arch/um/kernel/smp.c
+arch/um/kernel/sys_call_table.c
+arch/um/kernel/syscall_kern.c
+arch/um/kernel/syscall_user.c
+arch/um/kernel/sysrq.c
+arch/um/kernel/tempfile.c
+arch/um/kernel/time.c
+arch/um/kernel/time_kern.c
+arch/um/kernel/tlb.c
+arch/um/kernel/trap_kern.c
+arch/um/kernel/trap_user.c
+arch/um/kernel/tt/Makefile
+arch/um/kernel/tt/exec_kern.c
+arch/um/kernel/tt/exec_user.c
+arch/um/kernel/tt/gdb.c
+arch/um/kernel/tt/gdb_kern.c
+arch/um/kernel/tt/include/debug.h
+arch/um/kernel/tt/include/mmu.h
+arch/um/kernel/tt/include/mode.h
+arch/um/kernel/tt/include/mode_kern.h
+arch/um/kernel/tt/include/ptrace-tt.h
+arch/um/kernel/tt/include/tt.h
+arch/um/kernel/tt/include/uaccess.h
+arch/um/kernel/tt/ksyms.c
+arch/um/kernel/tt/mem.c
+arch/um/kernel/tt/mem_user.c
+arch/um/kernel/tt/process_kern.c
+arch/um/kernel/tt/ptproxy/Makefile
+arch/um/kernel/tt/ptproxy/proxy.c
+arch/um/kernel/tt/ptproxy/ptproxy.h
+arch/um/kernel/tt/ptproxy/ptrace.c
+arch/um/kernel/tt/ptproxy/sysdep.c
+arch/um/kernel/tt/ptproxy/sysdep.h
+arch/um/kernel/tt/ptproxy/wait.c
+arch/um/kernel/tt/ptproxy/wait.h
+arch/um/kernel/tt/sys-i386/Makefile
+arch/um/kernel/tt/sys-i386/sigcontext.c
+arch/um/kernel/tt/syscall_kern.c
+arch/um/kernel/tt/syscall_user.c
+arch/um/kernel/tt/time.c
+arch/um/kernel/tt/tlb.c
+arch/um/kernel/tt/tracer.c
+arch/um/kernel/tt/trap_user.c
+arch/um/kernel/tt/uaccess_user.c
+arch/um/kernel/tt/unmap.c
+arch/um/kernel/tty_log.c
+arch/um/kernel/uaccess_user.c
+arch/um/kernel/um_arch.c
+arch/um/kernel/umid.c
+arch/um/kernel/user_syms.c
+arch/um/kernel/user_util.c
+arch/um/link.ld.in
+arch/um/main.c
+arch/um/os-Linux/Makefile
+arch/um/os-Linux/drivers/Makefile
+arch/um/os-Linux/drivers/etap.h
+arch/um/os-Linux/drivers/ethertap_kern.c
+arch/um/os-Linux/drivers/ethertap_user.c
+arch/um/os-Linux/drivers/tuntap.h
+arch/um/os-Linux/drivers/tuntap_kern.c
+arch/um/os-Linux/drivers/tuntap_user.c
+arch/um/os-Linux/file.c
+arch/um/os-Linux/include/file.h
+arch/um/os-Linux/process.c
+arch/um/os-Linux/tty.c
+arch/um/sys-i386/Makefile
+arch/um/sys-i386/bugs.c
+arch/um/sys-i386/checksum.S
+arch/um/sys-i386/fault.c
+arch/um/sys-i386/ksyms.c
+arch/um/sys-i386/ldt.c
+arch/um/sys-i386/ptrace.c
+arch/um/sys-i386/ptrace_user.c
+arch/um/sys-i386/sigcontext.c
+arch/um/sys-i386/syscalls.c
+arch/um/sys-i386/sysrq.c
+arch/um/sys-i386/util/Makefile
+arch/um/sys-i386/util/mk_sc.c
+arch/um/sys-i386/util/mk_thread_kern.c
+arch/um/sys-i386/util/mk_thread_user.c
+arch/um/sys-ia64/Makefile
+arch/um/sys-ppc/Makefile
+arch/um/sys-ppc/misc.S
+arch/um/sys-ppc/miscthings.c
+arch/um/sys-ppc/ptrace.c
+arch/um/sys-ppc/ptrace_user.c
+arch/um/sys-ppc/sigcontext.c
+arch/um/sys-ppc/sysrq.c
+arch/um/util/Makefile
+arch/um/util/mk_constants_kern.c
+arch/um/util/mk_constants_user.c
+arch/um/util/mk_task_kern.c
+arch/um/util/mk_task_user.c
+drivers/char/Makefile
+drivers/char/tty_io.c
+drivers/net/setup.c
+include/asm-i386/hardirq.h
+include/asm-um/a.out.h
+include/asm-um/arch-signal-i386.h
+include/asm-um/archparam-i386.h
+include/asm-um/archparam-ppc.h
+include/asm-um/atomic.h
+include/asm-um/bitops.h
+include/asm-um/boot.h
+include/asm-um/bugs.h
+include/asm-um/byteorder.h
+include/asm-um/cache.h
+include/asm-um/checksum.h
+include/asm-um/cobalt.h
+include/asm-um/current.h
+include/asm-um/delay.h
+include/asm-um/desc.h
+include/asm-um/div64.h
+include/asm-um/dma.h
+include/asm-um/elf.h
+include/asm-um/errno.h
+include/asm-um/fcntl.h
+include/asm-um/fixmap.h
+include/asm-um/floppy.h
+include/asm-um/hardirq.h
+include/asm-um/hdreg.h
+include/asm-um/highmem.h
+include/asm-um/hw_irq.h
+include/asm-um/ide.h
+include/asm-um/init.h
+include/asm-um/io.h
+include/asm-um/ioctl.h
+include/asm-um/ioctls.h
+include/asm-um/ipc.h
+include/asm-um/ipcbuf.h
+include/asm-um/irq.h
+include/asm-um/keyboard.h
+include/asm-um/kmap_types.h
+include/asm-um/linux_logo.h
+include/asm-um/locks.h
+include/asm-um/mca_dma.h
+include/asm-um/mman.h
+include/asm-um/mmu.h
+include/asm-um/mmu_context.h
+include/asm-um/module.h
+include/asm-um/msgbuf.h
+include/asm-um/mtrr.h
+include/asm-um/namei.h
+include/asm-um/page.h
+include/asm-um/page_offset.h
+include/asm-um/param.h
+include/asm-um/pci.h
+include/asm-um/pgalloc.h
+include/asm-um/pgtable.h
+include/asm-um/poll.h
+include/asm-um/posix_types.h
+include/asm-um/processor-generic.h
+include/asm-um/processor-i386.h
+include/asm-um/processor-ppc.h
+include/asm-um/ptrace-generic.h
+include/asm-um/ptrace-i386.h
+include/asm-um/resource.h
+include/asm-um/rwlock.h
+include/asm-um/rwsem.h
+include/asm-um/scatterlist.h
+include/asm-um/segment.h
+include/asm-um/semaphore.h
+include/asm-um/sembuf.h
+include/asm-um/serial.h
+include/asm-um/shmbuf.h
+include/asm-um/shmparam.h
+include/asm-um/sigcontext-generic.h
+include/asm-um/sigcontext-i386.h
+include/asm-um/sigcontext-ppc.h
+include/asm-um/siginfo.h
+include/asm-um/signal.h
+include/asm-um/smp.h
+include/asm-um/smplock.h
+include/asm-um/socket.h
+include/asm-um/sockios.h
+include/asm-um/softirq.h
+include/asm-um/spinlock.h
+include/asm-um/stat.h
+include/asm-um/statfs.h
+include/asm-um/string.h
+include/asm-um/system-generic.h
+include/asm-um/system-i386.h
+include/asm-um/system-ppc.h
+include/asm-um/termbits.h
+include/asm-um/termios.h
+include/asm-um/timex.h
+include/asm-um/tlb.h
+include/asm-um/types.h
+include/asm-um/uaccess.h
+include/asm-um/ucontext.h
+include/asm-um/unaligned.h
+include/asm-um/unistd.h
+include/asm-um/user.h
+include/asm-um/vga.h
+include/asm-um/xor.h
+include/linux/blk.h
+include/linux/fs.h
+include/linux/hostfs_fs_i.h
+include/linux/hppfs_fs_i.h
+include/linux/kernel.h
+include/linux/kernel_stat.h
+include/linux/mm.h
+include/linux/proc_mm.h
+include/linux/tty.h
+init/do_mounts.c
+kernel/panic.c
+mm/Makefile
+mm/mmap.c
+mm/mprotect.c
+mm/proc_mm.c
+mm/slab.c
diff --git a/lustre/kernel_patches/pc/vanilla-2.4.18.pc b/lustre/kernel_patches/pc/vanilla-2.4.18.pc
deleted file mode 100644 (file)
index c1ed719..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-include/linux/lustre_version.h
-arch/ia64/mm/init.c
-arch/i386/mm/init.c
-drivers/block/blkpg.c
-drivers/block/loop.c
-drivers/ide/ide-disk.c
-fs/ext3/Makefile
-fs/ext3/super.c
-fs/jbd/commit.c
-fs/jbd/journal.c
-fs/jbd/transaction.c
-include/linux/blkdev.h
-include/linux/slab.h
-include/linux/jbd.h
-kernel/ksyms.c
-include/linux/dcache.h
-include/linux/fs.h
-fs/dcache.c
-fs/nfsd/vfs.c
-fs/namei.c
-fs/open.c
-fs/stat.c
-mm/slab.c
diff --git a/lustre/kernel_patches/pc/vanilla-2.4.19.pc b/lustre/kernel_patches/pc/vanilla-2.4.19.pc
deleted file mode 100644 (file)
index bb5c390..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-include/linux/lustre_version.h
-arch/ia64/mm/init.c
-arch/i386/mm/init.c
-drivers/block/blkpg.c
-drivers/block/loop.c
-drivers/ide/ide-disk.c
-fs/ext3/Makefile
-fs/ext3/super.c
-include/linux/blkdev.h
-include/linux/slab.h
-kernel/ksyms.c
-include/linux/dcache.h
-include/linux/fs.h
-fs/dcache.c
-fs/nfsd/vfs.c
-fs/namei.c
-fs/open.c
-fs/stat.c
-mm/slab.c
index dd2b1c8..8801aa7 100644 (file)
@@ -2,6 +2,7 @@ fs/dcache.c
 fs/namei.c
 fs/nfsd/vfs.c
 fs/open.c
+fs/proc/base.c
 fs/stat.c
 fs/exec.c
 include/linux/dcache.h
similarity index 82%
rename from lustre/kernel_patches/pc/vfs_intent.pc
rename to lustre/kernel_patches/pc/vfs_intent-2.4.20-rh.pc
index 881576c..fbe6ff1 100644 (file)
@@ -6,3 +6,5 @@ fs/stat.c
 include/linux/dcache.h
 include/linux/fs.h
 kernel/ksyms.c
+fs/exec.c
+fs/proc/base.c
@@ -1,8 +1,10 @@
+fs/exec.c
 fs/dcache.c
 fs/namei.c
 fs/nfsd/vfs.c
 fs/open.c
 fs/stat.c
+fs/proc/base.c
 include/linux/dcache.h
 include/linux/fs.h
 kernel/ksyms.c
index f512132..7d688db 100755 (executable)
@@ -67,7 +67,7 @@ REVINO=`(cd $TREE ; stat $REVERSE | awk '($3 == "Inode:") {print $4}')`
 
 [ $ABSINO != $REVINO ] && die "inodes differ, my reverse path is bad?"
 
-echo export PATCHSCRIPTS=$REVERSE
+echo export PATCHSCRIPTS_LIBDIR=$REVERSE
 
 cd $TREE
 ln -sf $REVERSE/series/$SERIES series
index 4b63598..be1c68e 100755 (executable)
@@ -5,8 +5,6 @@
        echo "Check your install, or go to the right directory"
        exit 1
 }
-
-
 do_apply()
 {
        FILES=$(cat $P/pc/$PATCH_NAME.pc)
@@ -70,7 +68,7 @@ apatch()
                echo "$PATCH_NAME" is already applied
                exit 1
        fi
-
+       
        if [ $opt_force != 0 ]
        then
                echo FORCING PATCH
@@ -78,6 +76,7 @@ apatch()
 
        if [ $opt_force != 0 ] || can_apply $P/patches/"$PATCH_NAME".patch
        then
+               check_pc_match $P/patches/"$PATCH_NAME".patch $P/pc/"$PATCH_NAME".pc 
                do_apply $P/patches/"$PATCH_NAME".patch
                add_to_db "$PATCH_NAME"
                echo applied $PATCH_NAME
diff --git a/lustre/kernel_patches/scripts/cat-series b/lustre/kernel_patches/scripts/cat-series
new file mode 100755 (executable)
index 0000000..c38b1a8
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. patchfns 2>/dev/null ||
+. /usr/lib/patch-scripts/patchfns 2>/dev/null ||
+. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null ||
+{
+       echo "Impossible to find my library 'patchfns'."
+       echo "Check your install, or go to the right directory"
+       exit 1
+}
+
+if [ $# -eq 0 ]
+then
+       cat_series
+else
+       __cat_series $1
+fi
index 8768b29..60ab7e9 100755 (executable)
@@ -23,21 +23,23 @@ fi
 
 need_file_there applied-patches
 CURRENT=$(mktemp /tmp/cmbd-XXXXXXXX)
+APPLY_FILE=$(mktemp /tmp/cmbd-XXXXXXXX)
 for FILE in `cat applied-patches`
 do
-       NEXT=$(mktemp /tmp/cmbd-XXXXXXXX)
-       if [ -f $P/patches/$FILE ] 
+       if [ -f $P/pc/$FILE.pc ]
        then
-               combinediff $CURRENT $P/patches/$FILE > $NEXT
-       elif [ -f $P/patches/$FILE.patch ]
+               cat $P/pc/$FILE.pc >> $CURRENT  
+       elif [ -f $P/pc/$FILE ]
        then
-               combinediff $CURRENT $P/patches/$FILE.patch > $NEXT
-       elif [ -f $FILE ]
-       then
-               combinediff $CURRENT $FILE > $NEXT
-       fi
-       rm $CURRENT
-       CURRENT=$NEXT
+               cat $P/pc/$FILE >> $CURRENT     
+       fi      
+done
+cat $CURRENT | sort -u > $APPLY_FILE
+echo > $1
+for FILE in `cat $APPLY_FILE`
+do
+       diff -uNp $FILE~orig $FILE >> $1 
 done
+rm -rf $APPLY_FILE 
+rm -rf $CURRENT
 
-mv $NEXT "$1"
diff --git a/lustre/kernel_patches/scripts/forkpatch b/lustre/kernel_patches/scripts/forkpatch
new file mode 100755 (executable)
index 0000000..cef297c
--- /dev/null
@@ -0,0 +1,76 @@
+#!/bin/sh
+
+#
+# Fork the next patch in the series
+#
+
+. patchfns >/dev/null || . /usr/lib/patch-scripts/patchfns >/dev/null || { \
+       echo "Impossible to find my library 'patchfns'."
+       echo "Check your install, or go to the right directory"
+       exit 1
+}
+
+usage()
+{
+       echo "Usage: forkpatch <newname>"
+       exit 1
+}
+
+if [ $# -ne 1 ]
+then
+       usage
+fi
+
+NEW=$1
+BASE=`stripit $NEW`
+SERIES=series
+
+if [ ! -e $SERIES ]
+then
+       echo 'File "series" not found'
+       exit 1
+fi
+
+if [ -f $P/$BASE.patch ] ; then 
+        echo "Patch $NEW already exists as a file"
+        exit 1
+fi
+
+if  grep $BASE $SERIES >& /dev/null ; then 
+        echo "Patch $NEW already exists in series"
+        exit 1
+fi
+
+TMPSERIES=$(mktemp /tmp/series-XXXXXXXX)
+top=$(toppatch)
+if [ x"$top" == x ]
+then
+       todo=$(head -1 $SERIES)
+else
+       last_in_series=$(stripit $(tail -1 $SERIES))
+       if [ $last_in_series == $top ]
+       then
+               echo "Series fully applied.  Ends at $top"
+               exit 0
+       fi
+       todo=$(grep -C1 "^$top\.patch" $SERIES | tail -1)
+       if [ x$todo = x ]
+       then
+               todo=$(head -1 $SERIES)
+       fi
+fi
+
+basetodo=`stripit $todo`
+
+sed "s/$todo/$BASE.patch/" < $SERIES > $TMPSERIES
+cat $TMPSERIES > $SERIES
+rm -f $TMPSERIES
+cp -f $P/patches/$todo $P/patches/$BASE.patch
+cp -f $P/pc/$basetodo.pc $P/pc/$BASE.pc
+if [ -f $P/txt/$basetodo.txt ]; then 
+     cp -f $P/txt/$basetodo.txt $P/txt/$BASE.txt
+else 
+     echo "Warning no documentation for $BASE"
+fi
+
+echo "Cloned $todo to $BASE"
diff --git a/lustre/kernel_patches/scripts/join-patch b/lustre/kernel_patches/scripts/join-patch
new file mode 100755 (executable)
index 0000000..065ea73
--- /dev/null
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+usage()
+{
+       echo "Usage: join-patch patchname"
+       exit 1
+}
+
+if [ $# -ne 1 ]
+then
+       usage
+fi
+
+PATCHNAME=$(stripit $1)
+
+if ! can_apply $PATCHNAME
+then
+       echo Patch $PATCHNAME does not apply
+       exit 1
+fi
+
+pcpatch $PATCHNAME
+for i in $(cat $P/pc/$PATCHNAME.pc)
+do
+       fpatch $i
+done
+
+patch -p1 -i "$P/patches/$PATCHNAME.patch" -f
index b6cc468..78e494b 100644 (file)
@@ -11,9 +11,9 @@ DB=applied-patches
 # Otherwise use "."
 #
 
-if [ x$PATCHSCRIPTS != x ]
+if [ x$PATCHSCRIPTS_LIBDIR != x ]
 then
-       P=$PATCHSCRIPTS
+       P=$PATCHSCRIPTS_LIBDIR
 elif [ -d ./patch-scripts ]
 then
        P=./patch-scripts
@@ -69,7 +69,20 @@ is_applied()
                return 1
        fi
 }
-
+check_pc_match()
+{
+       if [ -f /usr/bin/lsdiff ]; then
+               tmpfile=$(mktemp /tmp/p_XXXXXX) || exit 1
+               lsdiff --strip=1 $1 > $tmpfile 
+               diff $2 $tmpfile > /dev/null
+               if [ $? != 0 ]; then
+                       echo " $1 do not match with $2 "
+                       echo " $2 will be changed to match $2"
+                       cat $tmpfile > $P/pc/$PATCH_NAME.pc
+               fi
+               rm -rf $tmpfile
+       fi
+} 
 can_apply()
 {
        if patch -p1 --dry-run -i "$1" -f
@@ -166,19 +179,23 @@ copy_file_to_bup()
        file=$1
        patch=$2
        bup="$file"~"$patch"
+       orig="$file"~"orig"
+       src_dir=`pwd`
 
        if [ -e $bup ]
        then
                echo "Cannot install file $file in patch $patch: backup $bup exists"
                exit 1
        fi
-
        if [ -e $file ]
        then
-               cp $file "$file"~"$patch"
+               cp -p $file "$file"~"$patch"
        else
                echo "file $file appears to be newly added"
        fi
+       if [ ! -L "$orig" ]; then
+               ln -s "$src_dir/$bup" $orig
+       fi      
 }
 
 install_file_in_patch()
index 792cb9b..70055d6 100755 (executable)
@@ -64,7 +64,9 @@ do
        then
                if [ $STOP_AT == $(toppatch) ]
                then
+                        sum-series applied-patch
                        exit 0
                fi
        fi
 done
+sum-series applied-patch
index 018716d..6702e63 100755 (executable)
@@ -78,7 +78,9 @@ do
        then
                if [ $STOP_AT == $(toppatch) ]
                then
+                        sum-series applied-patch
                        exit 0
                fi
        fi
 done
+sum-series applied-patch
index 88f3caf..3195a57 100755 (executable)
@@ -28,4 +28,5 @@ fi
 
 TOP_PATCH=$(top_patch)
 mpatch $* $(top_patch)
+sum-series applied-patch
 echo "Refreshed $TOP_PATCH"
index 42e1533..5a8da38 100755 (executable)
@@ -6,15 +6,34 @@
        exit 1
 }
 
+# do_remove()
+# {
+#      if patch -R -p1 -s -i $P/patches/"$1".patch
+#      then
+#              true
+#      else
+#              echo SOMETHING WENT WRONG
+#              exit 1
+#      fi
+# }
+
 do_remove()
 {
-       if patch -R -p1 -s -i $P/patches/"$1".patch
-       then
-               true
-       else
-               echo SOMETHING WENT WRONG
-               exit 1
-       fi
+       FILES=$(cat $P/pc/$1.pc)
+       for file in $FILES ; do
+           base_dir=`pwd`      
+           if [ -L "$file"~"orig" ]; then
+               if [ `readlink "$file"~"orig"` = "$base_dir/""$file"~"$1" ]; then
+                   rm -rf "$file"~"orig"
+               fi
+           fi 
+           if [ -f "$file"~"$1" ]; then
+               mv -f "$file"~"$1" "$file"
+            else
+               rm -f "$file"
+            fi
+        done
+        true
 }
 
 kill_old_ones()
@@ -40,18 +59,20 @@ fi
 PATCH_NAME=$(stripit $1)
 
 warn_top_current
-
 if is_applied "$PATCH_NAME"
 then
-       if can_remove "$PATCH_NAME"
-       then
+#      if can_remove "$PATCH_NAME"
+#      then
+               if [ ! -f $P/pc/$PATCH_NAME.pc ]; then
+                       exit 1
+               fi
                do_remove "$PATCH_NAME"
                kill_old_ones "$PATCH_NAME"
                remove_from_db "$PATCH_NAME"
-       else
-               echo "$PATCH_NAME" does not remove cleanly
-               exit 1
-       fi
+#      else
+#              echo "$PATCH_NAME" does not remove cleanly
+#              exit 1
+#      fi
 else
        echo "$PATCH_NAME" is not applied
        exit 1
diff --git a/lustre/kernel_patches/scripts/sum-series b/lustre/kernel_patches/scripts/sum-series
new file mode 100755 (executable)
index 0000000..5b628fb
--- /dev/null
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+#
+# Make superpatch from current series using combinediff.
+#
+
+. patchfns >/dev/null || . /usr/lib/patch-scripts/patchfns >/dev/null || { \
+       echo "Impossible to find my library 'patchfns'."
+       echo "Check your install, or go to the right directory"
+       exit 1
+}
+
+usage()
+{
+       echo "Usage: sum-series output-file"
+       exit 1
+}
+
+if [ $# -ne 1 ] 
+then
+       usage
+fi
+
+need_file_there applied-patches
+CURRENT=$(mktemp /tmp/cmbd-XXXXXXXX)
+for FILE in $(cat applied-patches)
+do
+#    echo "Adding patch $FILE...."
+       if [ -f $P/patches/$FILE ] 
+       then
+               cat  $P/patches/$FILE >> $CURRENT
+       elif [ -f $P/patches/$FILE.patch ]
+       then
+               cat $P/patches/$FILE.patch >> $CURRENT
+       elif [ -f $FILE ]
+       then
+               cat $FILE >> $CURRENT
+       fi
+done
+
+mv $CURRENT "$1"
diff --git a/lustre/kernel_patches/scripts/trypatch b/lustre/kernel_patches/scripts/trypatch
new file mode 100755 (executable)
index 0000000..2e3cd15
--- /dev/null
@@ -0,0 +1,72 @@
+#!/bin/sh
+
+#
+# Fork the next patch in the series
+#
+
+. patchfns >/dev/null || . /usr/lib/patch-scripts/patchfns >/dev/null || { \
+       echo "Impossible to find my library 'patchfns'."
+       echo "Check your install, or go to the right directory"
+       exit 1
+}
+
+usage()
+{
+       echo "Usage: trypatch <newname>"
+       exit 1
+}
+
+if [ $# -ne 1 ]
+then
+       usage
+fi
+
+NEW=$1
+BASE=`stripit $NEW`
+SERIES=series
+
+if [ ! -e $SERIES ]
+then
+       echo 'File "series" not found'
+       exit 1
+fi
+
+if  grep $BASE $SERIES >& /dev/null  ; then 
+        echo "Patch $NEW already exists in series"
+        exit 1
+fi
+
+if [ ! -f $P/patches/$BASE.patch ] ; then 
+        echo "Patch $NEW doesn't exist as a file"
+        exit 1
+fi
+
+$TMPSERIES=$(mktemp /tmp/series-XXXXXXXX)
+top=$(toppatch)
+if [ x"$top" == x ]
+then
+       todo=$(head -1 $SERIES)
+else
+       last_in_series=$(stripit $(tail -1 $SERIES))
+       if [ $last_in_series == $top ]
+       then
+               echo "Series fully applied.  Ends at $top"
+               exit 0
+       fi
+       todo=$(grep -C1 "^$top\.patch" $SERIES | tail -1)
+       if [ x$todo = x ]
+       then
+               todo=$(head -1 $SERIES)
+       fi
+fi
+
+if  patch -p1 -i $P/patches/$BASE.patch ; then 
+    patch -R -p1 -i $P/patches/$BASE.patch
+
+    $basetodo=$(basename $todo)
+    sed "s/$todo/$BASE/" < $SERIES > $TMPSERIES
+    mv -f $TMPSERIES $SERIES
+    echo "Replaced $todo with $BASE"
+else 
+    echo "Failed to replace $todo with $BASE"
+fi
diff --git a/lustre/kernel_patches/scripts/unused-patches b/lustre/kernel_patches/scripts/unused-patches
new file mode 100755 (executable)
index 0000000..2f3a70a
--- /dev/null
@@ -0,0 +1,39 @@
+#!/bin/sh
+
+#
+# List unused patches
+#
+
+. patchfns >/dev/null || . /usr/lib/patch-scripts/patchfns >/dev/null || { \
+       echo "Impossible to find my library 'patchfns'."
+       echo "Check your install, or go to the right directory"
+       exit 1
+}
+
+usage()
+{
+       echo "Usage: unused-patches"
+       exit 1
+}
+
+if [ $# -ne 0 ] 
+then
+       usage
+fi
+
+for FILE in $(ls $P/patches)
+do
+        BASE=`stripit $FILE`
+#       echo checking $BASE in $P/patches
+       if  grep $FILE $P/series/*  >&  /dev/null ; then 
+                true
+#                echo $FILE found in $P/series
+        else 
+            if [ $BASE != CVS ]; then
+                echo patches/$FILE
+                echo txt/$BASE.txt
+                echo pc/$BASE.pc
+            fi
+       fi
+done
+
index 913ae18..00ae7fd 100644 (file)
@@ -5,3 +5,16 @@ lustre_version.patch
 vfs_intent-2.4.18-18.patch
 invalidate_show.patch
 iod-rmap-exports.patch
+export-truncate.patch
+htree-ext3-2.4.18.patch
+linux-2.4.18ea-0.8.26.patch
+ext3-2.4-ino_t.patch
+ext3-2.4.18-ino_sb_macro.patch
+ext3-orphan_lock.patch
+ext3-delete_thread-2.4.18.patch
+extN-misc-fixup.patch
+extN-noread.patch
+extN-wantedi.patch
+extN-san.patch
+extN-2.4.18-ino_sb_fixup.patch
+iopen-2.4.18.patch
diff --git a/lustre/kernel_patches/series/hp-pnnl b/lustre/kernel_patches/series/hp-pnnl
deleted file mode 100644 (file)
index bf276fb..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-dev_read_only_hp.patch
-exports_hp.patch
-kmem_cache_validate_hp.patch
-jbd-transno-cb.patch
-lustre_version.patch
-vfs_intent_hp.patch
-invalidate_show.patch
-iod-stock-24-exports_hp.patch
diff --git a/lustre/kernel_patches/series/hp-pnnl-2.4.20 b/lustre/kernel_patches/series/hp-pnnl-2.4.20
new file mode 100644 (file)
index 0000000..b951209
--- /dev/null
@@ -0,0 +1,25 @@
+dev_read_only_hp_2.4.20.patch
+exports_2.4.20-rh-hp.patch
+kmem_cache_validate_hp.patch
+lustre_version.patch
+vfs_intent-2.4.20-vanilla.patch
+invalidate_show.patch
+export-truncate.patch
+iod-stock-24-exports_hp.patch
+ext-2.4-patch-1.patch
+ext-2.4-patch-2.patch
+ext-2.4-patch-3.patch
+ext-2.4-patch-4.patch
+linux-2.4.20-xattr-0.8.54-hp.patch
+ext3-2.4.20-fixes.patch
+ext3-2.4-ino_t.patch
+ext3-largefile.patch
+ext3-truncate_blocks.patch
+ext3-use-after-free.patch
+ext3-orphan_lock.patch
+ext3-delete_thread-2.4.20.patch
+ext3-noread-2.4.20.patch
+extN-wantedi.patch
+ext3-san-2.4.20.patch
+iopen-2.4.20.patch
+tcp-zero-copy.patch
index 51a833f..df7f536 100644 (file)
@@ -7,4 +7,18 @@ uml_no_panic.patch
 vfs_intent-2.4.18-18.patch
 uml_compile_fixes.patch
 invalidate_show.patch
+export-truncate.patch
 iod-rmap-exports.patch
+htree-ext3-2.4.18.patch
+linux-2.4.18ea-0.8.26.patch
+ext3-2.4-ino_t.patch
+ext3-2.4.18-ino_sb_macro.patch
+ext3-orphan_lock.patch
+ext3-delete_thread-2.4.18.patch
+extN-misc-fixup.patch
+extN-noread.patch
+extN-wantedi.patch
+extN-san.patch
+extN-2.4.18-ino_sb_fixup.patch
+iopen-2.4.18.patch
+tcp-zero-copy.patch
diff --git a/lustre/kernel_patches/series/rh-2.4.20 b/lustre/kernel_patches/series/rh-2.4.20
new file mode 100644 (file)
index 0000000..a97c37c
--- /dev/null
@@ -0,0 +1,23 @@
+mcore-2.4.20-8.patch
+dsp.patch
+dev_read_only_2.4.20-rh.patch
+exports_2.4.20-rh-hp.patch
+kmem_cache_validate_2.4.20-rh.patch
+lustre_version.patch
+vfs_intent-2.4.20-rh.patch
+invalidate_show-2.4.20-rh.patch
+iod-rmap-exports-2.4.20.patch
+export-truncate.patch
+ext-2.4-patch-1-chaos.patch
+ext-2.4-patch-2.patch
+ext-2.4-patch-3.patch
+ext-2.4-patch-4.patch
+linux-2.4.20-xattr-0.8.54-chaos.patch
+ext3-2.4.20-fixes.patch
+ext3_orphan_lock-2.4.20-rh.patch
+ext3-delete_thread-2.4.20.patch
+ext3-noread-2.4.20.patch
+extN-wantedi.patch
+ext3-san-2.4.20.patch
+iopen-2.4.20.patch
+tcp-zero-copy.patch
diff --git a/lustre/kernel_patches/series/vanilla-2.4.18 b/lustre/kernel_patches/series/vanilla-2.4.18
deleted file mode 100644 (file)
index 5d2ab68..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-vanilla-2.4.18
-invalidate_show.patch
diff --git a/lustre/kernel_patches/series/vanilla-2.4.19 b/lustre/kernel_patches/series/vanilla-2.4.19
deleted file mode 100644 (file)
index 37cb65e..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-vanilla-2.4.19.patch
-jbd-transno-cb.patch
-invalidate_show.patch
diff --git a/lustre/kernel_patches/series/vanilla-2.4.20 b/lustre/kernel_patches/series/vanilla-2.4.20
new file mode 100644 (file)
index 0000000..e56cac6
--- /dev/null
@@ -0,0 +1,29 @@
+uml-patch-2.4.20-4.patch
+dev_read_only_2.4.20.patch
+exports_2.4.20.patch
+kmem_cache_validate_2.4.20.patch
+lustre_version.patch
+vfs_intent-2.4.20-vanilla.patch
+invalidate_show.patch
+export-truncate.patch
+iod-stock-24-exports.patch
+uml_check_get_page.patch
+uml_no_panic.patch
+ext-2.4-patch-1.patch
+ext-2.4-patch-2.patch
+ext-2.4-patch-3.patch
+ext-2.4-patch-4.patch
+linux-2.4.20-xattr-0.8.54.patch
+ext3-2.4.20-fixes.patch
+ext3-2.4-ino_t.patch
+ext3-largefile.patch
+ext3-truncate_blocks.patch
+ext3-unmount_sync.patch
+ext3-use-after-free.patch
+ext3-orphan_lock.patch
+ext3-noread-2.4.20.patch
+ext3-delete_thread-2.4.20.patch
+extN-wantedi.patch
+ext3-san-2.4.20.patch
+iopen-2.4.20.patch
+tcp-zero-copy.patch
diff --git a/lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt b/lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt
new file mode 100644 (file)
index 0000000..b890cbd
--- /dev/null
@@ -0,0 +1,3 @@
+DESC
+Fix for block allocation errors if block bitmap or inode block list is corrupt.
+EDESC
diff --git a/lustre/kernel_patches/txt/vfs_intent.txt b/lustre/kernel_patches/txt/vfs_intent.txt
deleted file mode 100644 (file)
index 010cdb7..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-DESC
-(undescribed patch)
-EDESC
index b7af3d9..2ef001d 100644 (file)
@@ -1,10 +1,13 @@
-series/chaos
-   chaos-39
-series/rh-2.4.18-18
-   redhat 2.4.18-18
-series/hp-pnnl              ** Note: functionally equivalent to 2.4.19 
-   linux-2.4.18-hp2_pnnl2
-series/vanilla-2.4.19       ** Not officially supported
-   linux-2.4.19
-series/lin-2.5.44
-   uml-2.5.44
+SERIES               MEMNONIC                  COMMENT
+
+hp-pnnl-2.4.20       linux-2.4.20-hp4_pnnl1    same as vanilla but no uml
+vanilla-2.4.20       linux-2.4.20              patch includes uml
+chaos-2.4.20         linux-chaos-2.4.20        same as rh-2.4.20-8
+rh-2.4.20            linux-rh-2.4.20-8         same as chaos-2.4.20
+rh-2.4.18-18         linux-rh-2.4.18-18        same as chaos but includes uml
+chaos                linux-chaos-2.4.18        same as rh-2.4.18-18 but no uml
+
+REVIEW:
+
+vanilla-2.5          linux-2.5.63
+hp-pnnl              linux-2.4.19-hp2_pnnl6
index d0c4199..1ceb276 100644 (file)
@@ -5,7 +5,7 @@
 
 DEFS= 
 
-LDLMSOURCES= l_lock.c ldlm_lock.c ldlm_resource.c  \
+LDLMSOURCES= l_lock.c ldlm_lock.c ldlm_resource.c ldlm_lib.c \
 ldlm_extent.c ldlm_request.c ldlm_lockd.c
 
 if LIBLUSTRE
@@ -20,5 +20,3 @@ ldlm_SOURCES = $(LDLMSOURCES)
 endif
 
 include $(top_srcdir)/Rules
-
-
index 9b10854..f6a9f5e 100644 (file)
 #include <linux/obd_support.h>
 #include <linux/lustre_lib.h>
 
-/* This function will be called to judge if the granted queue of another child
- * (read: another extent) is conflicting and needs its granted queue walked to
- * issue callbacks.
- *
- * This helps to find conflicts between read and write locks on overlapping
- * extents. */
+/* This function will be called to judge if one extent overlaps with another */
 int ldlm_extent_compat(struct ldlm_lock *a, struct ldlm_lock *b)
 {
-        if (MAX(a->l_extent.start, b->l_extent.start) <=
-            MIN(a->l_extent.end, b->l_extent.end))
+        if ((a->l_extent.start <= b->l_extent.end) &&
+            (a->l_extent.end >=  b->l_extent.start))
                 RETURN(0);
 
         RETURN(1);
@@ -48,7 +43,7 @@ int ldlm_extent_compat(struct ldlm_lock *a, struct ldlm_lock *b)
 /* The purpose of this function is to return:
  * - the maximum extent
  * - containing the requested extent
- * - and not overlapping existing extents outside the requested one
+ * - and not overlapping existing conflicting extents outside the requested one
  *
  * An alternative policy is to not shrink the new extent when conflicts exist.
  *
@@ -62,21 +57,33 @@ static void policy_internal(struct list_head *queue, struct ldlm_extent *req_ex,
                 struct ldlm_lock *lock;
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-                if (lock->l_extent.end < req_ex->start) {
-                        new_ex->start = MIN(lock->l_extent.end, new_ex->start);
-                } else {
-                        if (lock->l_extent.start < req_ex->start &&
-                            !lockmode_compat(lock->l_req_mode, mode))
-                                /* Policy: minimize conflict overlap */
+                /* if lock doesn't overlap new_ex, skip it. */
+                if (lock->l_extent.end < new_ex->start ||
+                    lock->l_extent.start > new_ex->end)
+                        continue;
+
+                /* Locks are compatible, overlap doesn't matter */
+                if (lockmode_compat(lock->l_req_mode, mode))
+                        continue;
+
+                if (lock->l_extent.start < req_ex->start) {
+                        if (lock->l_extent.end == ~0) {
                                 new_ex->start = req_ex->start;
+                                new_ex->end = req_ex->end;
+                                return;
+                        }
+                        new_ex->start = MIN(lock->l_extent.end + 1,
+                                            req_ex->start);
                 }
-                if (lock->l_extent.start > req_ex->end) {
-                        new_ex->end = MAX(lock->l_extent.start, new_ex->end);
-                } else {
-                        if (lock->l_extent.end > req_ex->end &&
-                            !lockmode_compat(lock->l_req_mode, mode))
-                                /* Policy: minimize conflict overlap */
+
+                if (lock->l_extent.end > req_ex->end) {
+                        if (lock->l_extent.start == 0) {
+                                new_ex->start = req_ex->start;
                                 new_ex->end = req_ex->end;
+                                return;
+                        }
+                        new_ex->end = MAX(lock->l_extent.start - 1,
+                                          req_ex->end);
                 }
         }
 }
@@ -104,8 +111,9 @@ int ldlm_extent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp,
 
         memcpy(&lock->l_extent, &new_ex, sizeof(new_ex));
 
-        LDLM_DEBUG(lock, "new extent "LPU64" -> "LPU64, new_ex.start,
-                   new_ex.end);
+        LDLM_DEBUG(lock, "requested extent ["LPU64"->"LPU64"], new extent ["
+                   LPU64"->"LPU64"]",
+                   req_ex->start, req_ex->end, new_ex.start, new_ex.end);
 
         if (new_ex.end != req_ex->end || new_ex.start != req_ex->start)
                 return ELDLM_LOCK_CHANGED;
diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h
new file mode 100644 (file)
index 0000000..b8bfdac
--- /dev/null
@@ -0,0 +1 @@
+int ldlm_cancel_lru(struct ldlm_namespace *ns);
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
new file mode 100644 (file)
index 0000000..735e383
--- /dev/null
@@ -0,0 +1,883 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+#define DEBUG_SUBSYSTEM S_LDLM
+
+#ifdef __KERNEL__
+# include <linux/module.h>
+#else
+# include <liblustre.h>
+#endif
+#include <linux/obd_ost.h>
+#include <linux/lustre_dlm.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_net.h>
+
+int client_import_connect(struct lustre_handle *dlm_handle, 
+                          struct obd_device *obd,
+                          struct obd_uuid *cluuid)
+{
+        struct client_obd *cli = &obd->u.cli;
+        struct obd_import *imp = cli->cl_import;
+        struct obd_export *exp;
+        struct ptlrpc_request *request;
+        /* XXX maybe this is a good time to create a connect struct? */
+        int rc, size[] = {sizeof(imp->imp_target_uuid),
+                          sizeof(obd->obd_uuid),
+                          sizeof(*dlm_handle)};
+        char *tmp[] = {imp->imp_target_uuid.uuid,
+                       obd->obd_uuid.uuid,
+                       (char *)dlm_handle};
+        int rq_opc = (obd->obd_type->typ_ops->o_brw) ? OST_CONNECT :MDS_CONNECT;
+        int msg_flags;
+
+        ENTRY;
+        down(&cli->cl_sem);
+        rc = class_connect(dlm_handle, obd, cluuid);
+        if (rc)
+                GOTO(out_sem, rc);
+
+        cli->cl_conn_count++;
+        if (cli->cl_conn_count > 1)
+                GOTO(out_sem, rc);
+
+        if (obd->obd_namespace != NULL)
+                CERROR("already have namespace!\n");
+        obd->obd_namespace = ldlm_namespace_new(obd->obd_name,
+                                                LDLM_NAMESPACE_CLIENT);
+        if (obd->obd_namespace == NULL)
+                GOTO(out_disco, rc = -ENOMEM);
+
+        request = ptlrpc_prep_req(imp, rq_opc, 3, size, tmp);
+        if (!request)
+                GOTO(out_ldlm, rc = -ENOMEM);
+
+        request->rq_level = LUSTRE_CONN_NEW;
+        request->rq_replen = lustre_msg_size(0, NULL);
+
+        imp->imp_dlm_handle = *dlm_handle;
+
+        imp->imp_level = LUSTRE_CONN_CON;
+        rc = ptlrpc_queue_wait(request);
+        if (rc) {
+                class_disconnect(dlm_handle, 0);
+                GOTO(out_req, rc);
+        }
+
+        exp = class_conn2export(dlm_handle);
+        exp->exp_connection = ptlrpc_connection_addref(request->rq_connection);
+        class_export_put(exp);
+
+        msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
+        if (rq_opc == MDS_CONNECT || msg_flags & MSG_CONNECT_REPLAYABLE) {
+                imp->imp_replayable = 1;
+                CDEBUG(D_HA, "connected to replayable target: %s\n",
+                       imp->imp_target_uuid.uuid);
+        }
+        imp->imp_level = LUSTRE_CONN_FULL;
+        imp->imp_remote_handle = request->rq_repmsg->handle;
+        CDEBUG(D_HA, "local import: %p, remote handle: "LPX64"\n", imp,
+               imp->imp_remote_handle.cookie);
+
+        EXIT;
+out_req:
+        ptlrpc_req_finished(request);
+        if (rc) {
+out_ldlm:
+                ldlm_namespace_free(obd->obd_namespace);
+                obd->obd_namespace = NULL;
+out_disco:
+                cli->cl_conn_count--;
+                class_disconnect(dlm_handle, 0);
+        }
+out_sem:
+        up(&cli->cl_sem);
+        return rc;
+}
+
+int client_import_disconnect(struct lustre_handle *dlm_handle, int failover)
+{
+        struct obd_device *obd = class_conn2obd(dlm_handle);
+        struct client_obd *cli = &obd->u.cli;
+        struct obd_import *imp = cli->cl_import;
+        struct ptlrpc_request *request = NULL;
+        int rc = 0, err, rq_opc;
+        ENTRY;
+
+        if (!obd) {
+                CERROR("invalid connection for disconnect: cookie "LPX64"\n",
+                       dlm_handle ? dlm_handle->cookie : -1UL);
+                RETURN(-EINVAL);
+        }
+
+        rq_opc = obd->obd_type->typ_ops->o_brw ? OST_DISCONNECT:MDS_DISCONNECT;
+        down(&cli->cl_sem);
+        if (!cli->cl_conn_count) {
+                CERROR("disconnecting disconnected device (%s)\n",
+                       obd->obd_name);
+                GOTO(out_sem, rc = -EINVAL);
+        }
+
+        cli->cl_conn_count--;
+        if (cli->cl_conn_count)
+                GOTO(out_no_disconnect, rc = 0);
+
+        if (obd->obd_namespace != NULL) {
+                /* obd_no_recov == local only */
+                ldlm_cli_cancel_unused(obd->obd_namespace, NULL,
+                                       obd->obd_no_recov, NULL);
+                ldlm_namespace_free(obd->obd_namespace);
+                obd->obd_namespace = NULL;
+        }
+
+        /* Yeah, obd_no_recov also (mainly) means "forced shutdown". */
+        if (obd->obd_no_recov) {
+                ptlrpc_abort_inflight(imp);
+        } else {
+                request = ptlrpc_prep_req(imp, rq_opc, 0, NULL, NULL);
+                if (!request)
+                        GOTO(out_req, rc = -ENOMEM);
+
+                request->rq_replen = lustre_msg_size(0, NULL);
+
+                /* Process disconnects even if we're waiting for recovery. */
+                request->rq_level = LUSTRE_CONN_RECOVD;
+
+                rc = ptlrpc_queue_wait(request);
+                if (rc)
+                        GOTO(out_req, rc);
+        }
+        EXIT;
+ out_req:
+        if (request)
+                ptlrpc_req_finished(request);
+ out_no_disconnect:
+        err = class_disconnect(dlm_handle, 0);
+        if (!rc && err)
+                rc = err;
+ out_sem:
+        up(&cli->cl_sem);
+        RETURN(rc);
+}
+
+/* --------------------------------------------------------------------------
+ * from old lib/target.c
+ * -------------------------------------------------------------------------- */
+
+int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
+                            struct obd_uuid *cluuid)
+{
+        if (exp->exp_connection) {
+                struct lustre_handle *hdl;
+                hdl = &exp->exp_ldlm_data.led_import->imp_remote_handle;
+                /* Might be a re-connect after a partition. */
+                if (!memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) {
+                        CERROR("%s reconnecting\n", cluuid->uuid);
+                        conn->cookie = exp->exp_handle.h_cookie;
+                        RETURN(EALREADY);
+                } else {
+                        CERROR("%s reconnecting from %s, "
+                               "handle mismatch (ours "LPX64", theirs "
+                               LPX64")\n", cluuid->uuid,
+                               exp->exp_connection->c_remote_uuid.uuid,
+                               hdl->cookie, conn->cookie);
+                        /* XXX disconnect them here? */
+                        memset(conn, 0, sizeof *conn);
+                        /* This is a little scary, but right now we build this
+                         * file separately into each server module, so I won't
+                         * go _immediately_ to hell.
+                         */
+                        RETURN(-EALREADY);
+                }
+        }
+
+        conn->cookie = exp->exp_handle.h_cookie;
+        CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n",
+               cluuid->uuid, exp);
+        CDEBUG(D_IOCTL,"connect: cookie "LPX64"\n", conn->cookie);
+        RETURN(0);
+}
+
+int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
+{
+        struct obd_device *target;
+        struct obd_export *export = NULL;
+        struct obd_import *dlmimp;
+        struct lustre_handle conn;
+        struct obd_uuid tgtuuid;
+        struct obd_uuid cluuid;
+        struct obd_uuid remote_uuid;
+        struct list_head *p;
+        char *str, *tmp;
+        int rc, i, abort_recovery;
+        ENTRY;
+
+        LASSERT_REQSWAB (req, 0);
+        str = lustre_msg_string (req->rq_reqmsg, 0, sizeof (tgtuuid.uuid) - 1);
+        if (str == NULL) {
+                CERROR("bad target UUID for connect\n");
+                GOTO(out, rc = -EINVAL);
+        }
+        obd_str2uuid (&tgtuuid, str);
+
+        LASSERT_REQSWAB (req, 1);
+        str = lustre_msg_string (req->rq_reqmsg, 1, sizeof (cluuid.uuid) - 1);
+        if (str == NULL) {
+                CERROR("bad client UUID for connect\n");
+                GOTO(out, rc = -EINVAL);
+        }
+        obd_str2uuid (&cluuid, str);
+
+        i = class_uuid2dev(&tgtuuid);
+        if (i == -1) {
+                CERROR("UUID '%s' not found for connect\n", tgtuuid.uuid);
+                GOTO(out, rc = -ENODEV);
+        }
+
+        target = &obd_dev[i];
+        if (!target || target->obd_stopping || !target->obd_set_up) {
+                CERROR("UUID '%s' is not available for connect\n", str);
+                GOTO(out, rc = -ENODEV);
+        }
+
+        /* XXX extract a nettype and format accordingly */
+        snprintf(remote_uuid.uuid, sizeof remote_uuid,
+                 "NET_"LPX64"_UUID", req->rq_peer.peer_nid);
+
+        spin_lock_bh(&target->obd_processing_task_lock);
+        abort_recovery = target->obd_abort_recovery;
+        spin_unlock_bh(&target->obd_processing_task_lock);
+        if (abort_recovery)
+                target_abort_recovery(target);
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn);
+        if (tmp == NULL)
+                GOTO(out, rc = -EPROTO);
+
+        memcpy(&conn, tmp, sizeof conn);
+
+        rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg);
+        if (rc)
+                GOTO(out, rc);
+
+        /* lctl gets a backstage, all-access pass. */
+        if (obd_uuid_equals(&cluuid, &lctl_fake_uuid))
+                goto dont_check_exports;
+
+        spin_lock(&target->obd_dev_lock);
+        list_for_each(p, &target->obd_exports) {
+                export = list_entry(p, struct obd_export, exp_obd_chain);
+                if (obd_uuid_equals(&cluuid, &export->exp_client_uuid)) {
+                        spin_unlock(&target->obd_dev_lock);
+                        LASSERT(export->exp_obd == target);
+
+                        rc = target_handle_reconnect(&conn, export, &cluuid);
+                        break;
+                }
+                export = NULL;
+        }
+        /* If we found an export, we already unlocked. */
+        if (!export)
+                spin_unlock(&target->obd_dev_lock);
+
+        /* Tell the client if we're in recovery. */
+        /* If this is the first client, start the recovery timer */
+        if (target->obd_recovering) {
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
+                target_start_recovery_timer(target, handler);
+        }
+
+        /* Tell the client if we support replayable requests */
+        if (target->obd_replayable)
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE);
+
+        if (export == NULL) {
+                if (target->obd_recovering) {
+                        CERROR("denying connection for new client %s: "
+                               "in recovery\n", cluuid.uuid);
+                        rc = -EBUSY;
+                } else {
+ dont_check_exports:
+                        rc = obd_connect(&conn, target, &cluuid);
+                }
+        }
+
+        /* If all else goes well, this is our RPC return code. */
+        req->rq_status = 0;
+
+        if (rc && rc != EALREADY)
+                GOTO(out, rc);
+
+        req->rq_repmsg->handle = conn;
+
+        /* If the client and the server are the same node, we will already
+         * have an export that really points to the client's DLM export,
+         * because we have a shared handles table.
+         *
+         * XXX this will go away when shaver stops sending the "connect" handle
+         * in the real "remote handle" field of the request --phik 24 Apr 2003
+         */
+        if (req->rq_export != NULL)
+                class_export_put(req->rq_export);
+
+        /* ownership of this export ref transfers to the request */
+        export = req->rq_export = class_conn2export(&conn);
+        LASSERT(export != NULL);
+
+        if (req->rq_connection != NULL)
+                ptlrpc_put_connection(req->rq_connection);
+        if (export->exp_connection != NULL)
+                ptlrpc_put_connection(export->exp_connection);
+        export->exp_connection = ptlrpc_get_connection(&req->rq_peer,
+                                                       &remote_uuid);
+        req->rq_connection = ptlrpc_connection_addref(export->exp_connection);
+
+        if (rc == EALREADY) {
+                /* We indicate the reconnection in a flag, not an error code. */
+                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
+                GOTO(out, rc = 0);
+        }
+
+        memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn),
+               sizeof conn);
+
+        if (export->exp_ldlm_data.led_import != NULL)
+                class_destroy_import(export->exp_ldlm_data.led_import);
+        dlmimp = export->exp_ldlm_data.led_import = class_new_import();
+        dlmimp->imp_connection = ptlrpc_connection_addref(req->rq_connection);
+        dlmimp->imp_client = &export->exp_obd->obd_ldlm_client;
+        dlmimp->imp_remote_handle = conn;
+        dlmimp->imp_obd = target;
+        dlmimp->imp_dlm_fake = 1;
+        dlmimp->imp_level = LUSTRE_CONN_FULL;
+        class_import_put(dlmimp);
+out:
+        if (rc)
+                req->rq_status = rc;
+        RETURN(rc);
+}
+
+int target_handle_disconnect(struct ptlrpc_request *req)
+{
+        struct lustre_handle *conn = &req->rq_reqmsg->handle;
+        struct obd_import *dlmimp;
+        int rc;
+        ENTRY;
+
+        rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg);
+        if (rc)
+                RETURN(rc);
+
+        req->rq_status = obd_disconnect(conn, 0);
+
+        dlmimp = req->rq_export->exp_ldlm_data.led_import;
+        class_destroy_import(dlmimp);
+
+        class_export_put(req->rq_export);
+        req->rq_export = NULL;
+        RETURN(0);
+}
+
+/*
+ * Recovery functions
+ */
+
+void target_cancel_recovery_timer(struct obd_device *obd)
+{
+        del_timer(&obd->obd_recovery_timer);
+}
+
+static void abort_delayed_replies(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        struct list_head *tmp, *n;
+        list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                DEBUG_REQ(D_ERROR, req, "aborted:");
+                req->rq_status = -ENOTCONN;
+                req->rq_type = PTL_RPC_MSG_ERR;
+                ptlrpc_reply(req);
+                list_del(&req->rq_list);
+                OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
+                OBD_FREE(req, sizeof *req);
+        }
+}
+
+static void abort_recovery_queue(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        struct list_head *tmp, *n;
+        int rc;
+
+        list_for_each_safe(tmp, n, &obd->obd_recovery_queue) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                DEBUG_REQ(D_ERROR, req, "aborted:");
+                req->rq_status = -ENOTCONN;
+                req->rq_type = PTL_RPC_MSG_ERR;
+                rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen,
+                                     &req->rq_repmsg);
+                if (rc == 0) {
+                        ptlrpc_reply(req);
+                } else {
+                        DEBUG_REQ(D_ERROR, req,
+                                  "packing failed for abort-reply; skipping");
+                }
+                list_del(&req->rq_list);
+                class_export_put(req->rq_export);
+                OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
+                OBD_FREE(req, sizeof *req);
+        }
+}
+
+void target_abort_recovery(void *data)
+{
+        struct obd_device *obd = data;
+
+        CERROR("disconnecting clients and aborting recovery\n");
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        if (!obd->obd_recovering) {
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                EXIT;
+                return;
+        }
+
+        obd->obd_recovering = obd->obd_abort_recovery = 0;
+        obd->obd_recoverable_clients = 0;
+        wake_up(&obd->obd_next_transno_waitq);
+        target_cancel_recovery_timer(obd);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+        class_disconnect_exports(obd, 0);
+        abort_delayed_replies(obd);
+        abort_recovery_queue(obd);
+}
+
+static void target_recovery_expired(unsigned long castmeharder)
+{
+        struct obd_device *obd = (struct obd_device *)castmeharder;
+        CERROR("recovery timed out, aborting\n");
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        obd->obd_abort_recovery = 1;
+        wake_up(&obd->obd_next_transno_waitq);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+}
+
+static void reset_recovery_timer(struct obd_device *obd)
+{
+        int recovering;
+        spin_lock(&obd->obd_dev_lock);
+        recovering = obd->obd_recovering;
+        spin_unlock(&obd->obd_dev_lock);
+
+        if (!recovering)
+                return;
+        CDEBUG(D_ERROR, "timer will expire in %ld seconds\n",
+               OBD_RECOVERY_TIMEOUT / HZ);
+        mod_timer(&obd->obd_recovery_timer, jiffies + OBD_RECOVERY_TIMEOUT);
+}
+
+
+/* Only start it the first time called */
+void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler)
+{
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        if (obd->obd_recovery_handler) {
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                return;
+        }
+        CERROR("%s: starting recovery timer\n", obd->obd_name);
+        obd->obd_recovery_handler = handler;
+        obd->obd_recovery_timer.function = target_recovery_expired;
+        obd->obd_recovery_timer.data = (unsigned long)obd;
+        init_timer(&obd->obd_recovery_timer);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        reset_recovery_timer(obd);
+}
+
+static int check_for_next_transno(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        int wake_up;
+
+        req = list_entry(obd->obd_recovery_queue.next,
+                         struct ptlrpc_request, rq_list);
+        LASSERT(req->rq_reqmsg->transno >= obd->obd_next_recovery_transno);
+
+        wake_up = req->rq_reqmsg->transno == obd->obd_next_recovery_transno ||
+                (obd->obd_recovering) == 0;
+        CDEBUG(D_HA, "check_for_next_transno: "LPD64" vs "LPD64", %d == %d\n",
+               req->rq_reqmsg->transno, obd->obd_next_recovery_transno,
+               obd->obd_recovering, wake_up);
+        return wake_up;
+}
+
+static void process_recovery_queue(struct obd_device *obd)
+{
+        struct ptlrpc_request *req;
+        int abort_recovery = 0;
+        struct l_wait_info lwi = { 0 };
+        ENTRY;
+
+        for (;;) {
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                LASSERT(obd->obd_processing_task == current->pid);
+                req = list_entry(obd->obd_recovery_queue.next,
+                                 struct ptlrpc_request, rq_list);
+
+                if (req->rq_reqmsg->transno != obd->obd_next_recovery_transno) {
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+                        CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is "
+                               LPD64")\n",
+                               obd->obd_next_recovery_transno,
+                               req->rq_reqmsg->transno);
+                        l_wait_event(obd->obd_next_transno_waitq,
+                                     check_for_next_transno(obd), &lwi);
+                        spin_lock_bh(&obd->obd_processing_task_lock);
+                        abort_recovery = obd->obd_abort_recovery;
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+                        if (abort_recovery) {
+                                target_abort_recovery(obd);
+                                return;
+                        }
+                        continue;
+                }
+                list_del_init(&req->rq_list);
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+
+                DEBUG_REQ(D_ERROR, req, "processing: ");
+                (void)obd->obd_recovery_handler(req);
+                reset_recovery_timer(obd);
+#warning FIXME: mds_fsync_super(mds->mds_sb);
+                class_export_put(req->rq_export);
+                OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
+                OBD_FREE(req, sizeof *req);
+                spin_lock_bh(&obd->obd_processing_task_lock);
+                obd->obd_next_recovery_transno++;
+                if (list_empty(&obd->obd_recovery_queue)) {
+                        obd->obd_processing_task = 0;
+                        spin_unlock_bh(&obd->obd_processing_task_lock);
+                        break;
+                }
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+        }
+        EXIT;
+}
+
+int target_queue_recovery_request(struct ptlrpc_request *req,
+                                  struct obd_device *obd)
+{
+        struct list_head *tmp;
+        int inserted = 0;
+        __u64 transno = req->rq_reqmsg->transno;
+        struct ptlrpc_request *saved_req;
+        struct lustre_msg *reqmsg;
+
+        /* CAVEAT EMPTOR: The incoming request message has been swabbed
+         * (i.e. buflens etc are in my own byte order), but type-dependent
+         * buffers (eg mds_body, ost_body etc) have NOT been swabbed. */
+
+        if (!transno) {
+                INIT_LIST_HEAD(&req->rq_list);
+                DEBUG_REQ(D_HA, req, "not queueing");
+                return 1;
+        }
+
+        /* XXX If I were a real man, these LBUGs would be sane cleanups. */
+        /* XXX just like the request-dup code in queue_final_reply */
+        OBD_ALLOC(saved_req, sizeof *saved_req);
+        if (!saved_req)
+                LBUG();
+        OBD_ALLOC(reqmsg, req->rq_reqlen);
+        if (!reqmsg)
+                LBUG();
+
+        spin_lock_bh(&obd->obd_processing_task_lock);
+
+        /* If we're processing the queue, we want don't want to queue this
+         * message.
+         *
+         * Also, if this request has a transno less than the one we're waiting
+         * for, we should process it now.  It could (and currently always will)
+         * be an open request for a descriptor that was opened some time ago.
+         */
+        if (obd->obd_processing_task == current->pid ||
+            transno < obd->obd_next_recovery_transno) {
+                /* Processing the queue right now, don't re-add. */
+                LASSERT(list_empty(&req->rq_list));
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                OBD_FREE(reqmsg, req->rq_reqlen);
+                OBD_FREE(saved_req, sizeof *saved_req);
+                return 1;
+        }
+
+        memcpy(saved_req, req, sizeof *req);
+        memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+        req = saved_req;
+        req->rq_reqmsg = reqmsg;
+        class_export_get(req->rq_export);
+        INIT_LIST_HEAD(&req->rq_list);
+
+        /* XXX O(n^2) */
+        list_for_each(tmp, &obd->obd_recovery_queue) {
+                struct ptlrpc_request *reqiter =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                if (reqiter->rq_reqmsg->transno > transno) {
+                        list_add_tail(&req->rq_list, &reqiter->rq_list);
+                        inserted = 1;
+                        break;
+                }
+        }
+
+        if (!inserted) {
+                list_add_tail(&req->rq_list, &obd->obd_recovery_queue);
+        }
+
+        if (obd->obd_processing_task != 0) {
+                /* Someone else is processing this queue, we'll leave it to
+                 * them.
+                 */
+                if (transno == obd->obd_next_recovery_transno)
+                        wake_up(&obd->obd_next_transno_waitq);
+                spin_unlock_bh(&obd->obd_processing_task_lock);
+                return 0;
+        }
+
+        /* Nobody is processing, and we know there's (at least) one to process
+         * now, so we'll do the honours.
+         */
+        obd->obd_processing_task = current->pid;
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        process_recovery_queue(obd);
+        return 0;
+}
+
+struct obd_device * target_req2obd(struct ptlrpc_request *req)
+{
+        return req->rq_export->exp_obd;
+}
+
+int target_queue_final_reply(struct ptlrpc_request *req, int rc)
+{
+        struct obd_device *obd = target_req2obd(req);
+        struct ptlrpc_request *saved_req;
+        struct lustre_msg *reqmsg;
+        int recovery_done = 0;
+
+        if (rc) {
+                /* Just like ptlrpc_error, but without the sending. */
+                lustre_pack_msg(0, NULL, NULL, &req->rq_replen,
+                                &req->rq_repmsg);
+                req->rq_type = PTL_RPC_MSG_ERR;
+        }
+
+        LASSERT(list_empty(&req->rq_list));
+        /* XXX just like the request-dup code in queue_recovery_request */
+        OBD_ALLOC(saved_req, sizeof *saved_req);
+        if (!saved_req)
+                LBUG();
+        OBD_ALLOC(reqmsg, req->rq_reqlen);
+        if (!reqmsg)
+                LBUG();
+        memcpy(saved_req, req, sizeof *saved_req);
+        memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen);
+        req = saved_req;
+        req->rq_reqmsg = reqmsg;
+        list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
+
+        spin_lock_bh(&obd->obd_processing_task_lock);
+        --obd->obd_recoverable_clients;
+        recovery_done = (obd->obd_recoverable_clients == 0);
+        spin_unlock_bh(&obd->obd_processing_task_lock);
+
+        if (recovery_done) {
+                struct list_head *tmp, *n;
+                ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace);
+                CDEBUG(D_ERROR,
+                       "%s: all clients recovered, sending delayed replies\n",
+                       obd->obd_name);
+                obd->obd_recovering = 0;
+                list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
+                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                        DEBUG_REQ(D_ERROR, req, "delayed:");
+                        ptlrpc_reply(req);
+                        list_del(&req->rq_list);
+                        OBD_FREE(req->rq_reqmsg, req->rq_reqlen);
+                        OBD_FREE(req, sizeof *req);
+                }
+                target_cancel_recovery_timer(obd);
+        } else {
+                CERROR("%s: %d recoverable clients remain\n",
+                       obd->obd_name, obd->obd_recoverable_clients);
+        }
+
+        return 1;
+}
+
+static void ptlrpc_abort_reply (struct ptlrpc_request *req)
+{
+        /* On return, we must be sure that the ACK callback has either
+         * happened or will not happen.  Note that the SENT callback will
+         * happen come what may since we successfully posted the PUT. */
+        int rc;
+        struct l_wait_info lwi;
+        unsigned long flags;
+
+ again:
+        /* serialise with ACK callback */
+        spin_lock_irqsave (&req->rq_lock, flags);
+        if (!req->rq_want_ack) {
+                spin_unlock_irqrestore (&req->rq_lock, flags);
+                /* The ACK callback has happened already.  Although the
+                 * SENT callback might still be outstanding (yes really) we
+                 * don't care; this is just like normal completion. */
+                return;
+        }
+        spin_unlock_irqrestore (&req->rq_lock, flags);
+
+        /* Have a bash at unlinking the MD.  This will fail until the SENT
+         * callback has happened since the MD is busy from the PUT.  If the
+         * ACK still hasn't arrived after then, a successful unlink will
+         * ensure the ACK callback never happens. */
+        rc = PtlMDUnlink (req->rq_reply_md_h);
+        switch (rc) {
+        default:
+                LBUG ();
+        case PTL_OK:
+                /* SENT callback happened; ACK callback preempted */
+                LASSERT (req->rq_want_ack);
+                spin_lock_irqsave (&req->rq_lock, flags);
+                req->rq_want_ack = 0;
+                spin_unlock_irqrestore (&req->rq_lock, flags);
+                return;
+        case PTL_INV_MD:
+                return;
+        case PTL_MD_INUSE:
+                /* Still sending or ACK callback in progress: wait until
+                 * either callback has completed and try again.
+                 * Actually we can't wait for the SENT callback because
+                 * there's no state the SENT callback can touch that will
+                 * allow it to communicate with us!  So we just wait here
+                 * for a short time, effectively polling for the SENT
+                 * callback by calling PtlMDUnlink() again, to see if it
+                 * has finished.  Note that if the ACK does arrive, its
+                 * callback wakes us in short order. --eeb */
+                lwi = LWI_TIMEOUT (HZ/4, NULL, NULL);
+                rc = l_wait_event(req->rq_wait_for_rep, !req->rq_want_ack,
+                                  &lwi);
+                CDEBUG (D_HA, "Retrying req %p: %d\n", req, rc);
+                /* NB go back and test rq_want_ack with locking, to ensure
+                 * if ACK callback happened, it has completed stopped
+                 * referencing this req. */
+                goto again;
+        }
+}
+
+void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
+{
+        int i;
+        int netrc;
+        unsigned long flags;
+        struct ptlrpc_req_ack_lock *ack_lock;
+        struct l_wait_info lwi = { 0 };
+        wait_queue_t commit_wait;
+        struct obd_device *obd =
+                req->rq_export ? req->rq_export->exp_obd : NULL;
+        struct obd_export *exp =
+                (req->rq_export && req->rq_ack_locks[0].mode) ?
+                req->rq_export : NULL;
+
+        if (exp) {
+                exp->exp_outstanding_reply = req;
+                spin_lock_irqsave (&req->rq_lock, flags);
+                req->rq_want_ack = 1;
+                spin_unlock_irqrestore (&req->rq_lock, flags);
+        }
+
+        if (!OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) {
+                if (rc) {
+                        DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
+                        netrc = ptlrpc_error(req);
+                } else {
+                        DEBUG_REQ(D_NET, req, "sending reply");
+                        netrc = ptlrpc_reply(req);
+                }
+        } else {
+                obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
+                DEBUG_REQ(D_ERROR, req, "dropping reply");
+                if (!exp && req->rq_repmsg) {
+                        OBD_FREE(req->rq_repmsg, req->rq_replen);
+                        req->rq_repmsg = NULL;
+                }
+                init_waitqueue_head(&req->rq_wait_for_rep);
+                netrc = 0;
+        }
+
+        /* a failed send simulates the callbacks */
+        LASSERT(netrc == 0 || req->rq_want_ack == 0);
+        if (exp == NULL) {
+                LASSERT(req->rq_want_ack == 0);
+                return;
+        }
+        LASSERT(obd != NULL);
+
+        init_waitqueue_entry(&commit_wait, current);
+        add_wait_queue(&obd->obd_commit_waitq, &commit_wait);
+        rc = l_wait_event(req->rq_wait_for_rep,
+                          !req->rq_want_ack || req->rq_resent ||
+                          req->rq_transno <= obd->obd_last_committed, &lwi);
+        remove_wait_queue(&obd->obd_commit_waitq, &commit_wait);
+
+        spin_lock_irqsave (&req->rq_lock, flags);
+        /* If we got here because the ACK callback ran, this acts as a
+         * barrier to ensure the callback completed the wakeup. */
+        spin_unlock_irqrestore (&req->rq_lock, flags);
+
+        /* If we committed the transno already, then we might wake up before
+         * the ack arrives.  We need to stop waiting for the ack before we can
+         * reuse this request structure.  We are guaranteed by this point that
+         * this cannot abort the sending of the actual reply.*/
+        ptlrpc_abort_reply(req);
+
+        if (req->rq_resent) {
+                DEBUG_REQ(D_HA, req, "resent: not cancelling locks");
+                return;
+        }
+
+        LASSERT(rc == 0);
+        DEBUG_REQ(D_HA, req, "cancelling locks for %s",
+                  req->rq_want_ack ? "commit" : "ack");
+
+        exp->exp_outstanding_reply = NULL;
+
+        for (ack_lock = req->rq_ack_locks, i = 0; i < 4; i++, ack_lock++) {
+                if (!ack_lock->mode)
+                        break;
+                ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
+        }
+}
+
+int target_handle_ping(struct ptlrpc_request *req)
+{
+        return lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg);
+}
index 81cc428..62272fa 100644 (file)
 #define DEBUG_SUBSYSTEM S_LDLM
 
 #ifdef __KERNEL__
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/lustre_dlm.h>
-#include <linux/lustre_mds.h>
+# include <linux/slab.h>
+# include <linux/module.h>
+# include <linux/lustre_dlm.h>
+# include <linux/lustre_mds.h>
 #else
-#include <liblustre.h>
-#include <linux/kp30.h>
+# include <liblustre.h>
+# include <linux/kp30.h>
 #endif
 
 #include <linux/obd_class.h>
+#include "ldlm_internal.h"
 
 //struct lustre_lock ldlm_everything_lock;
 
@@ -154,7 +155,7 @@ void ldlm_lock_put(struct ldlm_lock *lock)
                 if (lock->l_parent)
                         LDLM_LOCK_PUT(lock->l_parent);
 
-                PORTAL_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock));
+                OBD_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock));
                 l_unlock(&ns->ns_lock);
         }
 
@@ -248,7 +249,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_lock *parent,
         if (resource == NULL)
                 LBUG();
 
-        PORTAL_SLAB_ALLOC(lock, ldlm_lock_slab, sizeof(*lock));
+        OBD_SLAB_ALLOC(lock, ldlm_lock_slab, SLAB_KERNEL, sizeof(*lock));
         if (lock == NULL)
                 RETURN(NULL);
 
@@ -318,7 +319,6 @@ int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
 
 void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh)
 {
-        POISON(&lockh->addr, 0x69, sizeof(lockh->addr));
         lockh->cookie = lock->l_handle.h_cookie;
 }
 
@@ -447,10 +447,6 @@ void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode)
         LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]);
 }
 
-/* Args: unlocked lock */
-int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
-                                    struct ldlm_res_id, int flags);
-
 void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
 {
         struct ldlm_namespace *ns;
@@ -484,17 +480,14 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode)
                                "warning\n");
 
                 LDLM_DEBUG(lock, "final decref done on cbpending lock");
-
-                if (lock->l_blocking_ast == NULL) {
-                        /* The lock wasn't even fully formed; just destroy it */
-                        ldlm_lock_destroy(lock);
-                }
                 l_unlock(&ns->ns_lock);
 
                 /* FIXME: need a real 'desc' here */
                 if (lock->l_blocking_ast != NULL)
                         lock->l_blocking_ast(lock, NULL, lock->l_data,
                                              LDLM_CB_BLOCKING);
+                else
+                        LDLM_DEBUG(lock, "No blocking AST?");
         } else if (ns->ns_client && !lock->l_readers && !lock->l_writers) {
                 /* If this is a client-side namespace and this was the last
                  * reference, put it on the LRU. */
@@ -533,8 +526,8 @@ void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode)
         LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]);
         l_lock(&lock->l_resource->lr_namespace->ns_lock);
         lock->l_flags |= LDLM_FL_CBPENDING;
-        ldlm_lock_decref_internal(lock, mode);
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+        ldlm_lock_decref_internal(lock, mode);
         LDLM_LOCK_PUT(lock);
 }
 
@@ -630,7 +623,17 @@ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode,
                 if (lock == old_lock)
                         break;
 
-                if (lock->l_flags & LDLM_FL_CBPENDING)
+                /* llite sometimes wants to match locks that will be
+                 * canceled when their users drop, but we allow it to match
+                 * if it passes in CBPENDING and the lock still has users.
+                 * this is generally only going to be used by children 
+                 * whose parents already hold a lock so forward progress
+                 * can still happen. */
+                if (lock->l_flags & LDLM_FL_CBPENDING &&
+                    !(flags & LDLM_FL_CBPENDING))
+                        continue;
+                if (lock->l_flags & LDLM_FL_CBPENDING &&
+                    lock->l_readers == 0 && lock->l_writers == 0)
                         continue;
 
                 if (lock->l_req_mode != mode)
@@ -666,6 +669,9 @@ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode,
  *     server (ie, connh is NULL)
  * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted
  *     list will be considered
+ * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked
+ *     to be canceled can still be matched as long as they still have reader
+ *     or writer refernces
  *
  * Returns 1 if it finds an already-existing lock that is compatible; in this
  * case, lockh is filled in with a addref()ed lock
@@ -710,14 +716,15 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags,
                 GOTO(out, rc = 1);
 
         EXIT;
      out:
+ out:
         ldlm_resource_putref(res);
         l_unlock(&ns->ns_lock);
 
         if (lock) {
                 ldlm_lock2handle(lock, lockh);
                 if (lock->l_completion_ast)
-                        lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC, NULL);
+                        lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC,
+                                               NULL);
         }
         if (rc)
                 LDLM_DEBUG(lock, "matched");
@@ -734,7 +741,9 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags,
 struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
                                    struct lustre_handle *parent_lock_handle,
                                    struct ldlm_res_id res_id, __u32 type,
-                                   ldlm_mode_t mode, void *data, void *cp_data)
+                                   ldlm_mode_t mode,
+                                   ldlm_blocking_callback blocking,
+                                   void *data)
 {
         struct ldlm_resource *res, *parent_res = NULL;
         struct ldlm_lock *lock, *parent_lock = NULL;
@@ -760,7 +769,7 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns,
 
         lock->l_req_mode = mode;
         lock->l_data = data;
-        lock->l_cp_data = cp_data;
+        lock->l_blocking_ast = blocking;
 
         RETURN(lock);
 }
@@ -769,8 +778,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
                                struct ldlm_lock **lockp,
                                void *cookie, int cookie_len,
                                int *flags,
-                               ldlm_completion_callback completion,
-                               ldlm_blocking_callback blocking)
+                               ldlm_completion_callback completion)
 {
         struct ldlm_resource *res;
         struct ldlm_lock *lock = *lockp;
@@ -779,7 +787,6 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
         ENTRY;
 
         res = lock->l_resource;
-        lock->l_blocking_ast = blocking;
 
         if (res->lr_type == LDLM_EXTENT)
                 memcpy(&lock->l_extent, cookie, sizeof(lock->l_extent));
@@ -867,12 +874,6 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns,
                 *flags |= LDLM_FL_BLOCK_GRANTED;
                 GOTO(out, ELDLM_OK);
         }
-
-        if (lock->l_granted_cb != NULL && lock->l_data != NULL) {
-                /* We just -know- */
-                struct ptlrpc_request *req = lock->l_data;
-                lock->l_granted_cb(lock, req->rq_repmsg, 0);
-        }
         ldlm_grant_lock(lock, NULL, 0);
         EXIT;
       out:
@@ -994,11 +995,14 @@ void ldlm_cancel_callback(struct ldlm_lock *lock)
         l_lock(&lock->l_resource->lr_namespace->ns_lock);
         if (!(lock->l_flags & LDLM_FL_CANCEL)) {
                 lock->l_flags |= LDLM_FL_CANCEL;
-                if (lock->l_blocking_ast)
+                if (lock->l_blocking_ast) {
+                        l_unlock(&lock->l_resource->lr_namespace->ns_lock);
                         lock->l_blocking_ast(lock, NULL, lock->l_data,
                                              LDLM_CB_CANCELING);
-                else
+                        return;
+                } else {
                         LDLM_DEBUG(lock, "no blocking ast");
+                }
         }
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
 }
@@ -1023,7 +1027,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
                 LBUG();
         }
 
-        ldlm_cancel_callback(lock);
+        ldlm_cancel_callback(lock); /* XXX FIXME bug 1030 */
 
         ldlm_resource_unlink_lock(lock);
         ldlm_lock_destroy(lock);
@@ -1031,7 +1035,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock)
         EXIT;
 }
 
-int ldlm_lock_set_data(struct lustre_handle *lockh, void *data, void *cp_data)
+int ldlm_lock_set_data(struct lustre_handle *lockh, void *data)
 {
         struct ldlm_lock *lock = ldlm_handle2lock(lockh);
         ENTRY;
@@ -1040,7 +1044,6 @@ int ldlm_lock_set_data(struct lustre_handle *lockh, void *data, void *cp_data)
                 RETURN(-EINVAL);
 
         lock->l_data = data;
-        lock->l_cp_data = cp_data;
 
         LDLM_LOCK_PUT(lock);
 
@@ -1118,6 +1121,7 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
 void ldlm_lock_dump(int level, struct ldlm_lock *lock)
 {
         char ver[128];
+        struct obd_device *obd;
 
         if (!((portal_debug | D_ERROR) & level))
                 return;
@@ -1136,13 +1140,21 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock)
 
         CDEBUG(level, "  -- Lock dump: %p (%s) (rc: %d)\n", lock, ver,
                atomic_read(&lock->l_refc));
-        if (lock->l_export && lock->l_export->exp_connection)
+        obd = class_conn2obd(lock->l_connh);
+        if (lock->l_export && lock->l_export->exp_connection) {
                 CDEBUG(level, "  Node: NID "LPX64" on %s (rhandle: "LPX64")\n",
                        lock->l_export->exp_connection->c_peer.peer_nid,
                        lock->l_export->exp_connection->c_peer.peer_ni->pni_name,
                        lock->l_remote_handle.cookie);
-        else
+        } else if (obd == NULL) {
                 CDEBUG(level, "  Node: local\n");
+        } else {
+                struct obd_import *imp = obd->u.cli.cl_import;
+                CDEBUG(level, "  Node: NID "LPX64" on %s (rhandle: "LPX64")\n",
+                       imp->imp_connection->c_peer.peer_nid,
+                       imp->imp_connection->c_peer.peer_ni->pni_name,
+                       lock->l_remote_handle.cookie);
+        }
         CDEBUG(level, "  Parent: %p\n", lock->l_parent);
         CDEBUG(level, "  Resource: %p ("LPD64")\n", lock->l_resource,
                lock->l_resource->lr_name.name[0]);
index dafcb6e..9d2857e 100644 (file)
 # include <linux/module.h>
 # include <linux/slab.h>
 # include <linux/init.h>
+# include <linux/wait.h>
 #else
 # include <liblustre.h>
 #endif
 
 #include <linux/lustre_dlm.h>
 #include <linux/obd_class.h>
-
 extern kmem_cache_t *ldlm_resource_slab;
 extern kmem_cache_t *ldlm_lock_slab;
 extern struct lustre_lock ldlm_handle_lock;
@@ -42,6 +42,10 @@ extern struct list_head ldlm_namespace_list;
 extern int (*mds_reint_p)(int offset, struct ptlrpc_request *req);
 extern int (*mds_getattr_name_p)(int offset, struct ptlrpc_request *req);
 
+static int ldlm_already_setup = 0;
+
+#ifdef __KERNEL__
+
 inline unsigned long round_timeout(unsigned long timeout)
 {
         return ((timeout / HZ) + 1) * HZ;
@@ -51,23 +55,103 @@ inline unsigned long round_timeout(unsigned long timeout)
 static struct list_head waiting_locks_list;
 static spinlock_t waiting_locks_spinlock;
 static struct timer_list waiting_locks_timer;
-static int ldlm_already_setup = 0;
+
+static struct expired_lock_thread {
+        wait_queue_head_t         elt_waitq;
+        int                       elt_state;
+        struct list_head          elt_expired_locks;
+        spinlock_t                elt_lock;
+} expired_lock_thread;
+
+#define ELT_STOPPED   0
+#define ELT_READY     1
+#define ELT_TERMINATE 2
+
+static inline int have_expired_locks(void)
+{
+        int need_to_run;
+
+        spin_lock_bh(&expired_lock_thread.elt_lock);
+        need_to_run = !list_empty(&expired_lock_thread.elt_expired_locks);
+        spin_unlock_bh(&expired_lock_thread.elt_lock);
+
+        RETURN(need_to_run);
+}
+
+static int expired_lock_main(void *arg)
+{
+        struct list_head *expired = &expired_lock_thread.elt_expired_locks;
+        struct l_wait_info lwi = { 0 };
+        unsigned long flags;
+
+        ENTRY;
+        lock_kernel();
+        kportal_daemonize("ldlm_elt");
+        
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+        
+        unlock_kernel();
+        
+        expired_lock_thread.elt_state = ELT_READY;
+        wake_up(&expired_lock_thread.elt_waitq);
+        
+        while (1) {
+                l_wait_event(expired_lock_thread.elt_waitq,
+                             have_expired_locks() ||
+                             expired_lock_thread.elt_state == ELT_TERMINATE,
+                             &lwi);
+
+                spin_lock_bh(&expired_lock_thread.elt_lock);
+                while (!list_empty(expired)) {
+                        struct ldlm_lock *lock = list_entry(expired->next,
+                                                            struct ldlm_lock,
+                                                            l_pending_chain);
+                        spin_unlock_bh(&expired_lock_thread.elt_lock);
+                        
+                        ptlrpc_fail_export(lock->l_export);
+
+                        spin_lock_bh(&expired_lock_thread.elt_lock);
+                }
+                spin_unlock_bh(&expired_lock_thread.elt_lock);
+
+                if (expired_lock_thread.elt_state == ELT_TERMINATE)
+                        break;
+        }
+
+        expired_lock_thread.elt_state = ELT_STOPPED;
+        wake_up(&expired_lock_thread.elt_waitq);
+        RETURN(0);
+}
 
 static void waiting_locks_callback(unsigned long unused)
 {
-        struct list_head *liter, *n;
+        struct ldlm_lock *lock;
 
         spin_lock_bh(&waiting_locks_spinlock);
-        list_for_each_safe(liter, n, &waiting_locks_list) {
-                struct ldlm_lock *l = list_entry(liter, struct ldlm_lock,
-                                                 l_pending_chain);
-                if (l->l_callback_timeout > jiffies)
+        while (!list_empty(&waiting_locks_list)) {
+                lock = list_entry(waiting_locks_list.next, struct ldlm_lock,
+                                  l_pending_chain);
+
+                if (lock->l_callback_timeout > jiffies)
                         break;
-                CERROR("lock timer expired, lock %p\n", l);
-                LDLM_DEBUG(l, "timer expired, recovering exp %p on conn %p",
-                           l->l_export, l->l_export->exp_connection);
-                recovd_conn_fail(l->l_export->exp_connection);
+
+                LDLM_ERROR(lock, "lock callback timer expired: evicting client "
+                           "%s@%s nid "LPU64,
+                           lock->l_export->exp_client_uuid.uuid,
+                           lock->l_export->exp_connection->c_remote_uuid.uuid,
+                           lock->l_export->exp_connection->c_peer.peer_nid);
+
+                spin_lock_bh(&expired_lock_thread.elt_lock);
+                list_del(&lock->l_pending_chain);
+                list_add(&lock->l_pending_chain,
+                         &expired_lock_thread.elt_expired_locks);
+                spin_unlock_bh(&expired_lock_thread.elt_lock);
+                wake_up(&expired_lock_thread.elt_waitq);
         }
+
         spin_unlock_bh(&waiting_locks_spinlock);
 }
 
@@ -80,8 +164,8 @@ static void waiting_locks_callback(unsigned long unused)
 static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
 {
         unsigned long timeout_rounded;
-        ENTRY;
 
+        LDLM_DEBUG(lock, "adding to wait list");
         LASSERT(list_empty(&lock->l_pending_chain));
 
         spin_lock_bh(&waiting_locks_spinlock);
@@ -95,7 +179,9 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
         }
         list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */
         spin_unlock_bh(&waiting_locks_spinlock);
-        RETURN(1);
+        /* We drop this ref when we get removed from the list. */
+        class_export_get(lock->l_export);
+        return 1;
 }
 
 /*
@@ -107,13 +193,18 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
 {
         struct list_head *list_next;
 
-        ENTRY;
+        if (lock->l_export == NULL) {
+                /* We don't have a "waiting locks list" on clients. */
+                LDLM_DEBUG(lock, "client lock: no-op");
+                return 0;
+        }
 
         spin_lock_bh(&waiting_locks_spinlock);
 
         if (list_empty(&lock->l_pending_chain)) {
                 spin_unlock_bh(&waiting_locks_spinlock);
-                RETURN(0);
+                LDLM_DEBUG(lock, "wasn't waiting");
+                return 0;
         }
 
         list_next = lock->l_pending_chain.next;
@@ -132,13 +223,39 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock)
         }
         list_del_init(&lock->l_pending_chain);
         spin_unlock_bh(&waiting_locks_spinlock);
+        /* We got this ref when we were added to the list. */
+        class_export_put(lock->l_export);
+        LDLM_DEBUG(lock, "removed");
+        return 1;
+}
+
+#else /* !__KERNEL__ */
+
+static int ldlm_add_waiting_lock(struct ldlm_lock *lock)
+{
         RETURN(1);
 }
 
-static inline void ldlm_failed_ast(struct ldlm_lock *lock)
+int ldlm_del_waiting_lock(struct ldlm_lock *lock)
 {
-        /* XXX diagnostic */
-        recovd_conn_fail(lock->l_export->exp_connection);
+        RETURN(0);
+}
+
+#endif /* __KERNEL__ */
+
+static inline void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
+                                   char *ast_type)
+{
+        CERROR("%s AST failed (%d) for res "LPU64"/"LPU64
+               ", mode %s: evicting client %s@%s NID "LPU64"\n",
+               ast_type, rc,
+               lock->l_resource->lr_name.name[0],
+               lock->l_resource->lr_name.name[1],
+               ldlm_lockname[lock->l_granted_mode],
+               lock->l_export->exp_client_uuid.uuid,
+               lock->l_export->exp_connection->c_remote_uuid.uuid,
+               lock->l_export->exp_connection->c_peer.peer_nid);
+        ptlrpc_fail_export(lock->l_export);
 }
 
 int ldlm_server_blocking_ast(struct ldlm_lock *lock,
@@ -171,12 +288,19 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
                 RETURN(0);
         }
 
-        req = ptlrpc_prep_req(&lock->l_export->exp_ldlm_data.led_import,
+#if 0
+        if (LTIME_S(CURRENT_TIME) - lock->l_export->exp_last_request_time > 30){
+                ldlm_failed_ast(lock, -ETIMEDOUT, "Not-attempted blocking");
+                RETURN(-ETIMEDOUT);
+        }
+#endif
+
+        req = ptlrpc_prep_req(lock->l_export->exp_ldlm_data.led_import,
                               LDLM_BL_CALLBACK, 1, &size, NULL);
         if (!req)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->lock_handle1, &lock->l_remote_handle,
                sizeof(body->lock_handle1));
         memcpy(&body->lock_desc, desc, sizeof(*desc));
@@ -188,14 +312,28 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
 
         req->rq_level = LUSTRE_CONN_RECOVD;
-        req->rq_timeout = 2;
+        req->rq_timeout = 2; /* 2 second timeout for initial AST reply */
         rc = ptlrpc_queue_wait(req);
         if (rc == -ETIMEDOUT || rc == -EINTR) {
                 ldlm_del_waiting_lock(lock);
-                ldlm_failed_ast(lock);
+                ldlm_failed_ast(lock, rc, "blocking");
         } else if (rc) {
-                CERROR("client returned %d from blocking AST for lock %p\n",
-                       req->rq_status, lock);
+                if (rc == -EINVAL)
+                        CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d "
+                               "from blocking AST for lock %p--normal race\n",
+                               req->rq_connection->c_peer.peer_nid,
+                               req->rq_repmsg->status, lock);
+                else if (rc == -ENOTCONN)
+                        CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d "
+                               "from blocking AST for lock %p--this client was "
+                               "probably rebooted while it held a lock, nothing"
+                               " serious\n",req->rq_connection->c_peer.peer_nid,
+                               req->rq_repmsg->status, lock);
+                else
+                        CDEBUG(D_ERROR, "client (nid "LPU64") returned %d "
+                               "from blocking AST for lock %p\n",
+                               req->rq_connection->c_peer.peer_nid,
+                               req->rq_repmsg->status, lock);
                 LDLM_DEBUG(lock, "client returned error %d from blocking AST",
                            req->rq_status);
                 ldlm_lock_cancel(lock);
@@ -221,12 +359,12 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
                 RETURN(-EINVAL);
         }
 
-        req = ptlrpc_prep_req(&lock->l_export->exp_ldlm_data.led_import,
+        req = ptlrpc_prep_req(lock->l_export->exp_ldlm_data.led_import,
                               LDLM_CP_CALLBACK, 1, &size, NULL);
         if (!req)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->lock_handle1, &lock->l_remote_handle,
                sizeof(body->lock_handle1));
         body->lock_flags = flags;
@@ -236,11 +374,11 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         req->rq_replen = lustre_msg_size(0, NULL);
 
         req->rq_level = LUSTRE_CONN_RECOVD;
-        req->rq_timeout = 2;
+        req->rq_timeout = 2; /* 2 second timeout for initial AST reply */
         rc = ptlrpc_queue_wait(req);
         if (rc == -ETIMEDOUT || rc == -EINTR) {
                 ldlm_del_waiting_lock(lock);
-                ldlm_failed_ast(lock);
+                ldlm_failed_ast(lock, rc, "completion");
         } else if (rc) {
                 CERROR("client returned %d from completion AST for lock %p\n",
                        req->rq_status, lock);
@@ -272,7 +410,13 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
 
         LDLM_DEBUG_NOLOCK("server-side enqueue handler START");
 
-        dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
+        dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req),
+                                      lustre_swab_ldlm_request);
+        if (dlm_req == NULL) {
+                CERROR ("Can't unpack dlm_req\n");
+                RETURN (-EFAULT);
+        }
+        
         flags = dlm_req->lock_flags;
         if (dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN &&
             (flags & LDLM_FL_HAS_INTENT)) {
@@ -298,7 +442,8 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
                                 &dlm_req->lock_handle2,
                                 dlm_req->lock_desc.l_resource.lr_name,
                                 dlm_req->lock_desc.l_resource.lr_type,
-                                dlm_req->lock_desc.l_req_mode, NULL, 0);
+                                dlm_req->lock_desc.l_req_mode,
+                                blocking_callback, NULL);
         if (!lock)
                 GOTO(out, err = -ENOMEM);
 
@@ -314,11 +459,11 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req,
         l_unlock(&lock->l_resource->lr_namespace->ns_lock);
 
         err = ldlm_lock_enqueue(obddev->obd_namespace, &lock, cookie, cookielen,
-                                &flags, completion_callback, blocking_callback);
+                                &flags, completion_callback);
         if (err)
                 GOTO(out, err);
 
-        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0);
+        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep));
         dlm_rep->lock_flags = flags;
 
         ldlm_lock2handle(lock, &dlm_rep->lock_handle);
@@ -358,13 +503,19 @@ int ldlm_handle_convert(struct ptlrpc_request *req)
         int rc, size = sizeof(*dlm_rep);
         ENTRY;
 
+        dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req),
+                                      lustre_swab_ldlm_request);
+        if (dlm_req == NULL) {
+                CERROR ("Can't unpack dlm_req\n");
+                RETURN (-EFAULT);
+        }
+        
         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc) {
                 CERROR("out of memory\n");
                 RETURN(-ENOMEM);
         }
-        dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
-        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0);
+        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep));
         dlm_rep->lock_flags = dlm_req->lock_flags;
 
         lock = ldlm_handle2lock(&dlm_req->lock_handle1);
@@ -396,21 +547,24 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
         int rc;
         ENTRY;
 
+        dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req),
+                                      lustre_swab_ldlm_request);
+        if (dlm_req == NULL) {
+                CERROR("bad request buffer for cancel\n");
+                RETURN(-EFAULT);
+        }
+
         rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc) {
                 CERROR("out of memory\n");
                 RETURN(-ENOMEM);
         }
-        dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
-        if (!dlm_req) {
-                CERROR("bad request buffer for cancel\n");
-                RETURN(-EINVAL);
-        }
 
         lock = ldlm_handle2lock(&dlm_req->lock_handle1);
         if (!lock) {
-                CERROR("received cancel for unknown lock cookie "LPX64"\n",
-                       dlm_req->lock_handle1.cookie);
+                CERROR("received cancel for unknown lock cookie "LPX64
+                       " from nid "LPU64"\n", dlm_req->lock_handle1.cookie,
+                       req->rq_connection->c_peer.peer_nid);
                 LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock "
                                   "(cookie "LPU64")",
                                   dlm_req->lock_handle1.cookie);
@@ -423,7 +577,7 @@ int ldlm_handle_cancel(struct ptlrpc_request *req)
                 req->rq_status = 0;
         }
 
-        if (ptlrpc_reply(req->rq_svc, req) != 0)
+        if (ptlrpc_reply(req) != 0)
                 LBUG();
 
         if (lock) {
@@ -443,32 +597,28 @@ static void ldlm_handle_bl_callback(struct ptlrpc_request *req,
         int do_ast;
         ENTRY;
 
-        /* Try to narrow down this damn iozone bug */
-        if (lock->l_resource == NULL)
-                CERROR("lock %p resource NULL\n", lock);
-        if (lock->l_resource->lr_type != LDLM_EXTENT)
-                if (lock->l_resource->lr_namespace != ns)
-                        CERROR("lock %p namespace %p != passed ns %p\n", lock,
-                               lock->l_resource->lr_namespace, ns);
+        l_lock(&ns->ns_lock);
         LDLM_DEBUG(lock, "client blocking AST callback handler START");
 
-        l_lock(&ns->ns_lock);
         lock->l_flags |= LDLM_FL_CBPENDING;
         do_ast = (!lock->l_readers && !lock->l_writers);
-        l_unlock(&ns->ns_lock);
 
         if (do_ast) {
                 LDLM_DEBUG(lock, "already unused, calling "
                            "callback (%p)", lock->l_blocking_ast);
-                if (lock->l_blocking_ast != NULL)
+                if (lock->l_blocking_ast != NULL) {
+                        l_unlock(&ns->ns_lock);
                         lock->l_blocking_ast(lock, &dlm_req->lock_desc,
                                              lock->l_data, LDLM_CB_BLOCKING);
+                        l_lock(&ns->ns_lock);
+                }
         } else {
                 LDLM_DEBUG(lock, "Lock still has references, will be"
                            " cancelled later");
         }
 
         LDLM_DEBUG(lock, "client blocking callback handler END");
+        l_unlock(&ns->ns_lock);
         LDLM_LOCK_PUT(lock);
         EXIT;
 }
@@ -481,9 +631,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
         LIST_HEAD(ast_list);
         ENTRY;
 
-        LDLM_DEBUG(lock, "client completion callback handler START");
-
         l_lock(&ns->ns_lock);
+        LDLM_DEBUG(lock, "client completion callback handler START");
 
         /* If we receive the completion AST before the actual enqueue returned,
          * then we might need to switch lock modes, resources, or extents. */
@@ -491,9 +640,22 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
                 lock->l_req_mode = dlm_req->lock_desc.l_granted_mode;
                 LDLM_DEBUG(lock, "completion AST, new lock mode");
         }
-        if (lock->l_resource->lr_type == LDLM_EXTENT)
+        if (lock->l_resource->lr_type == LDLM_EXTENT) {
                 memcpy(&lock->l_extent, &dlm_req->lock_desc.l_extent,
                        sizeof(lock->l_extent));
+
+                if ((lock->l_extent.end & ~PAGE_MASK) != ~PAGE_MASK) {
+                        /* XXX Old versions of BA OST code have a fencepost bug
+                         * which will cause them to grant a lock that's one
+                         * byte too large.  This can be safely removed after BA
+                         * ships their next release -phik (02 Apr 2003) */
+                        lock->l_extent.end--;
+                } else if ((lock->l_extent.start & ~PAGE_MASK) ==
+                           ~PAGE_MASK) {
+                        lock->l_extent.start++;
+                }
+        }
+
         ldlm_resource_unlink_lock(lock);
         if (memcmp(&dlm_req->lock_desc.l_resource.lr_name,
                    &lock->l_resource->lr_name,
@@ -505,8 +667,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req,
         lock->l_resource->lr_tmp = &ast_list;
         ldlm_grant_lock(lock, req, sizeof(*req));
         lock->l_resource->lr_tmp = NULL;
-        l_unlock(&ns->ns_lock);
         LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work");
+        l_unlock(&ns->ns_lock);
         LDLM_LOCK_PUT(lock);
 
         ldlm_run_ast_work(&ast_list);
@@ -523,7 +685,7 @@ static int ldlm_callback_reply(struct ptlrpc_request *req, int rc)
                              &req->rq_repmsg);
         if (rc)
                 return rc;
-        return ptlrpc_reply(req->rq_svc, req);
+        return ptlrpc_reply(req);
 }
 
 static int ldlm_callback_handler(struct ptlrpc_request *req)
@@ -531,26 +693,29 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
         struct ldlm_namespace *ns;
         struct ldlm_request *dlm_req;
         struct ldlm_lock *lock;
-        int rc;
         ENTRY;
 
-        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
-        if (rc) {
-                CERROR("Invalid request: %d\n", rc);
-                RETURN(rc);
-        }
+        /* Requests arrive in sender's byte order.  The ptlrpc service
+         * handler has already checked and, if necessary, byte-swapped the
+         * incoming request message body, but I am responsible for the
+         * message buffers. */
 
         if (req->rq_export == NULL) {
                 struct ldlm_request *dlm_req;
 
-                CERROR("operation %d with bad export (ptl req %d/rep %d)\n",
-                       req->rq_reqmsg->opc, req->rq_request_portal,
-                       req->rq_reply_portal);
-                CERROR("--> export addr: "LPX64", cookie: "LPX64"\n",
-                       req->rq_reqmsg->addr, req->rq_reqmsg->cookie);
-                dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
-                CERROR("--> lock addr: "LPX64", cookie: "LPX64"\n",
-                       dlm_req->lock_handle1.addr,dlm_req->lock_handle1.cookie);
+                CDEBUG(D_RPCTRACE, "operation %d from nid "LPU64" with bad "
+                       "export cookie "LPX64" (ptl req %d/rep %d); this is "
+                       "normal if this node rebooted with a lock held\n",
+                       req->rq_reqmsg->opc, req->rq_connection->c_peer.peer_nid,
+                       req->rq_reqmsg->handle.cookie,
+                       req->rq_request_portal, req->rq_reply_portal);
+
+                dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req),
+                                             lustre_swab_ldlm_request);
+                if (dlm_req != NULL)
+                        CDEBUG(D_RPCTRACE, "--> lock cookie: "LPX64"\n",
+                               dlm_req->lock_handle1.cookie);
+
                 ldlm_callback_reply(req, -ENOTCONN);
                 RETURN(0);
         }
@@ -560,7 +725,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
         } else if (req->rq_reqmsg->opc == LDLM_CP_CALLBACK) {
                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_CP_CALLBACK, 0);
         } else {
-                ldlm_callback_reply(req, -EIO);
+                ldlm_callback_reply(req, -EPROTO);
                 RETURN(0);
         }
 
@@ -569,7 +734,14 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
         ns = req->rq_export->exp_obd->obd_namespace;
         LASSERT(ns != NULL);
 
-        dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
+        dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req),
+                                      lustre_swab_ldlm_request);
+        if (dlm_req == NULL) {
+                CERROR ("can't unpack dlm_req\n");
+                ldlm_callback_reply (req, -EPROTO);
+                RETURN (0);
+        }
+        
         lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle1);
         if (!lock) {
                 CDEBUG(D_INODE, "callback on lock "LPX64" - lock disappeared\n",
@@ -592,6 +764,8 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
                 CDEBUG(D_INODE, "completion ast\n");
                 ldlm_handle_cp_callback(req, ns, dlm_req, lock);
                 break;
+        default:
+                LBUG();                         /* checked above */
         }
 
         RETURN(0);
@@ -602,27 +776,28 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req)
         int rc;
         ENTRY;
 
-        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
-        if (rc) {
-                CERROR("lustre_ldlm: Invalid request: %d\n", rc);
-                RETURN(rc);
-        }
+        /* Requests arrive in sender's byte order.  The ptlrpc service
+         * handler has already checked and, if necessary, byte-swapped the
+         * incoming request message body, but I am responsible for the
+         * message buffers. */
 
         if (req->rq_export == NULL) {
                 struct ldlm_request *dlm_req;
                 CERROR("operation %d with bad export (ptl req %d/rep %d)\n",
                        req->rq_reqmsg->opc, req->rq_request_portal,
                        req->rq_reply_portal);
-                CERROR("--> export addr: "LPX64", cookie: "LPX64"\n",
-                       req->rq_reqmsg->addr, req->rq_reqmsg->cookie);
-                dlm_req = lustre_msg_buf(req->rq_reqmsg, 0);
-                ldlm_lock_dump_handle(D_ERROR, &dlm_req->lock_handle1);
+                CERROR("--> export cookie: "LPX64"\n",
+                       req->rq_reqmsg->handle.cookie);
+                dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req),
+                                             lustre_swab_ldlm_request);
+                if (dlm_req != NULL)
+                        ldlm_lock_dump_handle(D_ERROR, &dlm_req->lock_handle1);
                 RETURN(-ENOTCONN);
         }
 
         switch (req->rq_reqmsg->opc) {
 
-        /* XXX FIXME move this back to mds/handler.c, bug 625069 */
+        /* XXX FIXME move this back to mds/handler.c, bug 249 */
         case LDLM_CANCEL:
                 CDEBUG(D_INODE, "cancel\n");
                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_CANCEL, 0);
@@ -696,11 +871,18 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
                 RETURN(rc);
 
 #ifdef __KERNEL__
+        inter_module_register("ldlm_cli_cancel_unused", THIS_MODULE,
+                              ldlm_cli_cancel_unused);
+        inter_module_register("ldlm_namespace_cleanup", THIS_MODULE,
+                              ldlm_namespace_cleanup);
+        inter_module_register("ldlm_replay_locks", THIS_MODULE,
+                              ldlm_replay_locks);
+
         ldlm->ldlm_cb_service =
                 ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
                                 LDLM_MAXREQSIZE, LDLM_CB_REQUEST_PORTAL,
                                 LDLM_CB_REPLY_PORTAL,
-                                ldlm_callback_handler, "ldlm_cbd");
+                                ldlm_callback_handler, "ldlm_cbd", obddev);
 
         if (!ldlm->ldlm_cb_service) {
                 CERROR("failed to start service\n");
@@ -711,7 +893,7 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
                 ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE,
                                 LDLM_MAXREQSIZE, LDLM_CANCEL_REQUEST_PORTAL,
                                 LDLM_CANCEL_REPLY_PORTAL,
-                                ldlm_cancel_handler, "ldlm_canceld");
+                                ldlm_cancel_handler, "ldlm_canceld", obddev);
 
         if (!ldlm->ldlm_cancel_service) {
                 CERROR("failed to start service\n");
@@ -741,12 +923,26 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
                 }
         }
 
-#endif
+        INIT_LIST_HEAD(&expired_lock_thread.elt_expired_locks);
+        spin_lock_init(&expired_lock_thread.elt_lock);
+        expired_lock_thread.elt_state = ELT_STOPPED;
+        init_waitqueue_head(&expired_lock_thread.elt_waitq);
+
+        rc = kernel_thread(expired_lock_main, NULL, CLONE_VM | CLONE_FS);
+        if (rc < 0) {
+                CERROR("Cannot start ldlm expired-lock thread: %d\n", rc);
+                GOTO(out_thread, rc);
+        }
+
+        wait_event(expired_lock_thread.elt_waitq,
+                   expired_lock_thread.elt_state == ELT_READY);
+
         INIT_LIST_HEAD(&waiting_locks_list);
         spin_lock_init(&waiting_locks_spinlock);
         waiting_locks_timer.function = waiting_locks_callback;
         waiting_locks_timer.data = 0;
         init_timer(&waiting_locks_timer);
+#endif
 
         ldlm_already_setup = 1;
 
@@ -765,30 +961,49 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf)
         return rc;
 }
 
-static int ldlm_cleanup(struct obd_device *obddev)
+static int ldlm_cleanup(struct obd_device *obddev, int force, int failover)
 {
         struct ldlm_obd *ldlm = &obddev->u.ldlm;
         ENTRY;
 
         if (!list_empty(&ldlm_namespace_list)) {
                 CERROR("ldlm still has namespaces; clean these up first.\n");
+                ldlm_dump_all_namespaces();
                 RETURN(-EBUSY);
         }
 
 #ifdef __KERNEL__
+        if (force) {
+                ptlrpc_put_ldlm_hooks();
+        } else if (ptlrpc_ldlm_hooks_referenced()) {
+                CERROR("Some connections weren't cleaned up; run lconf with "
+                       "--force to forcibly unload.\n");
+                ptlrpc_dump_connections();
+                RETURN(-EBUSY);
+        }
+
         ptlrpc_stop_all_threads(ldlm->ldlm_cb_service);
         ptlrpc_unregister_service(ldlm->ldlm_cb_service);
         ptlrpc_stop_all_threads(ldlm->ldlm_cancel_service);
         ptlrpc_unregister_service(ldlm->ldlm_cancel_service);
         ldlm_proc_cleanup(obddev);
+
+        expired_lock_thread.elt_state = ELT_TERMINATE;
+        wake_up(&expired_lock_thread.elt_waitq);
+        wait_event(expired_lock_thread.elt_waitq,
+                   expired_lock_thread.elt_state == ELT_STOPPED);
+
+        inter_module_unregister("ldlm_namespace_cleanup");
+        inter_module_unregister("ldlm_cli_cancel_unused");
+        inter_module_unregister("ldlm_replay_locks");
 #endif
+
         ldlm_already_setup = 0;
         RETURN(0);
 }
 
 static int ldlm_connect(struct lustre_handle *conn, struct obd_device *src,
-                        struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                        ptlrpc_recovery_cb_t recover)
+                        struct obd_uuid *cluuid)
 {
         return class_connect(conn, src, cluuid);
 }
@@ -896,6 +1111,18 @@ EXPORT_SYMBOL(ldlm_namespace_dump);
 EXPORT_SYMBOL(l_lock);
 EXPORT_SYMBOL(l_unlock);
 
+/* ldlm_lib.c */
+EXPORT_SYMBOL(client_import_connect);
+EXPORT_SYMBOL(client_import_disconnect);
+EXPORT_SYMBOL(target_abort_recovery);
+EXPORT_SYMBOL(target_handle_connect);
+EXPORT_SYMBOL(target_cancel_recovery_timer);
+EXPORT_SYMBOL(target_send_reply);
+EXPORT_SYMBOL(target_queue_recovery_request);
+EXPORT_SYMBOL(target_handle_ping);
+EXPORT_SYMBOL(target_handle_disconnect);
+EXPORT_SYMBOL(target_queue_final_reply);
+
 #ifdef __KERNEL__
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Lustre Lock Management Module v0.1");
index d64a402..e5d9c24 100644 (file)
 #include <linux/obd_class.h>
 #include <linux/obd.h>
 
-static int interrupted_completion_wait(void *data)
+static void interrupted_completion_wait(void *data)
 {
-        RETURN(1);
 }
 
+struct lock_wait_data {
+        struct ldlm_lock *lwd_lock;
+        int               lwd_generation;
+};
+
 int ldlm_expired_completion_wait(void *data)
 {
-        struct ldlm_lock *lock = data;
-        struct ptlrpc_connection *conn;
-        struct obd_device *obd;
+        struct lock_wait_data *lwd = data;
+        struct ldlm_lock *lock = lwd->lwd_lock;
+        struct obd_device *obd = class_conn2obd(lock->l_connh);
 
-        if (!lock)
-                CERROR("NULL lock\n");
-        else if (!lock->l_connh)
-                CERROR("lock %p has NULL connh\n", lock);
-        else if (!(obd = class_conn2obd(lock->l_connh)))
-                CERROR("lock %p has NULL obd\n", lock);
-        else if (!(conn = obd->u.cli.cl_import.imp_connection))
-                CERROR("lock %p has NULL connection\n", lock);
-        else {
-                LDLM_DEBUG(lock, "timed out waiting for completion");
-                CERROR("lock %p timed out from %s\n", lock,
-                       conn->c_remote_uuid.uuid);
-                ldlm_lock_dump(D_ERROR, lock);
-                class_signal_connection_failure(conn);
+        if (obd == NULL) {
+                LDLM_ERROR(lock, "lock timed out; mot entering recovery in "
+                           "server code, just going back to sleep");
+        } else {
+                struct obd_import *imp = obd->u.cli.cl_import;
+                ptlrpc_fail_import(imp, lwd->lwd_generation);
+                LDLM_ERROR(lock, "lock timed out, entering recovery for %s@%s",
+                           imp->imp_target_uuid.uuid,
+                           imp->imp_connection->c_remote_uuid.uuid);
         }
+        
         RETURN(0);
 }
 
 int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 {
-        struct l_wait_info lwi =
-                LWI_TIMEOUT_INTR(obd_timeout * HZ, ldlm_expired_completion_wait,
-                                 interrupted_completion_wait, lock);
+        struct lock_wait_data lwd;
+        unsigned long irqflags;
+        struct obd_device *obd;
+        struct obd_import *imp = NULL;
         int rc = 0;
+        struct l_wait_info lwi;
+
+        obd = class_conn2obd(lock->l_connh);
+
+        /* if this is a local lock, then there is no import */
+        if (obd != NULL)
+                imp = obd->u.cli.cl_import;
+
+        lwd.lwd_lock = lock;
+
+        lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, ldlm_expired_completion_wait,
+                               interrupted_completion_wait, &lwd);
         ENTRY;
 
         if (flags == LDLM_FL_WAIT_NOREPROC)
@@ -84,6 +97,12 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data)
         ldlm_reprocess_all(lock->l_resource);
 
  noreproc:
+        if (imp != NULL) {
+                spin_lock_irqsave(&imp->imp_lock, irqflags);
+                lwd.lwd_generation = imp->imp_generation;
+                spin_unlock_irqrestore(&imp->imp_lock, irqflags);
+        }
+
         /* Go to sleep until the lock is granted or cancelled. */
         rc = l_wait_event(lock->l_waitq,
                           ((lock->l_req_mode == lock->l_granted_mode) ||
@@ -114,7 +133,6 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
                                   ldlm_completion_callback completion,
                                   ldlm_blocking_callback blocking,
                                   void *data,
-                                  void *cp_data,
                                   struct lustre_handle *lockh)
 {
         struct ldlm_lock *lock;
@@ -127,7 +145,7 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
         }
 
         lock = ldlm_lock_create(ns, parent_lockh, res_id, type, mode,
-                                data, cp_data);
+                                blocking, data);
         if (!lock)
                 GOTO(out_nolock, err = -ENOMEM);
         LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created");
@@ -136,8 +154,8 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns,
         ldlm_lock2handle(lock, lockh);
         lock->l_flags |= LDLM_FL_LOCAL;
 
-        err = ldlm_lock_enqueue(ns, &lock, cookie, cookielen, flags, completion,
-                                blocking);
+        err = ldlm_lock_enqueue(ns, &lock, cookie, cookielen, flags,
+                                completion);
         if (err != ELDLM_OK)
                 GOTO(out, err);
 
@@ -172,7 +190,6 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                      ldlm_completion_callback completion,
                      ldlm_blocking_callback blocking,
                      void *data,
-                     void *cp_data,
                      struct lustre_handle *lockh)
 {
         struct ldlm_lock *lock;
@@ -188,7 +205,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                 rc = ldlm_cli_enqueue_local(ns, parent_lock_handle, res_id,
                                             type, cookie, cookielen, mode,
                                             flags, completion, blocking, data,
-                                            cp_data, lockh);
+                                            lockh);
                 RETURN(rc);
         }
 
@@ -200,7 +217,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                 LASSERT(connh == lock->l_connh);
         } else {
                 lock = ldlm_lock_create(ns, parent_lock_handle, res_id, type,
-                                        mode, data, cp_data);
+                                        mode, blocking, data);
                 if (lock == NULL)
                         GOTO(out_nolock, rc = -ENOMEM);
                 /* ugh.  I set this early (instead of waiting for _enqueue)
@@ -227,7 +244,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                 LBUG();
 
         /* Dump lock data into the request buffer */
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
         ldlm_lock2desc(lock, &body->lock_desc);
         body->lock_flags = *flags;
 
@@ -243,6 +260,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
         }
         lock->l_connh = connh;
         lock->l_export = NULL;
+        lock->l_blocking_ast = blocking;
 
         LDLM_DEBUG(lock, "sending request");
         rc = ptlrpc_queue_wait(req);
@@ -253,26 +271,54 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                            rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED");
                 /* Set a flag to prevent us from sending a CANCEL (bug 407) */
                 l_lock(&ns->ns_lock);
-                lock->l_flags |= LDLM_FL_CANCELING;
+                lock->l_flags |= LDLM_FL_LOCAL_ONLY;
                 l_unlock(&ns->ns_lock);
 
                 ldlm_lock_decref_and_cancel(lockh, mode);
+
+                if (rc == ELDLM_LOCK_ABORTED) {
+                        /* caller expects reply buffer 0 to have been swabbed */
+                        reply = lustre_swab_repbuf(req, 0, sizeof (*reply),
+                                                   lustre_swab_ldlm_reply);
+                        if (reply == NULL) {
+                                CERROR ("Can't unpack ldlm_reply\n");
+                                GOTO (out_req, rc = -EPROTO);
+                        }
+                }
                 GOTO(out_req, rc);
         }
 
-        reply = lustre_msg_buf(req->rq_repmsg, 0);
+        reply = lustre_swab_repbuf(req, 0, sizeof (*reply),
+                                   lustre_swab_ldlm_reply);
+        if (reply == NULL) {
+                CERROR ("Can't unpack ldlm_reply\n");
+                GOTO (out_req, rc = -EPROTO);
+        }
+        
         memcpy(&lock->l_remote_handle, &reply->lock_handle,
                sizeof(lock->l_remote_handle));
         *flags = reply->lock_flags;
 
-        CDEBUG(D_INFO, "local: %p, remote: %p, flags: %d\n", lock,
-               (void *)(unsigned long)reply->lock_handle.addr, *flags);
+        CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: %d\n", lock,
+               reply->lock_handle.cookie, *flags);
         if (type == LDLM_EXTENT) {
                 CDEBUG(D_INFO, "requested extent: "LPU64" -> "LPU64", got "
                        "extent "LPU64" -> "LPU64"\n",
                        body->lock_desc.l_extent.start,
                        body->lock_desc.l_extent.end,
                        reply->lock_extent.start, reply->lock_extent.end);
+
+                if ((reply->lock_extent.end & ~PAGE_MASK) != ~PAGE_MASK) {
+                        /* XXX Old versions of BA OST code have a fencepost bug
+                         * which will cause them to grant a lock that's one
+                         * byte too large.  This can be safely removed after BA
+                         * ships their next release -phik (02 Apr 2003) */
+                        reply->lock_extent.end--;
+                } else if ((reply->lock_extent.start & ~PAGE_MASK) ==
+                           ~PAGE_MASK) {
+                        reply->lock_extent.start++;
+                }
+
                 cookie = &reply->lock_extent; /* FIXME bug 267 */
                 cookielen = sizeof(reply->lock_extent);
         }
@@ -310,7 +356,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh,
                 l_lock(&ns->ns_lock);
                 lock->l_completion_ast = NULL;
                 rc = ldlm_lock_enqueue(ns, &lock, cookie, cookielen, flags,
-                                       completion, blocking);
+                                       completion);
                 l_unlock(&ns->ns_lock);
                 if (lock->l_completion_ast)
                         lock->l_completion_ast(lock, *flags, NULL);
@@ -339,7 +385,6 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh,
                           ldlm_completion_callback completion,
                           ldlm_blocking_callback blocking,
                           void *data,
-                          void *cp_data,
                           struct lustre_handle *lockh)
 {
         int rc;
@@ -357,7 +402,7 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh,
                 rc = ldlm_cli_enqueue(connh, req, ns, parent_lock_handle,
                                       res_id, type, cookie, cookielen, mode,
                                       flags, completion, blocking, data,
-                                      cp_data, lockh);
+                                      lockh);
                 if (rc != ELDLM_OK)
                         CERROR("ldlm_cli_enqueue: err: %d\n", rc);
                 RETURN(rc);
@@ -373,7 +418,7 @@ int ldlm_cli_replay_enqueue(struct ldlm_lock *lock)
         ldlm_lock2handle(lock, &lockh);
         return ldlm_cli_enqueue(lock->l_connh, NULL, NULL, NULL, junk,
                                 lock->l_resource->lr_type, NULL, 0, -1, &flags,
-                                NULL, NULL, NULL, 0, &lockh);
+                                NULL, NULL, NULL, &lockh);
 }
 
 static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode,
@@ -425,7 +470,7 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, int *flags)
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->lock_handle1, &lock->l_remote_handle,
                sizeof(body->lock_handle1));
 
@@ -439,7 +484,13 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, int *flags)
         if (rc != ELDLM_OK)
                 GOTO(out, rc);
 
-        reply = lustre_msg_buf(req->rq_repmsg, 0);
+        reply = lustre_swab_repbuf(req, 0, sizeof (*reply),
+                                   lustre_swab_ldlm_reply);
+        if (reply == NULL) {
+                CERROR ("Can't unpack ldlm_reply\n");
+                GOTO (out, rc = -EPROTO);
+        }
+        
         res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags);
         if (res != NULL)
                 ldlm_reprocess_all(res);
@@ -469,23 +520,30 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
 
         if (lock->l_connh) {
                 int local_only;
+                struct obd_import *imp;
 
                 LDLM_DEBUG(lock, "client-side cancel");
                 /* Set this flag to prevent others from getting new references*/
                 l_lock(&lock->l_resource->lr_namespace->ns_lock);
                 lock->l_flags |= LDLM_FL_CBPENDING;
-                ldlm_cancel_callback(lock);
                 local_only = (lock->l_flags & LDLM_FL_LOCAL_ONLY);
                 l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+                ldlm_cancel_callback(lock);
 
                 if (local_only) {
                         CDEBUG(D_INFO, "not sending request (at caller's "
-                               "instruction\n");
+                               "instruction)\n");
+                        goto local_cancel;
+                }
+
+                imp = class_conn2cliimp(lock->l_connh);
+                if (imp == NULL || imp->imp_invalid) {
+                        CDEBUG(D_HA, "skipping cancel on invalid import %p\n",
+                               imp);
                         goto local_cancel;
                 }
 
-                req = ptlrpc_prep_req(class_conn2cliimp(lock->l_connh),
-                                      LDLM_CANCEL, 1, &size, NULL);
+                req = ptlrpc_prep_req(imp, LDLM_CANCEL, 1, &size, NULL);
                 if (!req)
                         GOTO(out, rc = -ENOMEM);
 
@@ -493,21 +551,23 @@ int ldlm_cli_cancel(struct lustre_handle *lockh)
                 req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
                 req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
 
-                body = lustre_msg_buf(req->rq_reqmsg, 0);
+                body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
                 memcpy(&body->lock_handle1, &lock->l_remote_handle,
                        sizeof(body->lock_handle1));
 
                 req->rq_replen = lustre_msg_size(0, NULL);
 
                 rc = ptlrpc_queue_wait(req);
-                ptlrpc_req_finished(req);
-                if (rc == ESTALE) {
-                        CERROR("client/server out of sync\n");
-                        LBUG();
-                }
-                if (rc != ELDLM_OK)
+
+                if (rc == ESTALE)
+                        CERROR("client/server (nid "LPU64") out of sync--not "
+                               "fatal\n",
+                               req->rq_import->imp_connection->c_peer.peer_nid);
+                else if (rc != ELDLM_OK)
                         CERROR("Got rc %d from cancel RPC: canceling "
                                "anyway\n", rc);
+
+                ptlrpc_req_finished(req);
         local_cancel:
                 ldlm_lock_cancel(lock);
         } else {
@@ -585,8 +645,9 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns)
         RETURN(rc);
 }
 
-int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
-                                    struct ldlm_res_id res_id, int flags)
+static int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
+                                           struct ldlm_res_id res_id, int flags,
+                                           void *opaque)
 {
         struct ldlm_resource *res;
         struct list_head *tmp, *next, list = LIST_HEAD_INIT(list);
@@ -605,8 +666,17 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
                 struct ldlm_lock *lock;
                 lock = list_entry(tmp, struct ldlm_lock, l_res_link);
 
-                if (lock->l_readers || lock->l_writers)
-                        continue;
+                if (lock->l_readers || lock->l_writers) {
+                        if (flags & LDLM_FL_WARN) {
+                                LDLM_ERROR(lock, "lock in use");
+                                LBUG();
+                        }
+                }
+                if (opaque != NULL && lock->l_data != opaque) {
+                        LDLM_ERROR(lock, "data %p doesn't match opaque %p",
+                                   lock->l_data, opaque);
+                        LBUG();
+                }
 
                 /* See CBPENDING comment in ldlm_cancel_lru */
                 lock->l_flags |= LDLM_FL_CBPENDING;
@@ -653,9 +723,10 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns,
  *
  * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying
  * to notify the server.
- * If flags & LDLM_FL_NO_CALLBACK, don't run the cancel callback. */
+ * If flags & LDLM_FL_NO_CALLBACK, don't run the cancel callback.
+ * If flags & LDLM_FL_WARN, print a warning if some locks are still in use. */
 int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
-                           struct ldlm_res_id *res_id, int flags)
+                           struct ldlm_res_id *res_id, int flags, void *opaque)
 {
         int i;
         ENTRY;
@@ -664,7 +735,8 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
                 RETURN(ELDLM_OK);
 
         if (res_id)
-                RETURN(ldlm_cli_cancel_unused_resource(ns, *res_id, flags));
+                RETURN(ldlm_cli_cancel_unused_resource(ns, *res_id, flags,
+                                                       opaque));
 
         l_lock(&ns->ns_lock);
         for (i = 0; i < RES_HASH_SIZE; i++) {
@@ -676,7 +748,7 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns,
                         ldlm_resource_getref(res);
 
                         rc = ldlm_cli_cancel_unused_resource(ns, res->lr_name,
-                                                             flags);
+                                                             flags, opaque);
 
                         if (rc)
                                 CERROR("cancel_unused_res ("LPU64"): %d\n",
@@ -827,7 +899,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
         /* We're part of recovery, so don't wait for it. */
         req->rq_level = LUSTRE_CONN_RECOVD;
         
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
         ldlm_lock2desc(lock, &body->lock_desc);
         body->lock_flags = flags;
 
@@ -839,8 +911,14 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
         rc = ptlrpc_queue_wait(req);
         if (rc != ELDLM_OK)
                 GOTO(out, rc);
-
-        reply = lustre_msg_buf(req->rq_repmsg, 0);
+        
+        reply = lustre_swab_repbuf(req, 0, sizeof (*reply),
+                                   lustre_swab_ldlm_reply);
+        if (reply == NULL) {
+                CERROR("Can't unpack ldlm_reply\n");
+                GOTO (out, rc = -EPROTO);
+        }
+        
         memcpy(&lock->l_remote_handle, &reply->lock_handle,
                sizeof(lock->l_remote_handle));
         LDLM_DEBUG(lock, "replayed lock:");
index 0f9f4e2..84fdecc 100644 (file)
@@ -103,7 +103,7 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns)
 #endif
 #undef MAX_STRING_SIZE
 
-#define LDLM_MAX_UNUSED 20
+#define LDLM_MAX_UNUSED 100
 struct ldlm_namespace *ldlm_namespace_new(char *name, __u32 client)
 {
         struct ldlm_namespace *ns = NULL;
@@ -280,13 +280,6 @@ int ldlm_namespace_free(struct ldlm_namespace *ns)
         return ELDLM_OK;
 }
 
-int ldlm_client_free(struct obd_export *exp)
-{
-        struct ldlm_export_data *led = &exp->exp_ldlm_data;
-        ptlrpc_cleanup_client(&led->led_import);
-        RETURN(0);
-}
-
 static __u32 ldlm_hash_fn(struct ldlm_resource *parent, struct ldlm_res_id name)
 {
         __u32 hash = 0;
@@ -304,7 +297,7 @@ static struct ldlm_resource *ldlm_resource_new(void)
 {
         struct ldlm_resource *res;
 
-        res = kmem_cache_alloc(ldlm_resource_slab, SLAB_KERNEL);
+        OBD_SLAB_ALLOC(res, ldlm_resource_slab, SLAB_KERNEL, sizeof *res);
         if (res == NULL) {
                 LBUG();
                 return NULL;
@@ -461,8 +454,7 @@ int ldlm_resource_putref(struct ldlm_resource *res)
                 list_del_init(&res->lr_hash);
                 list_del_init(&res->lr_childof);
 
-                POISON(res, 0x5a, sizeof(*res));
-                kmem_cache_free(ldlm_resource_slab, res);
+                OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res);
                 l_unlock(&ns->ns_lock);
 
                 spin_lock(&ns->ns_counter_lock);
diff --git a/lustre/lib/Makefile.am b/lustre/lib/Makefile.am
deleted file mode 100644 (file)
index 1bcc388..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-EXTRA_DIST = mds_updates.c obd_pack.c  simple.c
-EXTRA_DIST += client.c target.c
-
-include $(top_srcdir)/Rules
diff --git a/lustre/lib/client.c b/lustre/lib/client.c
deleted file mode 100644 (file)
index ae490d9..0000000
+++ /dev/null
@@ -1,406 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Mike Shaver <shaver@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Client-common OBD method implementations and utility functions.
- */
-
-#define EXPORT_SYMTAB
-#define DEBUG_SUBSYSTEM S_OST /* XXX WRONG */
-
-#ifdef __KERNEL__
-#include <linux/module.h>
-#else 
-#include <liblustre.h>
-#endif
-
-#include <linux/obd.h>
-#include <linux/obd_ost.h>
-#include <linux/lustre_net.h>
-#include <linux/lustre_dlm.h>
-
-struct client_obd *client_conn2cli(struct lustre_handle *conn)
-{
-        struct obd_export *export = class_conn2export(conn);
-        if (!export)
-                LBUG();
-        return &export->exp_obd->u.cli;
-}
-
-struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid)
-{
-        int i;
-
-        for (i = 0; i < MAX_OBD_DEVICES; i++) {
-                struct obd_device *obd = &obd_dev[i];
-                if ((strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) ||
-                    (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)) {
-                        struct client_obd *cli = &obd->u.cli;
-                        if (strncmp(tgtuuid->uuid, cli->cl_target_uuid.uuid,
-                                    sizeof(cli->cl_target_uuid.uuid)) == 0)
-                                return obd;
-                }
-        }
-
-        return NULL;
-}
-
-int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
-{
-        struct obd_ioctl_data* data = buf;
-        int rq_portal, rp_portal;
-        char *name;
-        struct client_obd *cli = &obddev->u.cli;
-        struct obd_import *imp = &cli->cl_import;
-        struct obd_uuid server_uuid;
-        ENTRY;
-
-        if (obddev->obd_type->typ_ops->o_brw) {
-                rq_portal = OST_REQUEST_PORTAL;
-                rp_portal = OSC_REPLY_PORTAL;
-                name = "osc";
-        } else {
-                rq_portal = MDS_REQUEST_PORTAL;
-                rp_portal = MDC_REPLY_PORTAL;
-                name = "mdc";
-        }
-
-        if (data->ioc_inllen1 < 1) {
-                CERROR("requires a TARGET UUID\n");
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen1 > 37) {
-                CERROR("client UUID must be less than 38 characters\n");
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen2 < 1) {
-                CERROR("setup requires a SERVER UUID\n");
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen2 > 37) {
-                CERROR("target UUID must be less than 38 characters\n");
-                RETURN(-EINVAL);
-        }
-
-        sema_init(&cli->cl_sem, 1);
-        cli->cl_conn_count = 0;
-        memcpy(cli->cl_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1);
-        memcpy(server_uuid.uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2,
-                                                   sizeof(server_uuid)));
-
-        imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid);
-        if (!imp->imp_connection)
-                RETURN(-ENOENT);
-
-        INIT_LIST_HEAD(&imp->imp_replay_list);
-        INIT_LIST_HEAD(&imp->imp_sending_list);
-        INIT_LIST_HEAD(&imp->imp_delayed_list);
-        spin_lock_init(&imp->imp_lock);
-
-        ptlrpc_init_client(rq_portal, rp_portal, name,
-                           &obddev->obd_ldlm_client);
-        imp->imp_client = &obddev->obd_ldlm_client;
-        imp->imp_obd = obddev;
-
-        cli->cl_max_mds_easize = sizeof(struct lov_mds_md);
-#if !defined(__KERNEL__) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        cli->cl_sandev = 0;
-#else
-        cli->cl_sandev.value = 0;
-#endif
-
-        RETURN(0);
-}
-
-#ifdef __KERNEL__
-/* convert a pathname into a kdev_t */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-static kdev_t path2dev(char *path)
-{
-        struct dentry *dentry;
-        struct nameidata nd;
-        kdev_t dev = 0;
-
-        if (!path_init(path, LOOKUP_FOLLOW, &nd))
-                return 0;
-
-        if (path_walk(path, &nd))
-                return 0;
-
-        dentry = nd.dentry;
-        if (dentry->d_inode && !is_bad_inode(dentry->d_inode) &&
-            S_ISBLK(dentry->d_inode->i_mode))
-                dev = dentry->d_inode->i_rdev;
-        path_release(&nd);
-
-        return dev;
-}
-#else
-static int path2dev(char *path)
-{
-        struct dentry *dentry;
-        struct nameidata nd;
-        int dev = 0;
-
-        if (!path_init(path, LOOKUP_FOLLOW, &nd))
-                return 0;
-
-        if (path_walk(path, &nd))
-                return 0;
-
-        dentry = nd.dentry;
-        if (dentry->d_inode && !is_bad_inode(dentry->d_inode) &&
-            S_ISBLK(dentry->d_inode->i_mode))
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                dev = dentry->d_inode->i_rdev;
-#else
-                dev = dentry->d_inode->i_rdev.value;
-#endif
-        path_release(&nd);
-
-        return dev;
-}
-#endif
-
-int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf)
-{
-        struct obd_ioctl_data* data = buf;
-        struct client_obd *cli = &obddev->u.cli;
-        struct obd_import *imp = &cli->cl_import;
-        struct obd_uuid server_uuid;
-        ENTRY;
-
-        if (data->ioc_inllen1 < 1) {
-                CERROR("requires a TARGET UUID\n");
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen1 > 37) {
-                CERROR("client UUID must be less than 38 characters\n");
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen2 < 1) {
-                CERROR("setup requires a SERVER UUID\n");
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen2 > 37) {
-                CERROR("target UUID must be less than 38 characters\n");
-                RETURN(-EINVAL);
-        }
-
-        if (data->ioc_inllen3 < 1) {
-                CERROR("setup requires a SAN device pathname\n");
-                RETURN(-EINVAL);
-        }
-
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        cli->cl_sandev = path2dev(data->ioc_inlbuf3);
-        if (!cli->cl_sandev) {
-                CERROR("%s seems not a valid SAN device\n", data->ioc_inlbuf3);
-                RETURN(-EINVAL);
-        }
-#else
-        cli->cl_sandev.value = path2dev(data->ioc_inlbuf3);
-        if (!cli->cl_sandev.value) {
-                CERROR("%s seems not a valid SAN device\n", data->ioc_inlbuf3);
-                RETURN(-EINVAL);
-        }
-#endif
-
-        sema_init(&cli->cl_sem, 1);
-        cli->cl_conn_count = 0;
-        memcpy(cli->cl_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1);
-        memcpy(server_uuid.uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2,
-                                                   sizeof(server_uuid)));
-
-        imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid);
-        if (!imp->imp_connection)
-                RETURN(-ENOENT);
-        
-        INIT_LIST_HEAD(&imp->imp_replay_list);
-        INIT_LIST_HEAD(&imp->imp_sending_list);
-        INIT_LIST_HEAD(&imp->imp_delayed_list);
-        spin_lock_init(&imp->imp_lock);
-
-        ptlrpc_init_client(OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
-                           "sanosc", &obddev->obd_ldlm_client);
-        imp->imp_client = &obddev->obd_ldlm_client;
-        imp->imp_obd = obddev;
-
-        cli->cl_max_mds_easize = sizeof(struct lov_mds_md);
-
-        RETURN(0);
-}
-#endif
-
-int client_obd_cleanup(struct obd_device * obddev)
-{
-        struct client_obd *obd = &obddev->u.cli;
-
-        ptlrpc_cleanup_client(&obd->cl_import);
-        ptlrpc_put_connection(obd->cl_import.imp_connection);
-
-        return 0;
-}
-
-int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd,
-                       struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                       ptlrpc_recovery_cb_t recover)
-{
-        struct client_obd *cli = &obd->u.cli;
-        struct ptlrpc_request *request;
-        int rc, size[] = {sizeof(cli->cl_target_uuid),
-                          sizeof(obd->obd_uuid) };
-        char *tmp[] = {cli->cl_target_uuid.uuid, obd->obd_uuid.uuid};
-        int rq_opc = (obd->obd_type->typ_ops->o_brw) ? OST_CONNECT :MDS_CONNECT;
-        struct ptlrpc_connection *c;
-        struct obd_import *imp = &cli->cl_import;
-        int msg_flags;
-
-        ENTRY;
-        down(&cli->cl_sem);
-        rc = class_connect(conn, obd, cluuid);
-        if (rc)
-                GOTO(out_sem, rc);
-
-        cli->cl_conn_count++;
-        if (cli->cl_conn_count > 1)
-                GOTO(out_sem, rc);
-
-        if (obd->obd_namespace != NULL)
-                CERROR("already have namespace!\n");
-        obd->obd_namespace = ldlm_namespace_new(obd->obd_name,
-                                                LDLM_NAMESPACE_CLIENT);
-        if (obd->obd_namespace == NULL)
-                GOTO(out_disco, rc = -ENOMEM);
-
-        INIT_LIST_HEAD(&imp->imp_chain);
-        imp->imp_max_transno = 0;
-        imp->imp_peer_committed_transno = 0;
-
-        request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 2, size, tmp);
-        if (!request)
-                GOTO(out_ldlm, rc = -ENOMEM);
-
-        request->rq_level = LUSTRE_CONN_NEW;
-        request->rq_replen = lustre_msg_size(0, NULL);
-        request->rq_reqmsg->addr = conn->addr;
-        request->rq_reqmsg->cookie = conn->cookie;
-        c = class_conn2export(conn)->exp_connection =
-                ptlrpc_connection_addref(request->rq_connection);
-        list_add(&imp->imp_chain, &c->c_imports);
-        recovd_conn_manage(c, recovd, recover);
-
-        imp->imp_level = LUSTRE_CONN_CON;
-        rc = ptlrpc_queue_wait(request);
-        if (rc)
-                GOTO(out_req, rc);
-
-        msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
-        if (rq_opc == MDS_CONNECT || msg_flags & MSG_CONNECT_REPLAYABLE) {
-                imp->imp_flags |= IMP_REPLAYABLE;
-                CDEBUG(D_HA, "connected to replayable target: %s\n", cli->cl_target_uuid.uuid);
-        }
-        imp->imp_level = LUSTRE_CONN_FULL;
-        imp->imp_handle.addr = request->rq_repmsg->addr;
-        imp->imp_handle.cookie = request->rq_repmsg->cookie;
-
-        EXIT;
-out_req:
-        ptlrpc_req_finished(request);
-        if (rc) {
-out_ldlm:
-                ldlm_namespace_free(obd->obd_namespace);
-                obd->obd_namespace = NULL;
-out_disco:
-                cli->cl_conn_count--;
-                class_disconnect(conn);
-        }
-out_sem:
-        up(&cli->cl_sem);
-        return rc;
-}
-
-int client_obd_disconnect(struct lustre_handle *conn)
-{
-        struct obd_device *obd = class_conn2obd(conn);
-        struct client_obd *cli = &obd->u.cli;
-        int rq_opc;
-        struct ptlrpc_request *request = NULL;
-        int rc, err;
-        ENTRY;
-
-        if (!obd) {
-                CERROR("invalid connection for disconnect: addr "LPX64
-                       ", cookie "LPX64"\n", conn ? conn->addr : -1UL,
-                       conn ? conn->cookie : -1UL);
-                RETURN(-EINVAL);
-        }
-
-        rq_opc = obd->obd_type->typ_ops->o_brw ? OST_DISCONNECT:MDS_DISCONNECT;
-        down(&cli->cl_sem);
-        if (!cli->cl_conn_count) {
-                CERROR("disconnecting disconnected device (%s)\n",
-                       obd->obd_name);
-                GOTO(out_sem, rc = -EINVAL);
-        }
-
-        cli->cl_conn_count--;
-        if (cli->cl_conn_count)
-                GOTO(out_no_disconnect, rc = 0);
-
-        if (obd->obd_namespace != NULL) {
-                ldlm_cli_cancel_unused(obd->obd_namespace, NULL, 0);
-                ldlm_namespace_free(obd->obd_namespace);
-                obd->obd_namespace = NULL;
-        }
-        request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 0, NULL, NULL);
-        if (!request)
-                GOTO(out_req, rc = -ENOMEM);
-
-        request->rq_replen = lustre_msg_size(0, NULL);
-
-        /* Process disconnects even if we're waiting for recovery. */
-        request->rq_level = LUSTRE_CONN_RECOVD;
-
-        rc = ptlrpc_queue_wait(request);
-        if (rc)
-                GOTO(out_req, rc);
-
-        EXIT;
- out_req:
-        if (request)
-                ptlrpc_req_finished(request);
-        list_del_init(&cli->cl_import.imp_chain);
- out_no_disconnect:
-        err = class_disconnect(conn);
-        if (!rc && err)
-                rc = err;
- out_sem:
-        up(&cli->cl_sem);
-        RETURN(rc);
-}
diff --git a/lustre/lib/mds_updates.c b/lustre/lib/mds_updates.c
deleted file mode 100644 (file)
index aa666ad..0000000
+++ /dev/null
@@ -1,604 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Lustre Lite Update Records
- *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/version.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-#include <linux/locks.h>   // for wait_on_buffer
-#else
-#include <linux/buffer_head.h>   // for wait_on_buffer
-#endif
-#include <linux/unistd.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <asm/uaccess.h>
-#include <linux/slab.h>
-#include <asm/segment.h>
-
-#define DEBUG_SUBSYSTEM S_MDS
-
-#include <linux/obd_support.h>
-#include <linux/lustre_lib.h>
-#include <linux/lustre_mds.h>
-#include <linux/lustre_lite.h>
-
-void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode)
-{
-        fid->id = HTON__u64(inode->i_ino);
-        fid->generation = HTON__u32(inode->i_generation);
-        fid->f_type = HTON__u32(S_IFMT & inode->i_mode);
-}
-
-void mds_pack_inode2body(struct mds_body *b, struct inode *inode)
-{
-        b->valid = OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME |
-                OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
-                OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
-                OBD_MD_FLNLINK | OBD_MD_FLGENER;
-
-        /* The MDS file size isn't authoritative for regular files, so don't
-         * even pretend. */
-        if (S_ISREG(inode->i_mode))
-                b->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
-
-        b->ino = HTON__u32(inode->i_ino);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        b->atime = HTON__u32(inode->i_atime);
-        b->mtime = HTON__u32(inode->i_mtime);
-        b->ctime = HTON__u32(inode->i_ctime);
-#else
-        b->atime = HTON__u32(inode->i_atime.tv_sec);
-        b->mtime = HTON__u32(inode->i_mtime.tv_sec);
-        b->ctime = HTON__u32(inode->i_ctime.tv_sec);
-#endif
-        b->mode = HTON__u32(inode->i_mode);
-        b->size = HTON__u64(inode->i_size);
-        b->blocks = HTON__u64(inode->i_blocks);
-        b->uid = HTON__u32(inode->i_uid);
-        b->gid = HTON__u32(inode->i_gid);
-        b->flags = HTON__u32(inode->i_flags);
-        b->rdev = HTON__u32(b->rdev);
-        b->nlink = HTON__u32(inode->i_nlink);
-        b->generation = HTON__u32(inode->i_generation);
-        b->suppgid = HTON__u32(-1);
-}
-
-
-void mds_pack_fid(struct ll_fid *fid)
-{
-        fid->id = HTON__u64(fid->id);
-        fid->generation = HTON__u32(fid->generation);
-        fid->f_type = HTON__u32(fid->f_type);
-}
-
-static void mds_pack_body(struct mds_body *b)
-{
-        if (b == NULL)
-                LBUG();
-
-        b->fsuid = HTON__u32(current->fsuid);
-        b->fsgid = HTON__u32(current->fsgid);
-        b->capability = HTON__u32(current->cap_effective);
-
-        mds_pack_fid(&b->fid1);
-        mds_pack_fid(&b->fid2);
-        b->size = HTON__u64(b->size);
-        b->ino = HTON__u32(b->ino);
-        b->valid = HTON__u32(b->valid);
-        b->mode = HTON__u32(b->mode);
-        b->uid = HTON__u32(b->uid);
-        b->gid = HTON__u32(b->gid);
-        b->mtime = HTON__u32(b->mtime);
-        b->ctime = HTON__u32(b->ctime);
-        b->atime = HTON__u32(b->atime);
-        b->flags = HTON__u32(b->flags);
-        b->rdev = HTON__u32(b->rdev);
-        b->nlink = HTON__u32(b->nlink);
-        b->generation = HTON__u32(b->generation);
-        b->suppgid = HTON__u32(b->suppgid);
-}
-
-void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset,
-                      int flags,
-                      struct inode *inode, const char *name, int namelen)
-{
-        struct mds_body *b;
-        b = lustre_msg_buf(req->rq_reqmsg, offset);
-
-        b->fsuid = HTON__u32(current->fsuid);
-        b->fsgid = HTON__u32(current->fsgid);
-        b->capability = HTON__u32(current->cap_effective);
-        b->valid = HTON__u32(valid);
-        b->flags = HTON__u32(flags);
-        if (in_group_p(inode->i_gid))
-                b->suppgid = HTON__u32(inode->i_gid);
-        else
-                b->suppgid = HTON__u32(-1);
-
-        ll_inode2fid(&b->fid1, inode);
-        if (name) {
-                char *tmp;
-                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-                LOGL0(name, namelen, tmp);
-        }
-}
-
-void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset,
-                      obd_id ino, int type, __u64 xid)
-{
-        struct mds_body *b;
-
-        b = lustre_msg_buf(req->rq_reqmsg, 0);
-        b->fsuid = HTON__u32(current->fsuid);
-        b->fsgid = HTON__u32(current->fsgid);
-        b->capability = HTON__u32(current->cap_effective);
-        b->fid1.id = HTON__u64(ino);
-        b->fid1.f_type = HTON__u32(type);
-        b->size = HTON__u64(offset);
-        b->suppgid = HTON__u32(-1);
-        b->blocks = HTON__u64(xid);
-}
-
-
-void mds_pack_req_body(struct ptlrpc_request *req)
-{
-        struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, 0);
-        mds_pack_body(b);
-}
-
-void mds_pack_rep_body(struct ptlrpc_request *req)
-{
-        struct mds_body *b = lustre_msg_buf(req->rq_repmsg, 0);
-        mds_pack_body(b);
-}
-
-
-/* packing of MDS records */
-void mds_create_pack(struct ptlrpc_request *req, int offset, struct inode *dir,
-                     __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time,
-                     const char *name, int namelen,
-                     const void *data, int datalen)
-{
-        struct mds_rec_create *rec;
-        char *tmp;
-        rec = lustre_msg_buf(req->rq_reqmsg, offset);
-
-        rec->cr_opcode = HTON__u32(REINT_CREATE);
-        rec->cr_fsuid = HTON__u32(current->fsuid);
-        rec->cr_fsgid = HTON__u32(current->fsgid);
-        rec->cr_cap = HTON__u32(current->cap_effective);
-        ll_inode2fid(&rec->cr_fid, dir);
-        memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid));
-        rec->cr_mode = HTON__u32(mode);
-        rec->cr_rdev = HTON__u64(rdev);
-        rec->cr_uid = HTON__u32(uid);
-        rec->cr_gid = HTON__u32(gid);
-        rec->cr_time = HTON__u64(time);
-        if (in_group_p(dir->i_gid))
-                rec->cr_suppgid = HTON__u32(dir->i_gid);
-        else
-                rec->cr_suppgid = HTON__u32(-1);
-
-        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        LOGL0(name, namelen, tmp);
-
-        if (data) {
-                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2);
-                LOGL0(data, datalen, tmp);
-        }
-}
-/* packing of MDS records */
-void mds_open_pack(struct ptlrpc_request *req, int offset, struct inode *dir,
-                     __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time,
-                     __u32 flags,
-                     const char *name, int namelen,
-                     const void *data, int datalen)
-{
-        struct mds_rec_create *rec;
-        char *tmp;
-        rec = lustre_msg_buf(req->rq_reqmsg, offset);
-
-        /* XXX do something about time, uid, gid */
-        rec->cr_opcode = HTON__u32(REINT_OPEN);
-        rec->cr_fsuid = HTON__u32(current->fsuid);
-        rec->cr_fsgid = HTON__u32(current->fsgid);
-        rec->cr_cap = HTON__u32(current->cap_effective);
-        ll_inode2fid(&rec->cr_fid, dir);
-        memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid));
-        rec->cr_mode = HTON__u32(mode);
-        rec->cr_flags = HTON__u32(flags);
-        rec->cr_rdev = HTON__u64(rdev);
-        rec->cr_uid = HTON__u32(uid);
-        rec->cr_gid = HTON__u32(gid);
-        rec->cr_time = HTON__u64(time);
-        if (in_group_p(dir->i_gid))
-                rec->cr_suppgid = HTON__u32(dir->i_gid);
-        else
-                rec->cr_suppgid = HTON__u32(-1);
-
-        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        LOGL0(name, namelen, tmp);
-
-        if (data) {
-                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2);
-                LOGL0(data, datalen, tmp);
-        }
-}
-
-void mds_setattr_pack(struct ptlrpc_request *req,
-                      struct inode *inode, struct iattr *iattr,
-                      void *ea, int ealen)
-{
-        struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, 0);
-
-        rec->sa_opcode = HTON__u32(REINT_SETATTR);
-        rec->sa_fsuid = HTON__u32(current->fsuid);
-        rec->sa_fsgid = HTON__u32(current->fsgid);
-        rec->sa_cap = HTON__u32(current->cap_effective);
-        ll_inode2fid(&rec->sa_fid, inode);
-
-        if (iattr) {
-                rec->sa_valid = HTON__u32(iattr->ia_valid);
-                rec->sa_mode = HTON__u32(iattr->ia_mode);
-                rec->sa_uid = HTON__u32(iattr->ia_uid);
-                rec->sa_gid = HTON__u32(iattr->ia_gid);
-                rec->sa_size = HTON__u64(iattr->ia_size);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                rec->sa_atime = HTON__u64(iattr->ia_atime);
-                rec->sa_mtime = HTON__u64(iattr->ia_mtime);
-                rec->sa_ctime = HTON__u64(iattr->ia_ctime);
-#else
-                rec->sa_atime = HTON__u64(iattr->ia_atime.tv_sec);
-                rec->sa_mtime = HTON__u64(iattr->ia_mtime.tv_sec);
-                rec->sa_ctime = HTON__u64(iattr->ia_ctime.tv_sec);
-#endif
-                rec->sa_attr_flags = HTON__u32(iattr->ia_attr_flags);
-
-                if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid))
-                        rec->sa_suppgid = HTON__u32(iattr->ia_gid);
-                else if ((iattr->ia_valid & ATTR_MODE) &&
-                         in_group_p(inode->i_gid))
-                        rec->sa_suppgid = HTON__u32(inode->i_gid);
-                else
-                        rec->sa_suppgid = HTON__u32(-1);
-        }
-
-        if (ealen)
-                memcpy(lustre_msg_buf(req->rq_reqmsg, 1), ea, ealen);
-}
-
-void mds_unlink_pack(struct ptlrpc_request *req, int offset,
-                     struct inode *inode, struct inode *child, __u32 mode,
-                     const char *name, int namelen)
-{
-        struct mds_rec_unlink *rec;
-        char *tmp;
-
-        rec = lustre_msg_buf(req->rq_reqmsg, offset);
-
-        rec->ul_opcode = HTON__u32(REINT_UNLINK);
-        rec->ul_fsuid = HTON__u32(current->fsuid);
-        rec->ul_fsgid = HTON__u32(current->fsgid);
-        rec->ul_cap = HTON__u32(current->cap_effective);
-        rec->ul_mode = HTON__u32(mode);
-        if (in_group_p(inode->i_gid))
-                rec->ul_suppgid = HTON__u32(inode->i_gid);
-        else
-                rec->ul_suppgid = HTON__u32(-1);
-        ll_inode2fid(&rec->ul_fid1, inode);
-        if (child)
-                ll_inode2fid(&rec->ul_fid2, child);
-
-        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        LOGL0(name, namelen, tmp);
-}
-
-void mds_link_pack(struct ptlrpc_request *req, int offset,
-                   struct inode *inode, struct inode *dir,
-                   const char *name, int namelen)
-{
-        struct mds_rec_link *rec;
-        char *tmp;
-
-        rec = lustre_msg_buf(req->rq_reqmsg, offset);
-
-        rec->lk_opcode = HTON__u32(REINT_LINK);
-        rec->lk_fsuid = HTON__u32(current->fsuid);
-        rec->lk_fsgid = HTON__u32(current->fsgid);
-        rec->lk_cap = HTON__u32(current->cap_effective);
-        if (in_group_p(dir->i_gid))
-                rec->lk_suppgid = HTON__u32(dir->i_gid);
-        else
-                rec->lk_suppgid = HTON__u32(-1);
-        ll_inode2fid(&rec->lk_fid1, inode);
-        ll_inode2fid(&rec->lk_fid2, dir);
-
-        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        LOGL0(name, namelen, tmp);
-}
-
-void mds_rename_pack(struct ptlrpc_request *req, int offset,
-                     struct inode *srcdir, struct inode *tgtdir,
-                     const char *old, int oldlen, const char *new, int newlen)
-{
-        struct mds_rec_rename *rec;
-        char *tmp;
-
-        rec = lustre_msg_buf(req->rq_reqmsg, offset);
-
-        /* XXX do something about time, uid, gid */
-        rec->rn_opcode = HTON__u32(REINT_RENAME);
-        rec->rn_fsuid = HTON__u32(current->fsuid);
-        rec->rn_fsgid = HTON__u32(current->fsgid);
-        rec->rn_cap = HTON__u32(current->cap_effective);
-        if (in_group_p(srcdir->i_gid))
-                rec->rn_suppgid1 = HTON__u32(srcdir->i_gid);
-        else
-                rec->rn_suppgid1 = HTON__u32(-1);
-        if (in_group_p(tgtdir->i_gid))
-                rec->rn_suppgid2 = HTON__u32(tgtdir->i_gid);
-        else
-                rec->rn_suppgid2 = HTON__u32(-1);
-        ll_inode2fid(&rec->rn_fid1, srcdir);
-        ll_inode2fid(&rec->rn_fid2, tgtdir);
-
-        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        LOGL0(old, oldlen, tmp);
-
-        if (new) {
-                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2);
-                LOGL0(new, newlen, tmp);
-        }
-}
-
-/* unpacking */
-void mds_unpack_fid(struct ll_fid *fid)
-{
-        fid->id = NTOH__u64(fid->id);
-        fid->generation = NTOH__u32(fid->generation);
-        fid->f_type = NTOH__u32(fid->f_type);
-}
-
-void mds_unpack_body(struct mds_body *b)
-{
-        if (b == NULL)
-                LBUG();
-
-        mds_unpack_fid(&b->fid1);
-        mds_unpack_fid(&b->fid2);
-        b->size = NTOH__u64(b->size);
-        b->blocks = NTOH__u64(b->blocks);
-        b->valid = NTOH__u32(b->valid);
-        b->fsuid = NTOH__u32(b->fsuid);
-        b->fsgid = NTOH__u32(b->fsgid);
-        b->capability = NTOH__u32(b->capability);
-        b->ino = NTOH__u32(b->ino);
-        b->mode = NTOH__u32(b->mode);
-        b->uid = NTOH__u32(b->uid);
-        b->gid = NTOH__u32(b->gid);
-        b->mtime = NTOH__u32(b->mtime);
-        b->ctime = NTOH__u32(b->ctime);
-        b->atime = NTOH__u32(b->atime);
-        b->flags = NTOH__u32(b->flags);
-        b->rdev = NTOH__u32(b->rdev);
-        b->nlink = NTOH__u32(b->nlink);
-        b->generation = NTOH__u32(b->generation);
-        b->suppgid = NTOH__u32(b->suppgid);
-}
-
-static int mds_setattr_unpack(struct ptlrpc_request *req, int offset,
-                              struct mds_update_record *r)
-{
-        struct iattr *attr = &r->ur_iattr;
-        struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, offset);
-        ENTRY;
-
-        if (req->rq_reqmsg->bufcount < offset + 1 ||
-            req->rq_reqmsg->buflens[offset] != sizeof(*rec))
-                RETURN(-EFAULT);
-
-        r->ur_fsuid = NTOH__u32(rec->sa_fsuid);
-        r->ur_fsgid = NTOH__u32(rec->sa_fsgid);
-        r->ur_cap = NTOH__u32(rec->sa_cap);
-        r->ur_suppgid1 = NTOH__u32(rec->sa_suppgid);
-        r->ur_suppgid2 = NTOH__u32(-1);
-        r->ur_fid1 = &rec->sa_fid;
-        attr->ia_valid = NTOH__u32(rec->sa_valid);
-        attr->ia_mode = NTOH__u32(rec->sa_mode);
-        attr->ia_uid = NTOH__u32(rec->sa_uid);
-        attr->ia_gid = NTOH__u32(rec->sa_gid);
-        attr->ia_size = NTOH__u64(rec->sa_size);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        attr->ia_atime = NTOH__u64(rec->sa_atime);
-        attr->ia_mtime = NTOH__u64(rec->sa_mtime);
-        attr->ia_ctime = NTOH__u64(rec->sa_ctime);
-#else
-        attr->ia_atime.tv_sec = NTOH__u64(rec->sa_atime);
-        attr->ia_mtime.tv_sec = NTOH__u64(rec->sa_mtime);
-        attr->ia_ctime.tv_sec = NTOH__u64(rec->sa_ctime);
-#endif
-        attr->ia_attr_flags = NTOH__u32(rec->sa_attr_flags);
-
-        if (req->rq_reqmsg->bufcount == offset + 2) {
-                r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
-                r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        } else {
-                r->ur_namelen = 0;
-        }
-
-        RETURN(0);
-}
-
-static int mds_create_unpack(struct ptlrpc_request *req, int offset,
-                             struct mds_update_record *r)
-{
-        struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, offset);
-        ENTRY;
-
-        if (req->rq_reqmsg->bufcount < offset + 2 ||
-            req->rq_reqmsg->buflens[offset] != sizeof(*rec))
-                RETURN(-EFAULT);
-
-        r->ur_fsuid = NTOH__u32(rec->cr_fsuid);
-        r->ur_fsgid = NTOH__u32(rec->cr_fsgid);
-        r->ur_cap = NTOH__u32(rec->cr_cap);
-        r->ur_fid1 = &rec->cr_fid;
-        r->ur_fid2 = &rec->cr_replayfid;
-        r->ur_mode = NTOH__u32(rec->cr_mode);
-        r->ur_rdev = NTOH__u64(rec->cr_rdev);
-        r->ur_uid = NTOH__u32(rec->cr_uid);
-        r->ur_gid = NTOH__u32(rec->cr_gid);
-        r->ur_time = NTOH__u64(rec->cr_time);
-        r->ur_flags = NTOH__u32(rec->cr_flags);
-        r->ur_suppgid1 = NTOH__u32(rec->cr_suppgid);
-        r->ur_suppgid2 = NTOH__u32(-1);
-
-        r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
-
-        if (req->rq_reqmsg->bufcount == offset + 3) {
-                r->ur_tgt = lustre_msg_buf(req->rq_reqmsg, offset + 2);
-                r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2];
-        } else {
-                r->ur_tgt = NULL;
-                r->ur_tgtlen = 0;
-        }
-        RETURN(0);
-}
-
-static int mds_link_unpack(struct ptlrpc_request *req, int offset,
-                           struct mds_update_record *r)
-{
-        struct mds_rec_link *rec = lustre_msg_buf(req->rq_reqmsg, offset);
-        ENTRY;
-
-        if (req->rq_reqmsg->bufcount != offset + 2 ||
-            req->rq_reqmsg->buflens[offset] != sizeof(*rec))
-                RETURN(-EFAULT);
-
-        r->ur_fsuid = NTOH__u32(rec->lk_fsuid);
-        r->ur_fsgid = NTOH__u32(rec->lk_fsgid);
-        r->ur_cap = NTOH__u32(rec->lk_cap);
-        r->ur_suppgid1 = NTOH__u32(rec->lk_suppgid);
-        r->ur_suppgid2 = NTOH__u32(-1);
-        r->ur_fid1 = &rec->lk_fid1;
-        r->ur_fid2 = &rec->lk_fid2;
-
-        r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
-        RETURN(0);
-}
-
-static int mds_unlink_unpack(struct ptlrpc_request *req, int offset,
-                             struct mds_update_record *r)
-{
-        struct mds_rec_unlink *rec = lustre_msg_buf(req->rq_reqmsg, offset);
-        ENTRY;
-
-        if (req->rq_reqmsg->bufcount != offset + 2 ||
-            req->rq_reqmsg->buflens[offset] != sizeof(*rec))
-                RETURN(-EFAULT);
-
-        r->ur_fsuid = NTOH__u32(rec->ul_fsuid);
-        r->ur_fsgid = NTOH__u32(rec->ul_fsgid);
-        r->ur_cap = NTOH__u32(rec->ul_cap);
-        r->ur_mode = NTOH__u32(rec->ul_mode);
-        r->ur_suppgid1 = NTOH__u32(rec->ul_suppgid);
-        r->ur_suppgid2 = NTOH__u32(-1);
-        r->ur_fid1 = &rec->ul_fid1;
-        r->ur_fid2 = &rec->ul_fid2;
-
-        r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
-        RETURN(0);
-}
-
-static int mds_rename_unpack(struct ptlrpc_request *req, int offset,
-                             struct mds_update_record *r)
-{
-        struct mds_rec_rename *rec = lustre_msg_buf(req->rq_reqmsg, offset);
-        ENTRY;
-
-        if (req->rq_reqmsg->bufcount != offset + 3 ||
-            req->rq_reqmsg->buflens[offset] != sizeof(*rec))
-                RETURN(-EFAULT);
-
-        r->ur_fsuid = NTOH__u32(rec->rn_fsuid);
-        r->ur_fsgid = NTOH__u32(rec->rn_fsgid);
-        r->ur_cap = NTOH__u32(rec->rn_cap);
-        r->ur_suppgid1 = NTOH__u32(rec->rn_suppgid1);
-        r->ur_suppgid2 = NTOH__u32(rec->rn_suppgid2);
-        r->ur_fid1 = &rec->rn_fid1;
-        r->ur_fid2 = &rec->rn_fid2;
-
-        r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
-
-        r->ur_tgt = lustre_msg_buf(req->rq_reqmsg, offset + 2);
-        r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2];
-        RETURN(0);
-}
-
-typedef int (*update_unpacker)(struct ptlrpc_request *req, int offset,
-                               struct mds_update_record *r);
-
-static update_unpacker mds_unpackers[REINT_MAX + 1] = {
-        [REINT_SETATTR] mds_setattr_unpack,
-        [REINT_CREATE] mds_create_unpack,
-        [REINT_LINK] mds_link_unpack,
-        [REINT_UNLINK] mds_unlink_unpack,
-        [REINT_RENAME] mds_rename_unpack,
-        [REINT_OPEN] mds_create_unpack,
-};
-
-int mds_update_unpack(struct ptlrpc_request *req, int offset,
-                      struct mds_update_record *rec)
-{
-        __u32 *opcode = lustre_msg_buf(req->rq_reqmsg, offset);
-        int rc, realop;
-        ENTRY;
-
-        if (!opcode || req->rq_reqmsg->buflens[offset] < sizeof(*opcode))
-                RETURN(-EFAULT);
-
-        realop = rec->ur_opcode = NTOH__u32(*opcode);
-        realop &= REINT_OPCODE_MASK;
-
-        if (realop < 0 || realop > REINT_MAX) {
-                LBUG();
-                RETURN(-EFAULT);
-        }
-
-        rc = mds_unpackers[realop](req, offset, rec);
-        RETURN(rc);
-}
diff --git a/lustre/lib/obd_pack.c b/lustre/lib/obd_pack.c
deleted file mode 100644 (file)
index c76ff32..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * (Un)packing of OST requests
- *
- */
-
-#define DEBUG_SUBSYSTEM S_OST
-#ifndef __KERNEL__
-#include <liblustre.h>
-#endif
-
-#include <linux/obd_ost.h>
-#include <linux/lustre_net.h>
-
-void ost_pack_ioo(struct obd_ioobj *ioo, struct lov_stripe_md *lsm, int bufcnt)
-{
-        ioo->ioo_id = HTON__u64(lsm->lsm_object_id);
-        ioo->ioo_gr = HTON__u64(0);
-        ioo->ioo_type = HTON__u32(S_IFREG);
-        ioo->ioo_bufcnt = HTON__u32(bufcnt);
-}
-
-void ost_unpack_ioo(struct obd_ioobj *dst, struct obd_ioobj *src)
-{
-        dst->ioo_id = NTOH__u64(src->ioo_id);
-        dst->ioo_gr = NTOH__u64(src->ioo_gr);
-        dst->ioo_type = NTOH__u32(src->ioo_type);
-        dst->ioo_bufcnt = NTOH__u32(src->ioo_bufcnt);
-}
-
-void ost_pack_niobuf(struct niobuf_remote *nb, __u64 offset, __u32 len,
-                     __u32 flags, __u32 xid)
-{
-        nb->offset = HTON__u64(offset);
-        nb->len = HTON__u32(len);
-        nb->xid = HTON__u32(xid);
-        nb->flags = HTON__u32(flags);
-}
-
-void ost_unpack_niobuf(struct niobuf_remote *dst, struct niobuf_remote *src)
-{
-        dst->offset = NTOH__u64(src->offset);
-        dst->len = NTOH__u32(src->len);
-        dst->xid = NTOH__u32(src->xid);
-        dst->flags = NTOH__u32(src->flags);
-}
diff --git a/lustre/lib/target.c b/lustre/lib/target.c
deleted file mode 100644 (file)
index 82f1164..0000000
+++ /dev/null
@@ -1,524 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
- *   Author: Peter J. Braam <braam@clusterfs.com>
- *   Author: Phil Schwan <phil@clusterfs.com>
- *   Author: Mike Shaver <shaver@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Target-common OBD method implementations and utility functions.
- */
-
-#define EXPORT_SYMTAB
-#define DEBUG_SUBSYSTEM S_OST /* XXX WRONG */
-
-#include <linux/module.h>
-#include <linux/obd_ost.h>
-#include <linux/lustre_net.h>
-#include <linux/lustre_dlm.h>
-
-int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp,
-                            struct obd_uuid *cluuid)
-{
-        if (exp->exp_connection) {
-                struct lustre_handle *hdl;
-                hdl = &exp->exp_ldlm_data.led_import.imp_handle;
-                /* Might be a re-connect after a partition. */
-                if (!memcmp(conn, hdl, sizeof *conn)) {
-                        CERROR("%s reconnecting\n", cluuid->uuid);
-                        conn->addr = (__u64) (unsigned long)exp;
-                        conn->cookie = exp->exp_cookie;
-                        RETURN(EALREADY);
-                } else {
-                        CERROR("%s reconnecting from %s, "
-                               "handle mismatch (ours "LPX64"/"LPX64", "
-                               "theirs "LPX64"/"LPX64")\n", cluuid->uuid,
-                               exp->exp_connection->c_remote_uuid.uuid,
-                               hdl->addr,
-                               hdl->cookie, conn->addr, conn->cookie);
-                        /* XXX disconnect them here? */
-                        memset(conn, 0, sizeof *conn);
-                        /* This is a little scary, but right now we build this
-                         * file separately into each server module, so I won't
-                         * go _immediately_ to hell.
-                         */
-                        RETURN(-EALREADY);
-                }
-        }
-
-        conn->addr = (__u64) (unsigned long)exp;
-        conn->cookie = exp->exp_cookie;
-        CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n", cluuid->uuid, exp);
-        CDEBUG(D_IOCTL,"connect: addr %Lx cookie %Lx\n",
-               (long long)conn->addr, (long long)conn->cookie);
-        RETURN(0);
-}
-
-
-int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
-{
-        struct obd_device *target;
-        struct obd_export *export = NULL;
-        struct obd_import *dlmimp;
-        struct lustre_handle conn;
-        struct obd_uuid tgtuuid;
-        struct obd_uuid cluuid;
-        struct list_head *p;
-        int rc, i;
-        ENTRY;
-
-        if (req->rq_reqmsg->buflens[0] > 37) {
-                CERROR("bad target UUID for connect\n");
-                GOTO(out, rc = -EINVAL);
-        }
-        obd_str2uuid(&tgtuuid, lustre_msg_buf(req->rq_reqmsg, 0));
-
-        if (req->rq_reqmsg->buflens[1] > 37) {
-                CERROR("bad client UUID for connect\n");
-                GOTO(out, rc = -EINVAL);
-        }
-        obd_str2uuid(&cluuid, lustre_msg_buf(req->rq_reqmsg, 1));
-
-        i = class_uuid2dev(&tgtuuid);
-        if (i == -1) {
-                CERROR("UUID '%s' not found for connect\n", tgtuuid.uuid);
-                GOTO(out, rc = -ENODEV);
-        }
-
-        target = &obd_dev[i];
-        if (!target)
-                GOTO(out, rc = -ENODEV);
-
-        spin_lock_bh(&target->obd_processing_task_lock);
-        if (target->obd_flags & OBD_ABORT_RECOVERY)
-                target_abort_recovery(target);
-        spin_unlock_bh(&target->obd_processing_task_lock);
-
-        conn.addr = req->rq_reqmsg->addr;
-        conn.cookie = req->rq_reqmsg->cookie;
-
-        rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg);
-        if (rc)
-                GOTO(out, rc);
-
-        /* lctl gets a backstage, all-access pass. */
-        if (!strcmp(cluuid.uuid, "OBD_CLASS_UUID"))
-                goto dont_check_exports;
-
-        spin_lock(&target->obd_dev_lock);
-        list_for_each(p, &target->obd_exports) {
-                export = list_entry(p, struct obd_export, exp_obd_chain);
-                if (!memcmp(&cluuid, &export->exp_client_uuid,
-                            sizeof(export->exp_client_uuid))) {
-                        spin_unlock(&target->obd_dev_lock);
-                        LASSERT(export->exp_obd == target);
-
-                        rc = target_handle_reconnect(&conn, export, &cluuid);
-                        break;
-                }
-                export = NULL;
-        }
-        /* If we found an export, we already unlocked. */
-        if (!export)
-                spin_unlock(&target->obd_dev_lock);
-
-        /* Tell the client if we're in recovery. */
-        /* If this is the first client, start the recovery timer */
-        if (target->obd_flags & OBD_RECOVERING) {
-                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING);
-                target_start_recovery_timer(target, handler);
-        }
-
-        /* Tell the client if we support replayable requests */
-        if (target->obd_flags & OBD_REPLAYABLE)
-                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE);
-
-        if (!export) {
-                if (target->obd_flags & OBD_RECOVERING) {
-                        CERROR("denying connection for new client %s: "
-                               "in recovery\n", cluuid.uuid);
-                        rc = -EBUSY;
-                } else {
- dont_check_exports:
-                        rc = obd_connect(&conn, target, &cluuid, ptlrpc_recovd,
-                                         target_revoke_connection);
-                }
-        }
-
-        /* If all else goes well, this is our RPC return code. */
-        req->rq_status = 0;
-
-        if (rc && rc != EALREADY)
-                GOTO(out, rc);
-
-        req->rq_repmsg->addr = conn.addr;
-        req->rq_repmsg->cookie = conn.cookie;
-
-        export = class_conn2export(&conn);
-        LASSERT(export);
-
-        req->rq_export = export;
-        export->exp_connection = ptlrpc_get_connection(&req->rq_peer, &cluuid);
-        if (req->rq_connection != NULL)
-                ptlrpc_put_connection(req->rq_connection);
-        req->rq_connection = ptlrpc_connection_addref(export->exp_connection);
-
-        if (rc == EALREADY) {
-                /* We indicate the reconnection in a flag, not an error code. */
-                lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
-                GOTO(out, rc = 0);
-        }
-
-        spin_lock(&export->exp_connection->c_lock);
-        list_add(&export->exp_conn_chain, &export->exp_connection->c_exports);
-        spin_unlock(&export->exp_connection->c_lock);
-        recovd_conn_manage(export->exp_connection, ptlrpc_recovd,
-                           target_revoke_connection);
-
-        dlmimp = &export->exp_ldlm_data.led_import;
-        dlmimp->imp_connection = req->rq_connection;
-        dlmimp->imp_client = &export->exp_obd->obd_ldlm_client;
-        dlmimp->imp_handle.addr = req->rq_reqmsg->addr;
-        dlmimp->imp_handle.cookie = req->rq_reqmsg->cookie;
-        dlmimp->imp_obd = target;
-        dlmimp->imp_recover = NULL;
-        INIT_LIST_HEAD(&dlmimp->imp_replay_list);
-        INIT_LIST_HEAD(&dlmimp->imp_sending_list);
-        INIT_LIST_HEAD(&dlmimp->imp_delayed_list);
-        spin_lock_init(&dlmimp->imp_lock);
-        dlmimp->imp_level = LUSTRE_CONN_FULL;
-out:
-        if (rc)
-                req->rq_status = rc;
-        RETURN(rc);
-}
-
-int target_handle_disconnect(struct ptlrpc_request *req)
-{
-        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
-        int rc;
-        ENTRY;
-
-        rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg);
-        if (rc)
-                RETURN(rc);
-
-        req->rq_status = obd_disconnect(conn);
-        req->rq_export = NULL;
-        RETURN(0);
-}
-
-static int target_disconnect_client(struct ptlrpc_connection *conn)
-{
-        struct list_head *expiter, *n;
-        struct lustre_handle hdl;
-        struct obd_export *exp;
-        int rc;
-        ENTRY;
-
-        list_for_each_safe(expiter, n, &conn->c_exports) {
-                exp = list_entry(expiter, struct obd_export, exp_conn_chain);
-
-                CDEBUG(D_HA, "disconnecting export %p/%s\n",
-                       exp, exp->exp_client_uuid.uuid);
-                hdl.addr = (__u64)(unsigned long)exp;
-                hdl.cookie = exp->exp_cookie;
-                rc = obd_disconnect(&hdl);
-                if (rc)
-                        CERROR("disconnecting export %p failed: %d\n", exp, rc);
-        }
-
-        /* XXX spank the connection (it's frozen in _RECOVD for now!) */
-        RETURN(0);
-}
-
-static int target_fence_failed_connection(struct ptlrpc_connection *conn)
-{
-        ENTRY;
-
-        conn->c_recovd_data.rd_phase = RD_PREPARED;
-
-        RETURN(0);
-}
-
-int target_revoke_connection(struct recovd_data *rd, int phase)
-{
-        struct ptlrpc_connection *conn = class_rd2conn(rd);
-
-        LASSERT(conn);
-        ENTRY;
-
-        switch (phase) {
-            case PTLRPC_RECOVD_PHASE_PREPARE:
-                RETURN(target_fence_failed_connection(conn));
-            case PTLRPC_RECOVD_PHASE_RECOVER:
-                RETURN(target_disconnect_client(conn));
-            case PTLRPC_RECOVD_PHASE_FAILURE:
-                LBUG();
-                RETURN(0);
-        }
-
-        LBUG();
-        RETURN(-ENOSYS);
-}
-
-/*
- * Recovery functions 
- */
-
-static void abort_delayed_replies(struct obd_device *obd)
-{
-        struct ptlrpc_request *req;
-        struct list_head *tmp, *n;
-        list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                DEBUG_REQ(D_ERROR, req, "aborted:");
-                req->rq_status = -ENOTCONN;
-                req->rq_type = PTL_RPC_MSG_ERR;
-                ptlrpc_reply(req->rq_svc, req);
-                list_del(&req->rq_list);
-                OBD_FREE(req, sizeof *req);
-        }
-}
-
-void target_abort_recovery(void *data)
-{
-        struct obd_device *obd = data;
-        CERROR("disconnecting clients and aborting recovery\n");
-        obd->obd_recoverable_clients = 0;
-        obd->obd_flags &= ~(OBD_RECOVERING | OBD_ABORT_RECOVERY);
-        abort_delayed_replies(obd);
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-        class_disconnect_all(obd);
-        spin_lock_bh(&obd->obd_processing_task_lock);
-}
-
-static void target_recovery_expired(unsigned long castmeharder)
-{
-        struct obd_device *obd = (struct obd_device *)castmeharder;
-        CERROR("recovery timed out, aborting\n");
-        spin_lock_bh(&obd->obd_processing_task_lock);
-        obd->obd_flags |= OBD_ABORT_RECOVERY;
-        wake_up(&obd->obd_next_transno_waitq);
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-}
-
-static void reset_recovery_timer(struct obd_device *obd)
-{
-        CDEBUG(D_ERROR, "timer will expire in %ld seconds\n",
-               OBD_RECOVERY_TIMEOUT / HZ);
-        mod_timer(&obd->obd_recovery_timer, jiffies + OBD_RECOVERY_TIMEOUT);
-}
-
-
-/* Only start it the first time called */
-void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler)
-{
-        spin_lock_bh(&obd->obd_processing_task_lock);
-        if (obd->obd_recovery_handler) {
-                spin_unlock_bh(&obd->obd_processing_task_lock);
-                return;
-        }
-        CERROR("%s: starting recovery timer\n", obd->obd_name);
-        obd->obd_recovery_handler = handler;
-        obd->obd_recovery_timer.function = target_recovery_expired;
-        obd->obd_recovery_timer.data = (unsigned long)obd;
-        init_timer(&obd->obd_recovery_timer);
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-
-        reset_recovery_timer(obd);
-}
-
-static void cancel_recovery_timer(struct obd_device *obd)
-{
-        del_timer(&obd->obd_recovery_timer);
-}
-
-static int check_for_next_transno(struct obd_device *obd)
-{
-        struct ptlrpc_request *req;
-        req = list_entry(obd->obd_recovery_queue.next,
-                         struct ptlrpc_request, rq_list);
-        LASSERT(req->rq_reqmsg->transno >= obd->obd_next_recovery_transno);
-
-        return req->rq_reqmsg->transno == obd->obd_next_recovery_transno ||
-                (obd->obd_flags & OBD_RECOVERING) == 0;
-}
-
-static void process_recovery_queue(struct obd_device *obd)
-{
-        struct ptlrpc_request *req;
-        int aborted = 0;
-        ENTRY;
-
-        for (;;) {
-                spin_lock_bh(&obd->obd_processing_task_lock);
-                LASSERT(obd->obd_processing_task == current->pid);
-                req = list_entry(obd->obd_recovery_queue.next,
-                                 struct ptlrpc_request, rq_list);
-
-                if (req->rq_reqmsg->transno != obd->obd_next_recovery_transno) {
-                        struct l_wait_info lwi = { 0 };
-                        spin_unlock_bh(&obd->obd_processing_task_lock);
-                        CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is "
-                               LPD64")\n",
-                               obd->obd_next_recovery_transno,
-                               req->rq_reqmsg->transno);
-                        l_wait_event(obd->obd_next_transno_waitq,
-                                     check_for_next_transno(obd), &lwi);
-                        spin_lock_bh(&obd->obd_processing_task_lock);
-                        if (obd->obd_flags & OBD_ABORT_RECOVERY) {
-                                target_abort_recovery(obd);
-                                aborted = 1;
-                        }
-                        spin_unlock_bh(&obd->obd_processing_task_lock);
-                        if (aborted)
-                                return;
-                        continue;
-                }
-                list_del_init(&req->rq_list);
-                spin_unlock_bh(&obd->obd_processing_task_lock);
-
-                DEBUG_REQ(D_ERROR, req, "processing: ");
-                (void)obd->obd_recovery_handler(req);
-                reset_recovery_timer(obd);
-#warning FIXME: mds_fsync_super(mds->mds_sb);
-                OBD_FREE(req, sizeof *req);
-                spin_lock_bh(&obd->obd_processing_task_lock);
-                obd->obd_next_recovery_transno++;
-                if (list_empty(&obd->obd_recovery_queue)) {
-                        obd->obd_processing_task = 0;
-                        spin_unlock_bh(&obd->obd_processing_task_lock);
-                        break;
-                }
-                spin_unlock_bh(&obd->obd_processing_task_lock);
-        }
-        EXIT;
-}
-
-int target_queue_recovery_request(struct ptlrpc_request *req,
-                                  struct obd_device *obd)
-{
-        struct list_head *tmp;
-        int inserted = 0;
-        __u64 transno = req->rq_reqmsg->transno;
-        struct ptlrpc_request *saved_req;
-
-        if (!transno) {
-                INIT_LIST_HEAD(&req->rq_list);
-                DEBUG_REQ(D_HA, req, "not queueing");
-                return 1;
-        }
-
-        spin_lock_bh(&obd->obd_processing_task_lock);
-
-        if (obd->obd_processing_task == current->pid) {
-                /* Processing the queue right now, don't re-add. */
-                LASSERT(list_empty(&req->rq_list));
-                spin_unlock_bh(&obd->obd_processing_task_lock);
-                return 1;
-        }
-
-        OBD_ALLOC(saved_req, sizeof *saved_req);
-        if (!saved_req)
-                LBUG();
-        memcpy(saved_req, req, sizeof *req);
-        req = saved_req;
-        INIT_LIST_HEAD(&req->rq_list);
-
-        /* XXX O(n^2) */
-        list_for_each(tmp, &obd->obd_recovery_queue) {
-                struct ptlrpc_request *reqiter =
-                        list_entry(tmp, struct ptlrpc_request, rq_list);
-
-                if (reqiter->rq_reqmsg->transno > transno) {
-                        list_add_tail(&req->rq_list, &reqiter->rq_list);
-                        inserted = 1;
-                        break;
-                }
-        }
-
-        if (!inserted) {
-                list_add_tail(&req->rq_list, &obd->obd_recovery_queue);
-        }
-
-        if (obd->obd_processing_task != 0) {
-                /* Someone else is processing this queue, we'll leave it to
-                 * them.
-                 */
-                if (transno == obd->obd_next_recovery_transno)
-                        wake_up(&obd->obd_next_transno_waitq);
-                spin_unlock_bh(&obd->obd_processing_task_lock);
-                return 0;
-        }
-
-        /* Nobody is processing, and we know there's (at least) one to process
-         * now, so we'll do the honours.
-         */
-        obd->obd_processing_task = current->pid;
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-
-        process_recovery_queue(obd);
-        return 0;
-}
-
-struct obd_device * target_req2obd(struct ptlrpc_request *req)
-{
-        return req->rq_export->exp_obd;
-}
-
-int target_queue_final_reply(struct ptlrpc_request *req, int rc)
-{
-        struct obd_device *obd = target_req2obd(req);
-        struct ptlrpc_request *saved_req;
-
-        spin_lock_bh(&obd->obd_processing_task_lock);
-        if (rc) {
-                /* Just like ptlrpc_error, but without the sending. */
-                lustre_pack_msg(0, NULL, NULL, &req->rq_replen,
-                                &req->rq_repmsg);
-                req->rq_type = PTL_RPC_MSG_ERR;
-        }
-
-        LASSERT(list_empty(&req->rq_list));
-        OBD_ALLOC(saved_req, sizeof *saved_req);
-        memcpy(saved_req, req, sizeof *saved_req);
-        req = saved_req;
-        list_add(&req->rq_list, &obd->obd_delayed_reply_queue);
-        if (--obd->obd_recoverable_clients == 0) {
-                struct list_head *tmp, *n;
-                ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace);
-                CDEBUG(D_ERROR,
-                       "all clients recovered, sending delayed replies\n");
-                obd->obd_flags &= ~OBD_RECOVERING;
-                list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) {
-                        req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                        DEBUG_REQ(D_ERROR, req, "delayed:");
-                        ptlrpc_reply(req->rq_svc, req);
-                        list_del(&req->rq_list);
-                        OBD_FREE(req, sizeof *req);
-                }
-                cancel_recovery_timer(obd);
-        } else {
-                CERROR("%d recoverable clients remain\n",
-                       obd->obd_recoverable_clients);
-        }
-
-        spin_unlock_bh(&obd->obd_processing_task_lock);
-        return 1;
-}
index 665295e..6648aa8 100644 (file)
@@ -1,18 +1,30 @@
 # Administration utilities Makefile
 DEFS=
 
-CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(PORTALS)/include  -I$(srcdir)/../include -Wall -L$(PORTALSLIB)
+CFLAGS:=-g -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include  -I$(srcdir)/../include -I$(top_srcdir)/../libsysio/include -Wall -L../portals/utils
 
 KFLAGS:=
-CPPFLAGS = $(HAVE_EFENCE)
+CPPFLAGS = $(HAVE_EFENCE) -D_LARGEFILE64_SOURCE=1
 LIBS = $(LIBEFENCE)
-LLIBS= ../lov/liblov.a ../obdecho/libobdecho.a ../osc/libosc.a ../ldlm/libldlm.a  ../ptlrpc/libptlrpc.a ../obdclass/liblustreclass.a
+LLIBS= ./libllite.a ../lov/liblov.a ../obdecho/libobdecho.a ../osc/libosc.a ../ldlm/libldlm.a  ../ptlrpc/libptlrpc.a ../obdclass/liblustreclass.a ../mdc/libmdc.a
+
+lib_LIBRARIES = libllite.a
+libllite_a_SOURCES = llite_lib.c super.c file.c rw.c
+
+bin_PROGRAMS = libtest lltest
 
 libtest_LDADD := $(LIBREADLINE)  $(LLIBS) \
-                 $(PORTALS)/user/procbridge/libprocbridge.a  $(PORTALS)/user/tcpnal/libtcpnal.a \
-                $(PORTALS)/user/util/libtcpnalutil.a $(PORTALS)/api/libptlapi.a \
-                 $(PORTALS)/lib/libptllib.a -lptlctl -lpthread 
-bin_PROGRAMS = libtest
+                 ../portals/unals/libtpcnal.a \
+                 ../portals/portals/libportals.a\
+                 -lptlctl -lpthread 
 libtest_SOURCES = libtest.c
 
+lltest_LDADD := $(LIBREADLINE)  $(LLIBS) \
+                ../../libsysio/src/libsysio.a ../../libsysio/dev/stdfd/libsysio_stdfd.a \
+                -lc \
+                ../portals/unals/libtcpnal.a ../portals/portals/libportals.a \
+               -lptlctl -lpthread
+lltest_SOURCES = lltest.c
+
 include $(top_srcdir)/Rules
+
diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c
new file mode 100644 (file)
index 0000000..8344af5
--- /dev/null
@@ -0,0 +1,553 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light Super operations
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <stdlib.h>
+#include <string.h>
+#include <error.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <sysio.h>
+#include <fs.h>
+#include <mount.h>
+#include <inode.h>
+#include <file.h>
+
+#include "llite_lib.h"
+
+void llu_prepare_mdc_op_data(struct mdc_op_data *data,
+                             struct inode *i1,
+                             struct inode *i2,
+                             const char *name,
+                             int namelen,
+                             int mode)
+{
+        struct llu_inode_info *lli1, *lli2;
+
+        LASSERT(i1);
+
+        lli1 = llu_i2info(i1);
+        data->ino1 = lli1->lli_st_ino;
+        data->gen1 = lli1->lli_st_generation;
+        data->typ1 = lli1->lli_st_mode & S_IFMT;
+        data->gid1 = lli1->lli_st_gid;
+
+        if (i2) {
+                lli2 = llu_i2info(i2);
+                data->ino2 = lli2->lli_st_ino;
+                data->gen2 = lli2->lli_st_generation;
+                data->typ2 = lli2->lli_st_mode & S_IFMT;
+                data->gid2 = lli2->lli_st_gid;
+        } else
+                data->ino2 = 0;
+
+        data->name = name;
+        data->namelen = namelen;
+        data->mode = mode;
+}
+
+static struct inode *llu_create_node(struct inode *dir, const char *name,
+                                     int namelen, const void *data, int datalen,
+                                     int mode, __u64 extra,
+                                     struct lookup_intent *it)
+{
+        struct inode *inode;
+        struct ptlrpc_request *request = NULL;
+        struct mds_body *body;
+        time_t time = 123456;//time(NULL);
+        struct llu_sb_info *sbi = llu_i2sbi(dir);
+
+        if (it && it->it_disposition) {
+                LBUG();
+#if 0
+                ll_invalidate_inode_pages(dir);
+#endif
+                request = it->it_data;
+                body = lustre_msg_buf(request->rq_repmsg, 1, sizeof(*body));
+        } else {
+                struct mdc_op_data op_data;
+                struct llu_inode_info *lli_dir = llu_i2info(dir);
+                int gid = current->fsgid;
+                int rc;
+
+                if (lli_dir->lli_st_mode & S_ISGID) {
+                        gid = lli_dir->lli_st_gid;
+                        if (S_ISDIR(mode))
+                                mode |= S_ISGID;
+                }
+
+                llu_prepare_mdc_op_data(&op_data, dir, NULL, name, namelen, 0);
+                rc = mdc_create(&sbi->ll_mdc_conn, &op_data,
+                                data, datalen, mode, current->fsuid, gid,
+                                time, extra, &request);
+                if (rc) {
+                        inode = (struct inode*)rc;
+                        goto out;
+                }
+                body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
+        }
+
+        inode = llu_new_inode(dir->i_fs, body->ino, body->mode);
+        if (!inode) {
+                /* FIXME more cleanup needed? */
+                goto out;
+        }
+
+        llu_update_inode(inode, body, NULL);
+
+        if (it && it->it_disposition) {
+                /* We asked for a lock on the directory, but were
+                 * granted a lock on the inode.  Since we finally have
+                 * an inode pointer, stuff it in the lock. */
+#if 0
+                ll_mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle,
+                                      inode);
+#endif
+        }
+
+ out:
+        ptlrpc_req_finished(request);
+        return inode;
+}
+
+int llu_create(struct inode *dir, struct pnode_base *pnode, int mode)
+{
+        struct inode *inode;
+#if 0
+        int rc = 0;
+
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu,intent=%s\n",
+               dentry->d_name.name, dir->i_ino, LL_IT2STR(dentry->d_it));
+
+        it = dentry->d_it;
+
+        rc = ll_it_open_error(IT_OPEN_CREATE, it);
+        if (rc) {
+                LL_GET_INTENT(dentry, it);
+                ptlrpc_req_finished(it->it_data);
+                RETURN(rc);
+        }
+#endif
+        inode = llu_create_node(dir, pnode->pb_name.name, pnode->pb_name.len,
+                                NULL, 0, mode, 0, NULL);
+
+        if (IS_ERR(inode))
+                RETURN(PTR_ERR(inode));
+
+        pnode->pb_ino = inode;
+
+        return 0;
+}
+
+static int llu_create_obj(struct lustre_handle *conn, struct inode *inode,
+                          struct lov_stripe_md *lsm)
+{
+        struct ptlrpc_request *req = NULL;
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct lov_mds_md *lmm = NULL;
+        struct obdo *oa;
+        struct iattr iattr;
+        struct mdc_op_data op_data;
+        int rc, err, lmm_size = 0;;
+        ENTRY;
+
+        oa = obdo_alloc();
+        if (!oa)
+                RETURN(-ENOMEM);
+
+        oa->o_mode = S_IFREG | 0600;
+        oa->o_id = lli->lli_st_ino;
+        /* Keep these 0 for now, because chown/chgrp does not change the
+         * ownership on the OST, and we don't want to allow BA OST NFS
+         * users to access these objects by mistake.
+         */
+        oa->o_uid = 0;
+        oa->o_gid = 0;
+        oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
+                OBD_MD_FLUID | OBD_MD_FLGID;
+
+        rc = obd_create(conn, oa, &lsm, NULL);
+        if (rc) {
+                CERROR("error creating objects for inode %lu: rc = %d\n",
+                       lli->lli_st_ino, rc);
+                if (rc > 0) {
+                        CERROR("obd_create returned invalid rc %d\n", rc);
+                        rc = -EIO;
+                }
+                GOTO(out_oa, rc);
+        }
+
+        LASSERT(lsm && lsm->lsm_object_id);
+        rc = obd_packmd(conn, &lmm, lsm);
+        if (rc < 0)
+                GOTO(out_destroy, rc);
+
+        lmm_size = rc;
+
+        /* Save the stripe MD with this file on the MDS */
+        memset(&iattr, 0, sizeof(iattr));
+        iattr.ia_valid = ATTR_FROM_OPEN;
+
+        llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
+
+        rc = mdc_setattr(&llu_i2sbi(inode)->ll_mdc_conn, &op_data,
+                         &iattr, lmm, lmm_size, &req);
+        ptlrpc_req_finished(req);
+
+        obd_free_diskmd(conn, &lmm);
+
+        /* If we couldn't complete mdc_open() and store the stripe MD on the
+         * MDS, we need to destroy the objects now or they will be leaked.
+         */
+        if (rc) {
+                CERROR("error: storing stripe MD for %lu: rc %d\n",
+                       lli->lli_st_ino, rc);
+                GOTO(out_destroy, rc);
+        }
+        lli->lli_smd = lsm;
+
+        EXIT;
+out_oa:
+        obdo_free(oa);
+        return rc;
+
+out_destroy:
+        obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
+        oa->o_id = lsm->lsm_object_id;
+        oa->o_valid |= OBD_MD_FLID;
+        err = obd_destroy(conn, oa, lsm, NULL);
+        obd_free_memmd(conn, &lsm);
+        if (err) {
+                CERROR("error uncreating inode %lu objects: rc %d\n",
+                       lli->lli_st_ino, err);
+        }
+        goto out_oa;
+}
+
+/* FIXME currently no "it" passed in */
+static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it)
+{
+        struct ll_file_data *fd;
+#if 0
+        struct ptlrpc_request *req = it->it_data;
+        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+        ENTRY;
+#endif
+        LASSERT(!lli->lli_file_data);
+
+        fd = malloc(sizeof(struct ll_file_data));
+        /* We can't handle this well without reorganizing ll_file_open and
+         * ll_mdc_close, so don't even try right now. */
+        LASSERT(fd != NULL);
+
+        memset(fd, 0, sizeof(*fd));
+#if 0
+        memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
+        fd->fd_mds_och.och_req = it->it_data;
+#endif
+        lli->lli_file_data = fd;
+
+        RETURN(0);
+}
+
+static int llu_osc_open(struct lustre_handle *conn, struct inode *inode,
+                        struct lov_stripe_md *lsm)
+{
+        struct ll_file_data *fd = llu_i2info(inode)->lli_file_data;
+        struct obdo *oa;
+        int rc;
+        ENTRY;
+
+        oa = obdo_alloc();
+        if (!oa)
+                RETURN(-ENOMEM);
+        oa->o_id = lsm->lsm_object_id;
+        oa->o_mode = S_IFREG;
+        oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS |
+                       OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+        rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
+        if (rc)
+                GOTO(out, rc);
+
+//        file->f_flags &= ~O_LOV_DELAY_CREATE;
+        obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLMTIME |
+                      OBD_MD_FLCTIME);
+
+        EXIT;
+out:
+        obdo_free(oa);
+        return rc;
+}
+
+static int llu_file_open(struct inode *inode)
+{
+#if 0
+        struct llu_sb_info *sbi = llu_i2sbi(inode);
+#endif
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct lustre_handle *conn = llu_i2obdconn(inode);
+        struct lookup_intent *it;
+        struct lov_stripe_md *lsm;
+        int rc = 0;
+
+#if 0
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino);
+        LL_GET_INTENT(file->f_dentry, it);
+        rc = ll_it_open_error(IT_OPEN_OPEN, it);
+        if (rc)
+                RETURN(rc);
+#endif
+        rc = llu_local_open(lli, it);
+        if (rc)
+                LBUG();
+#if 0
+        mdc_set_open_replay_data(&((struct ll_file_data *)
+                                 file->private_data)->fd_mds_och);
+#endif
+        lsm = lli->lli_smd;
+        if (lsm == NULL) {
+#if 0
+                if (file->f_flags & O_LOV_DELAY_CREATE) {
+                        CDEBUG(D_INODE, "delaying object creation\n");
+                        RETURN(0);
+                }
+#endif
+                if (!lli->lli_smd) {
+                        rc = llu_create_obj(conn, inode, NULL);
+                        if (rc)
+                                GOTO(out_close, rc);
+                } else {
+                        CERROR("warning: stripe already set on ino %lu\n",
+                               lli->lli_st_ino);
+                }
+                lsm = lli->lli_smd;
+        }
+
+        rc = llu_osc_open(conn, inode, lsm);
+        if (rc)
+                GOTO(out_close, rc);
+        RETURN(0);
+
+ out_close:
+//        ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
+        return rc;
+}
+
+int llu_iop_open(struct pnode *pnode, int flags, mode_t mode)
+{
+        struct inode *dir = pnode->p_parent->p_base->pb_ino;
+        int rc;
+        /* FIXME later we must add the ldlm here */
+
+        LASSERT(dir);
+
+        /* libsysio forgot to guarentee mode is valid XXX */
+        mode |= S_IFREG;
+
+        if (!pnode->p_base->pb_ino) {
+                rc = llu_create(dir, pnode->p_base, mode);
+                if (rc)
+                        return rc;
+        }
+
+        LASSERT(pnode->p_base->pb_ino);
+        return llu_file_open(pnode->p_base->pb_ino);
+}
+
+
+static int llu_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct ll_file_data *fd = lli->lli_file_data;
+        struct ptlrpc_request *req = NULL;
+        unsigned long flags;
+        struct obd_import *imp;
+        int rc;
+
+        /* FIXME add following code later FIXME */
+#if 0
+        /* Complete the open request and remove it from replay list */
+        rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, lli->lli_st_ino,
+                       inode->i_mode, &fd->fd_mds_och.och_fh, &req);
+        if (rc)
+                CERROR("inode %lu close failed: rc = %d\n",
+                                lli->lli_st_ino, rc);
+
+        imp = fd->fd_mds_och.och_req->rq_import;
+        LASSERT(imp != NULL);
+        spin_lock_irqsave(&imp->imp_lock, flags);
+
+        DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p", 
+                 fd->fd_mds_och.och_req);
+
+        /* We held on to the request for replay until we saw a close for that
+         * file.  Now that we've closed it, it gets replayed on the basis of
+         * its transno only. */
+        spin_lock (&fd->fd_mds_och.och_req->rq_lock);
+        fd->fd_mds_och.och_req->rq_replay = 0;
+        spin_unlock (&fd->fd_mds_och.och_req->rq_lock);
+
+        if (fd->fd_mds_och.och_req->rq_transno) {
+                /* This open created a file, so it needs replay as a
+                 * normal transaction now.  Our reference to it now
+                 * effectively owned by the imp_replay_list, and it'll
+                 * be committed just like other transno-having
+                 * requests from here on out. */
+
+                /* We now retain this close request, so that it is
+                 * replayed if the open is replayed.  We duplicate the
+                 * transno, so that we get freed at the right time,
+                 * and rely on the difference in xid to keep
+                 * everything ordered correctly.
+                 *
+                 * But! If this close was already given a transno
+                 * (because it caused real unlinking of an
+                 * open-unlinked file, f.e.), then we'll be ordered on
+                 * the basis of that and we don't need to do anything
+                 * magical here. */
+                if (!req->rq_transno) {
+                        req->rq_transno = fd->fd_mds_och.och_req->rq_transno;
+                        ptlrpc_retain_replayable_request(req, imp);
+                }
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+                /* Should we free_committed now? we always free before
+                 * replay, so it's probably a wash.  We could check to
+                 * see if the fd_req should already be committed, in
+                 * which case we can avoid the whole retain_replayable
+                 * dance. */
+        } else {
+                /* No transno means that we can just drop our ref. */
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+        }
+        ptlrpc_req_finished(fd->fd_mds_och.och_req);
+
+        /* Do this after the fd_req->rq_transno check, because we don't want
+         * to bounce off zero references. */
+        ptlrpc_req_finished(req);
+        fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC;
+#endif
+        lli->lli_file_data = NULL;
+        free(fd);
+
+        RETURN(-abs(rc));
+}
+
+static int llu_file_release(struct inode *inode)
+{
+        struct llu_sb_info *sbi = llu_i2sbi(inode);
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct lov_stripe_md *lsm = lli->lli_smd;
+        struct ll_file_data *fd;
+        struct obdo oa;
+        int rc = 0, rc2;
+
+        fd = lli->lli_file_data;
+        if (!fd) /* no process opened the file after an mcreate */
+                RETURN(rc = 0);
+
+        /* we might not be able to get a valid handle on this file
+         * again so we really want to flush our write cache.. */
+        if (S_ISREG(inode->i_mode) && lsm) {
+                memset(&oa, 0, sizeof(oa));
+                oa.o_id = lsm->lsm_object_id;
+                oa.o_mode = S_IFREG;
+                oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
+                
+                memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE);
+                oa.o_valid |= OBD_MD_FLHANDLE;
+
+                rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
+                if (rc)
+                        CERROR("inode %lu object close failed: rc = "
+                               "%d\n", lli->lli_st_ino, rc);
+       }
+
+        rc2 = llu_mdc_close(&sbi->ll_mdc_conn, inode);
+        if (rc2 && !rc)
+                rc = rc2;
+
+        RETURN(rc);
+}
+
+int llu_iop_close(struct inode *inode)
+{
+        return llu_file_release(inode);
+}
+
+int llu_iop_ipreadv(struct inode *ino,
+                    struct io_arguments *ioargs,
+                    struct ioctx **ioctxp)
+{
+        struct ioctx *ioctx;
+
+        if (!ioargs->ioarg_iovlen)
+                return 0;
+        if (ioargs->ioarg_iovlen < 0)
+                return -EINVAL;
+
+        ioctx = _sysio_ioctx_new(ino, ioargs);
+        if (!ioctx)
+                return -ENOMEM;
+
+        ioctx->ioctx_cc = llu_file_read(ino,
+                                        ioctx->ioctx_iovec,
+                                        ioctx->ioctx_iovlen,
+                                        ioctx->ioctx_offset);
+        if (ioctx->ioctx_cc < 0)
+                ioctx->ioctx_errno = ioctx->ioctx_cc;
+
+        *ioctxp = ioctx;
+        return 0;
+}
+
+int llu_iop_ipwritev(struct inode *ino,
+                     struct io_arguments *ioargs,
+                     struct ioctx **ioctxp)
+{
+        struct ioctx *ioctx;
+
+        if (!ioargs->ioarg_iovlen)
+                return 0;
+        if (ioargs->ioarg_iovlen < 0)
+                return -EINVAL;
+
+        ioctx = _sysio_ioctx_new(ino, ioargs);
+        if (!ioctx)
+                return -ENOMEM;
+
+        ioctx->ioctx_cc = llu_file_write(ino,
+                                         ioctx->ioctx_iovec,
+                                         ioctx->ioctx_iovlen,
+                                         ioctx->ioctx_offset);
+        if (ioctx->ioctx_cc < 0)
+                ioctx->ioctx_errno = ioctx->ioctx_cc;
+
+        *ioctxp = ioctx;
+        return 0;
+}
+
index c344198..1d523a6 100644 (file)
 #include <linux/obd_class.h>
 #include <portals/procbridge.h>
 
+struct ldlm_namespace;
+struct ldlm_res_id;
+struct obd_import;
+
+extern int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, struct ldlm_res_id *res_id, int flags);
+extern int ldlm_namespace_cleanup(struct ldlm_namespace *ns, int local_only);
+extern int ldlm_replay_locks(struct obd_import *imp);
+
+void *inter_module_get(char *arg)
+{
+        if (!strcmp(arg, "tcpnal_ni"))
+                return &tcpnal_ni;
+        else if (!strcmp(arg, "ldlm_cli_cancel_unused"))
+                return ldlm_cli_cancel_unused;
+        else if (!strcmp(arg, "ldlm_namespace_cleanup"))
+                return ldlm_namespace_cleanup;
+        else if (!strcmp(arg, "ldlm_replay_locks"))
+                return ldlm_replay_locks;
+        else
+                return NULL;
+}
+
 ptl_handle_ni_t         tcpnal_ni;
 
 struct pingcli_args {
@@ -27,7 +49,7 @@ struct task_struct *current;
 struct obd_class_user_state ocus;
 
 /* portals interfaces */
-inline const ptl_handle_ni_t *
+ptl_handle_ni_t *
 kportal_get_ni (int nal)
 {
         return &tcpnal_ni;
@@ -101,10 +123,10 @@ int main(int argc, char **argv)
         init_lib_portals(args);
         ptlrpc_init();
         ldlm_init();
+        mdc_init();
+        lov_init();
         osc_init();
         echo_client_init();
-        /* XXX  need mdc_getlovinfo before lov_init can work.. */
-        //        lov_init();
 
        parse_dump("/tmp/DUMP_FILE", lib_ioctl);
 
diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c
new file mode 100644 (file)
index 0000000..b11de88
--- /dev/null
@@ -0,0 +1,226 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light Super operations
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <stdlib.h>
+#include <string.h>
+#include <error.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <sysio.h>
+#include <fs.h>
+#include <mount.h>
+#include <inode.h>
+#include <file.h>
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+
+#include <portals/api-support.h> /* needed for ptpctl.h */
+#include <portals/ptlctl.h>    /* needed for parse_dump */
+
+#include "llite_lib.h"
+
+
+ptl_handle_ni_t         tcpnal_ni;
+struct task_struct *current;
+struct obd_class_user_state ocus;
+
+/* portals interfaces */
+ptl_handle_ni_t *
+kportal_get_ni (int nal)
+{
+        return &tcpnal_ni;
+}
+
+inline void
+kportal_put_ni (int nal)
+{
+        return;
+}
+
+struct ldlm_namespace;
+struct ldlm_res_id;
+struct obd_import;
+
+extern int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, struct ldlm_res_id *res_id, int flags);
+extern int ldlm_namespace_cleanup(struct ldlm_namespace *ns, int local_only);
+extern int ldlm_replay_locks(struct obd_import *imp);
+
+void *inter_module_get(char *arg)
+{
+        if (!strcmp(arg, "tcpnal_ni"))
+                return &tcpnal_ni;
+        else if (!strcmp(arg, "ldlm_cli_cancel_unused"))
+                return ldlm_cli_cancel_unused;
+        else if (!strcmp(arg, "ldlm_namespace_cleanup"))
+                return ldlm_namespace_cleanup;
+        else if (!strcmp(arg, "ldlm_replay_locks"))
+                return ldlm_replay_locks;
+        else
+                return NULL;
+}
+
+void init_current(char *comm)
+{ 
+        current = malloc(sizeof(*current));
+        current->fs = malloc(sizeof(*current->fs));
+        current->fs->umask = umask(0777);
+        umask(current->fs->umask);
+        strncpy(current->comm, comm, sizeof(current->comm));
+        current->pid = getpid();
+        current->fsuid = 0;
+        current->fsgid = 0;
+        current->cap_effective = 0;
+        memset(&current->pending, 0, sizeof(current->pending));
+}
+
+ptl_nid_t tcpnal_mynid;
+
+int init_lib_portals()
+{
+        int rc;
+
+        PtlInit();
+        rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni);
+        if (rc != 0) {
+                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+                PtlFini();
+                RETURN (rc);
+        }
+        PtlNIDebug(tcpnal_ni, ~0);
+        return rc;
+}
+
+extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg);
+
+struct mount_option_s mount_option = {NULL, NULL};
+
+/* FIXME simple arg parser FIXME */
+void parse_mount_options(void *arg)
+{
+        char *buf = NULL;
+        struct obd_ioctl_data *data;
+        char *ptr, *comma, *eq, **tgt, *v;
+        int len;
+
+        if (obd_ioctl_getdata(&buf, &len, arg)) {
+                CERROR("OBD ioctl: data error\n");
+                return;
+        }
+        data = (struct obd_ioctl_data *)buf;
+        ptr = data->ioc_inlbuf1;
+        printf("mount option: %s\n", ptr);
+
+        while (ptr) {
+                eq = strchr(ptr, '=');
+                if (!eq)
+                        return;
+
+                *eq = 0;
+                if (!strcmp("osc", ptr))
+                        tgt = &mount_option.osc_uuid;
+                else if (!strcmp("mdc", ptr))
+                        tgt = &mount_option.mdc_uuid;
+                else {
+                        printf("Unknown mount option %s\n", ptr);
+                        return;
+                }
+
+                v = eq + 1;
+                comma = strchr(v, ',');
+                if (comma) {
+                        *comma = 0;
+                        ptr = comma + 1;
+                } else
+                        ptr = NULL;
+
+                *tgt = malloc(strlen(v)+1);
+                strcpy(*tgt, v);
+        }
+
+        if (buf)
+                obd_ioctl_freedata(buf, len);
+}
+
+int lib_ioctl(int dev_id, int opc, void * ptr)
+{
+        int rc;
+
+       if (dev_id == OBD_DEV_ID) {
+                struct obd_ioctl_data *ioc = ptr;
+
+                if (opc == OBD_IOC_MOUNTOPT) {
+                        parse_mount_options(ptr);
+                        return 0;
+                }
+
+               rc = class_handle_ioctl(&ocus, opc, (unsigned long)ptr);
+
+               /* you _may_ need to call obd_ioctl_unpack or some
+                  other verification function if you want to use ioc
+                  directly here */
+               printf ("processing ioctl cmd: %x buf len: %d, rc %d\n", 
+                       opc,  ioc->ioc_len, rc);
+
+                if (rc)
+                        return rc;
+       }
+       return (0);
+}
+
+int lllib_init(char *arg)
+{
+       tcpnal_mynid = ntohl(inet_addr(arg));
+        INIT_LIST_HEAD(&ocus.ocus_conns);
+
+        init_current("dummy");
+        if (init_obdclass() ||
+            init_lib_portals() ||
+            ptlrpc_init() ||
+            ldlm_init() ||
+            mdc_init() ||
+            lov_init() ||
+            osc_init())
+                return -1;
+
+       if (parse_dump("/tmp/DUMP_FILE", lib_ioctl))
+                return -1;
+
+        return _sysio_fssw_register("llite", &llu_fssw_ops);
+}
+
+/* FIXME */
+void generate_random_uuid(unsigned char uuid_out[16])
+{
+        int *arr = (int*)uuid_out;
+        int i;
+
+        for (i = 0; i < sizeof(uuid_out)/sizeof(int); i++)
+                arr[i] = rand();
+}
+
diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h
new file mode 100644 (file)
index 0000000..ce2e23b
--- /dev/null
@@ -0,0 +1,135 @@
+#ifndef __LLU_H_
+#define __LLU_H_
+
+#include <liblustre.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <portals/procbridge.h>
+#include <linux/lustre_lite.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+struct ll_file_data {
+        struct obd_client_handle fd_mds_och;
+        struct obd_client_handle fd_ost_och;
+        __u32 fd_flags;
+};
+
+struct llu_sb_info
+{
+        struct obd_uuid         ll_sb_uuid;
+        struct lustre_handle    ll_mdc_conn;
+        struct lustre_handle    ll_osc_conn;
+        obd_id                  ll_rootino;
+        int                     ll_flags;
+        struct list_head        ll_conn_chain;
+};
+
+struct llu_inode_info {
+       struct llu_sb_info      *lli_sbi;
+       struct ll_fid           lli_fid;
+        struct lov_stripe_md   *lli_smd;
+        char                   *lli_symlink_name;
+        /*struct semaphore      lli_open_sem;*/
+        unsigned long          lli_flags;
+        struct list_head       lli_read_extents;
+
+       /* in libsysio we have no chance to store data in file,
+        * so place it here */
+       struct ll_file_data     *lli_file_data;
+
+       /* stat FIXME not 64 bit clean */
+       dev_t                   lli_st_dev;
+       ino_t                   lli_st_ino;
+       mode_t                  lli_st_mode;
+       nlink_t                 lli_st_nlink;
+       uid_t                   lli_st_uid;
+       gid_t                   lli_st_gid;
+       dev_t                   lli_st_rdev;
+       loff_t                  lli_st_size;
+       unsigned int            lli_st_blksize;
+       unsigned int            lli_st_blocks;
+       time_t                  lli_st_atime;
+       time_t                  lli_st_mtime;
+       time_t                  lli_st_ctime;
+
+       /* not for stat, change it later */
+       int                     lli_st_flags;
+       unsigned long           lli_st_generation;
+};
+
+static inline struct llu_sb_info *llu_fs2sbi(struct filesys *fs)
+{
+       return (struct llu_sb_info*)(fs->fs_private);
+}
+
+static inline struct llu_inode_info *llu_i2info(struct inode *inode)
+{
+       return (struct llu_inode_info*)(inode->i_private);
+}
+
+static inline struct llu_sb_info *llu_i2sbi(struct inode *inode)
+{
+        return llu_i2info(inode)->lli_sbi;
+}
+
+static inline struct client_obd *sbi2mdc(struct llu_sb_info *sbi)
+{
+       struct obd_device *obd = class_conn2obd(&sbi->ll_mdc_conn);
+       if (obd == NULL)
+               LBUG();
+       return &obd->u.cli;
+}
+
+static inline struct lustre_handle *llu_i2obdconn(struct inode *inode)
+{
+        return &(llu_i2info(inode)->lli_sbi->ll_osc_conn);
+}
+
+
+struct mount_option_s
+{
+       char *mdc_uuid;
+       char *osc_uuid;
+};
+
+/* llite_lib.c */
+void generate_random_uuid(unsigned char uuid_out[16]);
+
+extern struct mount_option_s mount_option;
+
+/* super.c */
+void llu_update_inode(struct inode *inode, struct mds_body *body,
+                      struct lov_stripe_md *lmm);
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
+struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode);
+
+extern struct fssw_ops llu_fssw_ops;
+
+/* file.c */
+void llu_prepare_mdc_op_data(struct mdc_op_data *data,
+                             struct inode *i1,
+                             struct inode *i2,
+                             const char *name,
+                             int namelen,
+                             int mode);
+int llu_create(struct inode *dir, struct pnode_base *pnode, int mode);
+int llu_iop_open(struct pnode *pnode, int flags, mode_t mode);
+int llu_iop_close(struct inode *inode);
+int llu_iop_ipreadv(struct inode *ino,
+                    struct io_arguments *ioargs,
+                    struct ioctx **ioctxp);
+int llu_iop_ipwritev(struct inode *ino,
+                     struct io_arguments *ioargs,
+                     struct ioctx **ioctxp);
+
+/* rw.c */
+int llu_iop_iodone(struct ioctx *ioctxp __IS_UNUSED);
+ssize_t llu_file_write(struct inode *inode, const struct iovec *iovec,
+                      size_t iovlen, loff_t pos);
+ssize_t llu_file_read(struct inode *inode, const struct iovec *iovec,
+                       size_t iovlen, loff_t pos);
+
+#endif
diff --git a/lustre/liblustre/lltest.c b/lustre/liblustre/lltest.c
new file mode 100644 (file)
index 0000000..acdc47e
--- /dev/null
@@ -0,0 +1,159 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light user test program
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define _BSD_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/queue.h>
+#include <sys/statvfs.h>
+
+#include <sysio.h>
+#include <mount.h>
+
+
+int do_stat(const char *name)
+{
+       struct stat stat;
+
+       if (lstat(name, &stat)) {
+               perror("failed to stat: ");
+               return -1;
+       }
+       printf("******* stat '%s' ********\n", name);
+       printf("ino:\t\t%lu\n",stat.st_ino);
+       printf("mode:\t\t%o\n",stat.st_mode);
+       printf("nlink:\t\t%d\n",stat.st_nlink);
+        printf("uid/gid:\t%d/%d\n", stat.st_uid, stat.st_gid);
+        printf("size:\t\t%ld\n", stat.st_size);
+        printf("blksize:\t%ld\n", stat.st_blksize);
+        printf("block count:\t%ld\n", stat.st_blocks);
+       printf("atime:\t\t%lu\n",stat.st_atime);
+       printf("mtime:\t\t%lu\n",stat.st_mtime);
+       printf("ctime:\t\t%lu\n",stat.st_ctime);
+       printf("******* end stat ********\n");
+
+       return 0;
+}
+/*
+ * Get stats of file and file system.
+ *
+ * Usage: test_stats [-a] [-r <root-path>] [-m <root-driver>] [<path> ...]
+ */
+
+extern int lllib_init(char *arg);
+
+char   *root_driver = "llite";
+char   *root_path = "/";
+unsigned mntflgs = 0;
+struct mount root_mount;
+
+extern int portal_debug;
+extern int portal_subsystem_debug;
+
+char* files[] = {"/dir1", "/dir1/file1", "/dir1/file2", "/dir1/dir2", "/dir1/dir2/file3"};
+
+int
+main(int argc, char * const argv[])
+{
+       struct stat statbuf;
+       int rc, err, i, fd, written, readed;
+       char pgbuf[4096], readbuf[4096];
+       int npages;
+
+       if (_sysio_init() != 0) {
+               perror("init sysio");
+               exit(1);
+       }
+       err = lllib_init(argv[1]);
+       if (err) {
+               perror("init llite driver");
+               exit(1);
+       }       
+
+       err = _sysio_mount_root(root_path, root_driver, mntflgs, NULL);
+       if (err) {
+               errno = -err;
+               perror(root_driver);
+               exit(1);
+       }
+#if 0
+       for (i=0; i< sizeof(files)/sizeof(char*); i++) {
+               printf("******** stat %s *********\n", files[i]);
+               /* XXX ugly, only for testing */
+               err = fixme_lstat(files[i], &statbuf);
+               if (err)
+                       perror(root_driver);
+               printf("******** end stat %s: %d*********\n", files[i], err);
+       }
+#endif
+#if 0
+       portal_debug = 0;
+       portal_subsystem_debug = 0;
+       npages = 10;
+
+       fd = open("/newfile01", O_RDWR|O_CREAT|O_TRUNC, 00664);
+       printf("***************** open return %d ****************\n", fd);
+
+       printf("***************** begin write pages ****************\n");
+       for (i = 0; i < npages; i++ ) {
+               memset(pgbuf, ('A'+ i%10), 4096);
+               written = write(fd, pgbuf, 4096);
+               printf(">>> page %d: %d bytes written\n", i, written);
+       }
+
+       printf("***************** begin read pages ****************\n");
+       lseek(fd, 0, SEEK_SET);
+
+       for (i = 0; i < npages; i++ ) {
+               memset(readbuf, '8', 4096);
+               readed = read(fd, readbuf, 4096);
+               readbuf[10] = 0;
+               printf("<<< page %d: %d bytes (%s)\n", i, readed, readbuf);
+       }
+        close(fd);
+#endif
+
+#if 1
+        //rc = chown("/newfile01", 10, 20);
+        rc = chmod("/newfile01", 0777);
+        printf("-------------- chmod return %d -----------\n", rc);
+        do_stat("/newfile01");
+#endif
+
+       printf("sysio is about shutdown\n");
+       /*
+        * Clean up.
+        */
+       _sysio_shutdown();
+
+       printf("complete successfully\n");
+       return 0;
+}
diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c
new file mode 100644 (file)
index 0000000..847b1d0
--- /dev/null
@@ -0,0 +1,519 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light Super operations
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <stdlib.h>
+#include <string.h>
+#include <error.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <sysio.h>
+#include <fs.h>
+#include <mount.h>
+#include <inode.h>
+#include <file.h>
+
+#include "llite_lib.h"
+
+int llu_iop_iodone(struct ioctx *ioctxp __IS_UNUSED)
+{
+        return 1;
+}
+
+/*
+ * this grabs a lock and manually implements behaviour that makes it look
+ * like the OST is returning the file size with each lock acquisition
+ */
+int llu_extent_lock(struct ll_file_data *fd, struct inode *inode,
+                   struct lov_stripe_md *lsm,
+                   int mode, struct ldlm_extent *extent,
+                   struct lustre_handle *lockh)
+{
+#if 0
+        struct ll_inode_info *lli = ll_i2info(inode);
+        int rc;
+        ENTRY;
+
+        rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
+        if (rc != ELDLM_OK)
+                RETURN(rc);
+
+        /* always do a getattr for the first person to pop out of lock
+         * acquisition.. the DID_GETATTR flag and semaphore serialize
+         * this initial race.  we used to make a decision based on whether
+         * the lock was matched or acquired, but the matcher could win the
+         * waking race with the first issuer so that was no good..
+         */
+        if (test_bit(LLI_F_DID_GETATTR, &lli->lli_flags))
+                RETURN(ELDLM_OK);
+
+        down(&lli->lli_getattr_sem);
+
+        if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) {
+                rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
+                if (rc == 0) {
+                        set_bit(LLI_F_DID_GETATTR, &lli->lli_flags);
+                } else {
+                        /* XXX can this fail? */
+                        ll_extent_unlock(fd, inode, lsm, mode, lockh);
+                }
+        }
+
+        up(&lli->lli_getattr_sem);
+        RETURN(rc);
+#else
+        return ELDLM_OK;
+#endif
+}
+
+int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
+                struct lov_stripe_md *lsm, int mode,
+                struct lustre_handle *lockh)
+{
+#if 0
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        int rc;
+        ENTRY;
+
+        /* XXX phil: can we do this?  won't it screw the file size up? */
+        if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
+            (sbi->ll_flags & LL_SBI_NOLCK))
+                RETURN(0);
+
+        rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
+
+        RETURN(rc);
+#else
+        return 0;
+#endif
+}
+
+static int llu_brw(int cmd, struct inode *inode, struct page *page, int flags)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct lov_stripe_md *lsm = lli->lli_smd;
+        struct brw_page pg;
+        int rc;
+        ENTRY;
+
+        pg.pg = page;
+        pg.off = ((obd_off)page->index) << PAGE_SHIFT;
+
+        /* FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME */
+#if 0
+        if (cmd == OBD_BRW_WRITE && (pg.off + PAGE_SIZE > lli->lli_st_size))
+                pg.count = lli->lli_st_size % PAGE_SIZE;
+        else
+#endif
+                pg.count = PAGE_SIZE;
+
+        CDEBUG(D_PAGE, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n",
+               cmd & OBD_BRW_WRITE ? "write" : "read", pg.count, lli->lli_st_ino,
+               pg.off, pg.off);
+        if (pg.count == 0) {
+                LBUG();
+        }
+
+        pg.flag = flags;
+
+        rc = obd_brw(cmd, llu_i2obdconn(inode), lsm, 1, &pg, set, NULL);
+        if (rc) {
+                CERROR("error from obd_brw: rc = %d\n", rc);
+        }
+
+        RETURN(rc);
+}
+
+static int llu_prepare_write(struct inode *inode, struct page *page,
+                             unsigned from, unsigned to)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+        obd_off offset = ((obd_off)page->index) << PAGE_SHIFT;
+        int rc = 0;
+        ENTRY;
+
+#if 0
+        if (!PageLocked(page))
+                LBUG();
+
+        if (PageUptodate(page))
+                RETURN(0);
+
+        //POISON(addr + from, 0xca, to - from);
+#endif
+        /* We're completely overwriting an existing page, so _don't_ set it up
+         * to date until commit_write */
+        if (from == 0 && to == PAGE_SIZE)
+                RETURN(0);
+
+        /* If are writing to a new page, no need to read old data.
+         * the extent locking and getattr procedures in ll_file_write have
+         * guaranteed that i_size is stable enough for our zeroing needs */
+        if (lli->lli_st_size <= offset) {
+                memset(kmap(page), 0, PAGE_SIZE);
+                kunmap(page);
+                GOTO(prepare_done, rc = 0);
+        }
+
+        rc = llu_brw(OBD_BRW_READ, inode, page, 0);
+
+        EXIT;
+
+ prepare_done:
+        return rc;
+}
+
+static int llu_commit_write(struct inode *inode, struct page *page,
+                            unsigned from, unsigned to)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+        loff_t size;
+        int rc;
+        ENTRY;
+#if 0
+        LASSERT(inode == file->f_dentry->d_inode);
+        LASSERT(PageLocked(page));
+
+        CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n",
+               inode, page, from, to, page->index);
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu,from=%d,to=%d\n",
+               inode->i_ino, from, to);
+        /* to match full page case in prepare_write */
+        SetPageUptodate(page);
+        /* mark the page dirty, put it on mapping->dirty,
+         * mark the inode PAGES_DIRTY, put it on sb->dirty */
+        set_page_dirty(page);
+#endif
+        rc = llu_brw(OBD_BRW_WRITE, inode, page, 0);
+        if (rc)
+                return rc;
+
+        /* this is matched by a hack in obdo_to_inode at the moment */
+        size = (((obd_off)page->index) << PAGE_SHIFT) + to;
+        if (size > lli->lli_st_size)
+                lli->lli_st_size = size;
+
+        RETURN(0);
+} /* ll_commit_write */
+
+ssize_t
+llu_generic_file_write(struct inode *inode, const char *buf,
+                       size_t count, loff_t pos)
+{
+       struct page     *page;
+       ssize_t         written;
+       long            status = 0;
+       int             err;
+       unsigned        bytes;
+
+       if ((ssize_t) count < 0)
+               return -EINVAL;
+#if 0
+       down(&inode->i_sem);
+#endif
+       if (pos < 0)
+                return -EINVAL;
+
+       written = 0;
+
+#if 0
+       remove_suid(inode);
+       update_inode_times(inode);
+#endif
+       do {
+               unsigned long index, offset;
+               char *kaddr;
+
+               /*
+                * Try to find the page in the cache. If it isn't there,
+                * allocate a free page.
+                */
+               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+               index = pos >> PAGE_CACHE_SHIFT;
+               bytes = PAGE_CACHE_SIZE - offset;
+               if (bytes > count) {
+                       bytes = count;
+               }
+
+               status = -ENOMEM;       /* we'll assign it later anyway */
+               page = __grab_cache_page(index);
+               if (!page)
+                       break;
+
+               kaddr = kmap(page);
+               status = llu_prepare_write(inode, page, offset, offset+bytes);
+               if (status)
+                       goto sync_failure;
+
+               memcpy(kaddr+offset, buf, bytes);
+
+               status = llu_commit_write(inode, page, offset, offset+bytes);
+               if (!status)
+                       status = bytes;
+
+               if (status >= 0) {
+                       written += status;
+                       count -= status;
+                       pos += status;
+                       buf += status;
+               }
+unlock:
+               kunmap(page);
+               page_cache_release(page);
+
+               if (status < 0)
+                       break;
+       } while (count);
+done:
+       err = written ? written : status;
+
+#if 0
+       up(&inode->i_sem);
+#endif
+       return err;
+
+       status = -EFAULT;
+       goto unlock;
+
+sync_failure:
+       /*
+        * If blocksize < pagesize, prepare_write() may have instantiated a
+        * few blocks outside i_size.  Trim these off again.
+        */
+       kunmap(page);
+       page_cache_release(page);
+       goto done;
+}
+
+ssize_t llu_file_write(struct inode *inode, const struct iovec *iovec,
+                       size_t iovlen, loff_t pos)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct ll_file_data *fd = lli->lli_file_data; /* XXX not ready don't use it now */
+        struct lustre_handle lockh = { 0 };
+        struct lov_stripe_md *lsm = lli->lli_smd;
+        struct ldlm_extent extent;
+        ldlm_error_t err;
+        ssize_t retval = 0;
+        ENTRY;
+
+        /* XXX consider other types later */
+        if (!S_ISREG(lli->lli_st_mode))
+                LBUG();
+#if 0
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu,size="LPSZ",offset=%Ld\n",
+               inode->i_ino, count, *ppos);
+
+        /*
+         * sleep doing some writeback work of this mount's dirty data
+         * if the VM thinks we're low on memory.. other dirtying code
+         * paths should think about doing this, too, but they should be
+         * careful not to hold locked pages while they do so.  like
+         * ll_prepare_write.  *cough*
+         */
+        ll_check_dirty(inode->i_sb);
+#endif
+        while (iovlen--) {
+                const char *buf = iovec[iovlen].iov_base;
+                size_t count = iovec[iovlen].iov_len;
+
+                /* POSIX, but surprised the VFS doesn't check this already */
+                if (count == 0)
+                        continue;
+
+#if 0
+                if (!S_ISBLK(lli->lli_st_mode) && file->f_flags & O_APPEND) {
+                        extent.start = 0;
+                        extent.end = OBD_OBJECT_EOF;
+                } else  {
+                        extent.start = *ppos;
+                        extent.end = *ppos + count - 1;
+                }
+#else
+                extent.start = pos;
+                extent.end = pos + count - 1;
+#endif
+
+                err = llu_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
+                if (err != ELDLM_OK)
+                        RETURN(-ENOLCK);
+
+#if 0
+                if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
+                        *ppos = inode->i_size;
+
+                CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
+                       inode->i_ino, count, *ppos);
+#endif
+                retval += llu_generic_file_write(inode, buf, count, pos);
+        }
+
+        /* XXX errors? */
+        ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
+        return(retval);
+}
+
+static void llu_update_atime(struct inode *inode)
+{
+#if 0
+        struct llu_inode_info *lli = llu_i2info(inode);
+
+#ifdef USE_ATIME
+        struct iattr attr;
+
+        attr.ia_atime = LTIME_S(CURRENT_TIME);
+        attr.ia_valid = ATTR_ATIME;
+
+        if (lli->lli_st_atime == attr.ia_atime) return;
+        if (IS_RDONLY(inode)) return;
+        if (IS_NOATIME(inode)) return;
+
+        /* ll_inode_setattr() sets inode->i_atime from attr.ia_atime */
+        llu_inode_setattr(inode, &attr, 0);
+#else
+        /* update atime, but don't explicitly write it out just this change */
+        inode->i_atime = CURRENT_TIME;
+#endif
+#endif
+}
+
+static size_t llu_generic_file_read(struct inode *inode, char *buf,
+                                    size_t count, loff_t pos)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+       unsigned long index, offset;
+       int error = 0;
+        size_t readed = 0;
+
+       index = pos >> PAGE_CACHE_SHIFT;
+       offset = pos & ~PAGE_CACHE_MASK;
+
+       do {
+               struct page *page;
+               unsigned long end_index, nr;
+
+               end_index = lli->lli_st_size >> PAGE_CACHE_SHIFT;
+
+               if (index > end_index)
+                       break;
+               nr = PAGE_CACHE_SIZE;
+               if (index == end_index) {
+                       nr = lli->lli_st_size & ~PAGE_CACHE_MASK;
+                       if (nr <= offset)
+                               break;
+               }
+
+               nr = nr - offset;
+                if (nr > count)
+                        nr = count;
+
+                page = grab_cache_page(index);
+                if (!page) {
+                        error = -ENOMEM;
+                        break;
+                }
+
+                error = llu_brw(OBD_BRW_READ, inode, page, 0);
+               if (error) {
+                       page_cache_release(page);
+                        break;
+               }
+
+                memcpy(buf, kmap(page)+offset, nr);
+               offset += nr;
+               index += offset >> PAGE_CACHE_SHIFT;
+               offset &= ~PAGE_CACHE_MASK;
+                readed += nr;
+                count -= nr;
+
+               page_cache_release(page);
+       } while (count);
+
+        if (error)
+                return error;
+        return readed;
+}
+
+ssize_t llu_file_read(struct inode *inode, const struct iovec *iovec,
+                       size_t iovlen, loff_t pos)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct ll_file_data *fd = lli->lli_file_data;
+        struct lov_stripe_md *lsm = lli->lli_smd;
+        struct lustre_handle lockh = { 0 };
+#if 0
+        struct ll_read_extent rextent;
+#else
+        struct ldlm_extent extent;
+#endif
+        ldlm_error_t err;
+        ssize_t retval = 0;
+        ENTRY;
+
+        while (iovlen--) {
+                char *buf = iovec[iovlen].iov_base;
+                size_t count = iovec[iovlen].iov_len;
+
+                /* "If nbyte is 0, read() will return 0 and have no other results."
+                 *                      -- Single Unix Spec */
+                if (count == 0)
+                        RETURN(0);
+
+#if 0
+                rextent.re_extent.start = pos;
+                rextent.re_extent.end = pos + count - 1;
+#else
+                extent.start = pos;
+                extent.end = pos + count - 1;
+#endif
+                err = llu_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh);
+                if (err != ELDLM_OK)
+                        RETURN(-ENOLCK);
+#if 0
+                rextent.re_task = current;
+                spin_lock(&lli->lli_read_extent_lock);
+                list_add(&rextent.re_lli_item, &lli->lli_read_extents);
+                spin_unlock(&lli->lli_read_extent_lock);
+#endif
+                CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
+                       lli->lli_st_ino, count, pos);
+                retval = llu_generic_file_read(inode, buf, count, pos);
+#if 0
+                spin_lock(&lli->lli_read_extent_lock);
+                list_del(&rextent.re_lli_item);
+                spin_unlock(&lli->lli_read_extent_lock);
+#endif
+        }
+
+        if (retval > 0)
+                llu_update_atime(inode);
+
+        /* XXX errors? */
+        ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
+        RETURN(retval);
+}
+
diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c
new file mode 100644 (file)
index 0000000..27ac231
--- /dev/null
@@ -0,0 +1,781 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Lustre Light Super operations
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <stdlib.h>
+#include <string.h>
+#include <error.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+
+#include <sysio.h>
+#include <fs.h>
+#include <mount.h>
+#include <inode.h>
+#include <file.h>
+
+#include "llite_lib.h"
+
+static void llu_fsop_gone(struct filesys *fs)
+{
+        /* FIXME */
+}
+
+static struct inode_ops llu_inode_ops;
+
+void llu_update_inode(struct inode *inode, struct mds_body *body,
+                      struct lov_stripe_md *lsm)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+
+        LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+        if (lsm != NULL) {
+                if (lli->lli_smd == NULL)                        
+                        lli->lli_smd = lsm;
+                else
+                        LASSERT (!memcmp (lli->lli_smd, lsm,
+                                          sizeof (*lsm)));
+        }
+
+        if (body->valid & OBD_MD_FLID)
+                lli->lli_st_ino = body->ino;
+        if (body->valid & OBD_MD_FLATIME)
+                LTIME_S(lli->lli_st_atime) = body->atime;
+        if (body->valid & OBD_MD_FLMTIME)
+                LTIME_S(lli->lli_st_mtime) = body->mtime;
+        if (body->valid & OBD_MD_FLCTIME)
+                LTIME_S(lli->lli_st_ctime) = body->ctime;
+        if (body->valid & OBD_MD_FLMODE)
+                lli->lli_st_mode = (lli->lli_st_mode & S_IFMT)|(body->mode & ~S_IFMT);
+        if (body->valid & OBD_MD_FLTYPE)
+                lli->lli_st_mode = (lli->lli_st_mode & ~S_IFMT)|(body->mode & S_IFMT);
+        if (body->valid & OBD_MD_FLUID)
+                lli->lli_st_uid = body->uid;
+        if (body->valid & OBD_MD_FLGID)
+                lli->lli_st_gid = body->gid;
+        if (body->valid & OBD_MD_FLFLAGS)
+                lli->lli_st_flags = body->flags;
+        if (body->valid & OBD_MD_FLNLINK)
+                lli->lli_st_nlink = body->nlink;
+        if (body->valid & OBD_MD_FLGENER)
+                lli->lli_st_generation = body->generation;
+        if (body->valid & OBD_MD_FLRDEV)
+                lli->lli_st_rdev = body->rdev;
+        if (body->valid & OBD_MD_FLSIZE)
+                lli->lli_st_size = body->size;
+        if (body->valid & OBD_MD_FLBLOCKS)
+                lli->lli_st_blocks = body->blocks;
+
+        /* fillin fid */
+        if (body->valid & OBD_MD_FLID)
+                lli->lli_fid.id = body->ino;
+        if (body->valid & OBD_MD_FLGENER)
+                lli->lli_fid.generation = body->generation;
+        if (body->valid & OBD_MD_FLTYPE)
+                lli->lli_fid.f_type = body->mode & S_IFMT;
+}
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+        struct llu_inode_info *lli = llu_i2info(dst);
+
+        valid &= src->o_valid;
+
+        if (valid & OBD_MD_FLATIME)
+                LTIME_S(lli->lli_st_atime) = src->o_atime;
+        if (valid & OBD_MD_FLMTIME)
+                LTIME_S(lli->lli_st_mtime) = src->o_mtime;
+        if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(lli->lli_st_ctime))
+                LTIME_S(lli->lli_st_ctime) = src->o_ctime;
+        if (valid & OBD_MD_FLSIZE)
+                lli->lli_st_size = src->o_size;
+        if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+                lli->lli_st_blocks = src->o_blocks;
+        if (valid & OBD_MD_FLBLKSZ)
+                lli->lli_st_blksize = src->o_blksize;
+        if (valid & OBD_MD_FLTYPE)
+                lli->lli_st_mode = (lli->lli_st_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+        if (valid & OBD_MD_FLMODE)
+                lli->lli_st_mode = (lli->lli_st_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+        if (valid & OBD_MD_FLUID)
+                lli->lli_st_uid = src->o_uid;
+        if (valid & OBD_MD_FLGID)
+                lli->lli_st_gid = src->o_gid;
+        if (valid & OBD_MD_FLFLAGS)
+                lli->lli_st_flags = src->o_flags;
+        if (valid & OBD_MD_FLNLINK)
+                lli->lli_st_nlink = src->o_nlink;
+        if (valid & OBD_MD_FLGENER)
+                lli->lli_st_generation = src->o_generation;
+        if (valid & OBD_MD_FLRDEV)
+                lli->lli_st_rdev = src->o_rdev;
+}
+
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
+{
+        struct llu_inode_info *lli = llu_i2info(src);
+
+        if (valid & OBD_MD_FLATIME)
+                dst->o_atime = LTIME_S(lli->lli_st_atime);
+        if (valid & OBD_MD_FLMTIME)
+                dst->o_mtime = LTIME_S(lli->lli_st_mtime);
+        if (valid & OBD_MD_FLCTIME)
+                dst->o_ctime = LTIME_S(lli->lli_st_ctime);
+        if (valid & OBD_MD_FLSIZE)
+                dst->o_size = lli->lli_st_size;
+        if (valid & OBD_MD_FLBLOCKS)   /* allocation of space */
+                dst->o_blocks = lli->lli_st_blocks;
+        if (valid & OBD_MD_FLBLKSZ)
+                dst->o_blksize = lli->lli_st_blksize;
+        if (valid & OBD_MD_FLTYPE)
+                dst->o_mode = (dst->o_mode & ~S_IFMT) | (lli->lli_st_mode & S_IFMT);
+        if (valid & OBD_MD_FLMODE)
+                dst->o_mode = (dst->o_mode & S_IFMT) | (lli->lli_st_mode & ~S_IFMT);
+        if (valid & OBD_MD_FLUID)
+                dst->o_uid = lli->lli_st_uid;
+        if (valid & OBD_MD_FLGID)
+                dst->o_gid = lli->lli_st_gid;
+        if (valid & OBD_MD_FLFLAGS)
+                dst->o_flags = lli->lli_st_flags;
+        if (valid & OBD_MD_FLNLINK)
+                dst->o_nlink = lli->lli_st_nlink;
+        if (valid & OBD_MD_FLGENER)
+                dst->o_generation = lli->lli_st_generation;
+        if (valid & OBD_MD_FLRDEV)
+                dst->o_rdev = (__u32)(lli->lli_st_rdev);
+
+        dst->o_valid |= (valid & ~OBD_MD_FLID);
+}
+
+int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
+                      char *ostdata)
+{
+        struct llu_sb_info *sbi = llu_i2sbi(inode);
+        struct obdo oa;
+        int rc;
+        ENTRY;
+
+        LASSERT(lsm);
+        LASSERT(sbi);
+
+        memset(&oa, 0, sizeof oa);
+        oa.o_id = lsm->lsm_object_id;
+        oa.o_mode = S_IFREG;
+        oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
+                OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+
+        if (ostdata != NULL) {
+                memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE);
+                oa.o_valid |= OBD_MD_FLHANDLE;
+        }
+
+        rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
+        if (rc)
+                RETURN(rc);
+
+        obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+                           OBD_MD_FLMTIME | OBD_MD_FLCTIME);
+
+        RETURN(0);
+}
+
+struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode)
+{
+       struct inode *inode;
+        struct llu_inode_info *lli;
+
+        OBD_ALLOC(lli, sizeof(*lli));
+        if (!lli)
+                return NULL;
+
+        /* initialize lli here */
+        lli->lli_sbi = llu_fs2sbi(fs);
+        lli->lli_smd = NULL;
+        lli->lli_symlink_name = NULL;
+        lli->lli_flags = 0;
+        INIT_LIST_HEAD(&lli->lli_read_extents);
+        lli->lli_file_data = NULL;
+
+        /* could file_identifier be 0 ? FIXME */
+       inode = _sysio_i_new(fs, ino, NULL,
+#ifndef AUTOMOUNT_FILE_NAME
+                            mode & S_IFMT,
+#else
+                            mode,      /* all of the bits! */
+#endif
+                             0,
+                            &llu_inode_ops, lli);
+
+       if (!inode)
+               OBD_FREE(lli, sizeof(*lli));
+
+        return inode;
+}
+
+static int llu_iop_lookup(struct pnode *pnode,
+                          struct inode **inop,
+                          struct intent *intnt __IS_UNUSED,
+                          const char *path __IS_UNUSED)
+{
+        struct pnode_base *pb_dir = pnode->p_parent->p_base;
+        struct ptlrpc_request *request = NULL;
+        struct llu_sb_info *sbi = llu_i2sbi(pb_dir->pb_ino);
+        struct ll_fid *fid = &llu_i2info(pb_dir->pb_ino)->lli_fid;
+        struct qstr *name = &pnode->p_base->pb_name;
+        struct mds_body *body;
+        unsigned long valid;
+        char *pname;
+        int rc, easize;
+        struct ll_read_inode2_cookie lic = {.lic_body = NULL, .lic_lsm = NULL};
+
+        /* the mount root inode have no name, so don't call
+         * remote in this case. but probably we need revalidate
+         * it here? FIXME */
+        if (pnode->p_mount->mnt_root == pnode) {
+                struct inode *i = pnode->p_base->pb_ino;
+                I_REF(i);
+                *inop = i;
+                return 0;
+        }
+
+        if (!name->len)
+                return -EINVAL;
+
+        /* mdc_getattr_name require NULL-terminated name */
+        OBD_ALLOC(pname, name->len + 1);
+        if (!pname)
+                return -ENOMEM;
+        memcpy(pname, name->name, name->len);
+        pname[name->len] = 0;
+
+        valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE;
+
+        /* FIXME before getattr_name, we don't know whether
+         * the inode we are finding is regular or not, so here
+         * we blindly require server feed in EA data */
+        easize = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
+        valid |= OBD_MD_FLEASIZE;
+
+        rc = mdc_getattr_name(&sbi->ll_mdc_conn, fid,
+                              pname, name->len + 1,
+                              valid, easize, &request);
+        if (rc < 0) {
+                CERROR("mdc_getattr_name: %d\n", rc);
+                rc = -ENOENT;
+                goto out;
+        }
+        body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body));
+
+        *inop = llu_new_inode(pnode->p_mount->mnt_fs, body->ino, body->mode);
+        if (!inop)
+                goto out;
+
+        lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*lic.lic_body));
+        LASSERT (lic.lic_body != NULL);
+        LASSERT_REPSWABBED (request, 0);
+
+        if (S_ISREG(lic.lic_body->mode) &&
+            lic.lic_body->valid & OBD_MD_FLEASIZE) {
+                struct lov_mds_md    *lmm;
+                int                   lmm_size;
+                int                   rc;
+                
+                lmm_size = lic.lic_body->eadatasize;
+                if (lmm_size == 0) {
+                        CERROR ("OBD_MD_FLEASIZE set but eadatasize 0\n");
+                        RETURN (-EPROTO);
+                }
+                lmm = lustre_msg_buf(request->rq_repmsg, 0 + 1, lmm_size);
+                LASSERT(lmm != NULL);
+                LASSERT_REPSWABBED (request, 0 + 1);
+
+                rc = obd_unpackmd (&sbi->ll_osc_conn, 
+                                   &lic.lic_lsm, lmm, lmm_size);
+                if (rc < 0) {
+                        CERROR ("Error %d unpacking eadata\n", rc);
+                        RETURN (rc);
+                }
+                LASSERT (rc >= sizeof (*lic.lic_lsm));
+
+        } else {
+                lic.lic_lsm = NULL;
+        }
+
+        llu_update_inode(*inop, body, lic.lic_lsm);
+
+        if (llu_i2info(*inop)->lli_smd) {
+                rc = llu_inode_getattr(*inop, llu_i2info(*inop)->lli_smd, NULL);
+                if (rc)
+                        _sysio_i_gone(*inop);
+        }
+
+out:
+        ptlrpc_req_finished(request);
+        OBD_FREE(pname, name->len + 1);
+
+        return rc;
+}
+
+static int llu_iop_getattr(struct pnode *pno,
+                           struct inode *ino,
+                           struct intnl_stat *b)
+{
+        struct llu_inode_info *lli = llu_i2info(ino);
+
+        b->st_dev = lli->lli_st_dev;
+        b->st_ino = lli->lli_st_ino;
+        b->st_mode = lli->lli_st_mode;
+        b->st_nlink = lli->lli_st_nlink;
+        b->st_uid = lli->lli_st_uid;
+        b->st_gid = lli->lli_st_gid;
+        b->st_rdev = lli->lli_st_rdev;
+        b->st_size = lli->lli_st_size;
+        b->st_blksize = lli->lli_st_blksize;
+        b->st_blocks = lli->lli_st_blocks;
+        b->st_atime = lli->lli_st_atime;
+        b->st_mtime = lli->lli_st_mtime;
+        b->st_ctime = lli->lli_st_ctime;
+
+        return 0;
+}
+
+int llu_mdc_cancel_unused(struct lustre_handle *conn,
+                          struct llu_inode_info *lli,
+                          int flags)
+{
+        struct ldlm_res_id res_id =
+                { .name = {lli->lli_st_ino, lli->lli_st_generation} };
+        struct obd_device *obddev = class_conn2obd(conn);
+        ENTRY;
+        RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags));
+}
+
+static void llu_clear_inode(struct inode *inode)
+{
+        struct llu_sb_info *sbi = llu_i2sbi(inode);
+        struct llu_inode_info *lli = llu_i2info(inode);
+        int rc;
+        ENTRY;
+
+        CDEBUG(D_INODE, "clear inode: %lu\n", lli->lli_st_ino);
+        rc = llu_mdc_cancel_unused(&sbi->ll_mdc_conn, lli,
+                                   LDLM_FL_NO_CALLBACK);
+        if (rc < 0) {
+                CERROR("ll_mdc_cancel_unused: %d\n", rc);
+                /* XXX FIXME do something dramatic */
+        }
+
+        if (lli->lli_smd) {
+                rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, 0);
+                if (rc < 0) {
+                        CERROR("obd_cancel_unused: %d\n", rc);
+                        /* XXX FIXME do something dramatic */
+                }
+        }
+
+        if (lli->lli_smd)
+                obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd);
+
+        if (lli->lli_symlink_name) {
+                OBD_FREE(lli->lli_symlink_name,
+                         strlen(lli->lli_symlink_name) + 1);
+                lli->lli_symlink_name = NULL;
+        }
+
+        EXIT;
+}
+
+void llu_iop_gone(struct inode *inode)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+
+        llu_clear_inode(inode);
+
+        OBD_FREE(lli, sizeof(*lli));
+}
+
+static int llu_setattr_raw(struct inode *inode, struct iattr *attr)
+{
+        struct ptlrpc_request *request = NULL;
+        struct llu_sb_info *sbi = llu_i2sbi(inode);
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct mdc_op_data op_data;
+        int err = 0;
+        ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino);
+
+        /* if need truncate, do it at first */
+        if (attr->ia_valid & ATTR_SIZE) {
+                printf("************* don't support truncate now !!!!!!!!\n");
+                LBUG();
+        }
+
+        /* Don't send size changes to MDS to avoid "fast EA" problems, and
+         * also avoid a pointless RPC (we get file size from OST anyways).
+         */
+        attr->ia_valid &= ~ATTR_SIZE;
+        if (!attr->ia_valid)
+                RETURN(0);
+
+        llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
+
+        err = mdc_setattr(&sbi->ll_mdc_conn, &op_data,
+                          attr, NULL, 0, &request);
+        if (err)
+                CERROR("mdc_setattr fails: err = %d\n", err);
+
+        ptlrpc_req_finished(request);
+
+        if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) {
+                struct lov_stripe_md *lsm = lli->lli_smd;
+                struct obdo oa;
+                int err2;
+
+                CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
+                       lli->lli_st_ino, attr->ia_mtime);
+                oa.o_id = lsm->lsm_object_id;
+                oa.o_mode = S_IFREG;
+                oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMTIME;
+                oa.o_mtime = attr->ia_mtime;
+                err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL);
+                if (err2) {
+                        CERROR("obd_setattr fails: rc=%d\n", err);
+                        if (!err)
+                                err = err2;
+                }
+        }
+        RETURN(err);
+}
+
+/* FIXME here we simply act as a thin layer to glue it with
+ * llu_setattr_raw(), which is copy from kernel
+ */
+static int llu_iop_setattr(struct pnode *pno,
+                           struct inode *ino,
+                           unsigned mask,
+                           struct intnl_stat *stbuf)
+{
+        struct iattr iattr;
+
+        memset(&iattr, 0, sizeof(iattr));
+
+        if (mask & SETATTR_MODE) {
+                iattr.ia_mode = stbuf->st_mode;
+                iattr.ia_valid |= ATTR_MODE;
+        }
+        if (mask & SETATTR_MTIME) {
+                iattr.ia_mtime = stbuf->st_mtime;
+                iattr.ia_valid |= ATTR_MTIME;
+        }
+        if (mask & SETATTR_ATIME) {
+                iattr.ia_atime = stbuf->st_atime;
+                iattr.ia_valid |= ATTR_ATIME;
+        }
+        if (mask & SETATTR_UID) {
+                iattr.ia_uid = stbuf->st_uid;
+                iattr.ia_valid |= ATTR_UID;
+        }
+        if (mask & SETATTR_GID) {
+                iattr.ia_gid = stbuf->st_gid;
+                iattr.ia_valid |= ATTR_GID;
+        }
+        if (mask & SETATTR_LEN) {
+                iattr.ia_size = stbuf->st_size; /* FIXME signed expansion problem */
+                iattr.ia_valid |= ATTR_SIZE;
+        }
+
+        iattr.ia_valid |= ATTR_RAW;
+        /* FIXME FIXME FIXME FIXME FIXME FIXME FIXME
+         * without ATTR_FROM_OPEN, mds_reint_setattr will call
+         * mds_fid2locked_dentry() and deadlocked at completion_ast call.
+         * Here we workaround it and avoid any locking.
+         * FIXME FIXME FIXME FIXME FIXME FIXME FIXME
+         */
+        iattr.ia_valid |= ATTR_FROM_OPEN;
+
+        return llu_setattr_raw(ino, &iattr);
+}
+
+
+static int llu_mkdir2(struct inode *dir, const char *name, int len, int mode)
+{
+        struct ptlrpc_request *request = NULL;
+        time_t curtime = CURRENT_TIME;
+        struct llu_sb_info *sbi = llu_i2sbi(dir);
+        struct llu_inode_info *lli = llu_i2info(dir);
+        struct mdc_op_data op_data;
+        int err = -EMLINK;
+        ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu\n",
+               name, lli->lli_st_ino);
+
+        /* FIXME check this later */
+#if 0 
+        if (dir->i_nlink >= EXT2_LINK_MAX)
+                RETURN(err);
+        mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR;
+#endif
+        mode |= S_IFDIR;
+        llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0);
+        err = mdc_create(&sbi->ll_mdc_conn, &op_data, NULL, 0, mode,
+                         current->fsuid, current->fsgid,
+                         curtime, 0, &request);
+        ptlrpc_req_finished(request);
+        RETURN(err);
+}
+
+static int llu_iop_mkdir(struct pnode *pno, mode_t mode)
+{
+        struct inode *dir = pno->p_base->pb_parent->pb_ino;
+        struct qstr *qstr = &pno->p_base->pb_name;
+        int rc;
+
+        LASSERT(dir);
+
+        rc = llu_mkdir2(dir, qstr->name, qstr->len, mode);
+
+        return rc;
+}
+
+#ifndef S_IRWXUGO
+#define S_IRWXUGO       (S_IRWXU|S_IRWXG|S_IRWXO)
+#endif
+
+static int llu_symlink2(struct inode *dir, const char *name, int len,
+                        const char *tgt)
+{
+        struct ptlrpc_request *request = NULL;
+        time_t curtime = CURRENT_TIME;
+        struct llu_sb_info *sbi = llu_i2sbi(dir);
+        struct llu_inode_info *lli = llu_i2info(dir);
+        struct mdc_op_data op_data;
+        int err = -EMLINK;
+        ENTRY;
+
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu,target=%s\n",
+               name, lli->lli_st_ino, tgt);
+
+#if 0
+        if (dir->i_nlink >= EXT2_LINK_MAX)
+                RETURN(err);
+#endif
+        llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0);
+        err = mdc_create(&sbi->ll_mdc_conn, &op_data,
+                         tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO,
+                         current->fsuid, current->fsgid, curtime, 0, &request);
+        ptlrpc_req_finished(request);
+        RETURN(err);
+}
+
+static int llu_iop_symlink(struct pnode *pno, const char *data)
+{
+        struct inode *dir = pno->p_base->pb_parent->pb_ino;
+        struct qstr *qstr = &pno->p_base->pb_name;
+        int rc;
+        
+        LASSERT(dir);
+
+        rc = llu_symlink2(dir, qstr->name, qstr->len, data);
+
+        return rc;
+}
+
+struct filesys_ops llu_filesys_ops =
+{
+        fsop_gone: llu_fsop_gone,
+};
+
+
+static struct inode_ops llu_inode_ops = {
+        inop_lookup:    llu_iop_lookup,
+        inop_getattr:   llu_iop_getattr,
+        inop_setattr:   llu_iop_setattr,
+        inop_getdirentries:     NULL,
+        inop_mkdir:     llu_iop_mkdir,
+        inop_rmdir:     NULL,
+        inop_symlink:   llu_iop_symlink,
+        inop_readlink:  NULL,
+        inop_open:      llu_iop_open,
+        inop_close:     llu_iop_close,
+        inop_unlink:    NULL,
+        inop_ipreadv:   llu_iop_ipreadv,
+        inop_ipwritev:  llu_iop_ipwritev,
+        inop_iodone:    llu_iop_iodone,
+        inop_fcntl:     NULL,
+        inop_sync:      NULL,
+        inop_datasync:  NULL,
+        inop_ioctl:     NULL,
+        inop_mknod:     NULL,
+        inop_statvfs:   NULL,
+        inop_gone:      llu_iop_gone,
+};
+
+
+static int
+llu_fsswop_mount(const char *source,
+                 unsigned flags,
+                 const void *data __IS_UNUSED,
+                 struct pnode *tocover,
+                 struct mount **mntp)
+{
+        struct filesys *fs;
+        struct inode *root;
+        struct pnode_base *rootpb;
+        static struct qstr noname = { NULL, 0, 0 };
+        struct ll_fid rootfid;
+
+        struct llu_sb_info *sbi;
+        struct ptlrpc_connection *mdc_conn;
+        struct ptlrpc_request *request = NULL;
+        struct mds_body *root_body;
+        struct obd_uuid param_uuid;
+        class_uuid_t uuid;
+        struct obd_device *obd;
+        char *osc=mount_option.osc_uuid;
+        char *mdc=mount_option.mdc_uuid;
+        int err = -EINVAL;
+
+        ENTRY;
+
+        OBD_ALLOC(sbi, sizeof(*sbi));
+        if (!sbi)
+                RETURN(-ENOMEM);
+
+        INIT_LIST_HEAD(&sbi->ll_conn_chain);
+        generate_random_uuid(uuid);
+        class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
+
+        fs = _sysio_fs_new(&llu_filesys_ops, flags, sbi);
+        if (!fs) {
+                err = -ENOMEM;
+                goto out_free;
+        }
+
+        strncpy(param_uuid.uuid, mdc, sizeof(param_uuid.uuid));
+        obd = class_uuid2obd(&param_uuid);
+        if (!obd) {
+                CERROR("MDC %s: not setup or attached\n", mdc);
+                err = -EINVAL;
+                goto out_free;
+        }
+
+        /* setup mdc */
+        /* FIXME need recover stuff */
+        err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid);
+        if (err) {
+                CERROR("cannot connect to %s: rc = %d\n", mdc, err);
+                goto out_free;
+        }
+
+        mdc_conn = sbi2mdc(sbi)->cl_import->imp_connection;
+
+        /* setup osc */
+        strncpy(param_uuid.uuid, osc, sizeof(param_uuid.uuid));
+        obd = class_uuid2obd(&param_uuid);
+        if (!obd) {
+                CERROR("OSC %s: not setup or attached\n", osc);
+                err = -EINVAL;
+                goto out_mdc;
+        }
+
+        err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid);
+        if (err) {
+                CERROR("cannot connect to %s: rc = %d\n", osc, err);
+                goto out_mdc;
+        }
+
+        err = mdc_getstatus(&sbi->ll_mdc_conn, &rootfid);
+        if (err) {
+                CERROR("cannot mds_connect: rc = %d\n", err);
+                goto out_osc;
+        }
+        CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id);
+        sbi->ll_rootino = rootfid.id;
+
+/* XXX do we need this??
+        memset(&osfs, 0, sizeof(osfs));
+        rc = obd_statfs(&sbi->ll_mdc_conn, &osfs);
+*/
+        /* fetch attr of root inode */
+        err = mdc_getattr(&sbi->ll_mdc_conn, &rootfid,
+                          OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request);
+        if (err) {
+                CERROR("mdc_getattr failed for root: rc = %d\n", err);
+                goto out_request;
+        }
+
+        root_body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*root_body));
+        LASSERT(sbi->ll_rootino != 0);
+
+        root = llu_new_inode(fs, root_body->ino, root_body->mode);
+        if (!root) {
+               err = -ENOMEM;
+                goto out_request;
+        }
+
+        llu_update_inode(root, root_body, NULL);
+
+       /*
+        * Generate base path-node for root.
+        */
+       rootpb = _sysio_pb_new(&noname, NULL, root);
+       if (!rootpb) {
+               err = -ENOMEM;
+               goto out_inode;
+       }
+
+       err = _sysio_do_mount(fs, rootpb, flags, NULL, mntp);
+       if (err) {
+                _sysio_pb_gone(rootpb);
+               goto out_inode;
+        }
+
+        ptlrpc_req_finished(request);
+        request = NULL;
+
+        printf("************************************************\n");
+        printf("*          Mount successfully!!!!!!!           *\n");
+        printf("************************************************\n");
+
+        return 0;
+
+out_inode:
+        _sysio_i_gone(root);
+out_request:
+        ptlrpc_req_finished(request);
+out_osc:
+        obd_disconnect(&sbi->ll_osc_conn);
+out_mdc:
+        obd_disconnect(&sbi->ll_mdc_conn);
+out_free:
+        OBD_FREE(sbi, sizeof(*sbi));
+        return err;
+}
+
+struct fssw_ops llu_fssw_ops = {
+        llu_fsswop_mount
+};
+
index 309088b..ddb9657 100644 (file)
@@ -11,6 +11,6 @@ EXTRA_PROGRAMS = llite
 
 llite_SOURCES = dcache.c commit_callback.c super.c rw.c iod.c super25.c
 llite_SOURCES += file.c dir.c sysctl.c symlink.c
-llite_SOURCES += recover.c namei.c lproc_llite.c
+llite_SOURCES += namei.c lproc_llite.c
 
 include $(top_srcdir)/Rules
index f8b7e70..ee49bb8 100644 (file)
@@ -34,6 +34,7 @@
 
 #include <linux/lustre_lite.h>
 #include <linux/lustre_lib.h>
+#include <linux/lustre_compat25.h>
 
 static int ll_commitcbd_check_event(struct ll_sb_info *sbi)
 {
@@ -57,26 +58,17 @@ static int ll_commitcbd_main(void *arg)
         ENTRY;
 
         lock_kernel();
-        daemonize();
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        spin_lock_irqsave(&current->sigmask_lock, flags);
-        sigfillset(&current->blocked);
-        our_recalc_sigpending(current);
-        spin_unlock_irqrestore(&current->sigmask_lock, flags);
-#else
+        kportal_daemonize("lustre_commitcbd");
+
+        SIGNAL_MASK_LOCK(current, flags);
         sigfillset(&current->blocked);
-        our_recalc_sigpending(current);
-#endif
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
 
-        sprintf(current->comm, "lustre_commitcbd");
         unlock_kernel();
 
         /* Record that the  thread is running */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        sbi->ll_commitcbd_waketime = CURRENT_TIME;
-#else
-        sbi->ll_commitcbd_waketime = CURRENT_TIME.tv_sec;
-#endif
+        sbi->ll_commitcbd_waketime = LTIME_S(CURRENT_TIME);
         sbi->ll_commitcbd_timeout = 10 * HZ;
         sbi->ll_commitcbd_thread = current;
         sbi->ll_commitcbd_flags =  LL_COMMITCBD_RUNNING;
index 41c68d9..0c9fcf7 100644 (file)
@@ -39,6 +39,16 @@ void ll_release(struct dentry *de)
         EXIT;
 }
 
+int ll_delete(struct dentry *de)
+{
+        if (de->d_it != 0) {
+                CERROR("%s put dentry %p+%p with d_it %p\n", current->comm,
+                       de, de->d_fsdata, de->d_it);
+                LBUG();
+        }
+        return 0;
+}
+
 void ll_set_dd(struct dentry *de)
 {
         ENTRY;
@@ -61,8 +71,6 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it)
         struct lustre_handle *handle;
         ENTRY;
 
-        LASSERT(ll_d2d(de) != NULL);
-
         if (it->it_lock_mode) {
                 handle = (struct lustre_handle *)it->it_lock_handle;
                 ldlm_lock_decref(handle, it->it_lock_mode);
@@ -80,8 +88,9 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it)
 
         if (de->d_it == it)
                 LL_GET_INTENT(de, it);
-        else 
-                CERROR("STRANGE intent release: %p %p\n", de->d_it, it);
+        else
+                CDEBUG(D_INODE, "STRANGE intent release: %p %p\n",
+                       de->d_it, it);
 
         EXIT;
 }
@@ -89,25 +98,66 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it)
 extern struct dentry *ll_find_alias(struct inode *, struct dentry *);
 
 static int revalidate2_finish(int flag, struct ptlrpc_request *request,
-                              struct dentry **de, struct lookup_intent *it,
-                              int offset, obd_id ino)
+                              struct inode *parent, struct dentry **de,
+                              struct lookup_intent *it, int offset, obd_id ino)
 {
-        struct mds_body *body;
-        struct lov_mds_md *lmm = NULL;
-        int rc = 0; 
+        struct ll_sb_info     *sbi = ll_i2sbi(parent);
+        struct mds_body       *body;
+        struct lov_stripe_md  *lsm = NULL;
+        struct lov_mds_md     *lmm;
+        int                    lmmsize;
+        int                    rc = 0;
         ENTRY;
 
-        if (!(flag & LL_LOOKUP_NEGATIVE)) {
-                body = lustre_msg_buf(request->rq_repmsg, offset);
-                if (body->valid & OBD_MD_FLEASIZE)
-                        lmm = lustre_msg_buf(request->rq_repmsg, offset + 1);
-                ll_update_inode((*de)->d_inode, body, lmm);
-                mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle,
-                                   (*de)->d_inode);
-        } else 
-                rc = -ENOENT;
-
-        ptlrpc_req_finished(request);
+        /* NB 1 request reference will be taken away by ll_intent_lock()
+         * when I return */
+
+        if ((flag & LL_LOOKUP_NEGATIVE) != 0)
+                GOTO (out, rc = -ENOENT);
+
+        /* We only get called if the mdc_enqueue() called from
+         * ll_intent_lock() was successful.  Therefore the mds_body is
+         * present and correct, and the eadata is present (but still
+         * opaque, so only obd_unpackmd() can check the size) */
+        body = lustre_msg_buf(request->rq_repmsg, offset, sizeof (*body));
+        LASSERT (body != NULL);
+        LASSERT_REPSWABBED (request, offset);
+
+        if (body->valid & OBD_MD_FLEASIZE) {
+                /* Only bother with this if inodes's LSM not set? */
+
+                if (body->eadatasize == 0) {
+                        CERROR ("OBD_MD_FLEASIZE set, but eadatasize 0\n");
+                        GOTO (out, rc = -EPROTO);
+                }
+                lmmsize = body->eadatasize;
+                lmm = lustre_msg_buf (request->rq_repmsg, offset + 1, lmmsize);
+                LASSERT (lmm != NULL);
+                LASSERT_REPSWABBED (request, offset + 1);
+
+                rc = obd_unpackmd (&sbi->ll_osc_conn,
+                                   &lsm, lmm, lmmsize);
+                if (rc < 0) {
+                        CERROR ("Error %d unpacking eadata\n", rc);
+                        LBUG();
+                        /* XXX don't know if I should do this... */
+                        GOTO (out, rc);
+                        /* or skip the ll_update_inode but still do
+                         * mdc_lock_set_inode() */
+                }
+                LASSERT (rc >= sizeof (*lsm));
+                rc = 0;
+        }
+
+        ll_update_inode((*de)->d_inode, body, lsm);
+
+        if (lsm != NULL &&
+            ll_i2info((*de)->d_inode)->lli_smd != lsm)
+                obd_free_memmd (&sbi->ll_osc_conn, &lsm);
+
+        ll_mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle,
+                              (*de)->d_inode);
+ out:
         RETURN(rc);
 }
 
@@ -146,6 +196,8 @@ int ll_revalidate2(struct dentry *de, int flags, struct lookup_intent *it)
 {
         int rc;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name,
+               LL_IT2STR(it));
 
         /* We don't want to cache negative dentries, so return 0 immediately.
          * We believe that this is safe, that negative dentries cannot be
@@ -221,4 +273,5 @@ struct dentry_operations ll_d_ops = {
         .d_revalidate2 = ll_revalidate2,
         .d_intent_release = ll_intent_release,
         .d_release = ll_release,
+        .d_delete = ll_delete,
 };
index 21192aa..8759598 100644 (file)
@@ -58,7 +58,7 @@ typedef struct ext2_dir_entry_2 ext2_dirent;
 static int ll_dir_prepare_write(struct file *file, struct page *page,
                                 unsigned from, unsigned to)
 {
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:\n");
         return 0;
 }
 
@@ -67,17 +67,18 @@ static int ll_dir_readpage(struct file *file, struct page *page)
 {
         struct inode *inode = page->mapping->host;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
-        char *buf;
         __u64 offset;
         int rc = 0;
         struct ptlrpc_request *request;
         struct lustre_handle lockh;
         struct mds_body *body;
         struct lookup_intent it = { .it_op = IT_READDIR };
+        struct mdc_op_data data;
 
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
         if ((inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT <= page->index){
                 /* XXX why do we need this exactly, and why do we think that
                  *     an all-zero directory page is useful?
@@ -89,8 +90,11 @@ static int ll_dir_readpage(struct file *file, struct page *page)
                 GOTO(readpage_out, rc);
         }
 
-        rc = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_PR, inode,
-                         NULL, &lockh, NULL, 0, inode, sizeof(*inode));
+        ll_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0);
+
+        rc = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_PR,
+                         &data, &lockh, NULL, 0,
+                         ldlm_completion_ast, ll_mdc_blocking_ast, inode);
         request = (struct ptlrpc_request *)it.it_data;
         if (request)
                 ptlrpc_req_finished(request);
@@ -107,16 +111,14 @@ static int ll_dir_readpage(struct file *file, struct page *page)
         }
 
         offset = page->index << PAGE_SHIFT;
-        buf = kmap(page);
         rc = mdc_readpage(&sbi->ll_mdc_conn, inode->i_ino,
-                          S_IFDIR, offset, buf, &request);
-        kunmap(page);
+                          S_IFDIR, offset, page, &request);
         if (!rc) {
-                body = lustre_msg_buf(request->rq_repmsg, 0);
-                if (!body)
-                        rc = -EINVAL;
-                else
-                        inode->i_size = body->size;
+                body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*body));
+                LASSERT (body != NULL);         /* checked by mdc_readpage() */
+                LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_readpage() */
+                
+                inode->i_size = body->size;
         }
         ptlrpc_req_finished(request);
         EXIT;
@@ -398,7 +400,8 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir)
         int need_revalidate = (filp->f_version != inode->i_version);
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
         if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
                 GOTO(done, 0);
 
@@ -764,15 +767,17 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct obd_ioctl_data *data;
         ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino,
+               inode->i_generation, inode, cmd);
 
         switch(cmd) {
         case IOC_MDC_LOOKUP: {
                 struct ptlrpc_request *request = NULL;
+                struct ll_fid fid;
                 char *buf = NULL;
+                struct mds_body *body;
                 char *filename;
                 int namelen, rc, err, len = 0;
-                int ea_size = 0; // obd_size_wiremd(&sbi->ll_osc_conn, NULL);
                 unsigned long valid;
 
                 rc = obd_ioctl_getdata(&buf, &len, (void *)arg);
@@ -789,29 +794,32 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                 }
 
                 valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE;
-                rc = mdc_getattr_name(&sbi->ll_mdc_conn, inode, filename,
-                                      namelen, valid, ea_size, &request);
+                ll_inode2fid(&fid, inode);
+                rc = mdc_getattr_name(&sbi->ll_mdc_conn, &fid,
+                                      filename, namelen, valid, 0, &request);
                 if (rc < 0) {
                         CERROR("mdc_getattr_name: %d\n", rc);
                         GOTO(out, rc);
-                } else {
-                        struct mds_body *body;
-                        body = lustre_msg_buf(request->rq_repmsg, 0);
-                        /* surely there's a better way -phik */
-                        data->ioc_obdo1.o_mode = body->mode;
-                        data->ioc_obdo1.o_uid = body->uid;
-                        data->ioc_obdo1.o_gid = body->gid;
                 }
 
+                body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*body));
+                LASSERT (body != NULL);         /* checked by mdc_getattr_name() */
+                LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_getattr_name() */
+                
+                /* surely there's a better way -phik */
+                data->ioc_obdo1.o_mode = body->mode;
+                data->ioc_obdo1.o_uid = body->uid;
+                data->ioc_obdo1.o_gid = body->gid;
+
+                ptlrpc_req_finished(request);
+
                 err = copy_to_user((void *)arg, buf, len);
                 if (err)
-                        GOTO(out_req, rc = -EFAULT);
+                        GOTO(out, rc = -EFAULT);
 
                 EXIT;
-        out_req:
-                ptlrpc_req_finished(request);
         out:
-                OBD_FREE(buf, len);
+                obd_ioctl_freedata(buf, len);
                 return rc;
         }
         default:
@@ -820,8 +828,21 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
         }
 }
 
+int ll_dir_open(struct inode *inode, struct file *file)
+{
+        return ll_file_open(inode, file);
+}
+
+int ll_dir_release(struct inode *inode, struct file *file)
+{
+        return ll_file_release(inode, file);
+}
+
 struct file_operations ll_dir_operations = {
+        open: ll_dir_open,
+        release: ll_dir_release,
         read: generic_read_dir,
         readdir: ll_readdir,
         ioctl: ll_dir_ioctl
 };
+
index 4c16e1c..3429b28 100644 (file)
  */
 
 #define DEBUG_SUBSYSTEM S_LLITE
-
 #include <linux/lustre_dlm.h>
 #include <linux/lustre_lite.h>
 #include <linux/obd_lov.h>      /* for lov_mds_md_size() in lov_setstripe() */
 #include <linux/random.h>
+#include <linux/pagemap.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/lustre_compat25.h>
+#endif
 
 int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc);
 extern int ll_setattr(struct dentry *de, struct iattr *attr);
@@ -44,22 +47,25 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
 
         /* Complete the open request and remove it from replay list */
         rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino,
-                       inode->i_mode, &fd->fd_mdshandle, &req);
+                       inode->i_mode, &fd->fd_mds_och.och_fh, &req);
         if (rc)
                 CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc);
 
-        imp = fd->fd_req->rq_import;
+        imp = fd->fd_mds_och.och_req->rq_import;
         LASSERT(imp != NULL);
         spin_lock_irqsave(&imp->imp_lock, flags);
 
-        DEBUG_REQ(D_HA, fd->fd_req, "matched open req %p", fd->fd_req);
+        DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p",
+                  fd->fd_mds_och.och_req);
 
         /* We held on to the request for replay until we saw a close for that
          * file.  Now that we've closed it, it gets replayed on the basis of
          * its transno only. */
-        fd->fd_req->rq_flags &= ~PTL_RPC_FL_REPLAY;
+        spin_lock (&fd->fd_mds_och.och_req->rq_lock);
+        fd->fd_mds_och.och_req->rq_replay = 0;
+        spin_unlock (&fd->fd_mds_och.och_req->rq_lock);
 
-        if (fd->fd_req->rq_transno) {
+        if (fd->fd_mds_och.och_req->rq_transno) {
                 /* This open created a file, so it needs replay as a
                  * normal transaction now.  Our reference to it now
                  * effectively owned by the imp_replay_list, and it'll
@@ -78,7 +84,7 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
                  * the basis of that and we don't need to do anything
                  * magical here. */
                 if (!req->rq_transno) {
-                        req->rq_transno = fd->fd_req->rq_transno;
+                        req->rq_transno = fd->fd_mds_och.och_req->rq_transno;
                         ptlrpc_retain_replayable_request(req, imp);
                 }
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
@@ -92,14 +98,14 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
                 /* No transno means that we can just drop our ref. */
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
         }
-        ptlrpc_req_finished(fd->fd_req);
+        ptlrpc_req_finished(fd->fd_mds_och.och_req);
 
         /* Do this after the fd_req->rq_transno check, because we don't want
          * to bounce off zero references. */
         ptlrpc_req_finished(req);
-        fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC;
+        fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC;
         file->private_data = NULL;
-        kmem_cache_free(ll_file_data_slab, fd);
+        OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
 
         RETURN(-abs(rc));
 }
@@ -109,7 +115,7 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
  * rarely check close errors and even if an error is returned they will not
  * re-try the close call.
  */
-static int ll_file_release(struct inode *inode, struct file *file)
+int ll_file_release(struct inode *inode, struct file *file)
 {
         struct ll_file_data *fd;
         struct obdo oa;
@@ -119,6 +125,12 @@ static int ll_file_release(struct inode *inode, struct file *file)
         int rc = 0, rc2;
 
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
+
+        /* don't do anything for / */
+        if (inode->i_sb->s_root == file->f_dentry)
+                RETURN(0);
 
         fd = (struct ll_file_data *)file->private_data;
         if (!fd) /* no process opened the file after an mcreate */
@@ -126,22 +138,24 @@ static int ll_file_release(struct inode *inode, struct file *file)
 
         /* we might not be able to get a valid handle on this file
          * again so we really want to flush our write cache.. */
-        filemap_fdatasync(inode->i_mapping);
-        filemap_fdatawait(inode->i_mapping);
+        if (S_ISREG(inode->i_mode)) {
+                filemap_fdatasync(inode->i_mapping);
+                filemap_fdatawait(inode->i_mapping);
 
-        if (lsm != NULL) {
-                memset(&oa, 0, sizeof(oa));
-                oa.o_id = lsm->lsm_object_id;
-                oa.o_mode = S_IFREG;
-                oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
+                if (lsm != NULL) {
+                        memset(&oa, 0, sizeof(oa));
+                        oa.o_id = lsm->lsm_object_id;
+                        oa.o_mode = S_IFREG;
+                        oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
 
-                memcpy(&oa.o_inline, fd->fd_ostdata, FD_OSTDATA_SIZE);
-                oa.o_valid |= OBD_MD_FLHANDLE;
+                        memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE);
+                        oa.o_valid |= OBD_MD_FLHANDLE;
 
-                rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
-                if (rc)
-                        CERROR("inode %lu object close failed: rc = %d\n",
-                               inode->i_ino, rc);
+                        rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
+                        if (rc)
+                                CERROR("inode %lu object close failed: rc = "
+                                       "%d\n", inode->i_ino, rc);
+                }
         }
 
         rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
@@ -155,20 +169,24 @@ static int ll_local_open(struct file *file, struct lookup_intent *it)
 {
         struct ptlrpc_request *req = it->it_data;
         struct ll_file_data *fd;
-        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+        struct mds_body *body;
         ENTRY;
 
+        body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body));
+        LASSERT (body != NULL);                 /* reply already checked out */
+        LASSERT_REPSWABBED (req, 1);            /* and swabbed down */
+
         LASSERT(!file->private_data);
 
-        fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL);
+        OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
         /* We can't handle this well without reorganizing ll_file_open and
          * ll_mdc_close, so don't even try right now. */
         LASSERT(fd != NULL);
 
         memset(fd, 0, sizeof(*fd));
 
-        memcpy(&fd->fd_mdshandle, &body->handle, sizeof(body->handle));
-        fd->fd_req = it->it_data;
+        memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
+        fd->fd_mds_och.och_req = it->it_data;
         file->private_data = fd;
 
         RETURN(0);
@@ -189,16 +207,13 @@ static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
         oa->o_mode = S_IFREG;
         oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS |
                        OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-        rc = obd_open(conn, oa, lsm, NULL);
+        rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
         if (rc)
                 GOTO(out, rc);
 
         file->f_flags &= ~O_LOV_DELAY_CREATE;
-        obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLMTIME |
-                      OBD_MD_FLCTIME);
-
-        if (oa->o_valid & OBD_MD_FLHANDLE)
-                memcpy(fd->fd_ostdata, obdo_handle(oa), FD_OSTDATA_SIZE);
+        obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+                                 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
 
         EXIT;
 out:
@@ -219,6 +234,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
         struct lov_mds_md *lmm = NULL;
         struct obdo *oa;
         struct iattr iattr;
+        struct mdc_op_data op_data;
         int rc, err, lmm_size = 0;;
         ENTRY;
 
@@ -230,8 +246,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
         oa->o_id = inode->i_ino;
         /* Keep these 0 for now, because chown/chgrp does not change the
          * ownership on the OST, and we don't want to allow BA OST NFS
-         * users to access these objects by mistake.
-         */
+         * users to access these objects by mistake. */
         oa->o_uid = 0;
         oa->o_gid = 0;
         oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
@@ -247,6 +262,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
                 }
                 GOTO(out_oa, rc);
         }
+        obdo_to_inode(inode, oa, OBD_MD_FLBLKSZ);
 
         LASSERT(lsm && lsm->lsm_object_id);
         rc = obd_packmd(conn, &lmm, lsm);
@@ -258,11 +274,14 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
         /* Save the stripe MD with this file on the MDS */
         memset(&iattr, 0, sizeof(iattr));
         iattr.ia_valid = ATTR_FROM_OPEN;
-        rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, inode, &iattr,
-                         lmm, lmm_size, &req);
+
+        ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
+
+        rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data,
+                         &iattr, lmm, lmm_size, &req);
         ptlrpc_req_finished(req);
 
-        obd_free_wiremd(conn, &lmm);
+        obd_free_diskmd (conn, &lmm);
 
         /* If we couldn't complete mdc_open() and store the stripe MD on the
          * MDS, we need to destroy the objects now or they will be leaked.
@@ -273,6 +292,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
                 GOTO(out_destroy, rc);
         }
         lli->lli_smd = lsm;
+        lli->lli_maxbytes = lsm->lsm_maxbytes;
 
         EXIT;
 out_oa:
@@ -308,7 +328,7 @@ out_destroy:
  */
 extern int ll_it_open_error(int phase, struct lookup_intent *it);
 
-static int ll_file_open(struct inode *inode, struct file *file)
+int ll_file_open(struct inode *inode, struct file *file)
 {
         struct ll_sb_info *sbi = ll_i2sbi(inode);
         struct ll_inode_info *lli = ll_i2info(inode);
@@ -318,7 +338,13 @@ static int ll_file_open(struct inode *inode, struct file *file)
         int rc = 0;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
+
+        /* don't do anything for / */
+        if (inode->i_sb->s_root == file->f_dentry)
+                RETURN(0);
+
         LL_GET_INTENT(file->f_dentry, it);
         rc = ll_it_open_error(IT_OPEN_OPEN, it);
         if (rc)
@@ -328,7 +354,10 @@ static int ll_file_open(struct inode *inode, struct file *file)
         if (rc)
                 LBUG();
 
-        mdc_set_open_replay_data((struct ll_file_data *)file->private_data);
+        mdc_set_open_replay_data(&((struct ll_file_data *)
+                                   file->private_data)->fd_mds_och);
+        if (!S_ISREG(inode->i_mode))
+                RETURN(0);
 
         lsm = lli->lli_smd;
         if (lsm == NULL) {
@@ -364,69 +393,86 @@ static int ll_file_open(struct inode *inode, struct file *file)
  * really does the getattr on the inode and updates its fields
  */
 int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
-                     char *ostdata)
+                     void *ostdata)
 {
         struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ptlrpc_request_set *set;
         struct obdo oa;
+        int bef, aft;
+        unsigned long before, after;
         int rc;
         ENTRY;
 
         LASSERT(lsm);
         LASSERT(sbi);
+        LASSERT(lli);
 
         memset(&oa, 0, sizeof oa);
         oa.o_id = lsm->lsm_object_id;
         oa.o_mode = S_IFREG;
         oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
-                OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME;
+                OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
+                OBD_MD_FLCTIME;
 
         if (ostdata != NULL) {
                 memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE);
                 oa.o_valid |= OBD_MD_FLHANDLE;
         }
 
-        rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
-        if (rc)
-                RETURN(rc);
-
-        obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
-                           OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-
-        CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu\n", lsm->lsm_object_id,
-               inode->i_size, inode->i_size);
-        RETURN(0);
-}
-
-/*
- * we've acquired a lock and need to see if we should perform a getattr
- * to update the file size that may have been updated by others that had
- * their locks canceled.
- */
-static int ll_size_validate(struct inode *inode, struct lov_stripe_md *lsm,
-                            char *ostdata, struct ldlm_extent *extent)
-{
-        struct ll_inode_info *lli = ll_i2info(inode);
-        int rc = 0;
-        ENTRY;
-
-        if (test_bit(LLI_F_DID_GETATTR, &lli->lli_flags))
+        /* getattr can race with writeback.  we don't want to trust a getattr
+         * that doesn't include the writeback of our farthest cached pages
+         * that it raced with. */
+        do {
+                bef = ll_farthest_dirty(&lli->lli_dirty, &before);
+#if 0
+                rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
+#else
+                set = ptlrpc_prep_set ();
+                if (set == NULL) {
+                        CERROR ("ENOMEM allocing request set\n");
+                        rc = -ENOMEM;
+                } else {
+                        rc = obd_getattr_async(&sbi->ll_osc_conn, &oa, lsm, set);
+                        if (rc == 0)
+                                rc = ptlrpc_set_wait (set);
+                        ptlrpc_set_destroy (set);
+                }
+#endif
+                if (rc)
+                        RETURN(rc);
+
+                aft = ll_farthest_dirty(&lli->lli_dirty, &after);
+                CDEBUG(D_INODE, " %d,%lu -> %d,%lu\n", bef, before, aft, after);
+        } while (bef == 0 &&
+                 (aft != 0 || after < before) &&
+                 oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT);
+
+        obdo_to_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
+                                   OBD_MD_FLMTIME | OBD_MD_FLCTIME));
+        if (inode->i_blksize < PAGE_CACHE_SIZE)
+                inode->i_blksize = PAGE_CACHE_SIZE;
+
+        /* make sure getattr doesn't return a size that causes writeback
+         * to forget about cached writes */
+        if ((aft == 0) && oa.o_size < ((u64)after + 1) << PAGE_CACHE_SHIFT) {
+                CDEBUG(D_INODE, "cached at %lu, keeping %llu i_size instead "
+                                "of oa "LPU64"\n", after, inode->i_size,
+                                oa.o_size);
                 RETURN(0);
-
-        down(&lli->lli_getattr_sem);
-
-        if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) {
-                rc = ll_inode_getattr(inode, lsm, ostdata);
-                if ( rc == 0 ) 
-                        set_bit(LLI_F_DID_GETATTR, &lli->lli_flags);
         }
 
-        up(&lli->lli_getattr_sem);
-        RETURN(rc);
+        obdo_to_inode(inode, &oa, OBD_MD_FLSIZE);
+
+        CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu blksize %lu\n",
+               lsm->lsm_object_id, inode->i_size, inode->i_size,
+               inode->i_blksize);
+        RETURN(0);
 }
 
 /*
  * some callers, notably truncate, really don't want i_size set based
- * on the the size returned by the getattr, or lock acquisition in 
+ * on the the size returned by the getattr, or lock acquisition in
  * the future.
  */
 int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
@@ -438,14 +484,14 @@ int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
         int rc, flags = 0;
         ENTRY;
 
-        LASSERT(lockh->addr == 0 && lockh->cookie == 0);
+        LASSERT(lockh->cookie == 0);
 
         /* XXX phil: can we do this?  won't it screw the file size up? */
         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
             (sbi->ll_flags & LL_SBI_NOLCK))
                 RETURN(0);
 
-        CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n",
+        CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
                inode->i_ino, extent->start, extent->end);
 
         rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent,
@@ -454,30 +500,53 @@ int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
 
         RETURN(rc);
 }
+
 /*
- * this grabs a lock and manually implements behaviour that makes it look
- * like the OST is returning the file size with each lock acquisition
+ * this grabs a lock and manually implements behaviour that makes it look like
+ * the OST is returning the file size with each lock acquisition.
  */
 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
                    struct lov_stripe_md *lsm,
                    int mode, struct ldlm_extent *extent,
                    struct lustre_handle *lockh)
 {
-        int rc;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ldlm_extent size_lock;
+        struct lustre_handle match_lockh = {0};
+        int flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED;
+        int rc, matched;
         ENTRY;
 
         rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
+        if (rc != ELDLM_OK)
+                RETURN(rc);
 
-        if (rc == ELDLM_OK) {
-                rc = ll_size_validate(inode, lsm, fd ? fd->fd_ostdata : NULL,
-                        extent);
-                if ( rc != 0 ) {
-                        ll_extent_unlock(fd, inode, lsm, mode, lockh);
-                        rc = ELDLM_GETATTR_ERROR;
-                }
+        if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags))
+                RETURN(0);
+
+        rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
+        if (rc) {
+                ll_extent_unlock(fd, inode, lsm, mode, lockh);
+                RETURN(rc);
         }
 
-        RETURN(rc);
+        size_lock.start = inode->i_size;
+        size_lock.end = OBD_OBJECT_EOF;
+
+        /* XXX I bet we should be checking the lock ignore flags.. */
+        matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT,
+                       &size_lock, sizeof(size_lock), LCK_PR, &flags,
+                       &match_lockh);
+
+        /* hey, alright, we hold a size lock that covers the size we
+         * just found, its not going to change for a while.. */
+        if (matched == 1) {
+                set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags);
+                obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR,
+                           &match_lockh);
+        }
+
+        RETURN(0);
 }
 
 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
@@ -513,16 +582,13 @@ static inline void ll_remove_suid(struct inode *inode)
         }
 }
 
+#if 0
 static void ll_update_atime(struct inode *inode)
 {
 #ifdef USE_ATIME
         struct iattr attr;
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        attr.ia_atime = CURRENT_TIME;
-#else
-        attr.ia_atime = CURRENT_TIME.tv_sec;
-#endif
+        attr.ia_atime = LTIME_S(CURRENT_TIME);
         attr.ia_valid = ATTR_ATIME;
 
         if (inode->i_atime == attr.ia_atime) return;
@@ -536,19 +602,170 @@ static void ll_update_atime(struct inode *inode)
         inode->i_atime = CURRENT_TIME;
 #endif
 }
+#endif
+
+/*
+ * flush the page cache for an extent as its canceled.  when we're on an
+ * lov we get a lock cancelation for each of the obd locks under the lov
+ * so we have to map the obd's region back onto the stripes in the file
+ * that it held.
+ *
+ * no one can dirty the extent until we've finished our work and they
+ * can enqueue another lock.
+ *
+ * XXX this could be asking the inode's dirty tree for info
+ */
+void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
+                              struct ldlm_lock *lock)
+{
+        struct ldlm_extent *extent = &lock->l_extent;
+        unsigned long start, end, count, skip, i, j;
+        struct page *page;
+        int ret;
+        ENTRY;
+
+        CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n",
+               inode->i_ino, inode, extent->start, extent->end, inode->i_size);
+
+        start = extent->start >> PAGE_CACHE_SHIFT;
+        count = ~0;
+        skip = 0;
+        end = (extent->end >> PAGE_CACHE_SHIFT) + 1;
+        if ((end << PAGE_CACHE_SHIFT) < extent->end)
+                end = ~0;
+        if (lsm->lsm_stripe_count > 1) {
+                struct {
+                        char name[16];
+                        struct ldlm_lock *lock;
+                        struct lov_stripe_md *lsm;
+                } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
+                __u32 stripe;
+                __u32 vallen = sizeof(stripe);
+                int rc;
+
+                /* get our offset in the lov */
+                rc = obd_get_info(ll_i2obdconn(inode), sizeof(key),
+                                  &key, &vallen, &stripe);
+                if (rc != 0) {
+                        CERROR("obd_get_info: rc = %d\n", rc);
+                        LBUG();
+                }
+                LASSERT(stripe < lsm->lsm_stripe_count);
+
+                count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
+                skip = (lsm->lsm_stripe_count - 1) * count;
+                start += (start/count * skip) + (stripe * count);
+                if (end != ~0)
+                        end += (end/count * skip) + (stripe * count);
+        }
+
+        i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+        if (end >= i)
+                clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
+        if (i < end)
+                end = i;
+
+        CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n",
+               start, start % count, count, skip, end);
+
+        /* start writeback on dirty pages in the extent when its PW */
+        for (i = start, j = start % count;
+                        lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
+                if (j == count) {
+                        i += skip;
+                        j = 0;
+                }
+                /* its unlikely, but give us a chance to bail when we're out */
+                PGCACHE_WRLOCK(inode->i_mapping);
+                if (list_empty(&inode->i_mapping->dirty_pages)) {
+                        CDEBUG(D_INODE, "dirty list empty\n");
+                        PGCACHE_WRUNLOCK(inode->i_mapping);
+                        break;
+                }
+                PGCACHE_WRUNLOCK(inode->i_mapping);
+
+                if (need_resched())
+                        schedule();
+
+                page = find_get_page(inode->i_mapping, i);
+                if (page == NULL)
+                        continue;
+                if (!PageDirty(page) || TryLockPage(page)) {
+                        page_cache_release(page);
+                        continue;
+                }
+                if (PageDirty(page)) {
+                        CDEBUG(D_INODE, "writing page %p\n", page);
+                        PGCACHE_WRLOCK(inode->i_mapping);
+                        list_del(&page->list);
+                        list_add(&page->list, &inode->i_mapping->locked_pages);
+                        PGCACHE_WRUNLOCK(inode->i_mapping);
+
+                        /* this writepage might write out pages outside
+                         * this extent, but that's ok, the pages are only
+                         * still dirty because a lock still covers them */
+                        ClearPageDirty(page);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+                        ret = inode->i_mapping->a_ops->writepage(page);
+#else
+                        ret = inode->i_mapping->a_ops->writepage(page, NULL);
+#endif
+                        if (ret != 0)
+                                unlock_page(page);
+                } else {
+                        unlock_page(page);
+                }
+                page_cache_release(page);
+
+        }
+
+        /* our locks are page granular thanks to osc_enqueue, we invalidate the
+         * whole page. */
+        LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
+        LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
+        for (i = start, j = start % count ; i < end ; j++, i++) {
+                if ( j == count ) {
+                        i += skip;
+                        j = 0;
+                }
+                PGCACHE_WRLOCK(inode->i_mapping);
+                if (list_empty(&inode->i_mapping->dirty_pages) &&
+                     list_empty(&inode->i_mapping->clean_pages) &&
+                     list_empty(&inode->i_mapping->locked_pages)) {
+                        CDEBUG(D_INODE, "nothing left\n");
+                        PGCACHE_WRUNLOCK(inode->i_mapping);
+                        break;
+                }
+                PGCACHE_WRUNLOCK(inode->i_mapping);
+                if (need_resched())
+                        schedule();
+                page = find_get_page(inode->i_mapping, i);
+                if (page == NULL)
+                        continue;
+                CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index);
+                lock_page(page);
+                if (page->mapping) /* might have raced */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+                        truncate_complete_page(page);
+#else
+                        truncate_complete_page(page->mapping, page);
+#endif                
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        EXIT;
+}
 
 int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
                      void *data, int flag)
 {
         struct inode *inode = data;
         struct ll_inode_info *lli = ll_i2info(inode);
-        struct lustre_handle lockh = { 0, 0 };
+        struct lustre_handle lockh = { 0 };
         int rc;
         ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
 
-        if (inode == NULL)
-                LBUG();
+        LASSERT(inode != NULL);
 
         switch (flag) {
         case LDLM_CB_BLOCKING:
@@ -562,11 +779,10 @@ int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
                  * could know to write-back or simply throw away the pages
                  * based on if the cancel comes from a desire to, say,
                  * read or truncate.. */
-                CDEBUG(D_INODE, "invalidating obdo/inode %lu\n", inode->i_ino);
-                filemap_fdatasync(inode->i_mapping);
-                filemap_fdatawait(inode->i_mapping);
-                clear_bit(LLI_F_DID_GETATTR, &lli->lli_flags);
-                truncate_inode_pages(inode->i_mapping, 0);
+                LASSERT((unsigned long)inode > 0x1000);
+                LASSERT((unsigned long)lli > 0x1000);
+                LASSERT((unsigned long)lli->lli_smd > 0x1000);
+                ll_pgcache_remove_extent(inode, lli->lli_smd, lock);
                 break;
         default:
                 LBUG();
@@ -582,27 +798,29 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
         struct inode *inode = filp->f_dentry->d_inode;
         struct ll_inode_info *lli = ll_i2info(inode);
         struct lov_stripe_md *lsm = lli->lli_smd;
-        struct lustre_handle lockh = { 0, 0 };
+        struct lustre_handle lockh = { 0 };
         struct ll_read_extent rextent;
         ldlm_error_t err;
         ssize_t retval;
         ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
+               inode->i_ino, inode->i_generation, inode, count, *ppos);
 
         /* "If nbyte is 0, read() will return 0 and have no other results."
          *                      -- Single Unix Spec */
         if (count == 0)
                 RETURN(0);
 
+        /* grab a -> eof extent to push extending writes out of node's caches
+         * so we can see them at the getattr after lock acquisition.  this will
+         * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt
+         * in the future. */
         rextent.re_extent.start = *ppos;
-        rextent.re_extent.end = *ppos + count - 1;
+        rextent.re_extent.end = OBD_OBJECT_EOF;
 
-        err = ll_extent_lock(fd, inode, lsm, 
-                             LCK_PR, &rextent.re_extent, &lockh);
-        if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED) {
-                retval = -ENOLCK;
-                RETURN(retval);
-        }
+        err = ll_extent_lock(fd, inode, lsm, LCK_PR, &rextent.re_extent,&lockh);
+        if (err != ELDLM_OK)
+                RETURN(-ENOLCK);
 
         /* XXX tell ll_readpage what pages have a PR lock.. */
         rextent.re_task = current;
@@ -618,9 +836,6 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
         list_del(&rextent.re_lli_item);
         spin_unlock(&lli->lli_read_extent_lock);
 
-        if (retval > 0)
-                ll_update_atime(inode);
-
         /* XXX errors? */
         ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
         RETURN(retval);
@@ -634,40 +849,72 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
 {
         struct ll_file_data *fd = file->private_data;
         struct inode *inode = file->f_dentry->d_inode;
-        struct lustre_handle lockh = { 0, 0 };
         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+        struct lustre_handle lockh = { 0 };
         struct ldlm_extent extent;
+        loff_t maxbytes = ll_file_maxbytes(inode);
         ldlm_error_t err;
         ssize_t retval;
+        char should_validate = 1;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
+               inode->i_ino, inode->i_generation, inode, count, *ppos);
+
+        /*
+         * sleep doing some writeback work of this mount's dirty data
+         * if the VM thinks we're low on memory.. other dirtying code
+         * paths should think about doing this, too, but they should be
+         * careful not to hold locked pages while they do so.  like
+         * ll_prepare_write.  *cough*
+         */
+        LL_CHECK_DIRTY(inode->i_sb);
 
         /* POSIX, but surprised the VFS doesn't check this already */
         if (count == 0)
                 RETURN(0);
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
-        if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) {
+        if (file->f_flags & O_APPEND) {
                 extent.start = 0;
                 extent.end = OBD_OBJECT_EOF;
         } else  {
                 extent.start = *ppos;
                 extent.end = *ppos + count - 1;
+                /* we really don't care what i_size is if we're doing
+                 * fully page aligned writes */
+                if ((*ppos & ~PAGE_CACHE_MASK) == 0 &&
+                    (count & ~PAGE_CACHE_MASK) == 0)
+                        should_validate = 0;
         }
 
-        err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
-        if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED) {
-                retval = -ENOLCK;
-                RETURN(retval);
-        }
-
-        if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
+        if (should_validate)
+                err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
+        else
+                err = ll_extent_lock_no_validate(fd, inode, lsm, LCK_PW,
+                                                 &extent, &lockh);
+        if (err != ELDLM_OK)
+                RETURN(-ENOLCK);
+
+        /* this is ok, g_f_w will overwrite this under i_sem if it races
+         * with a local truncate, it just makes our maxbyte checking easier */
+        if (file->f_flags & O_APPEND)
                 *ppos = inode->i_size;
 
+        if (*ppos >= maxbytes) {
+                if (count || *ppos > maxbytes) {
+                        send_sig(SIGXFSZ, current, 0);
+                        GOTO(out, retval = -EFBIG);
+                }
+        }
+        if (*ppos + count > maxbytes)
+                count = maxbytes - *ppos;
+
         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
                inode->i_ino, count, *ppos);
 
+        /* generic_file_write handles O_APPEND after getting i_sem */
         retval = generic_file_write(file, buf, count, ppos);
 
+out:
         /* XXX errors? */
         ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
         RETURN(retval);
@@ -686,7 +933,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file,
         lsm = lli->lli_smd;
         if (lsm) {
                 up(&lli->lli_open_sem);
-                CERROR("stripe already set for ino %lu\n", inode->i_ino);
+                CERROR("stripe already exists for ino %lu\n", inode->i_ino);
                 /* If we haven't already done the open, do so now */
                 if (file->f_flags & O_LOV_DELAY_CREATE) {
                         int rc2 = ll_osc_open(conn, inode, file, lsm);
@@ -694,7 +941,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file,
                                 RETURN(rc2);
                 }
 
-                RETURN(-EALREADY);
+                RETURN(-EEXIST);
         }
 
         rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg);
@@ -730,8 +977,8 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
         struct ll_file_data *fd = file->private_data;
         struct lustre_handle *conn;
         int flags;
-
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino,
+               inode->i_generation, inode, cmd);
 
         if ((cmd & 0xffffff00) == ((int)'T') << 8) /* tty ioctls */
                 return -ENOTTY;
@@ -780,19 +1027,19 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
         struct inode *inode = file->f_dentry->d_inode;
         struct ll_file_data *fd = file->private_data;
         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-        struct lustre_handle lockh = {0, 0};
+        struct lustre_handle lockh = {0};
         loff_t retval;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),to=%llu\n", inode->i_ino,
+               inode->i_generation, inode,
+               offset + ((origin==2) ? inode->i_size : file->f_pos));
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (origin == 2) { /* SEEK_END */
                 ldlm_error_t err;
                 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
                 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh);
-                if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED) {
-                        retval = -ENOLCK;
-                        RETURN(retval);
-                }
+                if (err != ELDLM_OK)
+                        RETURN(-ENOLCK);
 
                 offset += inode->i_size;
         } else if (origin == 1) { /* SEEK_CUR */
@@ -800,7 +1047,7 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
         }
 
         retval = -EINVAL;
-        if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+        if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
                 if (offset != file->f_pos) {
                         file->f_pos = offset;
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
@@ -819,7 +1066,10 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
 int ll_fsync(struct file *file, struct dentry *dentry, int data)
 {
         int ret;
+        struct inode *inode = dentry->d_inode;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
 
         /*
          * filemap_fdata{sync,wait} are also called at PW lock cancelation so
@@ -837,14 +1087,15 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data)
 int ll_inode_revalidate(struct dentry *dentry)
 {
         struct inode *inode = dentry->d_inode;
-        struct lov_stripe_md *lsm;
+        struct lov_stripe_md *lsm = NULL;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (!inode) {
                 CERROR("REPORT THIS LINE TO PETER\n");
                 RETURN(0);
         }
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
+               inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
 
         /* this is very tricky.  it is unsafe to call ll_have_md_lock
            when we have a referenced lock: because it may cause an RPC
@@ -855,37 +1106,67 @@ int ll_inode_revalidate(struct dentry *dentry)
             !ll_have_md_lock(dentry)) {
                 struct ptlrpc_request *req = NULL;
                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
+                struct ll_fid fid;
                 struct mds_body *body;
+                struct lov_mds_md *lmm;
                 unsigned long valid = 0;
-                int datalen = 0, rc;
+                int eadatalen = 0, rc;
 
                 /* Why don't we update all valid MDS fields here, if we're
                  * doing an RPC anyways?  -phil */
                 if (S_ISREG(inode->i_mode)) {
-                        datalen = obd_size_wiremd(&sbi->ll_osc_conn, NULL);
+                        eadatalen = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
                         valid |= OBD_MD_FLEASIZE;
                 }
-                rc = mdc_getattr(&sbi->ll_mdc_conn, inode->i_ino,
-                                 inode->i_mode, valid, datalen, &req);
+                ll_inode2fid(&fid, inode);
+                rc = mdc_getattr(&sbi->ll_mdc_conn, &fid,
+                                 valid, eadatalen, &req);
                 if (rc) {
                         CERROR("failure %d inode %lu\n", rc, inode->i_ino);
-                        ptlrpc_req_finished(req);
                         RETURN(-abs(rc));
                 }
 
-                body = lustre_msg_buf(req->rq_repmsg, 0);
+                body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
+                LASSERT (body != NULL);         /* checked by mdc_getattr() */
+                LASSERT_REPSWABBED (req, 0);    /* swabbed by mdc_getattr() */
 
                 if (S_ISREG(inode->i_mode) &&
-                    body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) {
+                    (body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))) {
                         CERROR("MDS sent back size for regular file\n");
                         body->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
                 }
 
-                if (body->valid & OBD_MD_FLEASIZE)
-                        ll_update_inode(inode, body,
-                                        lustre_msg_buf(req->rq_repmsg, 1));
-                else
-                        ll_update_inode(inode, body, NULL);
+                /* XXX Too paranoid? */
+                if ((body->valid ^ valid) & OBD_MD_FLEASIZE)
+                        CERROR("Asked for %s eadata but got %s\n",
+                               (valid & OBD_MD_FLEASIZE) ? "some" : "no",
+                               (body->valid & OBD_MD_FLEASIZE) ? "some":"none");
+
+                if (S_ISREG(inode->i_mode) &&
+                    (body->valid & OBD_MD_FLEASIZE)) {
+                        if (body->eadatasize == 0) { /* no EA data */
+                                CERROR("OBD_MD_FLEASIZE set but no data\n");
+                                RETURN(-EPROTO);
+                        }
+                        /* Only bother with this if inode's lsm not set? */
+                        lmm = lustre_msg_buf(req->rq_repmsg,1,body->eadatasize);
+                        LASSERT(lmm != NULL);       /* mdc_getattr() checked */
+                        LASSERT_REPSWABBED(req, 1); /* mdc_getattr() swabbed */
+
+                        rc = obd_unpackmd (&sbi->ll_osc_conn,
+                                           &lsm, lmm, body->eadatasize);
+                        if (rc < 0) {
+                                CERROR("Error %d unpacking eadata\n", rc);
+                                ptlrpc_req_finished(req);
+                                RETURN(rc);
+                        }
+                        LASSERT(rc >= sizeof (*lsm));
+                }
+
+                ll_update_inode(inode, body, lsm);
+                if (lsm != NULL && ll_i2info(inode)->lli_smd != lsm)
+                        obd_free_memmd(&sbi->ll_osc_conn, &lsm);
+
                 ptlrpc_req_finished(req);
         }
 
@@ -901,12 +1182,12 @@ int ll_inode_revalidate(struct dentry *dentry)
          */
         {
                 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
-                struct lustre_handle lockh = {0, 0};
+                struct lustre_handle lockh = {0};
                 ldlm_error_t err;
 
                 err = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh);
-                if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED )
-                        RETURN(-abs(err)); /* XXX can't be right */
+                if (err != ELDLM_OK)
+                        RETURN(err);
 
                 ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh);
         }
index 3a045f4..f88ed87 100644 (file)
@@ -24,6 +24,7 @@
  *  to force writeback.. the throttling in prepare_write and kupdate's usual
  *  writeback pressure got rid of our thread, but the file name remains.
  */
+
 #include <linux/version.h>
 #include <linux/config.h>
 #include <linux/module.h>
@@ -34,6 +35,9 @@
 #include <linux/kmod.h>
 #include <linux/pagemap.h>
 #include <linux/mm.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/time.h>
 
 /* PG_inactive_clean is shorthand for rmap, we want free_high/low here.. */
 #ifdef PG_inactive_clean
 
 extern spinlock_t inode_lock;
 
-#define LLWP_MAX_PAGES (PTL_MD_MAX_IOV)
 struct ll_writeback_pages {
-        unsigned        has_whole_pages:1,
-                        num_frags:2,
-                        num_pages:29;
-        struct brw_page pgs[LLWP_MAX_PAGES];
+        obd_count npgs, max;
+        struct brw_page *pga;
 };
 
-
-/*
- * ugh, we want disk allocation on the target to happen in offset order.  we'll
- * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
- * fine for our small page arrays and doesn't require allocation.  its an
- * insertion sort that swaps elements that are strides apart, shrinking the
- * stride down until its '1' and the array is sorted.
- */
-void sort_brw_pages(struct brw_page *array, int num)
-{
-        int stride, i, j;
-        struct brw_page tmp;
-
-        if ( num == 1 )
-                return;
-
-        for( stride = 1; stride < num ; stride = (stride*3) +1  )
-                ;
-
-        do {
-                stride /= 3;
-                for ( i = stride ; i < num ; i++ ) {
-                        tmp = array[i];
-                        j = i;
-                        while ( j >= stride &&
-                                        array[j - stride].off > tmp.off ) {
-                                array[j] = array[j - stride];
-                                j -= stride;
-                        }
-                        array[j] = tmp;
-                }
-        } while ( stride > 1 );
-}
-
 /*
- * returns 0 if the page was inserted in the array because it was
- * within i_size.  if we raced with truncate and i_size was less
- * than the page we can unlock the page because truncate_inode_pages will
- * be waiting to cleanup the page
+ * check to see if we're racing with truncate and put the page in
+ * the brw_page array.  returns 0 if there is more room and 1
+ * if the array is full.
  */
 static int llwp_consume_page(struct ll_writeback_pages *llwp,
                              struct inode *inode, struct page *page)
@@ -107,31 +73,24 @@ static int llwp_consume_page(struct ll_writeback_pages *llwp,
 
         /* we raced with truncate? */
         if ( off >= inode->i_size ) {
+                ll_remove_dirty(inode, page->index, page->index);
                 unlock_page(page);
-                goto out;
+                return 0;
         }
 
         page_cache_get(page);
-        pg = &llwp->pgs[llwp->num_pages];
-        llwp->num_pages++;
+        pg = &llwp->pga[llwp->npgs];
+        llwp->npgs++;
+        LASSERT(llwp->npgs <= llwp->max);
 
         pg->pg = page;
         pg->off = off;
         pg->flag = OBD_BRW_CREATE;
-        pg->count = PAGE_SIZE;
+        pg->count = PAGE_CACHE_SIZE;
 
         /* catch partial writes for files that end mid-page */
-        if ( pg->off + pg->count > inode->i_size )
-                pg->count = inode->i_size & ~PAGE_MASK;
-
-        if ( pg->count == PAGE_SIZE ) {
-                if ( ! llwp->has_whole_pages ) {
-                        llwp->has_whole_pages = 1;
-                        llwp->num_frags++;
-                }
-        } else {
-                llwp->num_frags++;
-        }
+        if (pg->off + pg->count > inode->i_size)
+                pg->count = inode->i_size & ~PAGE_CACHE_MASK;
 
         /*
          * matches ptlrpc_bulk_get assert that trickles down
@@ -141,14 +100,10 @@ static int llwp_consume_page(struct ll_writeback_pages *llwp,
         LASSERT(pg->count >= 0);
 
         CDEBUG(D_CACHE, "brw_page %p: off "LPU64" cnt %d, page %p: ind %ld"
-                        " i_size: "LPU64"\n", pg, pg->off, pg->count, page, 
+                        " i_size: %llu\n", pg, pg->off, pg->count, page,
                         page->index, inode->i_size);
 
-        if ( llwp->num_frags == 3 || llwp->num_pages == LLWP_MAX_PAGES )
-                return -1;
-
-out:
-        return 0;
+        return llwp->npgs == llwp->max;
 }
 
 /*
@@ -165,7 +120,7 @@ static void ll_get_dirty_pages(struct inode *inode,
         struct list_head *pos, *n;
         ENTRY;
 
-        spin_lock(&pagecache_lock);
+        PGCACHE_WRLOCK(mapping);
 
         list_for_each_prev_safe(pos, n, &mapping->dirty_pages) {
                 page = list_entry(pos, struct page, list);
@@ -186,46 +141,51 @@ static void ll_get_dirty_pages(struct inode *inode,
                         break;
         }
 
-        spin_unlock(&pagecache_lock);
+        PGCACHE_WRUNLOCK(mapping);
         EXIT;
 }
 
-static void ll_brw_pages_unlock( struct inode *inode,
-                                 struct ll_writeback_pages *llwp)
+static void ll_writeback(struct inode *inode, struct ll_writeback_pages *llwp)
 {
         int rc, i;
-        struct obd_brw_set *set;
+        struct ptlrpc_request_set *set;
         ENTRY;
 
-        sort_brw_pages(llwp->pgs, llwp->num_pages);
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),bytes=%u\n",
+               inode->i_ino, inode->i_generation, inode,
+               ((llwp->npgs-1) << PAGE_SHIFT) + llwp->pga[llwp->npgs-1].count);
 
-        set = obd_brw_set_new();
+        set = ptlrpc_prep_set();
         if (set == NULL) {
-                EXIT;
-                return;
+                CERROR ("Can't create request set\n");
+                rc = -ENOMEM;
+        } else {
+                rc = obd_brw_async(OBD_BRW_WRITE, ll_i2obdconn(inode),
+                                   ll_i2info(inode)->lli_smd, llwp->npgs,
+                                   llwp->pga, set, NULL);
+                if (rc == 0)
+                        rc = ptlrpc_set_wait (set);
+                ptlrpc_set_destroy (set);
         }
-        set->brw_callback = ll_brw_sync_wait;
-
-        rc = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode),
-                     ll_i2info(inode)->lli_smd, llwp->num_pages, llwp->pgs,
-                     set, NULL);
+        /*
+         * b=1038, we need to pass _brw errors up so that writeback
+         * doesn't get stuck in recovery leaving processes stuck in
+         * D waiting for pages
+         */
         if (rc) {
-                CERROR("error from obd_brw: rc = %d\n", rc);
+                CERROR("error from obd_brw_async: rc = %d\n", rc);
+                INODE_IO_STAT_ADD(inode, wb_fail, llwp->npgs);
         } else {
-                rc = ll_brw_sync_wait(set, CB_PHASE_START);
-                if (rc)
-                        CERROR("error from callback: rc = %d\n", rc);
+                INODE_IO_STAT_ADD(inode, wb_ok, llwp->npgs);
         }
-        obd_brw_set_decref(set);
 
-        /* XXX this doesn't make sense to me */
-        rc = 0;
+        for (i = 0 ; i < llwp->npgs ; i++) {
+                struct page *page = llwp->pga[i].pg;
 
-        for ( i = 0 ; i < llwp->num_pages ; i++) {
-                struct page *page = llwp->pgs[i].pg;
-
-                CDEBUG(D_CACHE, "cleaning page %p\n", page);
+                CDEBUG(D_CACHE, "finished page %p at index %lu\n", page,
+                       page->index);
                 LASSERT(PageLocked(page));
+                ll_remove_dirty(inode, page->index, page->index);
                 unlock_page(page);
                 page_cache_release(page);
         }
@@ -233,10 +193,13 @@ static void ll_brw_pages_unlock( struct inode *inode,
         EXIT;
 }
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+
 #ifndef PG_inactive_clean
 #ifdef CONFIG_DISCONTIGMEM
 #error "sorry, we don't support DISCONTIGMEM yet"
 #endif
+
 /*
  * __alloc_pages marks a zone as needing balancing if an allocation is
  * performed when the zone has fewer free pages than its 'low' water
@@ -280,24 +243,35 @@ static int should_writeback(void)
         return 0;
 }
 
-int ll_check_dirty( struct super_block *sb)
+static int ll_alloc_brw(struct inode *inode, struct ll_writeback_pages *llwp)
+{
+        memset(llwp, 0, sizeof(struct ll_writeback_pages));
+
+        llwp->max = inode->i_blksize >> PAGE_CACHE_SHIFT;
+        if (llwp->max == 0) {
+                CERROR("forcing llwp->max to 1.  blksize: %lu\n",
+                       inode->i_blksize);
+                llwp->max = 1;
+        }
+        llwp->pga = kmalloc(llwp->max * sizeof(*llwp->pga), GFP_ATOMIC);
+        if (llwp->pga == NULL)
+                RETURN(-ENOMEM);
+        RETURN(0);
+}
+
+int ll_check_dirty(struct super_block *sb)
 {
         unsigned long old_flags; /* hack? */
         int making_progress;
-        struct ll_writeback_pages *llwp;
         struct inode *inode;
         int rc = 0;
         ENTRY;
 
-        if ( ! should_writeback() )
+        if (!should_writeback())
                 return 0;
 
         old_flags = current->flags;
         current->flags |= PF_MEMALLOC;
-        llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC);
-        if ( llwp == NULL )
-                GOTO(cleanup, rc = -ENOMEM);
-        memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs));
 
         spin_lock(&inode_lock);
 
@@ -306,6 +280,7 @@ int ll_check_dirty( struct super_block *sb)
          * until the VM thinkgs we're ok again..
          */
         do {
+                struct ll_writeback_pages llwp;
                 struct list_head *pos;
                 inode = NULL;
                 making_progress = 0;
@@ -313,14 +288,14 @@ int ll_check_dirty( struct super_block *sb)
                 list_for_each_prev(pos, &sb->s_dirty) {
                         inode = list_entry(pos, struct inode, i_list);
 
-                        if ( ! (inode->i_state & I_DIRTY_PAGES) ) {
+                        if (!(inode->i_state & I_DIRTY_PAGES)) {
                                 inode = NULL;
                                 continue;
                         }
                         break;
                 }
 
-                if ( inode == NULL )
+                if (inode == NULL)
                         break;
 
                 /* duplicate __sync_one, *sigh* */
@@ -331,19 +306,25 @@ int ll_check_dirty( struct super_block *sb)
 
                 spin_unlock(&inode_lock);
 
-                do { 
-                        memset(llwp, 0, sizeof(*llwp));
-                        ll_get_dirty_pages(inode, llwp);
-                        if ( llwp->num_pages ) {
-                                ll_brw_pages_unlock(inode, llwp);
-                                rc += llwp->num_pages;
+                rc = ll_alloc_brw(inode, &llwp);
+                if (rc != 0)
+                        GOTO(cleanup, rc);
+
+                do {
+                        llwp.npgs = 0;
+                        ll_get_dirty_pages(inode, &llwp);
+                        if (llwp.npgs) {
+                                INODE_IO_STAT_ADD(inode, wb_from_pressure,
+                                                  llwp.npgs);
+                                ll_writeback(inode, &llwp);
+                                rc += llwp.npgs;
                                 making_progress = 1;
                         }
-                } while (llwp->num_pages && should_writeback() );
+                } while (llwp.npgs && should_writeback());
 
                 spin_lock(&inode_lock);
 
-                if ( ! list_empty(&inode->i_mapping->dirty_pages) )
+                if (!list_empty(&inode->i_mapping->dirty_pages))
                         inode->i_state |= I_DIRTY_PAGES;
 
                 inode->i_state &= ~I_LOCK;
@@ -356,19 +337,19 @@ int ll_check_dirty( struct super_block *sb)
                         list_add(&inode->i_list, &inode->i_sb->s_dirty);
                 }
                 wake_up(&inode->i_wait);
-
-        } while ( making_progress && should_writeback() );
+                kfree(llwp.pga);
+        } while (making_progress && should_writeback());
 
         /*
          * and if that didn't work, we sleep on any data that might
          * be under writeback..
          */
-        while ( should_writeback() ) {
-                if ( list_empty(&sb->s_locked_inodes) )  
+        while (should_writeback()) {
+                if (list_empty(&sb->s_locked_inodes))
                         break;
 
-                inode = list_entry(sb->s_locked_inodes.next, struct inode, 
-                                i_list);
+                inode = list_entry(sb->s_locked_inodes.next, struct inode,
+                                   i_list);
 
                 atomic_inc(&inode->i_count); /* XXX hack? */
                 spin_unlock(&inode_lock);
@@ -380,36 +361,339 @@ int ll_check_dirty( struct super_block *sb)
         spin_unlock(&inode_lock);
 
 cleanup:
-        if ( llwp != NULL )
-                kfree(llwp);
         current->flags = old_flags;
 
         RETURN(rc);
 }
+#endif /* linux 2.5 */
 
-int ll_batch_writepage( struct inode *inode, struct page *page )
+int ll_batch_writepage(struct inode *inode, struct page *page)
 {
         unsigned long old_flags; /* hack? */
-        struct ll_writeback_pages *llwp;
+        struct ll_writeback_pages llwp;
         int rc = 0;
         ENTRY;
 
         old_flags = current->flags;
         current->flags |= PF_MEMALLOC;
-        llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC);
-        if ( llwp == NULL )
-                GOTO(cleanup, rc = -ENOMEM);
-        memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs));
+        rc = ll_alloc_brw(inode, &llwp);
+        if (rc != 0)
+                GOTO(cleanup, rc);
 
-        llwp_consume_page(llwp, inode, page);
+        if (llwp_consume_page(&llwp, inode, page) == 0)
+                ll_get_dirty_pages(inode, &llwp);
 
-        ll_get_dirty_pages(inode, llwp);
-        if ( llwp->num_pages )
-                ll_brw_pages_unlock(inode, llwp);
+        if (llwp.npgs) {
+                INODE_IO_STAT_ADD(inode, wb_from_writepage, llwp.npgs);
+                ll_writeback(inode, &llwp);
+        }
 
+        kfree(llwp.pga);
 cleanup:
-        if ( llwp != NULL )
-                kfree(llwp);
         current->flags = old_flags;
         RETURN(rc);
 }
+
+/*
+ * we aggressively track offsets of pages that have been dirtied.  we need this
+ * to make file size decisions around lock acquisition and cancelation.  all
+ * extents include the offsets at their endpoints.
+ */
+struct offset_extent {
+        rb_node_t       oe_node;
+        unsigned long   oe_start, oe_end;
+};
+
+static struct offset_extent *ll_find_oe(rb_root_t *root,
+                                        struct offset_extent *needle)
+{
+        struct rb_node_s *node = root->rb_node;
+        struct offset_extent *oe;
+        ENTRY;
+
+        CDEBUG(D_INODE, "searching [%lu -> %lu]\n", needle->oe_start,
+               needle->oe_end);
+
+        while (node) {
+                oe = rb_entry(node, struct offset_extent, oe_node);
+                if (needle->oe_end < oe->oe_start)
+                        node = node->rb_left;
+                else if (needle->oe_start > oe->oe_end)
+                        node = node->rb_right;
+                else {
+                        CDEBUG(D_INODE, "returning [%lu -> %lu]\n",
+                               oe->oe_start, oe->oe_end);
+                        RETURN(oe);
+                }
+        }
+        RETURN(NULL);
+}
+
+/* do the rbtree mechanics to insert a node, callers are responsible
+ * for making sure that this new node doesn't overlap with existing
+ * nodes */
+static void ll_insert_oe(rb_root_t *root, struct offset_extent *new_oe)
+{
+        rb_node_t ** p = &root->rb_node;
+        rb_node_t * parent = NULL;
+        struct offset_extent *oe;
+        ENTRY;
+
+        LASSERT(new_oe->oe_start <= new_oe->oe_end);
+
+        while (*p) {
+                parent = *p;
+                oe = rb_entry(parent, struct offset_extent, oe_node);
+                if ( new_oe->oe_end < oe->oe_start )
+                        p = &(*p)->rb_left;
+                else if ( new_oe->oe_start > oe->oe_end )
+                        p = &(*p)->rb_right;
+                else
+                        LBUG();
+        }
+        rb_link_node(&new_oe->oe_node, parent, p);
+        rb_insert_color(&new_oe->oe_node, root);
+        EXIT;
+}
+
+static inline void lldo_dirty_add(struct inode *inode,
+                                  struct ll_dirty_offsets *lldo,
+                                  long val)
+{
+        lldo->do_num_dirty += val;
+        INODE_IO_STAT_ADD(inode, dirty_pages, val);
+}
+
+void ll_record_dirty(struct inode *inode, unsigned long offset)
+{
+        struct ll_dirty_offsets *lldo = &ll_i2info(inode)->lli_dirty;
+        struct offset_extent needle, *oe, *new_oe;
+        int rc;
+        ENTRY;
+
+        /* will allocate more intelligently later */
+        OBD_ALLOC(new_oe, sizeof(*new_oe));
+        LASSERT(new_oe); /* will have to do for now :/ */
+
+        spin_lock(&lldo->do_lock);
+
+        /* find neighbours that we might glom on to */
+        needle.oe_start = (offset > 0) ? offset - 1 : offset;
+        needle.oe_end = (offset < ~0) ? offset + 1 : offset;
+        oe = ll_find_oe(&lldo->do_root, &needle);
+        if ( oe == NULL ) {
+                new_oe->oe_start = offset;
+                new_oe->oe_end = offset;
+                ll_insert_oe(&lldo->do_root, new_oe);
+                lldo_dirty_add(inode, lldo, 1);
+                new_oe = NULL;
+                GOTO(out, rc = 1);
+        }
+
+        /* already recorded */
+        if ( offset >= oe->oe_start && offset <= oe->oe_end )
+                GOTO(out, rc = 2);
+
+        /* ok, need to check for adjacent neighbours */
+        needle.oe_start = offset;
+        needle.oe_end = offset;
+        if (ll_find_oe(&lldo->do_root, &needle))
+                GOTO(out, rc = 3);
+
+        /* ok, its safe to extend the oe we found */
+        if ( offset == oe->oe_start - 1 )
+                oe->oe_start--;
+        else if ( offset == oe->oe_end + 1 )
+                oe->oe_end++;
+        else
+                LBUG();
+        lldo_dirty_add(inode, lldo, 1);
+
+out:
+        CDEBUG(D_INODE, "%lu now dirty\n", lldo->do_num_dirty);
+        spin_unlock(&lldo->do_lock);
+        if ( new_oe )
+                OBD_FREE(new_oe, sizeof(*new_oe));
+        EXIT;
+        return;
+}
+
+void ll_remove_dirty(struct inode *inode, unsigned long start,
+                     unsigned long end)
+{
+        struct ll_dirty_offsets *lldo = &ll_i2info(inode)->lli_dirty;
+        struct offset_extent needle, *oe, *new_oe;
+        ENTRY;
+
+        /* will allocate more intelligently later */
+        OBD_ALLOC(new_oe, sizeof(*new_oe));
+        LASSERT(new_oe); /* will have to do for now :/ */
+
+        needle.oe_start = start;
+        needle.oe_end = end;
+
+        spin_lock(&lldo->do_lock);
+        for ( ; (oe = ll_find_oe(&lldo->do_root, &needle)) ; ) {
+
+                /* see if we're punching a hole and need to create a node */
+                if (oe->oe_start < start && oe->oe_end > end) {
+                        new_oe->oe_start = end + 1;
+                        new_oe->oe_end = oe->oe_end;
+                        oe->oe_end = start - 1;
+                        ll_insert_oe(&lldo->do_root, new_oe);
+                        new_oe = NULL;
+                        lldo_dirty_add(inode, lldo, -(end - start + 1));
+                        break;
+                }
+
+                /* overlapping edges */
+                if (oe->oe_start < start && oe->oe_end <= end) {
+                        lldo_dirty_add(inode, lldo, -(oe->oe_end - start + 1));
+                        oe->oe_end = start - 1;
+                        oe = NULL;
+                        continue;
+                }
+                if (oe->oe_end > end && oe->oe_start >= start) {
+                        lldo_dirty_add(inode, lldo, -(end - oe->oe_start + 1));
+                        oe->oe_start = end + 1;
+                        oe = NULL;
+                        continue;
+                }
+
+                /* an extent entirely within the one we're clearing */
+                rb_erase(&oe->oe_node, &lldo->do_root);
+                lldo_dirty_add(inode, lldo, -(oe->oe_end - oe->oe_start + 1));
+                spin_unlock(&lldo->do_lock);
+                OBD_FREE(oe, sizeof(*oe));
+                spin_lock(&lldo->do_lock);
+        }
+        CDEBUG(D_INODE, "%lu now dirty\n", lldo->do_num_dirty);
+        spin_unlock(&lldo->do_lock);
+        if (new_oe)
+                OBD_FREE(new_oe, sizeof(*new_oe));
+        EXIT;
+}
+
+int ll_find_dirty(struct ll_dirty_offsets *lldo, unsigned long *start,
+                  unsigned long *end)
+{
+        struct offset_extent needle, *oe;
+        int rc = -ENOENT;
+        ENTRY;
+
+        needle.oe_start = *start;
+        needle.oe_end = *end;
+
+        spin_lock(&lldo->do_lock);
+        oe = ll_find_oe(&lldo->do_root, &needle);
+        if (oe) {
+                *start = oe->oe_start;
+                *end = oe->oe_end;
+                rc = 0;
+        }
+        spin_unlock(&lldo->do_lock);
+
+        RETURN(rc);
+}
+
+int ll_farthest_dirty(struct ll_dirty_offsets *lldo, unsigned long *farthest)
+{
+        struct rb_node_s *last, *node;
+        struct offset_extent *oe;
+        int rc = -1;
+        ENTRY;
+
+        spin_lock(&lldo->do_lock);
+        for (node = lldo->do_root.rb_node, last = NULL;
+             node;
+             last = node, node = node->rb_right)
+                ;
+
+        if (last) {
+                oe = rb_entry(last, struct offset_extent, oe_node);
+                *farthest = oe->oe_end;
+                rc = 0;
+        }
+        spin_unlock(&lldo->do_lock);
+        RETURN(rc);
+}
+
+void ll_lldo_init(struct ll_dirty_offsets *lldo)
+{
+        spin_lock_init(&lldo->do_lock);
+        lldo->do_num_dirty = 0;
+        lldo->do_root.rb_node = NULL;
+}
+
+/* seq file export of some page cache tracking stats */
+static int ll_pgcache_seq_show(struct seq_file *seq, void *v)
+{
+        struct timeval now;
+        struct ll_sb_info *sbi = seq->private;
+        do_gettimeofday(&now);
+
+        seq_printf(seq, "snapshot_time:            %lu:%lu (secs:usecs)\n",
+                   now.tv_sec, now.tv_usec);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        seq_printf(seq, "VM_under_pressure:        %s\n",
+                   should_writeback() ? "yes" : "no");
+#endif        
+        seq_printf(seq, "dirty_pages:              "LPU64"\n",
+                   sbi->ll_iostats.fis_dirty_pages);
+        seq_printf(seq, "dirty_page_hits:          "LPU64"\n",
+                   sbi->ll_iostats.fis_dirty_hits);
+        seq_printf(seq, "dirty_page_misses:        "LPU64"\n",
+                   sbi->ll_iostats.fis_dirty_misses);
+        seq_printf(seq, "writeback_from_writepage: "LPU64"\n",
+                   sbi->ll_iostats.fis_wb_from_writepage);
+        seq_printf(seq, "writeback_from_pressure:  "LPU64"\n",
+                   sbi->ll_iostats.fis_wb_from_pressure);
+        seq_printf(seq, "writeback_ok_pages:       "LPU64"\n",
+                   sbi->ll_iostats.fis_wb_ok);
+        seq_printf(seq, "writeback_failed_pages:   "LPU64"\n",
+                   sbi->ll_iostats.fis_wb_fail);
+        return 0;
+}
+
+static void *ll_pgcache_seq_start(struct seq_file *p, loff_t *pos)
+{
+        if (*pos == 0)
+                return (void *)1;
+        return NULL;
+}
+static void *ll_pgcache_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+        ++*pos;
+        return NULL;
+}
+static void ll_pgcache_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+struct seq_operations ll_pgcache_seq_sops = {
+        .start = ll_pgcache_seq_start,
+        .stop = ll_pgcache_seq_stop,
+        .next = ll_pgcache_seq_next,
+        .show = ll_pgcache_seq_show,
+};
+
+static int ll_pgcache_seq_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *dp = inode->u.generic_ip;
+        struct seq_file *seq;
+        int rc;
+
+        rc = seq_open(file, &ll_pgcache_seq_sops);
+        if (rc)
+                return rc;
+        seq = file->private_data;
+        seq->private = dp->data;
+        return 0;
+}
+
+struct file_operations ll_pgcache_seq_fops = {
+        .open    = ll_pgcache_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
new file mode 100644 (file)
index 0000000..e53b605
--- /dev/null
@@ -0,0 +1,2 @@
+int ll_mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode,
+                         int flags, void *opaque);
index b5e6620..59cec1f 100644 (file)
@@ -106,6 +106,7 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
         struct lprocfs_vars lvars[2];
         struct ll_sb_info *sbi = ll_s2sbi(sb);
         struct obd_device *obd;
+        struct proc_dir_entry *entry;
         char name[MAX_STRING_SIZE + 1];
         struct obd_uuid uuid;
         int err;
@@ -135,6 +136,13 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
         if (err)
                 RETURN(err);
 
+        /* llite page cache stats */
+        entry = create_proc_entry("pgcache", 0444, sbi->ll_proc_root);
+        if (entry == NULL)
+                RETURN(-ENOMEM);
+        entry->proc_fops = &ll_pgcache_seq_fops;
+        entry->data = sbi;
+
         /* MDC info */
         strncpy(uuid.uuid, mdc, sizeof(uuid.uuid));
         obd = class_uuid2obd(&uuid);
index 449cac7..5e37d55 100644 (file)
@@ -98,12 +98,17 @@ static int ll_test_inode(struct inode *inode, void *opaque)
         struct ll_read_inode2_cookie *lic = opaque;
         struct mds_body *body = lic->lic_body;
 
+        if (!(lic->lic_body->valid & (OBD_MD_FLGENER | OBD_MD_FLID)))
+                CERROR("invalid generation\n");
+        CDEBUG(D_VFSTRACE, "comparing inode %p ino %lu/%u to body %lu/%u\n",
+               inode, inode->i_ino, inode->i_generation, ino,
+               lic->lic_body->generation);
+
         if (inode->i_generation != lic->lic_body->generation)
                 return 0;
 
         /* Apply the attributes in 'opaque' to this inode */
-        ll_update_inode(inode, body, lic->lic_lmm);
-
+        ll_update_inode(inode, body, lic->lic_lsm);
         return 1;
 }
 
@@ -118,6 +123,9 @@ int ll_unlock(__u32 mode, struct lustre_handle *lockh)
         RETURN(0);
 }
 
+/* Get an inode by inode number (already instantiated by the intent lookup).
+ * Returns inode or NULL
+ */
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 extern int ll_read_inode2(struct inode *inode, void *opaque);
 struct inode *ll_iget(struct super_block *sb, ino_t hash,
@@ -127,9 +135,8 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash,
 
         LASSERT(hash != 0);
         inode = iget5_locked(sb, hash, ll_test_inode, ll_read_inode2, lic);
-
-        if (!inode)
-                return ERR_PTR(-ENOMEM);
+        if (inode == NULL)
+                return NULL;              /* removed ERR_PTR(-ENOMEM) -eeb */
 
         if (inode->i_state & I_NEW)
                 unlock_new_inode(inode);
@@ -144,6 +151,8 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash,
         struct inode *inode;
         LASSERT(hash != 0);
         inode = iget4(sb, hash, ll_find_inode, lic);
+        CDEBUG(D_VFSTRACE, "inode: %lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
         return inode;
 }
 #endif
@@ -186,18 +195,112 @@ int ll_it_open_error(int phase, struct lookup_intent *it)
         return 0;
 }
 
+int ll_mdc_blocking_ast(struct ldlm_lock *lock,
+                        struct ldlm_lock_desc *desc,
+                        void *data, int flag)
+{
+        int rc;
+        struct lustre_handle lockh;
+        ENTRY;
+
+        switch (flag) {
+        case LDLM_CB_BLOCKING:
+                ldlm_lock2handle(lock, &lockh);
+                rc = ldlm_cli_cancel(&lockh);
+                if (rc < 0) {
+                        CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
+                        RETURN(rc);
+                }
+                break;
+        case LDLM_CB_CANCELING: {
+                /* Invalidate all dentries associated with this inode */
+                struct inode *inode = lock->l_data;
+                LASSERT(inode != NULL);
+
+                //if (inode->i_state & I_FREEING)
+                //        break;
+
+                if (S_ISDIR(inode->i_mode)) {
+                        CDEBUG(D_INODE, "invalidating inode %lu\n",
+                               inode->i_ino);
+
+                        ll_invalidate_inode_pages(inode);
+                }
+
+                if (inode->i_sb->s_root &&
+                    inode != inode->i_sb->s_root->d_inode)
+                        d_unhash_aliases(inode);
+                break;
+        }
+        default:
+                LBUG();
+        }
+
+        RETURN(0);
+}
+
+void ll_mdc_lock_set_inode(struct lustre_handle *lockh, struct inode *inode)
+{
+        struct ldlm_lock *lock = ldlm_handle2lock(lockh);
+        ENTRY;
+
+        LASSERT(lock != NULL);
+        lock->l_data = inode;
+        LDLM_LOCK_PUT(lock);
+        EXIT;
+}
+
+int ll_mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode,
+                         int flags, void *opaque)
+{
+        struct ldlm_res_id res_id =
+                { .name = {inode->i_ino, inode->i_generation} };
+        struct obd_device *obddev = class_conn2obd(conn);
+        ENTRY;
+        RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags,
+                                      opaque));
+}
+
+void ll_prepare_mdc_op_data(struct mdc_op_data *data,
+                            struct inode *i1,
+                            struct inode *i2,
+                            const char *name,
+                            int namelen,
+                            int mode)
+{
+        LASSERT(i1);
+
+        data->ino1 = i1->i_ino;
+        data->gen1 = i1->i_generation;
+        data->typ1 = i1->i_mode & S_IFMT;
+        data->gid1 = i1->i_gid;
+
+        if (i2) {
+                data->ino2 = i2->i_ino;
+                data->gen2 = i2->i_generation;
+                data->typ2 = i2->i_mode & S_IFMT;
+                data->gid2 = i2->i_gid;
+        } else {
+                data->ino2 = 0;
+        }
+
+        data->name = name;
+        data->namelen = namelen;
+        data->mode = mode;
+}
+
 #define IT_ENQ_COMPLETE (1<<16)
 
 int ll_intent_lock(struct inode *parent, struct dentry **de,
                    struct lookup_intent *it, intent_finish_cb intent_finish)
 {
         struct dentry *dentry = *de;
+        struct inode *inode = dentry->d_inode;
         struct ll_sb_info *sbi = ll_i2sbi(parent);
         struct lustre_handle lockh;
         struct lookup_intent lookup_it = { .it_op = IT_LOOKUP };
         struct ptlrpc_request *request = NULL;
-        char *data = NULL;
-        int rc = 0, datalen = 0, offset, flag = 0;
+        int rc = 0, offset, flag = 0;
         obd_id ino = 0;
         ENTRY;
 
@@ -208,17 +311,23 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
         if (it == NULL)
                 it = &lookup_it;
 
-        CDEBUG(D_INFO, "name: %*s, intent: %s\n", dentry->d_name.len,
+        CDEBUG(D_DLMTRACE, "name: %*s, intent: %s\n", dentry->d_name.len,
                dentry->d_name.name, ldlm_it2str(it->it_op));
 
         if (dentry->d_name.len > EXT2_NAME_LEN)
                 RETURN(-ENAMETOOLONG);
 
         if (!(it->it_disposition & IT_ENQ_COMPLETE)) {
+                struct mdc_op_data op_data;
+
+                ll_prepare_mdc_op_data(&op_data, parent, dentry->d_inode,
+                                       dentry->d_name.name, dentry->d_name.len,
+                                       0);
+
                 rc = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, it,
-                                 ll_intent_to_lock_mode(it), parent, dentry,
-                                 &lockh, data, datalen, parent,
-                                 sizeof(*parent));
+                                 ll_intent_to_lock_mode(it), &op_data,
+                                 &lockh, NULL, 0, ldlm_completion_ast,
+                                 ll_mdc_blocking_ast, parent);
                 if (rc < 0)
                         RETURN(rc);
                 memcpy(it->it_lock_handle, &lockh, sizeof(lockh));
@@ -256,14 +365,17 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
                  */
 
                 offset = 1;
-                mds_body = lustre_msg_buf(request->rq_repmsg, offset);
+                mds_body = lustre_msg_buf(request->rq_repmsg, offset,
+                                          sizeof(*mds_body));
+                LASSERT (mds_body != NULL);           /* mdc_enqueue checked */
+                LASSERT_REPSWABBED (request, offset); /* mdc_enqueue swabbed */
+
                 ino = mds_body->fid1.id;
                 mode = mds_body->mode;
 
                 /*We were called from revalidate2: did we find the same inode?*/
-                if ((*de)->d_inode &&
-                    (ino != (*de)->d_inode->i_ino || 
-                   mds_body->fid1.generation != (*de)->d_inode->i_generation)) {
+                if (inode && (ino != inode->i_ino ||
+                   mds_body->fid1.generation != inode->i_generation)) {
                         it->it_disposition |= IT_ENQ_COMPLETE;
                         RETURN(-ESTALE);
                 }
@@ -273,8 +385,13 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
                  * this request for unconditional replay. */
                 if (it->it_op & IT_OPEN &&
                     (!(it->it_disposition & IT_OPEN_OPEN) ||
-                     it->it_status != 0))
-                        request->rq_flags &= ~PTL_RPC_FL_REPLAY;
+                     it->it_status != 0)) {
+                        unsigned long flags;
+
+                        spin_lock_irqsave (&request->rq_lock, flags);
+                        request->rq_replay = 0;
+                        spin_unlock_irqrestore (&request->rq_lock, flags);
+                }
 
                 if (it->it_op & IT_CREAT) {
                         mdc_store_inode_generation(request, 2, 1);
@@ -329,7 +446,9 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
                 } else
                         LBUG();
         } else {
+                struct ll_fid fid;
                 obd_flag valid;
+                int eadatalen;
                 int mode;
 
                 LBUG(); /* For the moment, no non-intent locks */
@@ -351,32 +470,44 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
                 valid = OBD_MD_FLNOTOBD;
 
                 if (S_ISREG(mode)) {
-                        datalen = obd_size_wiremd(&sbi->ll_osc_conn, NULL),
+                        eadatalen = obd_size_diskmd(&sbi->ll_osc_conn, NULL),
                         valid |= OBD_MD_FLEASIZE;
                 } else {
+                        eadatalen = 0;
                         valid |= OBD_MD_FLBLOCKS;
                 }
 
-                rc = mdc_getattr(&sbi->ll_mdc_conn, ino, mode, valid,
-                                 datalen, &request);
+                fid.id = ino;
+                fid.generation = 0;
+                fid.f_type = mode;
+                rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, valid,
+                                 eadatalen, &request);
                 if (rc) {
                         CERROR("failure %d inode "LPX64"\n", rc, ino);
-                        GOTO(drop_req, rc = -abs(rc));
+                        GOTO(drop_lock, rc = -abs(rc));
                 }
         }
 
+        LASSERT (request != NULL);
+
         if (intent_finish != NULL) {
-                rc = intent_finish(flag, request, de, it, offset, ino);
+                rc = intent_finish(flag, request, parent, de, it, offset, ino);
                 dentry = *de; /* intent_finish may change *de */
-        } else {
-                ptlrpc_req_finished(request);
+                inode = dentry->d_inode;
+                if (rc != 0)
+                        GOTO(drop_lock, rc);
         }
+        ptlrpc_req_finished(request);
 
         /* This places the intent in the dentry so that the vfs_xxx
          * operation can lay its hands on it; but that is not always
          * needed...  (we need to save it in the GETATTR case for the
          * benefit of ll_inode_revalidate -phil) */
-        if (it->it_op & (IT_OPEN | IT_GETATTR))
+        /* Ignore trying to save the intent for "special" inodes as
+         * they have special semantics that can cause deadlocks on
+         * the intent semaphore. -mmex */
+        if ((!inode || S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+             S_ISLNK(inode->i_mode)) && (it->it_op & (IT_OPEN | IT_GETATTR)))
                 LL_SAVE_INTENT(dentry, it);
         else
                 CDEBUG(D_DENTRY,
@@ -389,10 +520,10 @@ int ll_intent_lock(struct inode *parent, struct dentry **de,
 
         RETURN(rc);
 
+ drop_lock:
+        ll_intent_release(dentry, it);
  drop_req:
         ptlrpc_req_finished(request);
- drop_lock:
-#warning FIXME: must release lock here
         RETURN(rc);
 }
 
@@ -440,32 +571,87 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de)
 }
 
 static int
-lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de,
+lookup2_finish(int flag, struct ptlrpc_request *request,
+               struct inode *parent, struct dentry **de,
                struct lookup_intent *it, int offset, obd_id ino)
 {
+        struct ll_sb_info *sbi = ll_i2sbi(parent);
         struct dentry *dentry = *de, *saved = *de;
         struct inode *inode = NULL;
-        struct ll_read_inode2_cookie lic = {.lic_body = NULL, .lic_lmm = NULL};
+        struct ll_read_inode2_cookie lic = {.lic_body = NULL, .lic_lsm = NULL};
+
+        /* NB 1 request reference will be taken away by ll_intent_lock()
+         * when I return */
 
         if (!(flag & LL_LOOKUP_NEGATIVE)) {
                 ENTRY;
-                lic.lic_body = lustre_msg_buf(request->rq_repmsg, offset);
+
+                /* We only get called if the mdc_enqueue() called from
+                 * ll_intent_lock() was successful.  Therefore the mds_body
+                 * is present and correct, and the eadata is present if
+                 * body->eadatasize != 0 (but still opaque, so only
+                 * obd_unpackmd() can check the size) */
+                lic.lic_body = lustre_msg_buf(request->rq_repmsg, offset,
+                                              sizeof (*lic.lic_body));
+                LASSERT(lic.lic_body != NULL);
+                LASSERT_REPSWABBED(request, offset);
 
                 if (S_ISREG(lic.lic_body->mode) &&
-                    lic.lic_body->valid & OBD_MD_FLEASIZE) {
-                        LASSERT(request->rq_repmsg->bufcount > offset);
-                        lic.lic_lmm = lustre_msg_buf(request->rq_repmsg,
-                                                     offset + 1);
-                } else {
-                        lic.lic_lmm = NULL;
+                    (lic.lic_body->valid & OBD_MD_FLEASIZE)) {
+                        struct lov_mds_md    *lmm;
+                        int                   lmm_size;
+                        int                   rc;
+
+                        lmm_size = lic.lic_body->eadatasize;
+                        if (lmm_size == 0) {
+                                CERROR("OBD_MD_FLEASIZE set but "
+                                       "eadatasize 0\n");
+                                RETURN(-EPROTO);
+                        }
+                        lmm = lustre_msg_buf(request->rq_repmsg, offset + 1,
+                                             lmm_size);
+                        LASSERT(lmm != NULL);
+                        LASSERT_REPSWABBED(request, offset + 1);
+
+                        rc = obd_unpackmd(&sbi->ll_osc_conn,
+                                          &lic.lic_lsm, lmm, lmm_size);
+                        if (rc < 0) {
+                                CERROR("Error %d unpacking eadata\n", rc);
+                                RETURN(rc);
+                        }
+                        LASSERT(rc >= sizeof(*lic.lic_lsm));
                 }
 
-                /* No rpc's happen during iget4, -ENOMEM's are possible */
+                /* Both ENOMEM and an RPC timeout are possible in ll_iget; which
+                 * to pick?  A more generic EIO?  -phik */
                 inode = ll_iget(dentry->d_sb, ino, &lic);
                 if (!inode) {
-                        /* XXX make sure that request is freed in this case;
-                         * I think it is, but double-check refcounts. -phil */
+                        /* free the lsm if we allocated one above */
+                        if (lic.lic_lsm != NULL)
+                                obd_free_memmd(&sbi->ll_osc_conn, &lic.lic_lsm);
                         RETURN(-ENOMEM);
+                } else if (lic.lic_lsm != NULL &&
+                           ll_i2info(inode)->lli_smd != lic.lic_lsm) {
+                        obd_free_memmd(&sbi->ll_osc_conn, &lic.lic_lsm);
+                }
+
+                /* If this is a stat, get the authoritative file size */
+                if (it->it_op == IT_GETATTR && S_ISREG(inode->i_mode) &&
+                    ll_i2info(inode)->lli_smd != NULL) {
+                        struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
+                        struct lustre_handle lockh = {0};
+                        struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+                        ldlm_error_t rc;
+
+                        LASSERT(lsm->lsm_object_id != 0);
+
+                        rc = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent,
+                                            &lockh);
+                        if (rc != ELDLM_OK) {
+                                iput(inode);
+                                RETURN(-EIO);
+                        }
+                        ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh);
                 }
 
                 dentry = *de = ll_find_alias(inode, dentry);
@@ -473,14 +659,12 @@ lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de,
                 /* We asked for a lock on the directory, and may have been
                  * granted a lock on the inode.  Just in case, fixup the data
                  * pointer. */
-                mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle,
-                                   inode);
+                ll_mdc_lock_set_inode((struct lustre_handle*)it->it_lock_handle,
+                                      inode);
         } else {
                 ENTRY;
         }
 
-        ptlrpc_req_finished(request);
-
         dentry->d_op = &ll_d_ops;
         ll_set_dd(dentry);
 
@@ -493,21 +677,26 @@ lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de,
 static struct dentry *ll_lookup2(struct inode *parent, struct dentry *dentry,
                                  struct lookup_intent *it)
 {
-        struct dentry *save = dentry;
+        struct dentry *save = dentry, *retval;
         int rc;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
+               dentry->d_name.name, parent->i_ino, parent->i_generation,
+               parent, LL_IT2STR(it));
+
         rc = ll_intent_lock(parent, &dentry, it, lookup2_finish);
         if (rc < 0) {
                 CDEBUG(D_INFO, "ll_intent_lock: %d\n", rc);
-                RETURN(ERR_PTR(rc));
+                GOTO(out, retval = ERR_PTR(rc));
         }
 
         if (dentry == save)
-                RETURN(NULL);
+                GOTO(out, retval = NULL);
         else
-                RETURN(dentry);
+                GOTO(out, retval = dentry);
+ out:
+        return retval;
 }
 
 /* We depend on "mode" being set with the proper file type/umask by now */
@@ -519,20 +708,19 @@ static struct inode *ll_create_node(struct inode *dir, const char *name,
         struct inode *inode;
         struct ptlrpc_request *request = NULL;
         struct mds_body *body;
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        time_t time = CURRENT_TIME.tv_sec;
-#else
-        time_t time = CURRENT_TIME;
-#endif
+        time_t time = LTIME_S(CURRENT_TIME);
         struct ll_sb_info *sbi = ll_i2sbi(dir);
-        struct ll_read_inode2_cookie lic = { .lic_lmm = NULL, };
+        struct ll_read_inode2_cookie lic;
         ENTRY;
 
         if (it && it->it_disposition) {
                 ll_invalidate_inode_pages(dir);
                 request = it->it_data;
-                body = lustre_msg_buf(request->rq_repmsg, 1);
+                body = lustre_msg_buf(request->rq_repmsg, 1, sizeof (*body));
+                LASSERT (body != NULL);         /* checked already */
+                LASSERT_REPSWABBED (request, 1); /* swabbed already */
         } else {
+                struct mdc_op_data op_data;
                 int gid = current->fsgid;
                 int rc;
 
@@ -542,21 +730,29 @@ static struct inode *ll_create_node(struct inode *dir, const char *name,
                                 mode |= S_ISGID;
                 }
 
-                rc = mdc_create(&sbi->ll_mdc_conn, dir, name, namelen,
+                ll_prepare_mdc_op_data(&op_data, dir, NULL, name, namelen, 0);
+                rc = mdc_create(&sbi->ll_mdc_conn, &op_data,
                                 data, datalen, mode, current->fsuid, gid,
                                 time, extra, &request);
                 if (rc) {
                         inode = ERR_PTR(rc);
                         GOTO(out, rc);
                 }
-                body = lustre_msg_buf(request->rq_repmsg, 0);
+                body = lustre_swab_repbuf(request, 0, sizeof (*body),
+                                          lustre_swab_mds_body);
+                if (body == NULL) {
+                        CERROR ("Can't unpack mds_body\n");
+                        GOTO (out, inode = ERR_PTR(-EPROTO));
+                }
         }
 
         lic.lic_body = body;
+        lic.lic_lsm = NULL;
 
         inode = ll_iget(dir->i_sb, body->ino, &lic);
-        if (IS_ERR(inode)) {
-                int rc = PTR_ERR(inode);
+        if (!inode || is_bad_inode(inode)) {
+                /* XXX might need iput() for bad inode */
+                int rc = -EIO;
                 CERROR("new_inode -fatal: rc %d\n", rc);
                 LBUG();
                 GOTO(out, rc);
@@ -576,8 +772,8 @@ static struct inode *ll_create_node(struct inode *dir, const char *name,
                 /* We asked for a lock on the directory, but were
                  * granted a lock on the inode.  Since we finally have
                  * an inode pointer, stuff it in the lock. */
-                mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle,
-                                   inode);
+                ll_mdc_lock_set_inode((struct lustre_handle*)it->it_lock_handle,
+                                      inode);
         }
 
         EXIT;
@@ -592,22 +788,21 @@ static int ll_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode,
         struct ptlrpc_request *request = NULL;
         struct ll_sb_info *sbi = ll_i2sbi(dir);
         struct mds_body *body;
+        struct lov_mds_md *eadata;
         struct lov_stripe_md *lsm = NULL;
         struct lustre_handle lockh;
         struct lookup_intent it = { .it_op = IT_UNLINK };
         struct obdo *oa;
         int err;
-        struct mdc_unlink_data data;
+        struct mdc_op_data op_data;
         ENTRY;
 
-        data.unl_dir = dir;
-        data.unl_de = child;
-        data.unl_mode = mode;
-        data.unl_name = name;
-        data.unl_len = len;
+        ll_prepare_mdc_op_data(&op_data, dir, child, name, len, mode);
 
-        err = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_EX, dir,
-                         NULL, &lockh, NULL, 0, &data, sizeof(data));
+        err = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_EX,
+                         &op_data, &lockh, NULL, 0,
+                         ldlm_completion_ast, ll_mdc_blocking_ast,
+                         dir);
         request = (struct ptlrpc_request *)it.it_data;
         if (err < 0)
                 GOTO(out, err);
@@ -615,21 +810,39 @@ static int ll_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode,
                 GOTO(out, err = it.it_status);
         err = 0;
 
-        body = lustre_msg_buf(request->rq_repmsg, 1);
-        LASSERT(body != NULL);
+        body = lustre_msg_buf (request->rq_repmsg, 1, sizeof (*body));
+        LASSERT (body != NULL);                 /* checked by mdc_enqueue() */
+        LASSERT_REPSWABBED (request, 1);        /* swabbed by mdc_enqueue() */
+
         if (!(body->valid & OBD_MD_FLEASIZE))
                 GOTO(out, 0);
 
+        if (body->eadatasize == 0) {
+                CERROR ("OBD_MD_FLEASIZE set but eadatasize zero\n");
+                GOTO (out, err = -EPROTO);
+        }
+
         /* The MDS sent back the EA because we unlinked the last reference
-         * to this file.  Use this EA to unlink the objects on the OST */
-        err = obd_unpackmd(ll_i2obdconn(dir), &lsm,
-                           lustre_msg_buf(request->rq_repmsg, 2));
-        if (err < 0)
+         * to this file. Use this EA to unlink the objects on the OST.
+         * Note that mdc_enqueue() has already checked there _is_ some EA
+         * data, but this data is opaque to both mdc_enqueue() and the MDS.
+         * We have to leave it to obd_unpackmd() to check it is complete
+         * and sensible. */
+        eadata = lustre_msg_buf (request->rq_repmsg, 2, body->eadatasize);
+        LASSERT (eadata != NULL);
+        LASSERT_REPSWABBED (request, 2);
+
+        err = obd_unpackmd(ll_i2obdconn(dir), &lsm, eadata,
+                           body->eadatasize);
+        if (err < 0) {
                 CERROR("obd_unpackmd: %d\n", err);
+                GOTO (out_unlock, err);
+        }
+        LASSERT (err >= sizeof (*lsm));
 
         oa = obdo_alloc();
         if (oa == NULL)
-                GOTO(out_unlock, err = -ENOMEM);
+                GOTO(out_free_memmd, err = -ENOMEM);
 
         oa->o_id = lsm->lsm_object_id;
         oa->o_mode = body->mode & S_IFMT;
@@ -640,7 +853,7 @@ static int ll_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode,
         if (err)
                 CERROR("obd destroy objid 0x"LPX64" error %d\n",
                        lsm->lsm_object_id, err);
-
+ out_free_memmd:
         obd_free_memmd(ll_i2obdconn(dir), &lsm);
  out_unlock:
         ldlm_lock_decref_and_cancel(&lockh, LCK_EX);
@@ -670,7 +883,10 @@ static int ll_create(struct inode *dir, struct dentry *dentry, int mode)
         int rc = 0;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
+               dentry->d_name.name, dir->i_ino, dir->i_generation, dir,
+               LL_IT2STR(dentry->d_it));
+
         it = dentry->d_it;
 
         rc = ll_it_open_error(IT_OPEN_CREATE, it);
@@ -702,16 +918,15 @@ static int ll_mknod2(struct inode *dir, const char *name, int len, int mode,
                      int rdev)
 {
         struct ptlrpc_request *request = NULL;
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        time_t time = CURRENT_TIME.tv_sec;
-#else
-        time_t time = CURRENT_TIME;
-#endif
+        time_t time = LTIME_S(CURRENT_TIME);
         struct ll_sb_info *sbi = ll_i2sbi(dir);
+        struct mdc_op_data op_data;
         int err = -EMLINK;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n",
+               name, dir->i_ino, dir->i_generation, dir);
+
         if (dir->i_nlink >= EXT2_LINK_MAX)
                 RETURN(err);
 
@@ -722,8 +937,9 @@ static int ll_mknod2(struct inode *dir, const char *name, int len, int mode,
                 mode |= S_IFREG; /* for mode = 0 case, fallthrough */
         case S_IFCHR: case S_IFBLK:
         case S_IFIFO: case S_IFSOCK:
-                err = mdc_create(&sbi->ll_mdc_conn, dir, name, len, NULL, 0,
-                                 mode, current->fsuid, current->fsgid, time,
+                ll_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0);
+                err = mdc_create(&sbi->ll_mdc_conn, &op_data, NULL, 0, mode,
+                                 current->fsuid, current->fsgid, time,
                                  rdev, &request);
                 ptlrpc_req_finished(request);
                 break;
@@ -743,7 +959,10 @@ static int ll_mknod(struct inode *dir, struct dentry *dentry, int mode,
         struct inode *inode;
         int rc = 0;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
+               dentry->d_name.name, dir->i_ino, dir->i_generation, dir,
+               LL_IT2STR(dentry->d_it));
+
         LL_GET_INTENT(dentry, it);
 
         if ((mode & S_IFMT) == 0)
@@ -767,20 +986,20 @@ static int ll_symlink2(struct inode *dir, const char *name, int len,
                        const char *tgt)
 {
         struct ptlrpc_request *request = NULL;
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        time_t time = CURRENT_TIME.tv_sec;
-#else
-        time_t time = CURRENT_TIME;
-#endif
+        time_t time = LTIME_S(CURRENT_TIME);
         struct ll_sb_info *sbi = ll_i2sbi(dir);
+        struct mdc_op_data op_data;
         int err = -EMLINK;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),target=%s\n",
+               name, dir->i_ino, dir->i_generation, dir, tgt);
+
         if (dir->i_nlink >= EXT2_LINK_MAX)
                 RETURN(err);
 
-        err = mdc_create(&sbi->ll_mdc_conn, dir, name, len,
+        ll_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0);
+        err = mdc_create(&sbi->ll_mdc_conn, &op_data,
                          tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO,
                          current->fsuid, current->fsgid, time, 0, &request);
         ptlrpc_req_finished(request);
@@ -797,7 +1016,10 @@ static int ll_symlink(struct inode *dir, struct dentry *dentry,
         int err = 0;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
+               dentry->d_name.name, dir->i_ino, dir->i_generation, dir,
+               LL_IT2STR(dentry->d_it));
+
         LL_GET_INTENT(dentry, it);
 
         inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len,
@@ -830,13 +1052,17 @@ static int ll_link2(struct inode *src, struct inode *dir,
                     const char *name, int len)
 {
         struct ptlrpc_request *request = NULL;
+        struct mdc_op_data op_data;
         int err;
         struct ll_sb_info *sbi = ll_i2sbi(dir);
 
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),dir=%lu/%u(%p),target=%s\n",
+               src->i_ino, src->i_generation, src,
+               dir->i_ino, dir->i_generation, dir, name);
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
-        err = mdc_link(&sbi->ll_mdc_conn, src, dir, name, len, &request);
+        ll_prepare_mdc_op_data(&op_data, src, dir, name, len, 0);
+        err = mdc_link(&sbi->ll_mdc_conn, &op_data, &request);
         ptlrpc_req_finished(request);
 
         RETURN(err);
@@ -848,18 +1074,18 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir,
         struct lookup_intent *it;
         struct inode *inode = old_dentry->d_inode;
         int rc;
+        CDEBUG(D_VFSTRACE,
+               "VFS Op:inode=%lu/%u(%p),dir=%lu/%u(%p),target=%s,intent=%s\n",
+               inode->i_ino, inode->i_generation, inode, dir->i_ino,
+               dir->i_generation, dir, dentry->d_name.name,
+               LL_IT2STR(dentry->d_it));
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         if (it && it->it_disposition) {
                 if (it->it_status)
                         RETURN(it->it_status);
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-                inode->i_ctime.tv_sec = CURRENT_TIME.tv_sec;
-#else
-                inode->i_ctime = CURRENT_TIME;
-#endif
+                LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME);
                 ext2_inc_count(inode);
                 atomic_inc(&inode->i_count);
                 d_instantiate(dentry, inode);
@@ -878,11 +1104,7 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir,
         if (rc)
                 RETURN(rc);
 
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-                inode->i_ctime.tv_sec = CURRENT_TIME.tv_sec;
-#else
-                inode->i_ctime = CURRENT_TIME;
-#endif
+        LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME);
         ext2_inc_count(inode);
         atomic_inc(&inode->i_count);
 
@@ -892,22 +1114,21 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir,
 static int ll_mkdir2(struct inode *dir, const char *name, int len, int mode)
 {
         struct ptlrpc_request *request = NULL;
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
-        time_t time = CURRENT_TIME.tv_sec;
-#else
-        time_t time = CURRENT_TIME;
-#endif
+        time_t time = LTIME_S(CURRENT_TIME);
         struct ll_sb_info *sbi = ll_i2sbi(dir);
+        struct mdc_op_data op_data;
         int err = -EMLINK;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n",
+               name, dir->i_ino, dir->i_generation, dir);
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (dir->i_nlink >= EXT2_LINK_MAX)
                 RETURN(err);
 
         mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR;
-        err = mdc_create(&sbi->ll_mdc_conn, dir, name, len, NULL, 0,
-                         mode, current->fsuid, current->fsgid,
+        ll_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0);
+        err = mdc_create(&sbi->ll_mdc_conn, &op_data, NULL, 0, mode,
+                         current->fsuid, current->fsgid,
                          time, 0, &request);
         ptlrpc_req_finished(request);
         RETURN(err);
@@ -920,8 +1141,10 @@ static int ll_mkdir(struct inode *dir, struct dentry *dentry, int mode)
         struct inode * inode;
         int err = -EMLINK;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
+               dentry->d_name.name, dir->i_ino, dir->i_generation, dir,
+               LL_IT2STR(dentry->d_it));
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         if (dir->i_nlink >= EXT2_LINK_MAX)
@@ -967,8 +1190,9 @@ static int ll_rmdir2(struct inode *dir, const char *name, int len)
 {
         int rc;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n",
+               name, dir->i_ino, dir->i_generation, dir);
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         rc = ll_mdc_unlink(dir, NULL, S_IFDIR, name, len);
         RETURN(rc);
 }
@@ -977,8 +1201,9 @@ static int ll_unlink2(struct inode *dir, const char *name, int len)
 {
         int rc;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n",
+               name, dir->i_ino, dir->i_generation, dir);
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         rc = ll_mdc_unlink(dir, NULL, S_IFREG, name, len);
         RETURN(rc);
 }
@@ -1029,8 +1254,10 @@ static int ll_unlink(struct inode *dir, struct dentry *dentry)
 {
         struct lookup_intent * it;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
+               dentry->d_name.name, dir->i_ino, dir->i_generation, dir,
+               LL_IT2STR(dentry->d_it));
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(dentry, it);
 
         RETURN(ll_common_unlink(dir, dentry, it, S_IFREG));
@@ -1042,8 +1269,10 @@ static int ll_rmdir(struct inode *dir, struct dentry *dentry)
         struct lookup_intent *it;
         int rc;
         ENTRY;
-        
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n",
+               dentry->d_name.name, dir->i_ino, dir->i_generation, dir,
+               LL_IT2STR(dentry->d_it));
+
         LL_GET_INTENT(dentry, it);
 
         if ((!it || !it->it_disposition) && !ext2_empty_dir(inode))
@@ -1065,11 +1294,15 @@ static int ll_rename2(struct inode *src, struct inode *tgt,
 {
         struct ptlrpc_request *request = NULL;
         struct ll_sb_info *sbi = ll_i2sbi(src);
+        struct mdc_op_data op_data;
         int err;
         ENTRY;
-        
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
-        err = mdc_rename(&sbi->ll_mdc_conn, src, tgt,
+        CDEBUG(D_VFSTRACE, "VFS Op:oldname=%s,src_dir=%lu/%u(%p),newname=%s,"
+               "tgt_dir=%lu/%u(%p)\n", oldname, src->i_ino, src->i_generation,
+               src, newname, tgt->i_ino, tgt->i_generation, tgt);
+
+        ll_prepare_mdc_op_data(&op_data, src, tgt, NULL, 0, 0);
+        err = mdc_rename(&sbi->ll_mdc_conn, &op_data,
                          oldname, oldlen, newname, newlen, &request);
         ptlrpc_req_finished(request);
 
@@ -1089,8 +1322,12 @@ static int ll_rename(struct inode * old_dir, struct dentry * old_dentry,
         struct ext2_dir_entry_2 * old_de;
         struct page * old_page;
         int err;
+        CDEBUG(D_VFSTRACE, "VFS Op:oldname=%s,src_dir=%lu/%u(%p),newname=%s,"
+               "tgt_dir=%lu/%u(%p),intent=%s\n",
+               old_dentry->d_name.name, old_dir->i_ino, old_dir->i_generation,
+               old_dir, new_dentry->d_name.name, new_dir->i_ino,
+               new_dir->i_generation, new_dir, LL_IT2STR(new_dentry->d_it));
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LL_GET_INTENT(new_dentry, it);
 
         if (it && it->it_disposition) {
diff --git a/lustre/llite/recover.c b/lustre/llite/recover.c
deleted file mode 100644 (file)
index 4c7ad42..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Lustre Lite recovery infrastructure.
- *
- * Copyright (C) 2002 Cluster File Systems Inc.
- */
-
-#define DEBUG_SUBSYSTEM S_LLITE
-
-#include <linux/lustre_lite.h>
-#include <linux/lustre_ha.h>
-#include <linux/lustre_dlm.h>
-#include <linux/lustre_idl.h>
-
-static int ll_retry_recovery(struct ptlrpc_connection *conn)
-{
-        ENTRY;
-        RETURN(0);
-}
-
-int ll_recover(struct recovd_data *rd, int phase)
-{
-        struct ptlrpc_connection *conn = class_rd2conn(rd);
-        struct list_head *tmp;
-
-        LASSERT(conn);
-        ENTRY;
-
-        switch (phase) {
-            case PTLRPC_RECOVD_PHASE_PREPARE:
-            case PTLRPC_RECOVD_PHASE_RECOVER:
-                list_for_each(tmp, &conn->c_imports) {
-                        struct obd_import *imp = 
-                                list_entry(tmp, struct obd_import, imp_chain);
-
-                        if (phase == PTLRPC_RECOVD_PHASE_PREPARE) {
-                                unsigned long flags;
-                                spin_lock_irqsave(&imp->imp_lock, flags);
-                                imp->imp_level = LUSTRE_CONN_RECOVD;
-                                spin_unlock_irqrestore(&imp->imp_lock, flags);
-                        }
-                        imp->imp_recover(imp, phase);
-                }
-                
-                if (phase == PTLRPC_RECOVD_PHASE_PREPARE)
-                        RETURN(ptlrpc_run_recovery_upcall(conn));
-                RETURN(0);
-                        
-            case PTLRPC_RECOVD_PHASE_FAILURE:
-                RETURN(ll_retry_recovery(conn));
-        }
-
-        LBUG();
-        RETURN(-ENOSYS);
-}
index 409f308..cd1fa90 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/lustre_mds.h>
 #include <linux/lustre_lite.h>
 #include <linux/lustre_lib.h>
+#include <linux/lustre_compat25.h>
 
 /*
  * Remove page from dirty list
@@ -64,9 +65,7 @@ static void __set_page_clean(struct page *page)
         if (!mapping)
                 return;
 
-#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
-        spin_lock(&pagecache_lock);
-#endif
+        PGCACHE_WRLOCK(mapping);
 
         list_del(&page->list);
         list_add(&page->list, &mapping->clean_pages);
@@ -77,9 +76,8 @@ static void __set_page_clean(struct page *page)
                 CDEBUG(D_INODE, "inode clean\n");
                 inode->i_state &= ~I_DIRTY_PAGES;
         }
-#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
-        spin_unlock(&pagecache_lock);
-#endif
+
+        PGCACHE_WRUNLOCK(mapping);
         EXIT;
 }
 
@@ -96,15 +94,10 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int flags)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
         struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obd_brw_set *set;
         struct brw_page pg;
         int rc;
         ENTRY;
 
-        set = obd_brw_set_new();
-        if (set == NULL)
-                RETURN(-ENOMEM);
-
         pg.pg = page;
         pg.off = ((obd_off)page->index) << PAGE_SHIFT;
 
@@ -125,22 +118,14 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int flags)
 
         pg.flag = flags;
 
-        set->brw_callback = ll_brw_sync_wait;
-        rc = obd_brw(cmd, ll_i2obdconn(inode), lsm, 1, &pg, set, NULL);
-        if (rc) {
-                if (rc != -EIO)
-                        CERROR("error from obd_brw: rc = %d\n", rc);
-        } else {
-                rc = ll_brw_sync_wait(set, CB_PHASE_START);
-                if (rc)
-                        CERROR("error from callback: rc = %d\n", rc);
-        }
-        obd_brw_set_decref(set);
+        rc = obd_brw(cmd, ll_i2obdconn(inode), lsm, 1, &pg, NULL);
+        if (rc)
+                CERROR("error from obd_brw: rc = %d\n", rc);
 
         RETURN(rc);
 }
 
-/* 
+/*
  * we were asked to read a single page but we're going to try and read a batch
  * of pages all at once.  this vaguely simulates 2.5's readpages.
  */
@@ -151,14 +136,17 @@ static int ll_readpage(struct file *file, struct page *first_page)
         struct page *page = first_page;
         struct list_head *pos;
         struct brw_page *pgs;
-        struct obd_brw_set *set;
         unsigned long end_index, extent_end = 0;
-        int npgs = 0, rc = 0;
+        struct ptlrpc_request_set *set;
+        int npgs = 0, rc = 0, max_pages;
         ENTRY;
 
         LASSERT(PageLocked(page));
         LASSERT(!PageUptodate(page));
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),offset="LPX64"\n",
+               inode->i_ino, inode->i_generation, inode,
+               (((obd_off)page->index) << PAGE_SHIFT));
+        LASSERT(atomic_read(&file->f_dentry->d_inode->i_count) > 0);
 
         if (inode->i_size <= ((obd_off)page->index) << PAGE_SHIFT) {
                 CERROR("reading beyond EOF\n");
@@ -169,56 +157,58 @@ static int ll_readpage(struct file *file, struct page *first_page)
                 RETURN(rc);
         }
 
-        pgs = kmalloc(PTL_MD_MAX_IOV * sizeof(*pgs), GFP_USER);
-        if ( pgs == NULL )
-                RETURN(-ENOMEM);
-        set = obd_brw_set_new();
-        if ( set == NULL )
-                GOTO(out_pgs, rc = -ENOMEM);
-
-        /* arbitrarily try to read-ahead 8 times what we can pass on 
-         * the wire at once, clamped to file size */
-        end_index = first_page->index + 
-                8 * ((PTL_MD_MAX_IOV * PAGE_SIZE)>>PAGE_CACHE_SHIFT);
-        if ( end_index > inode->i_size >> PAGE_CACHE_SHIFT )
+        /* try to read the file's preferred block size in a one-er */
+        end_index = first_page->index +
+                (inode->i_blksize >> PAGE_CACHE_SHIFT);
+        if (end_index > (inode->i_size >> PAGE_CACHE_SHIFT))
                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 
+        max_pages = ((end_index - first_page->index) << PAGE_CACHE_SHIFT) >>
+                PAGE_SHIFT;
+        pgs = kmalloc(max_pages * sizeof(*pgs), GFP_USER);
+        if (pgs == NULL)
+                RETURN(-ENOMEM);
+
         /*
          * find how far we're allowed to read under the extent ll_file_read
-         * is passing us.. 
+         * is passing us..
          */
         spin_lock(&lli->lli_read_extent_lock);
         list_for_each(pos, &lli->lli_read_extents) {
                 struct ll_read_extent *rextent;
                 rextent = list_entry(pos, struct ll_read_extent, re_lli_item);
-                if ( rextent->re_task != current )
+                if (rextent->re_task != current)
                         continue;
 
                 if (rextent->re_extent.end + PAGE_SIZE < rextent->re_extent.end)
                         /* extent wrapping */
                         extent_end = ~0;
-                else  {
-                        extent_end = ( rextent->re_extent.end + PAGE_SIZE )
+                else {
+                        extent_end = (rextent->re_extent.end + PAGE_SIZE)
                                                         << PAGE_CACHE_SHIFT;
                         /* 32bit indexes, 64bit extents.. */
-                        if ( ((u64)extent_end >> PAGE_CACHE_SHIFT ) < 
-                                        rextent->re_extent.end )
+                        if (((u64)extent_end >> PAGE_CACHE_SHIFT) <
+                                        rextent->re_extent.end)
                                 extent_end = ~0;
                 }
                 break;
         }
         spin_unlock(&lli->lli_read_extent_lock);
 
-        if ( extent_end == 0 ) {
-                CERROR("readpage outside ll_file_read, no lock held?\n");
+        if (extent_end == 0) {
+                static long next_print;
+                if (time_after(jiffies, next_print)) {
+                        next_print = jiffies + 30 * HZ;
+                        CDEBUG(D_INODE, "mmap readpage - check locks\n");
+                }
                 end_index = page->index + 1;
-        } else if ( extent_end < end_index )
+        } else if (extent_end < end_index)
                 end_index = extent_end;
 
         /* to balance the find_get_page ref the other pages get that is
          * decrefed on teardown.. */
         page_cache_get(page);
-        do { 
+        do {
                 unsigned long index ;
 
                 pgs[npgs].pg = page;
@@ -240,32 +230,32 @@ static int ll_readpage(struct file *file, struct page *first_page)
                 }
 
                 npgs++;
-                if ( npgs == PTL_MD_MAX_IOV )
+                if (npgs == max_pages)
                         break;
 
                 /*
-                 * find pages ahead of us that we can read in.  
+                 * find pages ahead of us that we can read in.
                  * grab_cache_page waits on pages that are locked so
                  * we first try find_get_page, which doesn't.  this stops
-                 * the worst case behaviour of racing threads waiting on 
+                 * the worst case behaviour of racing threads waiting on
                  * each other, but doesn't remove it entirely.
                  */
-                for ( index = page->index + 1, page = NULL ;
-                        page == NULL && index < end_index ; index++ ) {
+                for (index = page->index + 1, page = NULL;
+                     page == NULL && index < end_index; index++) {
 
                         /* see if the page already exists and needs updating */
                         page = find_get_page(inode->i_mapping, index);
-                        if ( page ) {
-                                if ( Page_Uptodate(page) || TryLockPage(page) )
+                        if (page) {
+                                if (Page_Uptodate(page) || TryLockPage(page))
                                         goto out_release;
-                                if ( !page->mapping || Page_Uptodate(page)) 
+                                if (!page->mapping || Page_Uptodate(page))
                                         goto out_unlock;
                         } else {
                                 /* ok, we have to create it.. */
                                 page = grab_cache_page(inode->i_mapping, index);
-                                if ( page == NULL ) 
+                                if (page == NULL)
                                         continue;
-                                if ( Page_Uptodate(page) )
+                                if (Page_Uptodate(page))
                                         goto out_unlock;
                         }
 
@@ -280,39 +270,45 @@ static int ll_readpage(struct file *file, struct page *first_page)
 
         } while (page);
 
-        set->brw_callback = ll_brw_sync_wait;
-        rc = obd_brw(OBD_BRW_READ, ll_i2obdconn(inode),
-                     ll_i2info(inode)->lli_smd, npgs, pgs, set, NULL);
-        if (rc) {
-                CERROR("error from obd_brw: rc = %d\n", rc);
+        set = ptlrpc_prep_set();
+        if (set == NULL) {
+                CERROR("ENOMEM allocing request set\n");
+                rc = -ENOMEM;
         } else {
-                rc = ll_brw_sync_wait(set, CB_PHASE_START);
-                if (rc)
-                        CERROR("error from callback: rc = %d\n", rc);
+                rc = obd_brw_async(OBD_BRW_READ, ll_i2obdconn(inode),
+                                   ll_i2info(inode)->lli_smd, npgs, pgs,
+                                   set, NULL);
+                if (rc == 0)
+                        rc = ptlrpc_set_wait(set);
+                ptlrpc_set_destroy(set);
+                if (rc && rc != -EIO)
+                        CERROR("error from obd_brw_async: rc = %d\n", rc);
         }
-        obd_brw_set_decref(set);
 
-        while ( --npgs > -1 ) {
+        while (npgs-- > 0) {
                 page = pgs[npgs].pg;
 
-                if ( rc == 0 )
+                if (rc == 0)
                         SetPageUptodate(page);
                 unlock_page(page);
                 page_cache_release(page);
         }
-out_pgs:
+
         kfree(pgs);
         RETURN(rc);
 } /* ll_readpage */
 
+/* this isn't where truncate starts.   roughly:
+ * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate
+ * we grab the lock back in setattr_raw to avoid races. */
 void ll_truncate(struct inode *inode)
 {
-        struct obdo oa = {0};
         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-        struct lustre_handle lockh = { 0, 0 };
-        struct ldlm_extent extent = {inode->i_size, OBD_OBJECT_EOF};
+        struct obdo oa = {0};
         int err;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
 
         if (!lsm) {
                 /* object not yet allocated */
@@ -321,22 +317,20 @@ void ll_truncate(struct inode *inode)
                 return;
         }
 
+        /* vmtruncate just threw away our dirty pages, make sure
+         * we don't think they're still dirty, being careful to round
+         * i_size to the first whole page that was tossed */
+        ll_remove_dirty(inode,
+                        (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT,
+                        ~0);
+
         oa.o_id = lsm->lsm_object_id;
         oa.o_mode = inode->i_mode;
         oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n",
                oa.o_id, inode->i_size);
 
-         /* i_size has already been set to the new size */
-        err = ll_extent_lock_no_validate(NULL, inode, lsm, LCK_PW, 
-                                        &extent, &lockh);
-        if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED) {
-                EXIT;
-                return;
-        }
-
         /* truncate == punch from new size to absolute end of file */
         err = obd_punch(ll_i2obdconn(inode), &oa, lsm, inode->i_size,
                         OBD_OBJECT_EOF, NULL);
@@ -345,10 +339,6 @@ void ll_truncate(struct inode *inode)
         else
                 obdo_to_inode(inode, &oa, oa.o_valid);
 
-        err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh);
-        if (err)
-                CERROR("ll_extent_unlock failed: %d\n", err);
-
         EXIT;
         return;
 } /* ll_truncate */
@@ -359,12 +349,13 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
                             unsigned to)
 {
         struct inode *inode = page->mapping->host;
+        struct ll_inode_info *lli = ll_i2info(inode);
+        struct lov_stripe_md *lsm = lli->lli_smd;
         obd_off offset = ((obd_off)page->index) << PAGE_SHIFT;
+        struct brw_page pg;
         int rc = 0;
         ENTRY;
 
-        ll_check_dirty(inode->i_sb);
-
         if (!PageLocked(page))
                 LBUG();
 
@@ -373,11 +364,19 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
 
         //POISON(addr + from, 0xca, to - from);
 
+        /* Check to see if we should return -EIO right away */
+        pg.pg = page;
+        pg.off = offset;
+        pg.count = PAGE_SIZE;
+        pg.flag = 0;
+        rc = obd_brw(OBD_BRW_CHECK, ll_i2obdconn(inode), lsm, 1, &pg, NULL);
+        if (rc)
+                RETURN(rc);
+
         /* We're completely overwriting an existing page, so _don't_ set it up
          * to date until commit_write */
         if (from == 0 && to == PAGE_SIZE)
                 RETURN(0);
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
 
         /* If are writing to a new page, no need to read old data.
          * the extent locking and getattr procedures in ll_file_write have
@@ -411,6 +410,7 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from,
  * free some more pages that our allocating writeback may need, but it isn't
  * yet.
  */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 static int ll_writepage(struct page *page)
 {
         struct inode *inode = page->mapping->host;
@@ -418,7 +418,6 @@ static int ll_writepage(struct page *page)
 
         CDEBUG(D_CACHE, "page %p [lau %d] inode %p\n", page,
                         PageLaunder(page), inode);
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         LASSERT(PageLocked(page));
 
         /* XXX should obd_brw errors trickle up? */
@@ -440,23 +439,50 @@ static int ll_commit_write(struct file *file, struct page *page,
         LASSERT(inode == file->f_dentry->d_inode);
         LASSERT(PageLocked(page));
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n",
                inode, page, from, to, page->index);
-
         /* to match full page case in prepare_write */
         SetPageUptodate(page);
         /* mark the page dirty, put it on mapping->dirty,
          * mark the inode PAGES_DIRTY, put it on sb->dirty */
-        set_page_dirty(page);
+        if (!PageDirty(page))
+                INODE_IO_STAT_ADD(inode, dirty_misses, 1);
+        else
+                INODE_IO_STAT_ADD(inode, dirty_hits, 1);
 
-        /* this is matched by a hack in obdo_to_inode at the moment */
         size = (((obd_off)page->index) << PAGE_SHIFT) + to;
         if (size > inode->i_size)
                 inode->i_size = size;
 
+        /* XXX temporary, bug 1286 */
+        {
+                struct ll_dirty_offsets *lldo = &ll_i2info(inode)->lli_dirty;
+                int rc;
+                if ((lldo->do_num_dirty * PAGE_CACHE_SIZE) > 10 * 1024 * 1024) {
+                        rc = ll_batch_writepage(inode, page);
+                        lock_page(page); /* caller expects to unlock */
+                        RETURN(rc);
+                }
+        }
+
+        set_page_dirty(page);
+        ll_record_dirty(inode, page->index);
+
         RETURN(0);
 } /* ll_commit_write */
+#else
+static int ll_writepage(struct page *page,
+                        struct writeback_control *wbc)
+{
+
+        return 0;
+}
+static int ll_commit_write(struct file *file, struct page *page,
+                           unsigned from, unsigned to)
+{
+        return 0;
+}
+#endif
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
@@ -465,12 +491,11 @@ static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
         struct ll_inode_info *lli = ll_i2info(inode);
         struct lov_stripe_md *lsm = lli->lli_smd;
         struct brw_page *pga;
-        struct obd_brw_set *set;
-        loff_t offset;
+        struct ptlrpc_request_set *set;
         int length, i, flags, rc = 0;
+        loff_t offset;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         if (!lsm || !lsm->lsm_object_id)
                 RETURN(-ENOMEM);
 
@@ -478,26 +503,18 @@ static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
             (iobuf->length & (blocksize - 1)))
                 RETURN(-EINVAL);
 
-#if 0
-        /* XXX Keep here until we find ia64 problem, it crashes otherwise */
-        if (blocksize != PAGE_SIZE) {
-                CERROR("direct_IO blocksize != PAGE_SIZE\n");
-                RETURN(-EINVAL);
-        }
-#endif
-
-        set = obd_brw_set_new();
+        set = ptlrpc_prep_set();
         if (set == NULL)
                 RETURN(-ENOMEM);
 
         OBD_ALLOC(pga, sizeof(*pga) * iobuf->nr_pages);
         if (!pga) {
-                obd_brw_set_decref(set);
+                ptlrpc_set_destroy(set);
                 RETURN(-ENOMEM);
         }
 
         flags = (rw == WRITE ? OBD_BRW_CREATE : 0) /* | OBD_BRW_DIRECTIO */;
-        offset = (blocknr << inode->i_blkbits);
+        offset = ((obd_off)blocknr << inode->i_blkbits);
         length = iobuf->length;
 
         for (i = 0, length = iobuf->length; length > 0;
@@ -514,18 +531,18 @@ static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf,
                 }
         }
 
-        set->brw_callback = ll_brw_sync_wait;
-        rc = obd_brw(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
-                     ll_i2obdconn(inode), lsm, iobuf->nr_pages, pga, set, NULL);
+        rc = obd_brw_async(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
+                           ll_i2obdconn(inode), lsm, iobuf->nr_pages, pga, set,
+                           NULL);
         if (rc) {
                 CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
-                       "error from obd_brw: rc = %d\n", rc);
+                       "error from obd_brw_async: rc = %d\n", rc);
         } else {
-                rc = ll_brw_sync_wait(set, CB_PHASE_START);
+                rc = ptlrpc_set_wait(set);
                 if (rc)
                         CERROR("error from callback: rc = %d\n", rc);
         }
-        obd_brw_set_decref(set);
+        ptlrpc_set_destroy(set);
         if (rc == 0)
                 rc = iobuf->length;
 
index ff754a0..66563c7 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/lprocfs_status.h>
+#include "llite_internal.h"
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 kmem_cache_t *ll_file_data_slab;
@@ -132,7 +133,7 @@ static struct super_block *ll_read_super(struct super_block *sb,
 
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
         OBD_ALLOC(sbi, sizeof(*sbi));
         if (!sbi)
                 RETURN(NULL);
@@ -140,6 +141,7 @@ static struct super_block *ll_read_super(struct super_block *sb,
         INIT_LIST_HEAD(&sbi->ll_conn_chain);
         INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
         generate_random_uuid(uuid);
+        spin_lock_init(&sbi->ll_iostats.fis_lock);
         class_uuid_unparse(uuid, &sbi->ll_sb_uuid);
 
         sb->u.generic_sbp = sbi;
@@ -163,15 +165,13 @@ static struct super_block *ll_read_super(struct super_block *sb,
                 GOTO(out_free, sb = NULL);
         }
 
-        err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid,
-                          ptlrpc_recovd, ll_recover);
+        err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid);
         if (err) {
                 CERROR("cannot connect to %s: rc = %d\n", mdc, err);
                 GOTO(out_free, sb = NULL);
         }
 
-        mdc_conn = sbi2mdc(sbi)->cl_import.imp_connection;
-        list_add(&mdc_conn->c_sb_chain, &sbi->ll_conn_chain);
+        mdc_conn = sbi2mdc(sbi)->cl_import->imp_connection;
 
         strncpy(param_uuid.uuid, osc, sizeof(param_uuid.uuid));
         obd = class_uuid2obd(&param_uuid);
@@ -180,8 +180,7 @@ static struct super_block *ll_read_super(struct super_block *sb,
                 GOTO(out_mdc, sb = NULL);
         }
 
-        err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid,
-                          ptlrpc_recovd, ll_recover);
+        err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid);
         if (err) {
                 CERROR("cannot connect to %s: rc = %d\n", osc, err);
                 GOTO(out_mdc, sb = NULL);
@@ -190,7 +189,7 @@ static struct super_block *ll_read_super(struct super_block *sb,
         err = mdc_getstatus(&sbi->ll_mdc_conn, &rootfid);
         if (err) {
                 CERROR("cannot mds_connect: rc = %d\n", err);
-                GOTO(out_mdc, sb = NULL);
+                GOTO(out_osc, sb = NULL);
         }
         CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id);
         sbi->ll_rootino = rootfid.id;
@@ -200,16 +199,17 @@ static struct super_block *ll_read_super(struct super_block *sb,
         sb->s_blocksize = osfs.os_bsize;
         sb->s_blocksize_bits = log2(osfs.os_bsize);
         sb->s_magic = LL_SUPER_MAGIC;
-        sb->s_maxbytes = (1ULL << (32 + 9)) - osfs.os_bsize;
+        sb->s_maxbytes = PAGE_CACHE_MAXBYTES;
 
         sb->s_op = &ll_super_operations;
 
-        /* make root inode */
-        err = mdc_getattr(&sbi->ll_mdc_conn, sbi->ll_rootino, S_IFDIR,
+        /* make root inode 
+         * XXX: move this to after cbd setup? */
+        err = mdc_getattr(&sbi->ll_mdc_conn, &rootfid,
                           OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request);
         if (err) {
                 CERROR("mdc_getattr failed for root: rc = %d\n", err);
-                GOTO(out_request, sb = NULL);
+                GOTO(out_osc, sb = NULL);
         }
 
         /* initialize committed transaction callback daemon */
@@ -220,23 +220,29 @@ static struct super_block *ll_read_super(struct super_block *sb,
         err = ll_commitcbd_setup(sbi);
         if (err) {
                 CERROR("failed to start commit callback daemon: rc = %d\n",err);
-                GOTO(out_request, sb = NULL);
+                ptlrpc_req_finished (request);
+                GOTO(out_osc, sb = NULL);
         }
 
-        lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0);
-        lic.lic_lmm = NULL;
+        lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0,
+                                      sizeof(*lic.lic_body));
+        LASSERT (lic.lic_body != NULL);         /* checked by mdc_getattr() */
+        LASSERT_REPSWABBED (request, 0);        /* swabbed by mdc_getattr() */
+
+        lic.lic_lsm = NULL;
+
         LASSERT(sbi->ll_rootino != 0);
         root = iget4(sb, sbi->ll_rootino, NULL, &lic);
 
-        if (root) {
-                sb->s_root = d_alloc_root(root);
-        } else {
+        ptlrpc_req_finished(request);
+
+        if (root == NULL || is_bad_inode(root)) {
+                /* XXX might need iput() for bad inode */
                 CERROR("lustre_lite: bad iget4 for root\n");
-                GOTO(out_cdb, sb = NULL);
+                GOTO(out_cbd, sb = NULL);
         }
 
-        ptlrpc_req_finished(request);
-        request = NULL;
+        sb->s_root = d_alloc_root(root);
 
         if (proc_lustre_fs_root) {
                 err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
@@ -253,13 +259,12 @@ out_dev:
 
         RETURN(sb);
 
-out_cdb:
+out_cbd:
         ll_commitcbd_cleanup(sbi);
-out_request:
-        ptlrpc_req_finished(request);
-        obd_disconnect(&sbi->ll_osc_conn);
+out_osc:
+        obd_disconnect(&sbi->ll_osc_conn, 0);
 out_mdc:
-        obd_disconnect(&sbi->ll_mdc_conn);
+        obd_disconnect(&sbi->ll_mdc_conn, 0);
 out_free:
         OBD_FREE(sbi, sizeof(*sbi));
 
@@ -271,12 +276,13 @@ static void ll_put_super(struct super_block *sb)
         struct ll_sb_info *sbi = ll_s2sbi(sb);
         struct list_head *tmp, *next;
         struct ll_fid rootfid;
+        struct obd_device *obd = class_conn2obd(&sbi->ll_mdc_conn);
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
         list_del(&sbi->ll_conn_chain);
         ll_commitcbd_cleanup(sbi);
-        obd_disconnect(&sbi->ll_osc_conn);
+        obd_disconnect(&sbi->ll_osc_conn, 0);
 
         /* NULL request to force sync on the MDS, and get the last_committed
          * value to flush remaining RPCs from the sending queue on client.
@@ -284,14 +290,15 @@ static void ll_put_super(struct super_block *sb)
          * XXX This should be an mdc_sync() call to sync the whole MDS fs,
          *     which we can call for other reasons as well.
          */
-        mdc_getstatus(&sbi->ll_mdc_conn, &rootfid);
+        if (!obd->obd_no_recov)
+                mdc_getstatus(&sbi->ll_mdc_conn, &rootfid);
 
         if (sbi->ll_proc_root) {
                 lprocfs_remove(sbi->ll_proc_root);
                 sbi->ll_proc_root = NULL;
         }
 
-        obd_disconnect(&sbi->ll_mdc_conn);
+        obd_disconnect(&sbi->ll_mdc_conn, 0);
 
         spin_lock(&dcache_lock);
         list_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) {
@@ -312,27 +319,29 @@ static void ll_clear_inode(struct inode *inode)
         int rc;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
-        rc = mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK);
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
+        rc = ll_mdc_cancel_unused(&sbi->ll_mdc_conn, inode,
+                                  LDLM_FL_NO_CALLBACK, inode);
         if (rc < 0) {
-                CERROR("mdc_cancel_unused: %d\n", rc);
+                CERROR("ll_mdc_cancel_unused: %d\n", rc);
                 /* XXX FIXME do something dramatic */
         }
 
+        if (atomic_read(&inode->i_count) != 0)
+                CERROR("clearing in-use inode %lu: count = %d\n",
+                       inode->i_ino, atomic_read(&inode->i_count));
+
         if (lli->lli_smd) {
-                rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, 0);
+                rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd,
+                                       LDLM_FL_WARN, inode);
                 if (rc < 0) {
                         CERROR("obd_cancel_unused: %d\n", rc);
                         /* XXX FIXME do something dramatic */
                 }
-        }
-
-        if (atomic_read(&inode->i_count) != 0)
-                CERROR("clearing in-use inode %lu: count = %d\n",
-                       inode->i_ino, atomic_read(&inode->i_count));
-
-        if (lli->lli_smd)
                 obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd);
+                lli->lli_smd = NULL;
+        }
 
         if (lli->lli_symlink_name) {
                 OBD_FREE(lli->lli_symlink_name,
@@ -347,7 +356,8 @@ static void ll_clear_inode(struct inode *inode)
 static void ll_delete_inode(struct inode *inode)
 {
         ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
         if (S_ISREG(inode->i_mode)) {
                 int err;
                 struct obdo *oa;
@@ -390,6 +400,10 @@ static int ll_attr2inode(struct inode *inode, struct iattr *attr, int trunc)
         int error = 0;
 
         if ((ia_valid & ATTR_SIZE) && trunc) {
+                if (attr->ia_size > ll_file_maxbytes(inode)) {
+                        error = -EFBIG;
+                        goto out;
+                }
                 error = vmtruncate(inode, attr->ia_size);
                 if (error)
                         goto out;
@@ -423,15 +437,20 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
         ENTRY;
 
         /* change incore inode */
-        ll_attr2inode(inode, attr, do_trunc);
+        err = ll_attr2inode(inode, attr, do_trunc);
+        if (err)
+                RETURN(err);
 
         /* Don't send size changes to MDS to avoid "fast EA" problems, and
          * also avoid a pointless RPC (we get file size from OST anyways).
          */
         attr->ia_valid &= ~ATTR_SIZE;
         if (attr->ia_valid) {
-                err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0,
-                                  &request);
+                struct mdc_op_data op_data;
+
+                ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
+                err = mdc_setattr(&sbi->ll_mdc_conn, &op_data,
+                                  attr, NULL, 0, &request);
                 if (err)
                         CERROR("mdc_setattr fails: err = %d\n", err);
 
@@ -461,31 +480,63 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
 
 int ll_setattr_raw(struct inode *inode, struct iattr *attr)
 {
-        struct ptlrpc_request *request = NULL;
+        struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
         struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int err = 0;
+        struct ptlrpc_request *request = NULL;
+        struct mdc_op_data op_data;
+        int rc = 0, err;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
 
         if ((attr->ia_valid & ATTR_SIZE)) {
+                struct ldlm_extent extent = {attr->ia_size, OBD_OBJECT_EOF};
+                struct lustre_handle lockh = { 0 };
+
+                if (attr->ia_size > ll_file_maxbytes(inode))
+                        RETURN(-EFBIG);
+
                 /* writeback uses inode->i_size to determine how far out
                  * its cached pages go.  ll_truncate gets a PW lock, canceling
                  * our lock, _after_ it has updated i_size.  this can confuse
-                 * us into zero extending the file to the newly truncated
-                 * size, and this has bad implications for a racing o_append.
-                 * if we're extending our size we need to flush the pages
-                 * with the correct i_size before vmtruncate stomps on
-                 * the new i_size.  again, this can only find pages to
-                 * purge if the PW lock that generated them is still held.
-                 */
-                if ( attr->ia_size > inode->i_size ) {
-                        filemap_fdatasync(inode->i_mapping);
-                        filemap_fdatawait(inode->i_mapping);
+                 *
+                 * If this file doesn't have stripes yet, it is already,
+                 * by definition, truncated. */
+                if ((attr->ia_valid & ATTR_FROM_OPEN) && lsm == NULL) {
+                        LASSERT(attr->ia_size == 0);
+                        GOTO(skip_extent_lock, rc = 0);
+                }
+
+                /* we really need to get our PW lock before we change
+                 * inode->i_size.  if we don't we can race with other
+                 * i_size updaters on our node, like ll_file_read.  we
+                 * can also race with i_size propogation to other
+                 * nodes through dirtying and writeback of final cached
+                 * pages.  this last one is especially bad for racing
+                 * o_append users on other nodes. */
+                rc = ll_extent_lock_no_validate(NULL, inode, lsm, LCK_PW,
+                                                &extent, &lockh);
+                if (rc != ELDLM_OK) {
+                        if (rc > 0)
+                                RETURN(-ENOLCK);
+                        RETURN(rc);
                 }
-                err = vmtruncate(inode, attr->ia_size);
+
+                rc = vmtruncate(inode, attr->ia_size);
+                if (rc == 0)
+                        set_bit(LLI_F_HAVE_SIZE_LOCK,
+                                &ll_i2info(inode)->lli_flags);
+
+                /* unlock now as we don't mind others file lockers racing with
+                 * the mds updates below? */
+                err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh);
                 if (err)
-                        RETURN(err);
+                        CERROR("ll_extent_unlock failed: %d\n", err);
+                if (rc)
+                        RETURN(rc);
         }
 
+skip_extent_lock:
         /* Don't send size changes to MDS to avoid "fast EA" problems, and
          * also avoid a pointless RPC (we get file size from OST anyways).
          */
@@ -493,18 +544,25 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
         if (!attr->ia_valid)
                 RETURN(0);
 
-        err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0,
-                          &request);
+        ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
+
+        err = mdc_setattr(&sbi->ll_mdc_conn, &op_data,
+                          attr, NULL, 0, &request);
         if (err)
                 CERROR("mdc_setattr fails: err = %d\n", err);
 
         ptlrpc_req_finished(request);
 
-        if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) {
+        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_MTIME_SET)) {
                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
                 struct obdo oa;
                 int err2;
 
+                if (lsm == NULL) {
+                        CDEBUG(D_INODE, "no lsm: not setting mtime on OSTs\n");
+                        RETURN(err);
+                }
+
                 CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
                        inode->i_ino, attr->ia_mtime);
                 oa.o_id = lsm->lsm_object_id;
@@ -524,8 +582,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
 int ll_setattr(struct dentry *de, struct iattr *attr)
 {
         int rc = inode_change_ok(de->d_inode, attr);
-
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s\n", de->d_name.name);
         if (rc)
                 return rc;
 
@@ -539,7 +596,7 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs)
         int rc;
         ENTRY;
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
+        CDEBUG(D_VFSTRACE, "VFS Op:\n");
         memset(sfs, 0, sizeof(*sfs));
         rc = obd_statfs(&sbi->ll_mdc_conn, &osfs);
         statfs_unpack(sfs, &osfs);
@@ -570,33 +627,68 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs)
                         osfs.os_bfree >>= 1;
                         osfs.os_bavail >>= 1;
                 }
+
                 sfs->f_blocks = osfs.os_blocks;
                 sfs->f_bfree = osfs.os_bfree;
                 sfs->f_bavail = osfs.os_bavail;
-                if (osfs.os_ffree < (__u64)sfs->f_ffree)
+
+                /* If we don't have as many objects free on the OST as inodes
+                 * on the MDS, we reduce the total number of inodes to
+                 * compensate, so that the "inodes in use" number is correct.
+                 */
+                if (osfs.os_ffree < (__u64)sfs->f_ffree) {
+                        sfs->f_files = (sfs->f_files - sfs->f_ffree) +
+                                       osfs.os_ffree;
                         sfs->f_ffree = osfs.os_ffree;
+                }
         }
 
 out:
         RETURN(rc);
 }
 
+void dump_lsm(int level, struct lov_stripe_md *lsm)
+{
+        CDEBUG(level, "objid "LPX64", maxbytes "LPX64", magic %#08x, "
+               "stripe_size %#08x, offset %u, stripe_count %u\n",
+               lsm->lsm_object_id, lsm->lsm_maxbytes, lsm->lsm_magic,
+               lsm->lsm_stripe_size, lsm->lsm_stripe_offset,
+               lsm->lsm_stripe_count);
+}
+
 void ll_update_inode(struct inode *inode, struct mds_body *body,
-                     struct lov_mds_md *lmm)
+                     struct lov_stripe_md *lsm)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
 
-        if (lmm != NULL)
-                obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lmm);
+        LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+        if (lsm != NULL) {
+                if (lli->lli_smd == NULL) {
+                        lli->lli_maxbytes = lsm->lsm_maxbytes;
+                        if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
+                                lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
+                        lli->lli_smd = lsm;
+                } else {
+                        if (memcmp(lli->lli_smd, lsm, sizeof(*lsm))) {
+                                CERROR("lsm mismatch for inode %ld\n",
+                                       inode->i_ino);
+                                CERROR("lli_smd:\n");
+                                dump_lsm(D_ERROR, lli->lli_smd);
+                                CERROR("lsm:\n");
+                                dump_lsm(D_ERROR, lsm);
+                                LBUG();
+                        }
+                }
+        }
 
         if (body->valid & OBD_MD_FLID)
                 inode->i_ino = body->ino;
         if (body->valid & OBD_MD_FLATIME)
-                inode->i_atime = body->atime;
+                LTIME_S(inode->i_atime) = body->atime;
         if (body->valid & OBD_MD_FLMTIME)
-                inode->i_mtime = body->mtime;
+                LTIME_S(inode->i_mtime) = body->mtime;
         if (body->valid & OBD_MD_FLCTIME)
-                inode->i_ctime = body->ctime;
+                LTIME_S(inode->i_ctime) = body->ctime;
         if (body->valid & OBD_MD_FLMODE)
                 inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
         if (body->valid & OBD_MD_FLTYPE)
@@ -625,37 +717,22 @@ static void ll_read_inode2(struct inode *inode, void *opaque)
         struct mds_body *body = lic->lic_body;
         struct ll_inode_info *lli = ll_i2info(inode);
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
+               inode->i_generation, inode);
 
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
         sema_init(&lli->lli_open_sem, 1);
-        atomic_set(&lli->lli_open_count, 0);
-        lli->lli_flags = 0;
-        init_MUTEX(&lli->lli_getattr_sem);
         spin_lock_init(&lli->lli_read_extent_lock);
         INIT_LIST_HEAD(&lli->lli_read_extents);
+        ll_lldo_init(&lli->lli_dirty);
+        lli->lli_flags = 0;
+        /* We default to 2T-4k until the LSM is created/read, at which point
+         * it'll be updated. */
+        lli->lli_maxbytes = LUSTRE_STRIPE_MAXBYTES;
 
         LASSERT(!lli->lli_smd);
 
         /* core attributes from the MDS first */
-        ll_update_inode(inode, body, lic ? lic->lic_lmm : NULL);
-
-        /* Get the authoritative file size */
-        if (lli->lli_smd && (inode->i_mode & S_IFREG)) {
-                struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
-                struct lustre_handle lockh = {0, 0};
-                struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-                ldlm_error_t rc;
-
-                LASSERT(lli->lli_smd->lsm_object_id != 0);
-
-                rc = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh);
-                if (rc != ELDLM_OK && rc != ELDLM_LOCK_MATCHED) {
-                        ll_clear_inode(inode);
-                        make_bad_inode(inode);
-                } else {
-                        ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh);
-                }
-        }
+        ll_update_inode(inode, body, lic->lic_lsm);
 
         /* OIDEBUG(inode); */
 
@@ -679,41 +756,30 @@ static void ll_read_inode2(struct inode *inode, void *opaque)
         }
 }
 
-static inline void invalidate_request_list(struct list_head *req_list)
-{
-        struct list_head *tmp, *n;
-        list_for_each_safe(tmp, n, req_list) {
-                struct ptlrpc_request *req =
-                        list_entry(tmp, struct ptlrpc_request, rq_list);
-                CERROR("invalidating req xid "LPU64" op %d to %s:%d\n",
-                       req->rq_xid, req->rq_reqmsg->opc,
-                       req->rq_connection->c_remote_uuid.uuid,
-                       req->rq_import->imp_client->cli_request_portal);
-                req->rq_flags |= PTL_RPC_FL_ERR;
-                wake_up(&req->rq_wait_for_rep);
-        }
-}
-
 void ll_umount_begin(struct super_block *sb)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
-        struct list_head *ctmp;
+        struct obd_device *obd;
+        struct obd_ioctl_data ioc_data = { 0 };
 
         ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op\n");
-
-        list_for_each(ctmp, &sbi->ll_conn_chain) {
-                struct ptlrpc_connection *conn;
-                conn = list_entry(ctmp, struct ptlrpc_connection, c_sb_chain);
-
-                spin_lock(&conn->c_lock);
-                /* XXX should just be dealing with imports, probably through
-                 * XXX iocontrol, need next-gen recovery! */
-                conn->c_flags |= CONN_INVALID;
-                /* invalidate_request_list(&conn->c_sending_head); */
-                invalidate_request_list(&conn->c_delayed_head);
-                spin_unlock(&conn->c_lock);
-        }
+        CDEBUG(D_VFSTRACE, "VFS Op:\n");
+
+        obd = class_conn2obd(&sbi->ll_mdc_conn);
+        obd->obd_no_recov = 1;
+        obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_mdc_conn, sizeof ioc_data,
+                      &ioc_data, NULL);
+
+        obd = class_conn2obd(&sbi->ll_osc_conn);
+        obd->obd_no_recov = 1;
+        obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_osc_conn, sizeof ioc_data,
+                      &ioc_data, NULL);
+
+        /* Really, we'd like to wait until there are no requests outstanding,
+         * and then continue.  For now, we just invalidate the requests,
+         * schedule, and hope.
+         */
+        schedule();
 
         EXIT;
 }
index f296d10..680c47f 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/lprocfs_status.h>
+#include "llite_internal.h"
 
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
 #include <asm/statfs.h>
@@ -136,6 +137,7 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
         struct obd_uuid param_uuid;
 
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:\n");
 
         OBD_ALLOC(sbi, sizeof(*sbi));
         if (!sbi)
@@ -167,24 +169,22 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
                 GOTO(out_free, sb = NULL);
         }
 
-        err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid,
-                          ptlrpc_recovd, ll_recover);
+        err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid);
         if (err) {
                 CERROR("cannot connect to %s: rc = %d\n", mdc, err);
                 GOTO(out_free, sb = NULL);
         }
 
-        mdc_conn = sbi2mdc(sbi)->cl_import.imp_connection;
-        list_add(&mdc_conn->c_sb_chain, &sbi->ll_conn_chain);
+        mdc_conn = sbi2mdc(sbi)->cl_import->imp_connection;
+        strncpy(param_uuid.uuid, osc, sizeof(param_uuid.uuid));
 
-        obd = class_uuid2obd(osc);
+        obd = class_uuid2obd(&param_uuid);
         if (!obd) {
                 CERROR("OSC %s: not setup or attached\n", osc);
                 GOTO(out_mdc, sb = NULL);
         }
 
-        err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid,
-                          ptlrpc_recovd, ll_recover);
+        err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid);
         if (err) {
                 CERROR("cannot connect to %s: rc = %d\n", osc, err);
                 GOTO(out_mdc, sb = NULL);
@@ -193,7 +193,7 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
         err = mdc_getstatus(&sbi->ll_mdc_conn, &rootfid);
         if (err) {
                 CERROR("cannot mds_connect: rc = %d\n", err);
-                GOTO(out_mdc, sb = NULL);
+                GOTO(out_osc, sb = NULL);
         }
         CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id);
         sbi->ll_rootino = rootfid.id;
@@ -203,16 +203,17 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
         sb->s_blocksize = osfs.os_bsize;
         sb->s_blocksize_bits = log2(osfs.os_bsize);
         sb->s_magic = LL_SUPER_MAGIC;
-        sb->s_maxbytes = (1ULL << (32 + 9)) - osfs.os_bsize;
+        sb->s_maxbytes = PAGE_CACHE_MAXBYTES;
 
         sb->s_op = &ll_super_operations;
 
-        /* make root inode */
-        err = mdc_getattr(&sbi->ll_mdc_conn, sbi->ll_rootino, S_IFDIR,
+        /* make root inode 
+         * XXX: move this to after cbd setup? */
+        err = mdc_getattr(&sbi->ll_mdc_conn, &rootfid,
                           OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request);
         if (err) {
                 CERROR("mdc_getattr failed for root: rc = %d\n", err);
-                GOTO(out_request, sb = NULL);
+                GOTO(out_osc, sb = NULL);
         }
 
         /* initialize committed transaction callback daemon */
@@ -223,25 +224,30 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent)
         err = ll_commitcbd_setup(sbi);
         if (err) {
                 CERROR("failed to start commit callback daemon: rc = %d\n",err);
-                GOTO(out_request, sb = NULL);
+                ptlrpc_req_finished (request);
+                GOTO(out_osc, sb = NULL);
         }
 
-        lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0);
-        lic.lic_lmm = NULL;
+        lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*lic.lic_body));
+        LASSERT (lic.lic_body != NULL);         /* checked by mdc_getattr() */
+        LASSERT_REPSWABBED (request, 0);        /* swabbed by mdc_getattr() */
+
+        lic.lic_lsm = NULL;
+
         root = iget5_locked(sb, sbi->ll_rootino, NULL,
                             ll_read_inode2, &lic);
 
-        if (root) {
-                sb->s_root = d_alloc_root(root);
-                root->i_state &= ~(I_LOCK | I_NEW);
-        } else {
-                CERROR("lustre_lite: bad iget4 for root\n");
-                GOTO(out_cdb, sb = NULL);
-        }
-
         ptlrpc_req_finished(request);
-        request = NULL;
 
+        if (root == NULL || is_bad_inode(root)) {
+                /* XXX might need iput() for bad inode */
+                CERROR("lustre_lite: bad iget5 for root\n");
+                GOTO(out_cbd, sb = NULL);
+        }
+
+        sb->s_root = d_alloc_root(root);
+        root->i_state &= ~(I_LOCK | I_NEW);
+        printk("AMRUT 1\n");
         if (proc_lustre_fs_root) {
                 err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb,
                                                   osc, mdc);
@@ -254,22 +260,88 @@ out_dev:
                 OBD_FREE(mdc, strlen(mdc) + 1);
         if (osc)
                 OBD_FREE(osc, strlen(osc) + 1);
+        printk("AMRUT 2\n");
 
         RETURN(0);
 
-out_cdb:
+out_cbd:
         ll_commitcbd_cleanup(sbi);
-out_request:
-        ptlrpc_req_finished(request);
-        obd_disconnect(&sbi->ll_osc_conn);
+out_osc:
+        obd_disconnect(&sbi->ll_osc_conn, 0);
 out_mdc:
-        obd_disconnect(&sbi->ll_mdc_conn);
+        obd_disconnect(&sbi->ll_mdc_conn, 0);
 out_free:
         OBD_FREE(sbi, sizeof(*sbi));
 
         goto out_dev;
 } /* ll_fill_super */
 
+
+int ll_setattr_raw(struct inode *inode, struct iattr *attr)
+{
+        struct ptlrpc_request *request = NULL;
+        struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct mdc_op_data op_data;
+        int err = 0;
+        ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino);
+
+        if ((attr->ia_valid & ATTR_SIZE)) {
+                /* writeback uses inode->i_size to determine how far out
+                 * its cached pages go.  ll_truncate gets a PW lock, canceling
+                 * our lock, _after_ it has updated i_size.  this can confuse
+                 * us into zero extending the file to the newly truncated
+                 * size, and this has bad implications for a racing o_append.
+                 * if we're extending our size we need to flush the pages
+                 * with the correct i_size before vmtruncate stomps on
+                 * the new i_size.  again, this can only find pages to
+                 * purge if the PW lock that generated them is still held.
+                 */
+                if ( attr->ia_size > inode->i_size ) {
+                        filemap_fdatasync(inode->i_mapping);
+                        filemap_fdatawait(inode->i_mapping);
+                }
+                err = vmtruncate(inode, attr->ia_size);
+                if (err)
+                        RETURN(err);
+        }
+
+        /* Don't send size changes to MDS to avoid "fast EA" problems, and
+         * also avoid a pointless RPC (we get file size from OST anyways).
+         */
+        attr->ia_valid &= ~ATTR_SIZE;
+        if (!attr->ia_valid)
+                RETURN(0);
+
+        ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
+
+        err = mdc_setattr(&sbi->ll_mdc_conn, &op_data,
+                          attr, NULL, 0, &request);
+        if (err)
+                CERROR("mdc_setattr fails: err = %d\n", err);
+
+        ptlrpc_req_finished(request);
+
+        if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) {
+                struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+                struct obdo oa;
+                int err2;
+
+                CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
+                       inode->i_ino, attr->ia_mtime);
+                oa.o_id = lsm->lsm_object_id;
+                oa.o_mode = S_IFREG;
+                oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMTIME;
+                oa.o_mtime = LTIME_S(attr->ia_mtime);
+                err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL);
+                if (err2) {
+                        CERROR("obd_setattr fails: rc=%d\n", err);
+                        if (!err)
+                                err = err2;
+                }
+        }
+        RETURN(err);
+}
 struct super_block * ll_get_sb(struct file_system_type *fs_type,
                                int flags, char *devname, void * data)
 {
@@ -282,10 +354,11 @@ static void ll_put_super(struct super_block *sb)
         struct list_head *tmp, *next;
         struct ll_fid rootfid;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:\n");
 
         list_del(&sbi->ll_conn_chain);
         ll_commitcbd_cleanup(sbi);
-        obd_disconnect(&sbi->ll_osc_conn);
+        obd_disconnect(&sbi->ll_osc_conn, 0);
 
         /* NULL request to force sync on the MDS, and get the last_committed
          * value to flush remaining RPCs from the pending queue on client.
@@ -300,7 +373,7 @@ static void ll_put_super(struct super_block *sb)
         sbi->ll_proc_root = NULL;
         }
 
-        obd_disconnect(&sbi->ll_mdc_conn);
+        obd_disconnect(&sbi->ll_mdc_conn, 0);
 
         spin_lock(&dcache_lock);
         list_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list){
@@ -320,12 +393,13 @@ static void ll_clear_inode(struct inode *inode)
         struct ll_inode_info *lli = ll_i2info(inode);
         int rc;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino);
 
 #warning "Is there a reason we don't do this in 2.5, but we do in 2.4?"
 #if 0
-        rc = mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK);
+        rc = ll_mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK);
         if (rc < 0) {
-                CERROR("mdc_cancel_unused: %d\n", rc);
+                CERROR("ll_mdc_cancel_unused: %d\n", rc);
                 /* XXX FIXME do something dramatic */
         }
 
@@ -342,8 +416,10 @@ static void ll_clear_inode(struct inode *inode)
                 CERROR("clearing in-use inode %lu: count = %d\n",
                        inode->i_ino, atomic_read(&inode->i_count));
 
-        if (lli->lli_smd)
+        if (lli->lli_smd) {
                 obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd);
+                lli->lli_smd = NULL;
+        }
 
         if (lli->lli_symlink_name) {
                 OBD_FREE(lli->lli_symlink_name,strlen(lli->lli_symlink_name)+1);
@@ -357,6 +433,7 @@ static void ll_clear_inode(struct inode *inode)
 static void ll_delete_inode(struct inode *inode)
 {
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino);
         if (S_ISREG(inode->i_mode)) {
                 int err;
                 struct obdo *oa;
@@ -399,6 +476,10 @@ static int ll_attr2inode(struct inode * inode, struct iattr * attr, int trunc)
         int error = 0;
 
         if ((ia_valid & ATTR_SIZE) && trunc) {
+                if (attr->ia_size > ll_file_maxbytes(inode)) {
+                        error = -EFBIG;
+                        goto out;
+                }
                 error = vmtruncate(inode, attr->ia_size);
                 if (error)
                         goto out;
@@ -433,15 +514,21 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
         ENTRY;
 
         /* change incore inode */
-        ll_attr2inode(inode, attr, do_trunc);
+        err = ll_attr2inode(inode, attr, do_trunc);
+        if (err)
+                RETURN(err);
 
         /* Don't send size changes to MDS to avoid "fast EA" problems, and
          * also avoid a pointless RPC (we get file size from OST anyways).
          */
         attr->ia_valid &= ~ATTR_SIZE;
         if (attr->ia_valid) {
-                err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0,
-                                  &request);
+                struct mdc_op_data op_data;
+
+                ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
+
+                err = mdc_setattr(&sbi->ll_mdc_conn, &op_data,
+                                  attr, NULL, 0, &request);
                 if (err)
                         CERROR("mdc_setattr fails: err = %d\n", err);
 
@@ -455,7 +542,7 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
                         oa.o_id = lsm->lsm_object_id;
                         oa.o_mode = S_IFREG;
                         oa.o_valid = OBD_MD_FLID |OBD_MD_FLTYPE |OBD_MD_FLMTIME;
-                        oa.o_mtime = attr->ia_mtime.tv_sec;
+                        oa.o_mtime = LTIME_S(attr->ia_mtime);
                         err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL);
                         if (err2) {
                                 CERROR("obd_setattr fails: rc=%d\n", err);
@@ -471,7 +558,7 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc)
 int ll_setattr(struct dentry *de, struct iattr *attr)
 {
         int rc = inode_change_ok(de->d_inode, attr);
-
+        CDEBUG(D_VFSTRACE, "VFS Op:name=%s\n", de->d_name.name);
         if (rc)
                 return rc;
 
@@ -484,6 +571,7 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs)
         struct obd_statfs osfs;
         int rc;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:\n");
 
         memset(sfs, 0, sizeof(*sfs));
         rc = obd_statfs(&sbi->ll_mdc_conn, &osfs);
@@ -518,8 +606,11 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs)
                 sfs->f_blocks = osfs.os_blocks;
                 sfs->f_bfree = osfs.os_bfree;
                 sfs->f_bavail = osfs.os_bavail;
-                if (osfs.os_ffree < (__u64)sfs->f_ffree)
+                if (osfs.os_ffree < (__u64)sfs->f_ffree) {
+                        sfs->f_files = (sfs->f_files - sfs->f_ffree) +
+                                       osfs.os_ffree;
                         sfs->f_ffree = osfs.os_ffree;
+                }
         }
 
 out:
@@ -527,21 +618,30 @@ out:
 }
 
 void ll_update_inode(struct inode *inode, struct mds_body *body,
-                     struct lov_mds_md *lmm)
+                     struct lov_stripe_md *lsm)
 {
         struct ll_inode_info *lli = ll_i2info(inode);
 
-        if (lmm != NULL)
-                obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lmm);
+        LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+        if (lsm != NULL) {
+                if (lli->lli_smd == NULL) {
+                        lli->lli_smd = lsm;
+                        lli->lli_maxbytes = lsm->lsm_maxbytes;
+                        if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
+                                lli->lli_maxbytes = PAGE_CACHE_MAXBYTES;
+                } else {
+                        LASSERT (!memcmp (lli->lli_smd, lsm, sizeof (*lsm)));
+                }
+        }
 
         if (body->valid & OBD_MD_FLID)
                 inode->i_ino = body->ino;
         if (body->valid & OBD_MD_FLATIME)
-                inode->i_atime.tv_sec = body->atime;
+                LTIME_S(inode->i_atime) = body->atime;
         if (body->valid & OBD_MD_FLMTIME)
-                inode->i_mtime.tv_sec = body->mtime;
+                LTIME_S(inode->i_mtime) = body->mtime;
         if (body->valid & OBD_MD_FLCTIME)
-                inode->i_ctime.tv_sec = body->ctime;
+                LTIME_S(inode->i_ctime) = body->ctime;
         if (body->valid & OBD_MD_FLMODE)
                 inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT);
         if (body->valid & OBD_MD_FLTYPE)
@@ -571,36 +671,20 @@ int ll_read_inode2(struct inode *inode, void *opaque)
         struct ll_inode_info *lli = ll_i2info(inode);
         int rc = 0;
         ENTRY;
+        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino);
 
         sema_init(&lli->lli_open_sem, 1);
-        lli->flags = 0;
-        init_MUTEX(&lli->lli_getattr_sem);
         /* these are 2.4 only, but putting them here for consistency.. */
         spin_lock_init(&lli->lli_read_extent_lock);
         INIT_LIST_HEAD(&lli->lli_read_extents);
+        ll_lldo_init(&lli->lli_dirty);
+        lli->lli_flags = 0;
+        lli->lli_maxbytes = LUSTRE_STRIPE_MAXBYTES;
 
         LASSERT(!lli->lli_smd);
 
         /* core attributes first */
-        ll_update_inode(inode, body, lic ? lic->lic_lmm : NULL);
-
-        /* Get the authoritative file size */
-        if (lli->lli_smd && S_ISREG(inode->i_mode)) {
-                struct ll_file_data *fd = file->private_data;
-                struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
-                struct lustre_handle lockh = {0, 0};
-
-                LASSERT(lli->lli_smd->lsm_object_id != 0);
-
-                rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh);
-                if (err != ELDLM_OK && err != ELDLM_MATCHED) {
-                        ll_clear_inode(inode);
-                        make_bad_inode(inode);
-                } else {
-                        l_extent_unlock(fd, inode, lsm, LCK_PR, &extent,
-                                        &lockh);
-                }
-        }
+        ll_update_inode(inode, body, lic ? lic->lic_lsm : NULL);
 
         /* OIDEBUG(inode); */
 
@@ -618,6 +702,7 @@ int ll_read_inode2(struct inode *inode, void *opaque)
                 inode->i_op = &ll_fast_symlink_inode_operations;
                 EXIT;
         } else {
+                inode->i_op = &ll_special_inode_operations;
                 init_special_inode(inode, inode->i_mode,
                                    kdev_t_to_nr(inode->i_rdev));
                 EXIT;
@@ -626,62 +711,56 @@ int ll_read_inode2(struct inode *inode, void *opaque)
         return rc;
 }
 
-static inline void invalidate_request_list(struct list_head *req_list)
-{
-        struct list_head *tmp, *n;
-        list_for_each_safe(tmp, n, req_list) {
-                struct ptlrpc_request *req =
-                        list_entry(tmp, struct ptlrpc_request, rq_list);
-                CERROR("invalidating req xid %d op %d to %s:%d\n",
-                       (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
-                       req->rq_connection->c_remote_uuid,
-                       req->rq_import->imp_client->cli_request_portal);
-                req->rq_flags |= PTL_RPC_FL_ERR;
-                wake_up(&req->rq_wait_for_rep);
-        }
-}
 
 void ll_umount_begin(struct super_block *sb)
 {
         struct ll_sb_info *sbi = ll_s2sbi(sb);
-        struct list_head *ctmp;
+        struct obd_device *obd;
+        struct obd_ioctl_data ioc_data = { 0 };
 
         ENTRY;
-
-        list_for_each(ctmp, &sbi->ll_conn_chain) {
-                struct ptlrpc_connection *conn;
-                conn = list_entry(ctmp, struct ptlrpc_connection, c_sb_chain);
-
-                spin_lock(&conn->c_lock);
-                conn->c_flags |= CONN_INVALID;
-                /*invalidate_request_list(&conn->c_sending_head);*/
-                invalidate_request_list(&conn->c_delayed_head);
-                spin_unlock(&conn->c_lock);
-        }
+        CDEBUG(D_VFSTRACE, "VFS Op:\n");
+
+        obd = class_conn2obd(&sbi->ll_mdc_conn);
+        obd->obd_no_recov = 1;
+        obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_mdc_conn, sizeof ioc_data,
+                      &ioc_data, NULL);
+
+        obd = class_conn2obd(&sbi->ll_osc_conn);
+        obd->obd_no_recov = 1;
+        obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_osc_conn, sizeof ioc_data,
+                      &ioc_data, NULL);
+        
+        /* Really, we'd like to wait until there are no requests outstanding,
+         * and then continue.  For now, we just invalidate the requests,
+         * schedule, and hope.
+         */
+        schedule();
 
         EXIT;
 }
 
-
 static kmem_cache_t *ll_inode_cachep;
 
 static struct inode *ll_alloc_inode(struct super_block *sb)
 {
         struct ll_inode_info *lli;
-        lli = kmem_cache_alloc(ll_inode_cachep, SLAB_KERNEL);
-        if (!lli)
+        OBD_SLAB_ALLOC(lli, ll_inode_cachep, SLAB_KERNEL, sizeof *lli);
+        if (lli == NULL)
                 return NULL;
 
         memset(lli, 0, (char *)&lli->lli_vfs_inode - (char *)lli);
         sema_init(&lli->lli_open_sem, 1);
         init_MUTEX(&lli->lli_size_valid_sem);
+        lli->lli_maxbytes = LUSTRE_STRIPE_MAXBYTES;
 
         return &lli->lli_vfs_inode;
 }
 
 static void ll_destroy_inode(struct inode *inode)
 {
-        kmem_cache_free(ll_inode_cachep, ll_i2info(inode));
+        OBD_SLAB_FREE(ll_inode_cachep, ll_i2info(inode),
+                      sizeof(struct ll_inode_info));
 }
 
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
index 6ebe7de..19d234e 100644 (file)
@@ -36,6 +36,8 @@ static int ll_readlink_internal(struct inode *inode,
 {
         struct ll_inode_info *lli = ll_i2info(inode);
         struct ll_sb_info *sbi = ll_i2sbi(inode);
+        struct ll_fid fid;
+        struct mds_body *body;
         int rc, symlen = inode->i_size + 1;
         ENTRY;
 
@@ -47,14 +49,38 @@ static int ll_readlink_internal(struct inode *inode,
                 RETURN(0);
         }
 
-        rc = mdc_getattr(&sbi->ll_mdc_conn, inode->i_ino, S_IFLNK,
+        ll_inode2fid(&fid, inode);
+        rc = mdc_getattr(&sbi->ll_mdc_conn, &fid,
                          OBD_MD_LINKNAME, symlen, request);
         if (rc) {
                 CERROR("inode %lu: rc = %d\n", inode->i_ino, rc);
                 RETURN(rc);
         }
 
-        *symname = lustre_msg_buf((*request)->rq_repmsg, 1);
+        body = lustre_msg_buf ((*request)->rq_repmsg, 0, sizeof (*body));
+        LASSERT (body != NULL);
+        LASSERT_REPSWABBED (*request, 0);
+
+        if ((body->valid & OBD_MD_LINKNAME) == 0) {
+                CERROR ("OBD_MD_LINKNAME not set on reply\n");
+                GOTO (failed, rc = -EPROTO);
+        }
+        
+        LASSERT (symlen != 0);
+        if (body->eadatasize != symlen) {
+                CERROR ("inode %lu: symlink length %d not expected %d\n",
+                        inode->i_ino, body->eadatasize - 1, symlen - 1);
+                GOTO (failed, rc = -EPROTO);
+        }
+
+        *symname = lustre_msg_buf ((*request)->rq_repmsg, 1, symlen);
+        if (*symname == NULL ||
+            strnlen (*symname, symlen) != symlen - 1) {
+                /* not full/NULL terminated */
+                CERROR ("inode %lu: symlink not NULL terminated string"
+                        "of length %d\n", inode->i_ino, symlen - 1);
+                GOTO (failed, rc = -EPROTO);
+        }
 
         OBD_ALLOC(lli->lli_symlink_name, symlen);
         /* do not return an error if we cannot cache the symlink locally */
@@ -62,6 +88,10 @@ static int ll_readlink_internal(struct inode *inode,
                 memcpy(lli->lli_symlink_name, *symname, symlen);
 
         RETURN(0);
+
+ failed:
+        ptlrpc_req_finished (*request);
+        RETURN (-EPROTO);
 }
 
 static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
@@ -81,10 +111,9 @@ static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
                 GOTO(out, rc);
 
         rc = vfs_readlink(dentry, buffer, buflen, symname);
+        ptlrpc_req_finished(request);
  out:
         up(&lli->lli_open_sem);
-        ptlrpc_req_finished(request);
-
         RETURN(rc);
 }
 
@@ -119,9 +148,8 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd,
         }
 
         rc = vfs_follow_link_it(nd, symname, it);
- out:
         ptlrpc_req_finished(request);
-
+ out:
         RETURN(rc);
 }
 #else
@@ -149,9 +177,9 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd)
         nd->it.it_mode = mode;
 
         rc = vfs_follow_link(nd, symname);
+        ptlrpc_req_finished(request);
  out:
         up(&lli->lli_open_sem);
-        ptlrpc_req_finished(request);
 
         RETURN(rc);
 }
index 6b647b4..879e44d 100644 (file)
@@ -7,18 +7,12 @@ DEFS=
 
 if LIBLUSTRE
 lib_LIBRARIES = liblov.a
-LINX=client.c
-liblov_a_SOURCES = lov_obd.c lov_pack.c $(LINX)
+liblov_a_SOURCES = lov_obd.c lov_pack.c
 else
 MODULE = lov
 modulefs_DATA = lov.o
 EXTRA_PROGRAMS = lov
-LINX=client.c
-lov_SOURCES = lov_obd.c lov_pack.c lproc_lov.c $(LINX)
+lov_SOURCES = lov_obd.c lov_pack.c lproc_lov.c
 endif
 
-
-client.c: 
-       test -e client.c || ln -sf $(top_srcdir)/lib/client.c
-
 include $(top_srcdir)/Rules
index 19738b9..1a4f6c4 100644 (file)
 #include <linux/obd_lov.h>
 #include <linux/lprocfs_status.h>
 
-static kmem_cache_t *lov_file_cache;
-
 struct lov_file_handles {
+        struct portals_handle lfh_handle;
+        atomic_t lfh_refcount;
         struct list_head lfh_list;
-        __u64 lfh_cookie;
         int lfh_count;
-        char *lfh_data; /* an array of opaque data saved on behalf of
-                        * each osc, FD_OSTDATA_SIZE bytes for each */
+        struct obd_client_handle *lfh_och;
 };
 
 struct lov_lock_handles {
-        __u64 llh_cookie;
+        struct portals_handle llh_handle;
+        atomic_t llh_refcount;
+        int llh_stripe_count;
         struct lustre_handle llh_handles[0];
 };
 
-extern int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmm,
-                       struct lov_stripe_md *lsm);
-extern int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsm,
-                         struct lov_mds_md *lmm);
-extern int lov_setstripe(struct lustre_handle *conn,
-                         struct lov_stripe_md **lsmp, struct lov_mds_md *lmmu);
-extern int lov_getstripe(struct lustre_handle *conn, struct lov_mds_md *lmmu,
-                         struct lov_stripe_md *lsm);
+/* lov_file_handles helpers */
+static void lov_lfh_addref(void *lfhp)
+{
+        struct lov_file_handles *lfh = lfhp;
+
+        atomic_inc(&lfh->lfh_refcount);
+        CDEBUG(D_INFO, "GETting lfh %p : new refcount %d\n", lfh,
+               atomic_read(&lfh->lfh_refcount));
+}
+
+static struct lov_file_handles *lov_lfh_new(void)
+{
+        struct lov_file_handles *lfh;
+
+        OBD_ALLOC(lfh, sizeof *lfh);
+        if (lfh == NULL) {
+                CERROR("out of memory\n");
+                return NULL;
+        }
+
+        atomic_set(&lfh->lfh_refcount, 2);
+
+        INIT_LIST_HEAD(&lfh->lfh_handle.h_link);
+        class_handle_hash(&lfh->lfh_handle, lov_lfh_addref);
+
+        return lfh;
+}
+
+static struct lov_file_handles *lov_handle2lfh(struct lustre_handle *handle)
+{
+        ENTRY;
+        LASSERT(handle != NULL);
+        RETURN(class_handle2object(handle->cookie));
+}
+
+static void lov_lfh_put(struct lov_file_handles *lfh)
+{
+        CDEBUG(D_INFO, "PUTting lfh %p : new refcount %d\n", lfh,
+               atomic_read(&lfh->lfh_refcount) - 1);
+        LASSERT(atomic_read(&lfh->lfh_refcount) > 0 &&
+                atomic_read(&lfh->lfh_refcount) < 0x5a5a);
+        if (atomic_dec_and_test(&lfh->lfh_refcount)) {
+                LASSERT(list_empty(&lfh->lfh_handle.h_link));
+                OBD_FREE(lfh, sizeof *lfh);
+        }
+}
+
+static void lov_lfh_destroy(struct lov_file_handles *lfh)
+{
+        class_handle_unhash(&lfh->lfh_handle);
+        lov_lfh_put(lfh);
+}
+
+static void lov_llh_addref(void *llhp)
+{
+        struct lov_lock_handles *llh = llhp;
+
+        atomic_inc(&llh->llh_refcount);
+        CDEBUG(D_INFO, "GETting llh %p : new refcount %d\n", llh,
+               atomic_read(&llh->llh_refcount));
+}
+
+static struct lov_lock_handles *lov_llh_new(struct lov_stripe_md *lsm)
+{
+        struct lov_lock_handles *llh;
+
+        OBD_ALLOC(llh, sizeof *llh +
+                  sizeof(*llh->llh_handles) * lsm->lsm_stripe_count);
+        if (llh == NULL) {
+                CERROR("out of memory\n");
+                return NULL;
+        }
+        atomic_set(&llh->llh_refcount, 2);
+        llh->llh_stripe_count = lsm->lsm_stripe_count;
+        INIT_LIST_HEAD(&llh->llh_handle.h_link);
+        class_handle_hash(&llh->llh_handle, lov_llh_addref);
+        return llh;
+}
+
+static struct lov_lock_handles *lov_handle2llh(struct lustre_handle *handle)
+{
+        ENTRY;
+        LASSERT(handle != NULL);
+        RETURN(class_handle2object(handle->cookie));
+}
+
+static void lov_llh_put(struct lov_lock_handles *llh)
+{
+        CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh,
+               atomic_read(&llh->llh_refcount) - 1);
+        LASSERT(atomic_read(&llh->llh_refcount) > 0 &&
+                atomic_read(&llh->llh_refcount) < 0x5a5a);
+        if (atomic_dec_and_test(&llh->llh_refcount)) {
+                LASSERT(list_empty(&llh->llh_handle.h_link));
+                OBD_FREE(llh, sizeof *llh +
+                         sizeof(*llh->llh_handles) * llh->llh_stripe_count);
+        }
+}
+
+static void lov_llh_destroy(struct lov_lock_handles *llh)
+{
+        class_handle_unhash(&llh->llh_handle);
+        lov_llh_put(llh);
+}
 
 /* obd methods */
 int lov_attach(struct obd_device *dev, obd_count len, void *data)
@@ -84,18 +180,18 @@ int lov_detach(struct obd_device *dev)
 }
 
 static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
-                       struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                       ptlrpc_recovery_cb_t recover)
+                       struct obd_uuid *cluuid)
 {
         struct ptlrpc_request *req = NULL;
         struct lov_obd *lov = &obd->u.lov;
         struct client_obd *mdc = &lov->mdcobd->u.cli;
         struct lov_desc *desc = &lov->desc;
+        struct lov_desc *mdesc;
         struct lov_tgt_desc *tgts;
         struct obd_export *exp;
         struct lustre_handle mdc_conn;
         struct obd_uuid lov_mds_uuid = {"LOV_MDS_UUID"};
-        char *tmp;
+        struct obd_uuid *uuids;
         int rc, rc2, i;
         ENTRY;
 
@@ -114,14 +210,14 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
         INIT_LIST_HEAD(&exp->exp_lov_data.led_open_head);
 
         /* retrieve LOV metadata from MDS */
-        rc = obd_connect(&mdc_conn, lov->mdcobd, &lov_mds_uuid, recovd,recover);
+        rc = obd_connect(&mdc_conn, lov->mdcobd, &lov_mds_uuid);
         if (rc) {
                 CERROR("cannot connect to mdc: rc = %d\n", rc);
                 GOTO(out_conn, rc);
         }
 
         rc = mdc_getlovinfo(obd, &mdc_conn, &req);
-        rc2 = obd_disconnect(&mdc_conn);
+        rc2 = obd_disconnect(&mdc_conn, 0);
         if (rc) {
                 CERROR("cannot get lov info %d\n", rc);
                 GOTO(out_conn, rc);
@@ -129,36 +225,24 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
 
         if (rc2) {
                 CERROR("error disconnecting from MDS %d\n", rc2);
-                GOTO(out_conn, rc = rc2);
-        }
-
-        /* sanity... */
-        if (req->rq_repmsg->bufcount < 2 ||
-            req->rq_repmsg->buflens[0] < sizeof(*desc)) {
-                CERROR("LOV desc: invalid descriptor returned\n");
-                GOTO(out_conn, rc = -EINVAL);
+                GOTO(out_req, rc = rc2);
         }
 
-        memcpy(desc, lustre_msg_buf(req->rq_repmsg, 0), sizeof(*desc));
-        lov_unpackdesc(desc);
+        /* mdc_getlovinfo() has checked and swabbed the reply.  It has also
+         * done some simple checks (e.g. #uuids consistent with desc, uuid
+         * array fits in LOV_MAX_UUID_BUFFER_SIZE and all uuids are
+         * terminated), but I still need to verify it makes overall
+         * sense */
+        mdesc = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*mdesc));
+        LASSERT (mdesc != NULL);
+        LASSERT_REPSWABBED (req, 0);
 
-        if (req->rq_repmsg->buflens[1] <
-            sizeof(desc->ld_uuid.uuid) * desc->ld_tgt_count){
-                CERROR("LOV desc: invalid uuid array returned\n");
-                GOTO(out_conn, rc = -EINVAL);
-        }
+        *desc = *mdesc;
 
-        if (memcmp(obd->obd_uuid.uuid, desc->ld_uuid.uuid,
-                   sizeof(desc->ld_uuid.uuid))) {
+        if (!obd_uuid_equals(&obd->obd_uuid, &desc->ld_uuid)) {
                 CERROR("LOV desc: uuid %s not on mds device (%s)\n",
                        obd->obd_uuid.uuid, desc->ld_uuid.uuid);
-                GOTO(out_conn, rc = -EINVAL);
-        }
-
-        if (desc->ld_tgt_count > 1000) {
-                CERROR("LOV desc: target count > 1000 (%d)\n",
-                       desc->ld_tgt_count);
-                GOTO(out_conn, rc = -EINVAL);
+                GOTO(out_req, rc = -EINVAL);
         }
 
         /* Because of 64-bit divide/mod operations only work with a 32-bit
@@ -172,38 +256,45 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                        desc->ld_default_stripe_size,
                        desc->ld_default_stripe_count ?
                        desc->ld_default_stripe_count : desc->ld_tgt_count,~0UL);
-                GOTO(out_conn, rc = -EINVAL);
+                GOTO(out_req, rc = -EINVAL);
         }
 
+        /* We know ld_tgt_count is reasonable (the array of UUIDS fits in
+         * the maximum buffer size, so we won't be making outrageous
+         * demands on memory here. */
         lov->bufsize = sizeof(struct lov_tgt_desc) * desc->ld_tgt_count;
         OBD_ALLOC(lov->tgts, lov->bufsize);
         if (!lov->tgts) {
                 CERROR("Out of memory\n");
-                GOTO(out_conn, rc = -ENOMEM);
+                GOTO(out_req, rc = -ENOMEM);
         }
 
-        tmp = lustre_msg_buf(req->rq_repmsg, 1);
+        uuids = lustre_msg_buf(req->rq_repmsg, 1,
+                               sizeof(*uuids) * desc->ld_tgt_count);
+        LASSERT (uuids != NULL);
+        LASSERT_REPSWABBED (req, 1);
+
         for (i = 0, tgts = lov->tgts; i < desc->ld_tgt_count; i++, tgts++) {
                 struct obd_uuid *uuid = &tgts->uuid;
                 struct obd_device *tgt_obd;
                 struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" };
 
-                obd_str2uuid(uuid, tmp);
+                /* NULL termination already checked */
+                *uuid = uuids[i];
+
                 tgt_obd = client_tgtuuid2obd(uuid);
-                tmp += sizeof(uuid->uuid);
 
                 if (!tgt_obd) {
                         CERROR("Target %s not attached\n", uuid->uuid);
                         GOTO(out_disc, rc = -EINVAL);
                 }
 
-                if (!(tgt_obd->obd_flags & OBD_SET_UP)) {
+                if (!tgt_obd->obd_set_up) {
                         CERROR("Target %s not set up\n", uuid->uuid);
                         GOTO(out_disc, rc = -EINVAL);
                 }
 
-                rc = obd_connect(&tgts->conn, tgt_obd, &lov_osc_uuid, recovd,
-                                 recover);
+                rc = obd_connect(&tgts->conn, tgt_obd, &lov_osc_uuid);
 
                 if (rc) {
                         CERROR("Target %s connect error %d\n", uuid->uuid, rc);
@@ -215,7 +306,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                 if (rc) {
                         CERROR("Target %s REGISTER_LOV error %d\n",
                                uuid->uuid, rc);
-                        obd_disconnect(&tgts->conn);
+                        obd_disconnect(&tgts->conn, 0);
                         GOTO(out_disc, rc);
                 }
 
@@ -223,11 +314,10 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                 tgts->active = 1;
         }
 
-        mdc->cl_max_mds_easize = obd_size_wiremd(conn, NULL);
-
- out:
-        ptlrpc_req_finished(req);
-        RETURN(rc);
+        mdc->cl_max_mds_easize = obd_size_diskmd(conn, NULL);
+        ptlrpc_req_finished (req);
+        class_export_put(exp);
+        RETURN (0);
 
  out_disc:
         while (i-- > 0) {
@@ -235,25 +325,30 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                 --tgts;
                 --desc->ld_active_tgt_count;
                 tgts->active = 0;
-                obd_str2uuid(&uuid, tgts->uuid.uuid);
-                rc2 = obd_disconnect(&tgts->conn);
+                /* save for CERROR below; (we know it's terminated) */
+                uuid = tgts->uuid;
+                rc2 = obd_disconnect(&tgts->conn, 0);
                 if (rc2)
                         CERROR("error: LOV target %s disconnect on OST idx %d: "
                                "rc = %d\n", uuid.uuid, i, rc2);
         }
         OBD_FREE(lov->tgts, lov->bufsize);
+ out_req:
+        ptlrpc_req_finished (req);
  out_conn:
-        class_disconnect(conn);
-        goto out;
+        class_export_put(exp);
+        class_disconnect(conn, 0);
+        RETURN (rc);
 }
 
-static int lov_disconnect(struct lustre_handle *conn)
+static int lov_disconnect(struct lustre_handle *conn, int failover)
 {
         struct obd_device *obd = class_conn2obd(conn);
         struct lov_obd *lov = &obd->u.lov;
         struct obd_export *exp;
         struct list_head *p, *n;
         int rc, i;
+        ENTRY;
 
         if (!lov->tgts)
                 goto out_local;
@@ -264,7 +359,16 @@ static int lov_disconnect(struct lustre_handle *conn)
                 goto out_local;
 
         for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                rc = obd_disconnect(&lov->tgts[i].conn);
+                if (obd->obd_no_recov) {
+                        /* Pass it on to our clients.
+                         * XXX This should be an argument to disconnect,
+                         * XXX not a back-door flag on the OBD.  Ah well.
+                         */
+                        struct obd_device *osc_obd =
+                                class_conn2obd(&lov->tgts[i].conn);
+                        osc_obd->obd_no_recov = 1;
+                }
+                rc = obd_disconnect(&lov->tgts[i].conn, failover);
                 if (rc) {
                         if (lov->tgts[i].active) {
                                 CERROR("Target %s disconnect error %d\n",
@@ -282,22 +386,29 @@ static int lov_disconnect(struct lustre_handle *conn)
         lov->tgts = NULL;
 
         exp = class_conn2export(conn);
+        if (exp == NULL) {
+                CERROR("export handle "LPU64" invalid!  If you can reproduce, "
+                       "please send a full debug log to phik\n", conn->cookie);
+                RETURN(0);
+        }
         spin_lock(&exp->exp_lov_data.led_lock);
         list_for_each_safe(p, n, &exp->exp_lov_data.led_open_head) {
                 /* XXX close these, instead of just discarding them? */
                 struct lov_file_handles *lfh;
                 lfh = list_entry(p, typeof(*lfh), lfh_list);
                 CERROR("discarding open LOV handle %p:"LPX64"\n",
-                       lfh, lfh->lfh_cookie);
+                       lfh, lfh->lfh_handle.h_cookie);
                 list_del(&lfh->lfh_list);
-                OBD_FREE(lfh->lfh_data, lfh->lfh_count * FD_OSTDATA_SIZE);
-                PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh));
+                OBD_FREE(lfh->lfh_och, lfh->lfh_count * FD_OSTDATA_SIZE);
+                lov_lfh_destroy(lfh);
+                lov_lfh_put(lfh);
         }
         spin_unlock(&exp->exp_lov_data.led_lock);
+        class_export_put(exp);
 
  out_local:
-        rc = class_disconnect(conn);
-        return rc;
+        rc = class_disconnect(conn, 0);
+        RETURN(rc);
 }
 
 /* Error codes:
@@ -305,7 +416,6 @@ static int lov_disconnect(struct lustre_handle *conn)
  *  -EINVAL  : UUID can't be found in the LOV's target list
  *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
  *  -EBADF   : The UUID is found, but the OBD is the wrong type (!)
- *  -EALREADY: The OSC is already marked (in)active
  */
 static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
                               int activate)
@@ -321,8 +431,8 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
         spin_lock(&lov->lov_lock);
         for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) {
                 CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n",
-                       i, tgt->uuid.uuid, tgt->conn.addr);
-                if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof(uuid->uuid)) == 0)
+                       i, tgt->uuid.uuid, tgt->conn.cookie);
+                if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof uuid->uuid) == 0)
                         break;
         }
 
@@ -331,22 +441,19 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid,
 
         obd = class_conn2obd(&tgt->conn);
         if (obd == NULL) {
-                LBUG();
+                /* This can happen if OST failure races with node shutdown */
                 GOTO(out, rc = -ENOTCONN);
         }
 
         CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LOV idx %d\n",
                obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
                obd->obd_type->typ_name, i);
-        if (strcmp(obd->obd_type->typ_name, "osc") != 0) {
-                LBUG();
-                GOTO(out, rc = -EBADF);
-        }
+        LASSERT(strcmp(obd->obd_type->typ_name, "osc") == 0);
 
         if (tgt->active == activate) {
                 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
                        activate ? "" : "in");
-                GOTO(out, rc = -EALREADY);
+                GOTO(out, rc);
         }
 
         CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in");
@@ -407,21 +514,55 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf)
         RETURN(rc);
 }
 
-static struct lov_file_handles *lov_handle2lfh(struct lustre_handle *handle)
+/* compute object size given "stripeno" and the ost size */
+static obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
+                                int stripeno)
 {
-        struct lov_file_handles *lfh = NULL;
+        unsigned long ssize  = lsm->lsm_stripe_size;
+        unsigned long swidth = ssize * lsm->lsm_stripe_count;
+        unsigned long stripe_size;
+        obd_size lov_size;
+
+        if (ost_size == 0)
+                return 0;
+
+        /* do_div(a, b) returns a % b, and a = a / b */
+        stripe_size = do_div(ost_size, ssize);
 
-        if (!handle || !handle->addr)
-                RETURN(NULL);
+        if (stripe_size)
+                lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
+        else
+                lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
 
-        lfh = (struct lov_file_handles *)(unsigned long)(handle->addr);
-        if (!kmem_cache_validate(lov_file_cache, lfh))
-                RETURN(NULL);
+        return lov_size;
+}
 
-        if (lfh->lfh_cookie != handle->cookie)
-                RETURN(NULL);
+static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid,
+                            struct lov_stripe_md *lsm, int stripeno, int *set)
+{
+        if (*set) {
+                if (valid & OBD_MD_FLSIZE) {
+                        /* this handles sparse files properly */
+                        obd_size lov_size;
 
-        return lfh;
+                        lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
+                        if (lov_size > tgt->o_size)
+                                tgt->o_size = lov_size;
+                }
+                if (valid & OBD_MD_FLBLOCKS)
+                        tgt->o_blocks += src->o_blocks;
+                if (valid & OBD_MD_FLBLKSZ)
+                        tgt->o_blksize += src->o_blksize;
+                if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
+                        tgt->o_ctime = src->o_ctime;
+                if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
+                        tgt->o_mtime = src->o_mtime;
+        } else {
+                obdo_cpy_md(tgt, src, valid);
+                if (valid & OBD_MD_FLSIZE)
+                        tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno);
+                *set = 1;
+        }
 }
 
 /* the LOV expects oa->o_id to be set to the LOV object id */
@@ -433,24 +574,24 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
         struct lov_stripe_md *lsm;
         struct lov_oinfo *loi;
         struct obdo *tmp;
-        int ost_count, ost_idx;
-        int first = 1, obj_alloc = 0;
+        unsigned ost_count, ost_idx;
+        int set = 0, obj_alloc = 0;
         int rc = 0, i;
         ENTRY;
 
         LASSERT(ea);
 
         if (!export)
-                RETURN(-EINVAL);
+                GOTO(out_exp, rc = -EINVAL);
 
         lov = &export->exp_obd->u.lov;
 
         if (!lov->desc.ld_active_tgt_count)
-                RETURN(-EIO);
+                GOTO(out_exp, rc = -EIO);
 
         tmp = obdo_alloc();
         if (!tmp)
-                RETURN(-ENOMEM);
+                GOTO(out_exp, rc = -ENOMEM);
 
         lsm = *ea;
 
@@ -471,11 +612,8 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
                 lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size;
 
         if (!*ea || lsm->lsm_stripe_offset >= ost_count) {
-                int mult = lsm->lsm_object_id * lsm->lsm_stripe_count;
-                int stripe_offset = mult % ost_count;
-                int sub_offset = (mult / ost_count);
-
-                ost_idx = (stripe_offset + sub_offset) % ost_count;
+                get_random_bytes(&ost_idx, 2);
+                ost_idx %= ost_count;
         } else
                 ost_idx = lsm->lsm_stripe_offset;
 
@@ -517,10 +655,9 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
                 CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64" at idx %d\n",
                        lsm->lsm_object_id, loi->loi_id, ost_idx);
 
-                if (first) {
+                if (!set)
                         lsm->lsm_stripe_offset = ost_idx;
-                        first = 0;
-                }
+                lov_merge_attrs(oa, tmp, OBD_MD_FLBLKSZ, lsm, obj_alloc, &set);
 
                 ++obj_alloc;
                 ++loi;
@@ -532,13 +669,15 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
                 }
         }
 
-        if (*ea)
+        if (*ea != NULL) {
                 GOTO(out_cleanup, rc);
-        else {
+        else {
                 struct lov_stripe_md *lsm_new;
                 /* XXX LOV STACKING call into osc for sizes */
-                int size = lov_stripe_md_size(obj_alloc);
+                unsigned size = lov_stripe_md_size(obj_alloc);
 
+                CERROR("reallocating LSM for objid "LPX64": old %u new %u\n",
+                       lsm->lsm_object_id, obj_alloc, lsm->lsm_stripe_count);
                 OBD_ALLOC(lsm_new, size);
                 if (!lsm_new)
                         GOTO(out_cleanup, rc = -ENOMEM);
@@ -554,6 +693,8 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
 
  out_tmp:
         obdo_free(tmp);
+ out_exp:
+        class_export_put(export);
         return rc;
 
  out_cleanup:
@@ -564,14 +705,15 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa,
                 /* destroy already created objects here */
                 memcpy(tmp, oa, sizeof(*tmp));
                 tmp->o_id = loi->loi_id;
-                err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL, NULL);
+                err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL,
+                                  NULL);
                 if (err)
                         CERROR("Failed to uncreate objid "LPX64" subobj "
                                LPX64" on OST idx %d: rc = %d\n",
                                oa->o_id, loi->loi_id, loi->loi_ost_idx,
                                err);
         }
-        if (!*ea)
+        if (*ea == NULL)
                 obd_free_memmd(conn, &lsm);
         goto out_tmp;
 }
@@ -589,17 +731,17 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
 
         if (!lsm) {
                 CERROR("LOV requires striping ea for destruction\n");
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out, rc = -ENODEV);
 
         if (oa->o_valid & OBD_MD_FLHANDLE)
                 lfh = lov_handle2lfh(obdo_handle(oa));
@@ -616,8 +758,7 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
                 if (lfh)
-                        memcpy(obdo_handle(&tmp),
-                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                        memcpy(obdo_handle(&tmp), lfh->lfh_och + i,
                                FD_OSTDATA_SIZE);
                 else
                         tmp.o_valid &= ~OBD_MD_FLHANDLE;
@@ -625,62 +766,18 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa,
                                   NULL, NULL);
                 if (err && lov->tgts[loi->loi_ost_idx].active) {
                         CERROR("error: destroying objid "LPX64" subobj "
-                               LPX64" on OST idx %d\n: rc = %d",
+                               LPX64" on OST idx %d: rc = %d\n",
                                oa->o_id, loi->loi_id, loi->loi_ost_idx, err);
                         if (!rc)
                                 rc = err;
                 }
         }
-        RETURN(rc);
-}
-
-/* compute object size given "stripeno" and the ost size */
-static obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
-                                int stripeno)
-{
-        unsigned long ssize  = lsm->lsm_stripe_size;
-        unsigned long swidth = ssize * lsm->lsm_stripe_count;
-        unsigned long stripe_size;
-        obd_size lov_size;
-
-        if (ost_size == 0)
-                return 0;
-
-        /* do_div(a, b) returns a % b, and a = a / b */
-        stripe_size = do_div(ost_size, ssize);
-
-        if (stripe_size)
-                lov_size = ost_size * swidth + stripeno * ssize + stripe_size;
-        else
-                lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize;
-
-        return lov_size;
-}
-
-static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid,
-                            struct lov_stripe_md *lsm, int stripeno, int *set)
-{
-        if (*set) {
-                if (valid & OBD_MD_FLSIZE) {
-                        /* this handles sparse files properly */
-                        obd_size lov_size;
-
-                        lov_size = lov_stripe_size(lsm, src->o_size, stripeno);
-                        if (lov_size > tgt->o_size)
-                                tgt->o_size = lov_size;
-                }
-                if (valid & OBD_MD_FLBLOCKS)
-                        tgt->o_blocks += src->o_blocks;
-                if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime)
-                        tgt->o_ctime = src->o_ctime;
-                if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime)
-                        tgt->o_mtime = src->o_mtime;
-        } else {
-                obdo_cpy_md(tgt, src, valid);
-                if (valid & OBD_MD_FLSIZE)
-                        tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno);
-                *set = 1;
-        }
+        if (lfh != NULL)
+                lov_lfh_put(lfh);
+        EXIT;
+ out:
+        class_export_put(export);
+        return rc;
 }
 
 static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
@@ -691,23 +788,22 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
         struct lov_obd *lov;
         struct lov_oinfo *loi;
         struct lov_file_handles *lfh = NULL;
-        int i;
-        int set = 0;
+        int i, rc = 0, set = 0;
         ENTRY;
 
         if (!lsm) {
                 CERROR("LOV requires striping ea\n");
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out, rc = -ENODEV);
 
         lov = &export->exp_obd->u.lov;
 
@@ -730,8 +826,7 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
                 if (lfh)
-                        memcpy(obdo_handle(&tmp),
-                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                        memcpy(obdo_handle(&tmp), lfh->lfh_och + i,
                                FD_OSTDATA_SIZE);
                 else
                         tmp.o_valid &= ~OBD_MD_FLHANDLE;
@@ -743,14 +838,145 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa,
                                        LPX64" on OST idx %d: rc = %d\n",
                                        oa->o_id, loi->loi_id, loi->loi_ost_idx,
                                        err);
-                                RETURN(err);
+                                GOTO(out, rc = err);
                         }
                 } else {
                         lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &set);
                 }
         }
+        if (!set)
+                rc = -EIO;
+        GOTO(out, rc);
+ out:
+        if (lfh != NULL)
+                lov_lfh_put(lfh);
+        class_export_put(export);
+        return rc;
+}
+
+static int lov_getattr_interpret(struct ptlrpc_request_set *rqset,
+                                 struct lov_getattr_async_args *aa, int rc)
+{
+        struct lov_stripe_md *lsm = aa->aa_lsm;
+        struct obdo          *oa = aa->aa_oa;
+        struct obdo          *obdos = aa->aa_stripe_oas;
+        struct lov_oinfo     *loi;
+        int                   i;
+        int                   set = 0;
+        ENTRY;
+
+        if (rc == 0) {
+                /* NB all stripe requests succeeded to get here */
+
+                for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count;
+                     i++,loi++) {
+                        if (obdos[i].o_valid == 0)      /* inactive stripe */
+                                continue;
+
+                        lov_merge_attrs(oa, &obdos[i], obdos[i].o_valid, lsm,
+                                        i, &set);
+                }
+
+                if (!set) {
+                        CERROR ("No stripes had valid attrs\n");
+                        rc = -EIO;
+                }
+        }
+
+        OBD_FREE (obdos, lsm->lsm_stripe_count * sizeof (*obdos));
+        RETURN (rc);
+}
+
+static int lov_getattr_async (struct lustre_handle *conn, struct obdo *oa,
+                              struct lov_stripe_md *lsm,
+                              struct ptlrpc_request_set *rqset)
+{
+        struct obdo *obdos;
+        struct obd_export *export = class_conn2export(conn);
+        struct lov_obd *lov;
+        struct lov_oinfo *loi;
+        struct lov_file_handles *lfh = NULL;
+        struct lov_getattr_async_args *aa;
+        int i;
+        int set = 0;
+        int rc = 0;
+        ENTRY;
+
+        if (!lsm) {
+                CERROR("LOV requires striping ea\n");
+                GOTO(out, rc = -EINVAL);
+        }
+
+        if (lsm->lsm_magic != LOV_MAGIC) {
+                CERROR("LOV striping magic bad %#x != %#x\n",
+                       lsm->lsm_magic, LOV_MAGIC);
+                GOTO(out, rc = -EINVAL);
+        }
+
+        if (!export || !export->exp_obd)
+                GOTO(out, rc = -ENODEV);
+
+        lov = &export->exp_obd->u.lov;
+
+        OBD_ALLOC (obdos, lsm->lsm_stripe_count * sizeof (*obdos));
+        if (obdos == NULL)
+                GOTO (out, rc = -ENOMEM);
+
+        if (oa->o_valid & OBD_MD_FLHANDLE)
+                lfh = lov_handle2lfh(obdo_handle(oa));
+
+        CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n",
+               lsm->lsm_object_id, lsm->lsm_stripe_count, lsm->lsm_stripe_size);
+        for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
+                int err;
+
+                if (lov->tgts[loi->loi_ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                        /* leaves obdos[i].obd_valid unset */
+                        continue;
+                }
+
+                CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx "
+                       "%u\n", oa->o_id, i, loi->loi_id, loi->loi_ost_idx);
+                /* create data objects with "parent" OA */
+                memcpy(&obdos[i], oa, sizeof(obdos[i]));
+                obdos[i].o_id = loi->loi_id;
+                if (lfh)
+                        memcpy(obdo_handle(&obdos[i]), lfh->lfh_och + i,
+                               FD_OSTDATA_SIZE);
+                else
+                        obdos[i].o_valid &= ~OBD_MD_FLHANDLE;
 
-        RETURN(set ? 0 : -EIO);
+                err = obd_getattr_async (&lov->tgts[loi->loi_ost_idx].conn,
+                                         &obdos[i], NULL, rqset);
+                if (err) {
+                        CERROR("error: getattr objid "LPX64" subobj "
+                               LPX64" on OST idx %d: rc = %d\n",
+                               oa->o_id, loi->loi_id, loi->loi_ost_idx,
+                               err);
+                        GOTO(out_obdos, rc = err);
+                }
+                set = 1;
+        }
+        if (!set)
+                GOTO (out_obdos, rc = -EIO);
+
+        LASSERT (rqset->set_interpret == NULL);
+        rqset->set_interpret = lov_getattr_interpret;
+        LASSERT (sizeof (rqset->set_args) >= sizeof (*aa));
+        aa = (struct lov_getattr_async_args *)&rqset->set_args;
+        aa->aa_lsm = lsm;
+        aa->aa_oa = oa;
+        aa->aa_stripe_oas = obdos;
+        GOTO (out, rc = 0);
+
+ out_obdos:
+        OBD_FREE (obdos, lsm->lsm_stripe_count * sizeof (*obdos));
+ out:
+        if (lfh != NULL)
+                lov_lfh_put(lfh);
+        class_export_put(export);
+        RETURN (rc);
 }
 
 static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
@@ -766,17 +992,17 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
 
         if (!lsm) {
                 CERROR("LOV requires striping ea\n");
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out, rc = -ENODEV);
 
         /* size changes should go through punch and not setattr */
         LASSERT(!(oa->o_valid & OBD_MD_FLSIZE));
@@ -786,7 +1012,7 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
 
         tmp = obdo_alloc();
         if (!tmp)
-                RETURN(-ENOMEM);
+                GOTO(out, rc = -ENOMEM);
 
         if (oa->o_valid & OBD_MD_FLHANDLE)
                 lfh = lov_handle2lfh(obdo_handle(oa));
@@ -803,8 +1029,7 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
                 obdo_cpy_md(tmp, oa, oa->o_valid);
 
                 if (lfh)
-                        memcpy(obdo_handle(tmp),
-                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                        memcpy(obdo_handle(tmp), lfh->lfh_och + i,
                                FD_OSTDATA_SIZE);
                 else
                         tmp->o_valid &= ~OBD_MD_FLHANDLE;
@@ -828,45 +1053,50 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa,
         obdo_free(tmp);
         if (!set && !rc)
                 rc = -EIO;
-        RETURN(rc);
+        if (lfh != NULL)
+                lov_lfh_put(lfh);
+        GOTO(out, rc);
+ out:
+        class_export_put(export);
+        return rc;
 }
 
 static int lov_open(struct lustre_handle *conn, struct obdo *oa,
-                    struct lov_stripe_md *lsm, struct obd_trans_info *oti)
+                    struct lov_stripe_md *lsm, struct obd_trans_info *oti,
+                    struct obd_client_handle *och)
 {
         struct obdo *tmp; /* on the heap here, on the stack in lov_close? */
         struct obd_export *export = class_conn2export(conn);
         struct lov_obd *lov;
         struct lov_oinfo *loi;
         struct lov_file_handles *lfh = NULL;
-        struct lustre_handle *handle;
-        int set = 0;
-        int rc = 0, i;
+        int set = 0, rc = 0, i;
         ENTRY;
+        LASSERT(och != NULL);
 
         if (!lsm) {
                 CERROR("LOV requires striping ea for opening\n");
-                RETURN(-EINVAL);
+                GOTO(out_exp, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out_exp, rc = -EINVAL);
         }
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out_exp, rc = -ENODEV);
 
         tmp = obdo_alloc();
         if (!tmp)
-                RETURN(-ENOMEM);
+                GOTO(out_exp, rc = -ENOMEM);
 
-        PORTAL_SLAB_ALLOC(lfh, lov_file_cache, sizeof(*lfh));
-        if (!lfh)
+        lfh = lov_lfh_new();
+        if (lfh == NULL)
                 GOTO(out_tmp, rc = -ENOMEM);
-        OBD_ALLOC(lfh->lfh_data, lsm->lsm_stripe_count * FD_OSTDATA_SIZE);
-        if (!lfh->lfh_data)
+        OBD_ALLOC(lfh->lfh_och, lsm->lsm_stripe_count * sizeof *och);
+        if (!lfh->lfh_och)
                 GOTO(out_lfh, rc = -ENOMEM);
 
         lov = &export->exp_obd->u.lov;
@@ -883,10 +1113,12 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa,
                 tmp->o_id = loi->loi_id;
 
                 rc = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp,
-                              NULL, NULL);
+                              NULL, NULL, lfh->lfh_och + i);
                 if (rc) {
-                        if (!lov->tgts[loi->loi_ost_idx].active)
+                        if (!lov->tgts[loi->loi_ost_idx].active) {
+                                rc = 0;
                                 continue;
+                        }
                         CERROR("error: open objid "LPX64" subobj "LPX64
                                " on OST idx %d: rc = %d\n",
                                oa->o_id, lsm->lsm_oinfo[i].loi_id,
@@ -895,31 +1127,26 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa,
                 }
 
                 lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &set);
-
-                if (tmp->o_valid & OBD_MD_FLHANDLE)
-                        memcpy(lfh->lfh_data + i * FD_OSTDATA_SIZE,
-                               obdo_handle(tmp), FD_OSTDATA_SIZE);
         }
 
-        handle = obdo_handle(oa);
-
         lfh->lfh_count = lsm->lsm_stripe_count;
-        get_random_bytes(&lfh->lfh_cookie, sizeof(lfh->lfh_cookie));
-
-        handle->addr = (__u64)(unsigned long)lfh;
-        handle->cookie = lfh->lfh_cookie;
+        och->och_fh.cookie = lfh->lfh_handle.h_cookie;
+        obdo_handle(oa)->cookie = lfh->lfh_handle.h_cookie;
         oa->o_valid |= OBD_MD_FLHANDLE;
+
+        /* llfh refcount transfers to list */
         spin_lock(&export->exp_lov_data.led_lock);
         list_add(&lfh->lfh_list, &export->exp_lov_data.led_open_head);
         spin_unlock(&export->exp_lov_data.led_lock);
 
-        if (!set && !rc)
-                rc = -EIO;
-out_tmp:
+        GOTO(out_tmp, rc);
+ out_tmp:
         obdo_free(tmp);
-        RETURN(rc);
+ out_exp:
+        class_export_put(export);
+        return rc;
 
-out_handles:
+ out_handles:
         for (i--, loi = &lsm->lsm_oinfo[i]; i >= 0; i--, loi--) {
                 int err;
 
@@ -928,8 +1155,7 @@ out_handles:
 
                 memcpy(tmp, oa, sizeof(*tmp));
                 tmp->o_id = loi->loi_id;
-                memcpy(obdo_handle(tmp), lfh->lfh_data + i * FD_OSTDATA_SIZE,
-                       FD_OSTDATA_SIZE);
+                memcpy(obdo_handle(tmp), lfh->lfh_och + i, FD_OSTDATA_SIZE);
 
                 err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, tmp,
                                 NULL, NULL);
@@ -940,9 +1166,10 @@ out_handles:
                 }
         }
 
-        OBD_FREE(lfh->lfh_data, lsm->lsm_stripe_count * FD_OSTDATA_SIZE);
-out_lfh:
-        PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh));
+        OBD_FREE(lfh->lfh_och, lsm->lsm_stripe_count * FD_OSTDATA_SIZE);
+ out_lfh:
+        lov_lfh_destroy(lfh);
+        lov_lfh_put(lfh);
         goto out_tmp;
 }
 
@@ -959,17 +1186,17 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa,
 
         if (!lsm) {
                 CERROR("LOV requires striping ea\n");
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out, rc = -ENODEV);
 
         if (oa->o_valid & OBD_MD_FLHANDLE)
                 lfh = lov_handle2lfh(obdo_handle(oa));
@@ -978,17 +1205,11 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa,
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
                 int err;
 
-                if (lov->tgts[loi->loi_ost_idx].active == 0) {
-                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
-                        continue;
-                }
-
                 /* create data objects with "parent" OA */
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
                 if (lfh)
-                        memcpy(obdo_handle(&tmp),
-                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                        memcpy(obdo_handle(&tmp), lfh->lfh_och + i,
                                FD_OSTDATA_SIZE);
                 else
                         tmp.o_valid &= ~OBD_MD_FLHANDLE;
@@ -1005,50 +1226,140 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa,
                                 rc = err;
                 }
         }
-        if (lfh) {
+        if (lfh != NULL) {
                 spin_lock(&export->exp_lov_data.led_lock);
                 list_del(&lfh->lfh_list);
                 spin_unlock(&export->exp_lov_data.led_lock);
+                lov_lfh_put(lfh); /* drop the reference owned by the list */
 
-                OBD_FREE(lfh->lfh_data, lsm->lsm_stripe_count*FD_OSTDATA_SIZE);
-                PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh));
+                OBD_FREE(lfh->lfh_och, lsm->lsm_stripe_count * FD_OSTDATA_SIZE);
+                lov_lfh_destroy(lfh);
+                lov_lfh_put(lfh); /* balance handle2lfh above */
         }
-
-        RETURN(rc);
+        GOTO(out, rc);
+ out:
+        class_export_put(export);
+        return rc;
 }
 
 #ifndef log2
 #define log2(n) ffz(~(n))
 #endif
 
-#warning FIXME: merge these two functions now that they are nearly the same
-
-/* compute ost offset in stripe "stripeno" corresponding to offset "lov_off" */
-static obd_off lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
-                                 int stripeno)
+/* we have an offset in file backed by an lov and want to find out where
+ * that offset lands in our given stripe of the file.  for the easy
+ * case where the offset is within the stripe, we just have to scale the
+ * offset down to make it relative to the stripe instead of the lov.
+ *
+ * the harder case is what to do when the offset doesn't intersect the
+ * stripe.  callers will want start offsets clamped ahead to the start
+ * of the nearest stripe in the file.  end offsets similarly clamped to the
+ * nearest ending byte of a stripe in the file:
+ *
+ * all this function does is move offsets to the nearest region of the
+ * stripe, and it does its work "mod" the full length of all the stripes.
+ * consider a file with 3 stripes:
+ *
+ *             S                                              E
+ * ---------------------------------------------------------------------
+ * |    0    |     1     |     2     |    0    |     1     |     2     |
+ * ---------------------------------------------------------------------
+ *
+ * to find stripe 1's offsets for S and E, it divides by the full stripe
+ * width and does its math in the context of a single set of stripes:
+ *
+ *             S         E
+ * -----------------------------------
+ * |    0    |     1     |     2     |
+ * -----------------------------------
+ *
+ * it'll notice that E is outside stripe 1 and clamp it to the end of the
+ * stripe, then multiply it back out by lov_off to give the real offsets in
+ * the stripe:
+ *
+ *   S                   E
+ * ---------------------------------------------------------------------
+ * |    1    |     1     |     1     |    1    |     1     |     1     |
+ * ---------------------------------------------------------------------
+ *
+ * it would have done similarly and pulled S forward to the start of a 1
+ * stripe if, say, S had landed in a 0 stripe.
+ *
+ * this rounding isn't always correct.  consider an E lov offset that lands
+ * on a 0 stripe, the "mod stripe width" math will pull it forward to the
+ * start of a 1 stripe, when in fact it wanted to be rounded back to the end
+ * of a previous 1 stripe.  this logic is handled by callers and this is why:
+ *
+ * this function returns < 0 when the offset was "before" the stripe and
+ * was moved forward to the start of the stripe in question;  0 when it
+ * falls in the stripe and no shifting was done; > 0 when the offset
+ * was outside the stripe and was pulled back to its final byte. */
+static int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off,
+                             int stripeno, obd_off *obd_off)
 {
         unsigned long ssize  = lsm->lsm_stripe_size;
         unsigned long swidth = ssize * lsm->lsm_stripe_count;
         unsigned long stripe_off, this_stripe;
+        int ret = 0;
 
-        if (lov_off == OBD_OBJECT_EOF || lov_off == 0)
-                return lov_off;
+        if (lov_off == OBD_OBJECT_EOF) {
+                *obd_off = OBD_OBJECT_EOF;
+                return 0;
+        }
 
         /* do_div(a, b) returns a % b, and a = a / b */
         stripe_off = do_div(lov_off, swidth);
 
         this_stripe = stripeno * ssize;
-        if (stripe_off <= this_stripe)
+        if (stripe_off < this_stripe) {
                 stripe_off = 0;
-        else {
+                ret = -1;
+        } else {
                 stripe_off -= this_stripe;
 
-                if (stripe_off > ssize)
+                if (stripe_off >= ssize) {
                         stripe_off = ssize;
+                        ret = 1;
+                }
         }
 
+        *obd_off = lov_off * ssize + stripe_off;
+        return ret;
+}
+
+/* given an extent in an lov and a stripe, calculate the extent of the stripe
+ * that is contained within the lov extent.  this returns true if the given
+ * stripe does intersect with the lov extent. */
+static int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno,
+                                 obd_off start, obd_off end,
+                                 obd_off *obd_start, obd_off *obd_end)
+{
+        int start_side, end_side;
+
+        start_side = lov_stripe_offset(lsm, start, stripeno, obd_start);
+        end_side = lov_stripe_offset(lsm, end, stripeno, obd_end);
+
+        CDEBUG(D_INODE, "["LPU64"->"LPU64"] -> [(%d) "LPU64"->"LPU64" (%d)]\n",
+               start, end, start_side, *obd_start, *obd_end, end_side);
 
-        return lov_off * ssize + stripe_off;
+        /* this stripe doesn't intersect the file extent when neither
+         * start or the end intersected the stripe and obd_start and
+         * obd_end got rounded up to the save value. */
+        if (start_side != 0 && end_side != 0 && *obd_start == *obd_end)
+                return 0;
+
+        /* as mentioned in the lov_stripe_offset commentary, end
+         * might have been shifted in the wrong direction.  This
+         * happens when an end offset is before the stripe when viewed
+         * through the "mod stripe size" math. we detect it being shifted
+         * in the wrong direction and touch it up.
+         * interestingly, this can't underflow since end must be > start
+         * if we passed through the previous check.
+         * (should we assert for that somewhere?) */
+        if (end_side != 0)
+                (*obd_end)--;
+
+        return 1;
 }
 
 /* compute which stripe number "lov_off" will be written into */
@@ -1063,7 +1374,6 @@ static int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off)
         return stripe_off / ssize;
 }
 
-
 /* FIXME: maybe we'll just make one node the authoritative attribute node, then
  * we can send this 'punch' to just the authoritative node and the nodes
  * that the punch will affect. */
@@ -1081,36 +1391,39 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
 
         if (!lsm) {
                 CERROR("LOV requires striping ea\n");
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out, rc = -ENODEV);
 
         if (oa->o_valid & OBD_MD_FLHANDLE)
                 lfh = lov_handle2lfh(obdo_handle(oa));
 
         lov = &export->exp_obd->u.lov;
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
-                obd_off starti = lov_stripe_offset(lsm, start, i);
-                obd_off endi = lov_stripe_offset(lsm, end, i);
+                obd_off starti, endi;
                 int err;
 
-                if (starti == endi)
+                if (lov->tgts[loi->loi_ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                        continue;
+                }
+
+                if (!lov_stripe_intersects(lsm, i, start, end, &starti, &endi))
                         continue;
 
                 /* create data objects with "parent" OA */
                 memcpy(&tmp, oa, sizeof(tmp));
                 tmp.o_id = loi->loi_id;
                 if (lfh)
-                        memcpy(obdo_handle(&tmp),
-                               lfh->lfh_data + i * FD_OSTDATA_SIZE,
+                        memcpy(obdo_handle(&tmp), lfh->lfh_och + i,
                                FD_OSTDATA_SIZE);
                 else
                         tmp.o_valid &= ~OBD_MD_FLHANDLE;
@@ -1127,13 +1440,43 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa,
                                 rc = err;
                 }
         }
-        RETURN(rc);
+        if (lfh != NULL)
+                lov_lfh_put(lfh);
+        GOTO(out, rc);
+ out:
+        class_export_put(export);
+        return rc;
+}
+
+static int lov_brw_check(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                         obd_count oa_bufs, struct brw_page *pga)
+{
+        int i;
+
+        /* The caller just wants to know if there's a chance that this
+         * I/O can succeed */
+        for (i = 0; i < oa_bufs; i++) {
+                int stripe = lov_stripe_number(lsm, pga[i].off);
+                int ost = lsm->lsm_oinfo[stripe].loi_ost_idx;
+                struct ldlm_extent ext, subext;
+                ext.start = pga[i].off;
+                ext.start = pga[i].off + pga[i].count;
+
+                if (!lov_stripe_intersects(lsm, i, ext.start, ext.end,
+                                           &subext.start, &subext.end))
+                        continue;
+
+                if (lov->tgts[ost].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", ost);
+                        return -EIO;
+                }
+        }
+        return 0;
 }
 
-static inline int lov_brw(int cmd, struct lustre_handle *conn,
-                          struct lov_stripe_md *lsm, obd_count oa_bufs,
-                          struct brw_page *pga, struct obd_brw_set *set,
-                          struct obd_trans_info *oti)
+static int lov_brw(int cmd, struct lustre_handle *conn,
+                   struct lov_stripe_md *lsm, obd_count oa_bufs,
+                   struct brw_page *pga, struct obd_trans_info *oti)
 {
         struct {
                 int bufct;
@@ -1151,20 +1494,25 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn,
 
         if (!lsm) {
                 CERROR("LOV requires striping ea\n");
-                RETURN(-EINVAL);
+                GOTO(out_exp, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out_exp, rc = -EINVAL);
         }
 
         lov = &export->exp_obd->u.lov;
 
+        if (cmd == OBD_BRW_CHECK) {
+                rc = lov_brw_check(lov, lsm, oa_bufs, pga);
+                GOTO(out_exp, rc);
+        }
+
         OBD_ALLOC(stripeinfo, stripe_count * sizeof(*stripeinfo));
         if (!stripeinfo)
-                GOTO(out_cbdata, rc = -ENOMEM);
+                GOTO(out_exp, rc = -ENOMEM);
 
         OBD_ALLOC(where, sizeof(*where) * oa_bufs);
         if (!where)
@@ -1194,65 +1542,159 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn,
                 shift = stripeinfo[which].index + stripeinfo[which].subcount;
                 LASSERT(shift < oa_bufs);
                 ioarr[shift] = pga[i];
-                ioarr[shift].off = lov_stripe_offset(lsm, pga[i].off, which);
+                lov_stripe_offset(lsm, pga[i].off, which, &ioarr[shift].off);
                 stripeinfo[which].subcount++;
         }
 
         for (i = 0, si = stripeinfo; i < stripe_count; i++, si++) {
                 int shift = si->index;
 
+                if (lov->tgts[si->ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", si->ost_idx);
+                        GOTO(out_ioarr, rc = -EIO);
+                }
+
                 if (si->bufct) {
                         LASSERT(shift < oa_bufs);
                         rc = obd_brw(cmd, &lov->tgts[si->ost_idx].conn,
                                      &si->lsm, si->bufct, &ioarr[shift],
-                                     set, oti);
+                                     oti);
                         if (rc)
                                 GOTO(out_ioarr, rc);
                 }
         }
-
+        GOTO(out_ioarr, rc);
  out_ioarr:
         OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs);
  out_where:
         OBD_FREE(where, sizeof(*where) * oa_bufs);
  out_sinfo:
         OBD_FREE(stripeinfo, stripe_count * sizeof(*stripeinfo));
- out_cbdata:
-        RETURN(rc);
+ out_exp:
+        class_export_put(export);
+        return rc;
 }
 
-static struct lov_lock_handles *lov_newlockh(struct lov_stripe_md *lsm)
+static int lov_brw_interpret (struct ptlrpc_request_set *set,
+                              struct lov_brw_async_args *aa, int rc)
 {
-        struct lov_lock_handles *lov_lockh;
-
-        OBD_ALLOC(lov_lockh, sizeof(*lov_lockh) +
-                  sizeof(*lov_lockh->llh_handles) * lsm->lsm_stripe_count);
-        if (!lov_lockh)
-                return NULL;
-
-        get_random_bytes(&lov_lockh->llh_cookie, sizeof(lov_lockh->llh_cookie));
+        obd_count        oa_bufs = aa->aa_oa_bufs;
+        struct brw_page *ioarr = aa->aa_ioarr;
+        ENTRY;
 
-        return lov_lockh;
+        OBD_FREE (ioarr, sizeof (*ioarr) * oa_bufs);
+        RETURN (rc);
 }
 
-/* We are only ever passed local lock handles here, so we do not need to
- * validate (and we can't really because these structs are variable sized
- * and therefore alloced, and not from a private slab).
- *
- * We just check because we can...
- */
-static struct lov_lock_handles *lov_h2lovlockh(struct lustre_handle *handle)
+static int lov_brw_async(int cmd, struct lustre_handle *conn,
+                         struct lov_stripe_md *lsm, obd_count oa_bufs,
+                         struct brw_page *pga, struct ptlrpc_request_set *set,
+                         struct obd_trans_info *oti)
 {
-        struct lov_lock_handles *lov_lockh = NULL;
+        struct {
+                int bufct;
+                int index;
+                int subcount;
+                struct lov_stripe_md lsm;
+                int ost_idx;
+        } *stripeinfo, *si, *si_last;
+        struct obd_export *export = class_conn2export(conn);
+        struct lov_obd *lov;
+        struct brw_page *ioarr;
+        struct lov_oinfo *loi;
+        struct lov_brw_async_args *aa;
+        int rc = 0, i, *where, stripe_count = lsm->lsm_stripe_count;
+        ENTRY;
+
+        if (!lsm) {
+                CERROR("LOV requires striping ea\n");
+                GOTO(out_exp, rc = -EINVAL);
+        }
+
+        if (lsm->lsm_magic != LOV_MAGIC) {
+                CERROR("LOV striping magic bad %#x != %#x\n",
+                       lsm->lsm_magic, LOV_MAGIC);
+                GOTO(out_exp, rc = -EINVAL);
+        }
+
+        lov = &export->exp_obd->u.lov;
+
+        if (cmd == OBD_BRW_CHECK) {
+                rc = lov_brw_check(lov, lsm, oa_bufs, pga);
+                GOTO(out_exp, rc);
+        }
+
+        OBD_ALLOC(stripeinfo, stripe_count * sizeof(*stripeinfo));
+        if (!stripeinfo)
+                GOTO(out_exp, rc = -ENOMEM);
+
+        OBD_ALLOC(where, sizeof(*where) * oa_bufs);
+        if (!where)
+                GOTO(out_sinfo, rc = -ENOMEM);
+
+        OBD_ALLOC(ioarr, sizeof(*ioarr) * oa_bufs);
+        if (!ioarr)
+                GOTO(out_where, rc = -ENOMEM);
+
+        for (i = 0; i < oa_bufs; i++) {
+                where[i] = lov_stripe_number(lsm, pga[i].off);
+                stripeinfo[where[i]].bufct++;
+        }
+
+        for (i = 0, loi = lsm->lsm_oinfo, si_last = si = stripeinfo;
+             i < stripe_count; i++, loi++, si_last = si, si++) {
+                if (i > 0)
+                        si->index = si_last->index + si_last->bufct;
+                si->lsm.lsm_object_id = loi->loi_id;
+                si->ost_idx = loi->loi_ost_idx;
+        }
+
+        for (i = 0; i < oa_bufs; i++) {
+                int which = where[i];
+                int shift;
+
+                shift = stripeinfo[which].index + stripeinfo[which].subcount;
+                LASSERT(shift < oa_bufs);
+                ioarr[shift] = pga[i];
+                lov_stripe_offset(lsm, pga[i].off, which, &ioarr[shift].off);
+                stripeinfo[which].subcount++;
+        }
 
-        if (!handle || !handle->addr)
-                RETURN(NULL);
+        for (i = 0, si = stripeinfo; i < stripe_count; i++, si++) {
+                int shift = si->index;
+
+                if (si->bufct == 0)
+                        continue;
 
-        lov_lockh = (struct lov_lock_handles *)(unsigned long)(handle->addr);
-        if (lov_lockh->llh_cookie != handle->cookie)
-                RETURN(NULL);
+                if (lov->tgts[si->ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", si->ost_idx);
+                        GOTO(out_ioarr, rc = -EIO);
+                }
 
-        return lov_lockh;
+                LASSERT(shift < oa_bufs);
+                rc = obd_brw_async(cmd, &lov->tgts[si->ost_idx].conn,
+                                   &si->lsm, si->bufct, &ioarr[shift],
+                                   set, oti);
+                if (rc)
+                        GOTO(out_ioarr, rc);
+        }
+        LASSERT (rc == 0);
+        LASSERT (set->set_interpret == NULL);
+        set->set_interpret = lov_brw_interpret;
+        LASSERT (sizeof (set->set_args) >= sizeof (struct lov_brw_async_args));
+        aa = (struct lov_brw_async_args *)&set->set_args;
+        aa->aa_oa_bufs = oa_bufs;
+        aa->aa_ioarr = ioarr;
+        GOTO(out_where, rc);
+ out_ioarr:
+        OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs);
+ out_where:
+        OBD_FREE(where, sizeof(*where) * oa_bufs);
+ out_sinfo:
+        OBD_FREE(stripeinfo, stripe_count * sizeof(*stripeinfo));
+ out_exp:
+        class_export_put(export);
+        return rc;
 }
 
 static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
@@ -1267,35 +1709,33 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         struct lov_obd *lov;
         struct lov_oinfo *loi;
         struct lov_stripe_md submd;
-        ldlm_error_t rc = ELDLM_LOCK_MATCHED, err;
+        ldlm_error_t rc;
         int i;
         ENTRY;
 
         if (!lsm) {
                 CERROR("LOV requires striping ea\n");
-                RETURN(-EINVAL);
+                GOTO(out_exp, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out_exp, rc = -EINVAL);
         }
 
-        /* we should never be asked to replay a lock. */
-
+        /* we should never be asked to replay a lock this way. */
         LASSERT((*flags & LDLM_FL_REPLAY) == 0);
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out_exp, rc = -ENODEV);
 
         if (lsm->lsm_stripe_count > 1) {
-                lov_lockh = lov_newlockh(lsm);
-                if (!lov_lockh)
-                        RETURN(-ENOMEM);
+                lov_lockh = lov_llh_new(lsm);
+                if (lov_lockh == NULL)
+                        GOTO(out_exp, rc = -ENOMEM);
 
-                lockh->addr = (__u64)(unsigned long)lov_lockh;
-                lockh->cookie = lov_lockh->llh_cookie;
+                lockh->cookie = lov_lockh->llh_handle.h_cookie;
                 lov_lockhp = lov_lockh->llh_handles;
         } else {
                 lov_lockhp = lockh;
@@ -1307,32 +1747,27 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                 struct ldlm_extent *extent = (struct ldlm_extent *)cookie;
                 struct ldlm_extent sub_ext;
 
+                *flags = 0;
+                if (!lov_stripe_intersects(lsm, i, extent->start, extent->end,
+                                           &sub_ext.start, &sub_ext.end))
+                        continue;
+
                 if (lov->tgts[loi->loi_ost_idx].active == 0) {
                         CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                         continue;
                 }
 
-                *flags = 0;
-                sub_ext.start = lov_stripe_offset(lsm, extent->start, i);
-                sub_ext.end = lov_stripe_offset(lsm, extent->end, i);
-                if (sub_ext.start == sub_ext.end /* || !active */)
-                        continue;
-
                 /* XXX LOV STACKING: submd should be from the subobj */
                 submd.lsm_object_id = loi->loi_id;
                 submd.lsm_stripe_count = 0;
                 /* XXX submd is not fully initialized here */
                 *flags = 0;
-                err = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd,
+                rc = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd,
                                   parent_lock, type, &sub_ext, sizeof(sub_ext),
                                   mode, flags, cb, data, datalen, lov_lockhp);
 
                 // XXX add a lock debug statement here
-                /* return _MATCHED only when all locks matched.. */
-                if (err == ELDLM_OK) {
-                        rc = ELDLM_OK;
-                } else if (err != ELDLM_LOCK_MATCHED) {
-                        rc = err;
+                if (rc != ELDLM_OK) {
                         memset(lov_lockhp, 0, sizeof(*lov_lockhp));
                         if (lov->tgts[loi->loi_ost_idx].active) {
                                 CERROR("error: enqueue objid "LPX64" subobj "
@@ -1343,15 +1778,16 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                         }
                 }
         }
-        RETURN(rc);
+        if (lsm->lsm_stripe_count > 1)
+                lov_llh_put(lov_lockh);
+        GOTO(out_exp, rc = ELDLM_OK);
 
-out_locks:
+ out_locks:
         while (loi--, lov_lockhp--, i-- > 0) {
                 struct lov_stripe_md submd;
                 int err;
 
-                if (lov_lockhp->cookie == 0 ||
-                    lov->tgts[loi->loi_ost_idx].active == 0)
+                if (lov_lockhp->cookie == 0)
                         continue;
 
                 /* XXX LOV STACKING: submd should be from the subobj */
@@ -1367,13 +1803,112 @@ out_locks:
         }
 
         if (lsm->lsm_stripe_count > 1) {
-                lov_lockh->llh_cookie = DEAD_HANDLE_MAGIC;
-                OBD_FREE(lov_lockh, sizeof(*lov_lockh) +
-                          sizeof(*lov_lockh->llh_handles) *
-                          lsm->lsm_stripe_count);
+                lov_llh_destroy(lov_lockh);
+                lov_llh_put(lov_lockh);
+        }
+ out_exp:
+        class_export_put(export);
+        RETURN(rc);
+}
+
+static int lov_match(struct lustre_handle *conn, struct lov_stripe_md *lsm,
+                       __u32 type, void *cookie, int cookielen, __u32 mode,
+                       int *flags, struct lustre_handle *lockh)
+{
+        struct obd_export *export = class_conn2export(conn);
+        struct lov_lock_handles *lov_lockh = NULL;
+        struct lustre_handle *lov_lockhp;
+        struct lov_obd *lov;
+        struct lov_oinfo *loi;
+        struct lov_stripe_md submd;
+        ldlm_error_t rc = 0;
+        int i;
+        ENTRY;
+
+        if (!lsm) {
+                CERROR("LOV requires striping ea\n");
+                GOTO(out_exp, rc = -EINVAL);
+        }
+
+        if (lsm->lsm_magic != LOV_MAGIC) {
+                CERROR("LOV striping magic bad %#x != %#x\n",
+                       lsm->lsm_magic, LOV_MAGIC);
+                GOTO(out_exp, rc = -EINVAL);
+        }
+
+        if (!export || !export->exp_obd)
+                GOTO(out_exp, rc = -ENODEV);
+
+        if (lsm->lsm_stripe_count > 1) {
+                lov_lockh = lov_llh_new(lsm);
+                if (lov_lockh == NULL)
+                        GOTO(out_exp, rc = -ENOMEM);
+
+                lockh->cookie = lov_lockh->llh_handle.h_cookie;
+                lov_lockhp = lov_lockh->llh_handles;
+        } else {
+                lov_lockhp = lockh;
         }
-        lockh->cookie = DEAD_HANDLE_MAGIC;
 
+        lov = &export->exp_obd->u.lov;
+        for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count;
+             i++, loi++, lov_lockhp++) {
+                struct ldlm_extent *extent = (struct ldlm_extent *)cookie;
+                struct ldlm_extent sub_ext;
+                int lov_flags;
+
+                if (!lov_stripe_intersects(lsm, i, extent->start, extent->end,
+                                           &sub_ext.start, &sub_ext.end))
+                        continue;
+
+                if (lov->tgts[loi->loi_ost_idx].active == 0) {
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+                        rc = -EIO;
+                        break;
+                }
+
+                /* XXX LOV STACKING: submd should be from the subobj */
+                submd.lsm_object_id = loi->loi_id;
+                submd.lsm_stripe_count = 0;
+                lov_flags = *flags;
+                /* XXX submd is not fully initialized here */
+                rc = obd_match(&(lov->tgts[loi->loi_ost_idx].conn), &submd,
+                               type, &sub_ext, sizeof(sub_ext), mode,
+                               &lov_flags, lov_lockhp);
+                if (rc != 1)
+                        break;
+        }
+        if (rc == 1) {
+                if (lsm->lsm_stripe_count > 1)
+                        lov_llh_put(lov_lockh);
+                GOTO(out_exp, 1);
+        }
+
+        while (loi--, lov_lockhp--, i-- > 0) {
+                struct lov_stripe_md submd;
+                int err;
+
+                if (lov_lockhp->cookie == 0)
+                        continue;
+
+                /* XXX LOV STACKING: submd should be from the subobj */
+                submd.lsm_object_id = loi->loi_id;
+                submd.lsm_stripe_count = 0;
+                err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
+                                 mode, lov_lockhp);
+                if (err && lov->tgts[loi->loi_ost_idx].active) {
+                        CERROR("error: cancelling objid "LPX64" on OST "
+                               "idx %d after match failure: rc = %d\n",
+                               loi->loi_id, loi->loi_ost_idx, err);
+                }
+        }
+
+        if (lsm->lsm_stripe_count > 1) {
+                lov_llh_destroy(lov_lockh);
+                lov_llh_put(lov_lockh);
+        }
+ out_exp:
+        class_export_put(export);
         RETURN(rc);
 }
 
@@ -1390,29 +1925,30 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
 
         if (!lsm) {
                 CERROR("LOV requires striping ea\n");
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (lsm->lsm_magic != LOV_MAGIC) {
                 CERROR("LOV striping magic bad %#x != %#x\n",
                        lsm->lsm_magic, LOV_MAGIC);
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out, rc = -ENODEV);
 
         LASSERT(lockh);
         if (lsm->lsm_stripe_count > 1) {
-                lov_lockh = lov_h2lovlockh(lockh);
+                lov_lockh = lov_handle2llh(lockh);
                 if (!lov_lockh) {
                         CERROR("LOV: invalid lov lock handle %p\n", lockh);
-                        RETURN(-EINVAL);
+                        GOTO(out, rc = -EINVAL);
                 }
 
                 lov_lockhp = lov_lockh->llh_handles;
-        } else
+        } else {
                 lov_lockhp = lockh;
+        }
 
         lov = &export->exp_obd->u.lov;
         for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count;
@@ -1421,7 +1957,8 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                 int err;
 
                 if (lov_lockhp->cookie == 0) {
-                        CDEBUG(D_HA, "lov idx %d no lock?\n", loi->loi_ost_idx);
+                        CDEBUG(D_HA, "lov idx %d subobj "LPX64" no lock?\n",
+                               loi->loi_ost_idx, loi->loi_id);
                         continue;
                 }
 
@@ -1442,19 +1979,18 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                 }
         }
 
-        if (lsm->lsm_stripe_count > 1) {
-                lov_lockh->llh_cookie = DEAD_HANDLE_MAGIC;
-                OBD_FREE(lov_lockh, sizeof(*lov_lockh) +
-                          sizeof(*lov_lockh->llh_handles) *
-                          lsm->lsm_stripe_count);
-        }
-        lockh->cookie = DEAD_HANDLE_MAGIC;
-
-        RETURN(rc);
+        if (lsm->lsm_stripe_count > 1)
+                lov_llh_destroy(lov_lockh);
+        if (lov_lockh != NULL)
+                lov_llh_put(lov_lockh);
+        GOTO(out, rc);
+ out:
+        class_export_put(export);
+        return rc;
 }
 
 static int lov_cancel_unused(struct lustre_handle *conn,
-                             struct lov_stripe_md *lsm, int flags)
+                             struct lov_stripe_md *lsm, int flags, void *opaque)
 {
         struct obd_export *export = class_conn2export(conn);
         struct lov_obd *lov;
@@ -1464,21 +2000,24 @@ static int lov_cancel_unused(struct lustre_handle *conn,
 
         if (!lsm) {
                 CERROR("LOV requires striping ea for lock cancellation\n");
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out, rc = -ENODEV);
 
         lov = &export->exp_obd->u.lov;
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
                 struct lov_stripe_md submd;
                 int err;
 
+                if (lov->tgts[loi->loi_ost_idx].active == 0)
+                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
+
                 submd.lsm_object_id = loi->loi_id;
                 submd.lsm_stripe_count = 0;
                 err = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn,
-                                       &submd, flags);
+                                        &submd, flags, opaque);
                 if (err && lov->tgts[loi->loi_ost_idx].active) {
                         CERROR("error: cancel unused objid "LPX64" subobj "LPX64
                                " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
@@ -1487,10 +2026,21 @@ static int lov_cancel_unused(struct lustre_handle *conn,
                                 rc = err;
                 }
         }
-
-        RETURN(rc);
+        GOTO(out, rc);
+ out:
+        class_export_put(export);
+        return rc;
 }
 
+#define LOV_U64_MAX ((__u64)~0ULL)
+#define LOV_SUM_MAX(tot, add)                                           \
+        do {                                                            \
+                if ((tot) + (add) < (tot))                              \
+                        (tot) = LOV_U64_MAX;                            \
+                else                                                    \
+                        (tot) += (add);                                 \
+        } while(0)
+
 static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
 {
         struct obd_export *export = class_conn2export(conn);
@@ -1502,7 +2052,7 @@ static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
         ENTRY;
 
         if (!export || !export->exp_obd)
-                RETURN(-ENODEV);
+                GOTO(out, rc = -ENODEV);
 
         lov = &export->exp_obd->u.lov;
 
@@ -1539,14 +2089,30 @@ static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
                          *     if one of the OBDs has no more objects left)
                          *   - could be sum if we stripe whole objects
                          *   - could be average, just to give a nice number
-                         *   - we just pick first OST and hope it is enough
-                        sfs->f_ffree += lov_sfs.f_ffree;
+                         *
+                         * To give a "reasonable" (if not wholly accurate)
+                         * number, we divide the total number of free objects
+                         * by expected stripe count (watch out for overflow).
                          */
+                        LOV_SUM_MAX(osfs->os_files, lov_sfs.os_files);
+                        LOV_SUM_MAX(osfs->os_ffree, lov_sfs.os_ffree);
                 }
         }
-        if (!set && !rc)
+        if (set) {
+                __u32 expected_stripes = lov->desc.ld_default_stripe_count ?
+                                         lov->desc.ld_default_stripe_count :
+                                         lov->desc.ld_active_tgt_count;
+
+                if (osfs->os_files != LOV_U64_MAX)
+                        do_div(osfs->os_files, expected_stripes);
+                if (osfs->os_ffree != LOV_U64_MAX)
+                        do_div(osfs->os_ffree, expected_stripes);
+        } else if (!rc)
                 rc = -EIO;
-        RETURN(rc);
+        GOTO(out, rc);
+ out:
+        class_export_put(export);
+        return rc;
 }
 
 static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
@@ -1601,7 +2167,7 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
                 rc = copy_to_user((void *)uarg, buf, len);
                 if (rc)
                         rc = -EFAULT;
-                OBD_FREE(buf, len);
+                obd_ioctl_freedata(buf, len);
                 break;
         }
         case LL_IOC_LOV_SETSTRIPE:
@@ -1639,6 +2205,49 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
         RETURN(rc);
 }
 
+static int lov_get_info(struct lustre_handle *conn, __u32 keylen,
+                        void *key, __u32 *vallen, void *val)
+{
+        struct obd_device *obddev = class_conn2obd(conn);
+        struct lov_obd *lov = &obddev->u.lov;
+        int i;
+        ENTRY;
+
+        if (!vallen || !val)
+                RETURN(-EFAULT);
+
+        if (keylen > strlen("lock_to_stripe") &&
+            strcmp(key, "lock_to_stripe") == 0) {
+                struct {
+                        char name[16];
+                        struct ldlm_lock *lock;
+                        struct lov_stripe_md *lsm;
+                } *data = key;
+                __u32 *stripe = val;
+                struct lov_oinfo *loi;
+
+                if (*vallen < sizeof(*stripe))
+                        RETURN(-EFAULT);
+                *vallen = sizeof(*stripe);
+
+                /* XXX This is another one of those bits that will need to
+                 * change if we ever actually support nested LOVs.  It uses
+                 * the lock's connection to find out which stripe it is. */
+                for (i = 0, loi = data->lsm->lsm_oinfo;
+                     i < data->lsm->lsm_stripe_count;
+                     i++, loi++) {
+                        if (lov->tgts[loi->loi_ost_idx].conn.cookie ==
+                            data->lock->l_connh->cookie) {
+                                *stripe = i;
+                                RETURN(0);
+                        }
+                }
+                RETURN(-ENXIO);
+        }
+
+        RETURN(-EINVAL);
+}
+
 struct obd_ops lov_obd_ops = {
         o_owner:       THIS_MODULE,
         o_attach:      lov_attach,
@@ -1652,15 +2261,19 @@ struct obd_ops lov_obd_ops = {
         o_create:      lov_create,
         o_destroy:     lov_destroy,
         o_getattr:     lov_getattr,
+        o_getattr_async: lov_getattr_async,
         o_setattr:     lov_setattr,
         o_open:        lov_open,
         o_close:       lov_close,
         o_brw:         lov_brw,
+        o_brw_async:   lov_brw_async,
         o_punch:       lov_punch,
         o_enqueue:     lov_enqueue,
+        o_match:       lov_match,
         o_cancel:      lov_cancel,
         o_cancel_unused: lov_cancel_unused,
-        o_iocontrol:   lov_iocontrol
+        o_iocontrol:   lov_iocontrol,
+        o_get_info:    lov_get_info
 };
 
 int __init lov_init(void)
@@ -1670,12 +2283,6 @@ int __init lov_init(void)
 
         printk(KERN_INFO "Lustre Logical Object Volume driver; "
                "info@clusterfs.com\n");
-        lov_file_cache = kmem_cache_create("ll_lov_file_data",
-                                           sizeof(struct lov_file_handles),
-                                           0, 0, NULL, NULL);
-        if (!lov_file_cache)
-                RETURN(-ENOMEM);
-
         lprocfs_init_vars(&lvars);
         rc = class_register_type(&lov_obd_ops, lvars.module_vars,
                                  OBD_LOV_DEVICENAME);
@@ -1684,8 +2291,6 @@ int __init lov_init(void)
 
 static void __exit lov_exit(void)
 {
-        if (kmem_cache_destroy(lov_file_cache))
-                CERROR("couldn't free LOV open cache\n");
         class_unregister_type(OBD_LOV_DEVICENAME);
 }
 
index 463dd72..620dd5c 100644 (file)
 #include <linux/obd_class.h>
 #include <linux/obd_support.h>
 
-/* lov_packdesc() is in mds/mds_lov.c */
-void lov_unpackdesc(struct lov_desc *ld)
-{
-        ld->ld_tgt_count = NTOH__u32(ld->ld_tgt_count);
-        ld->ld_default_stripe_count = HTON__u32(ld->ld_default_stripe_count);
-        ld->ld_default_stripe_size = HTON__u32(ld->ld_default_stripe_size);
-        ld->ld_pattern = HTON__u32(ld->ld_pattern);
-}
-
 void lov_dump_lmm(int level, struct lov_mds_md *lmm)
 {
         struct lov_object_id *loi;
@@ -65,7 +56,8 @@ do {                                                                    \
         LASSERT(test); /* so we know what assertion failed */           \
 } while(0)
 
-/* Pack LOV object metadata for shipment to the MDS.
+/* Pack LOV object metadata for disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
  *
  * XXX In the future, this will be enhanced to get the EA size from the
  *     underlying OSC device(s) to get their EA sizes so we can stack
@@ -108,8 +100,7 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp,
                 RETURN(lmm_size);
 
         if (*lmmp && !lsm) {
-                /* endianness */
-                ost_count = ((*lmmp)->lmm_ost_count);
+                ost_count = le32_to_cpu ((*lmmp)->lmm_ost_count);
                 OBD_FREE(*lmmp, lov_mds_md_size(ost_count));
                 *lmmp = NULL;
                 RETURN(0);
@@ -122,25 +113,24 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp,
         }
 
         lmm = *lmmp;
+        lmm->lmm_magic = cpu_to_le32 (LOV_MAGIC);
+        lmm->lmm_ost_count = cpu_to_le16 (ost_count);
 
-        lmm->lmm_stripe_count = (stripe_count);
         if (!lsm)
                 RETURN(lmm_size);
 
-        /* XXX endianness */
-        lmm->lmm_magic = (lsm->lsm_magic);
-        lmm->lmm_object_id = (lsm->lsm_object_id);
-        LASSERT(lsm->lsm_object_id);
-        lmm->lmm_stripe_size = (lsm->lsm_stripe_size);
-        lmm->lmm_stripe_offset = (lsm->lsm_stripe_offset);
-        lmm->lmm_ost_count = (ost_count);
+        lmm->lmm_object_id = cpu_to_le64 (lsm->lsm_object_id);
+        lmm->lmm_stripe_count = cpu_to_le16 (stripe_count);
+        lmm->lmm_stripe_size = cpu_to_le32 (lsm->lsm_stripe_size);
+        lmm->lmm_stripe_offset = cpu_to_le32 (lsm->lsm_stripe_offset);
 
         /* Only fill in the object ids which we are actually using.
          * Assumes lmm_objects is otherwise zero-filled. */
         for (i = 0, loi = lsm->lsm_oinfo; i < stripe_count; i++, loi++) {
                 /* XXX call down to osc_packmd() to do the packing */
-                LASSERT(loi->loi_id);
-                lmm->lmm_objects[loi->loi_ost_idx].l_object_id = (loi->loi_id);
+                LASSERT (loi->loi_id);
+                lmm->lmm_objects[loi->loi_ost_idx].l_object_id = 
+                        cpu_to_le64 (loi->loi_id);
         }
 
         RETURN(lmm_size);
@@ -156,14 +146,17 @@ static int lov_get_stripecnt(struct lov_obd *lov, int stripe_count)
         return stripe_count;
 }
 
+/* Unpack LOV object metadata from disk storage.  It is packed in LE byte
+ * order and is opaque to the networking layer.
+ */
 int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
-                 struct lov_mds_md *lmm)
+                 struct lov_mds_md *lmm, int lmm_bytes)
 {
         struct obd_device *obd = class_conn2obd(conn);
         struct lov_obd *lov = &obd->u.lov;
         struct lov_stripe_md *lsm;
         struct lov_oinfo *loi;
-        int ost_count;
+        int ost_count = 0;
         int ost_offset = 0;
         int stripe_count;
         int lsm_size;
@@ -171,14 +164,31 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
         ENTRY;
 
         if (lmm) {
-                /* endianness */
-                if (lmm->lmm_magic != LOV_MAGIC) {
-                        CERROR("bad wire LOV MAGIC: %#08x != %#08x\n",
-                               lmm->lmm_magic, LOV_MAGIC);
+                if (lmm_bytes < sizeof (*lmm)) {
+                        CERROR("lov_mds_md too small: %d, need %d\n",
+                                lmm_bytes, (int)sizeof(*lmm));
+                        RETURN(-EINVAL);
+                }
+                if (le32_to_cpu (lmm->lmm_magic) != LOV_MAGIC) {
+                        CERROR("bad disk LOV MAGIC: %#08x != %#08x\n",
+                               le32_to_cpu (lmm->lmm_magic), LOV_MAGIC);
                         RETURN(-EINVAL);
                 }
-                stripe_count = (lmm->lmm_stripe_count);
-                LASSERT(stripe_count);
+
+                ost_count = le16_to_cpu (lmm->lmm_ost_count);
+                stripe_count = le16_to_cpu (lmm->lmm_stripe_count);
+
+                if (ost_count == 0 || stripe_count == 0) {
+                        CERROR ("zero ost %d or stripe %d count\n",
+                                ost_count, stripe_count);
+                        RETURN (-EINVAL);
+                }
+
+                if (lmm_bytes < lov_mds_md_size (ost_count)) {
+                        CERROR ("lov_mds_md too small: %d, need %d\n",
+                                lmm_bytes, lov_mds_md_size (ost_count));
+                        RETURN (-EINVAL);
+                }
         } else
                 stripe_count = lov_get_stripecnt(lov, 0);
 
@@ -202,18 +212,16 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
         }
 
         lsm = *lsmp;
-
+        lsm->lsm_magic = LOV_MAGIC;
         lsm->lsm_stripe_count = stripe_count;
+        lsm->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
+
         if (!lmm)
                 RETURN(lsm_size);
 
-        /* XXX endianness */
-        ost_offset = lsm->lsm_stripe_offset = (lmm->lmm_stripe_offset);
-        lsm->lsm_magic = (lmm->lmm_magic);
-        lsm->lsm_object_id = (lmm->lmm_object_id);
-        lsm->lsm_stripe_size = (lmm->lmm_stripe_size);
-
-        ost_count = (lmm->lmm_ost_count);
+        lsm->lsm_object_id = le64_to_cpu (lmm->lmm_object_id);
+        lsm->lsm_stripe_size = le32_to_cpu (lmm->lmm_stripe_size);
+        ost_offset = lsm->lsm_stripe_offset = le32_to_cpu (lmm->lmm_stripe_offset);
 
         LMM_ASSERT(lsm->lsm_object_id);
         LMM_ASSERT(ost_count);
@@ -226,7 +234,7 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
 
                 LMM_ASSERT(loi - lsm->lsm_oinfo < stripe_count);
                 /* XXX LOV STACKING call down to osc_unpackmd() */
-                loi->loi_id = (lmm->lmm_objects[ost_offset].l_object_id);
+                loi->loi_id = le64_to_cpu (lmm->lmm_objects[ost_offset].l_object_id);
                 loi->loi_ost_idx = ost_offset;
                 loi++;
         }
@@ -258,8 +266,10 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
         if (rc)
                 RETURN(-EFAULT);
 
+        /* Bug 1185 FIXME: struct lov_mds_md is little-endian everywhere else */
+
         if (lmm.lmm_magic != LOV_MAGIC) {
-                CERROR("bad wire LOV MAGIC: %#08x != %#08x\n",
+                CERROR("bad userland LOV MAGIC: %#08x != %#08x\n",
                        lmm.lmm_magic, LOV_MAGIC);
                 RETURN(-EINVAL);
         }
@@ -299,6 +309,7 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
         lsm->lsm_stripe_count = stripe_count;
         lsm->lsm_stripe_offset = lmm.lmm_stripe_offset;
         lsm->lsm_stripe_size = lmm.lmm_stripe_size;
+        lsm->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
 
         *lsmp = lsm;
 
@@ -314,10 +325,8 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
 int lov_getstripe(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                   struct lov_mds_md *lmmu)
 {
-        struct obd_device *obd = class_conn2obd(conn);
-        struct lov_obd *lov = &obd->u.lov;
         struct lov_mds_md lmm, *lmmk = NULL;
-        int ost_count, rc, lmm_size;
+        int rc, lmm_size;
         ENTRY;
 
         if (!lsm)
@@ -330,23 +339,20 @@ int lov_getstripe(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         if (lmm.lmm_magic != LOV_MAGIC)
                 RETURN(-EINVAL);
 
-        ost_count = lov->desc.ld_tgt_count;
-
-        /* XXX we _could_ check if indices > user lmm_ost_count are zero */
-        if (lmm.lmm_ost_count < ost_count)
-                RETURN(-EOVERFLOW);
-
         rc = lov_packmd(conn, &lmmk, lsm);
         if (rc < 0)
                 RETURN(rc);
-
+        /* Bug 1185 FIXME: convert lmmk to big-endian before copy to userspace */
         lmm_size = rc;
         rc = 0;
 
-        if (lmm_size && copy_to_user(lmmu, lmmk, lmm_size))
+        /* User wasn't expecting this many OST entries */
+        if (lmm.lmm_ost_count < lmmk->lmm_ost_count)
+                rc = -EOVERFLOW;
+        else if (copy_to_user(lmmu, lmmk, lmm_size))
                 rc = -EFAULT;
 
-        obd_free_wiremd(conn, &lmmk);
+        obd_free_diskmd (conn, &lmmk);
 
         RETURN(rc);
 }
index 1d9c099..f4f0218 100644 (file)
@@ -5,16 +5,15 @@
 
 DEFS=
 
+if LIBLUSTRE
+lib_LIBRARIES = libmdc.a
+libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h
+else
 MODULE = mdc
 modulefs_DATA = mdc.o
 EXTRA_PROGRAMS = mdc
 
-LINX= mds_updates.c client.c
-mdc_SOURCES =  mdc_request.c mdc_reint.c lproc_mdc.c $(LINX)
-
-mds_updates.c: 
-       test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c .
-client.c: 
-       test -e client.c || ln -sf $(top_srcdir)/lib/client.c .
+mdc_SOURCES = mdc_request.c mdc_reint.c lproc_mdc.c mdc_lib.c mdc_internal.h
+endif
 
 include $(top_srcdir)/Rules
diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h
new file mode 100644 (file)
index 0000000..e39a0aa
--- /dev/null
@@ -0,0 +1,24 @@
+void mds_pack_req_body(struct ptlrpc_request *);
+void mds_pack_rep_body(struct ptlrpc_request *);
+void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, __u32 size,
+                      obd_id ino, int type);
+void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset,
+                      int flags, struct mdc_op_data *data);
+void mds_setattr_pack(struct ptlrpc_request *req,
+                      struct mdc_op_data *data,
+                      struct iattr *iattr, void *ea, int ealen);
+void mds_create_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *op_data,
+                     __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time,
+                     const void *data, int datalen);
+void mds_open_pack(struct ptlrpc_request *req, int offset,
+                   struct mdc_op_data *op_data,
+                   __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time,
+                   __u32 flags, const void *data, int datalen);
+void mds_unlink_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *data);
+void mds_link_pack(struct ptlrpc_request *req, int offset,
+                   struct mdc_op_data *data);
+void mds_rename_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *data,
+                     const char *old, int oldlen, const char *new, int newlen);
diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c
new file mode 100644 (file)
index 0000000..1396f8d
--- /dev/null
@@ -0,0 +1,282 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDS
+#ifndef __KERNEL__
+# include <liblustre.h>
+#endif
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_lite.h>
+
+void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, __u32 size,
+                      obd_id ino, int type, __u64 xid)
+{
+        struct mds_body *b;
+
+        b = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*b));
+        b->fsuid = current->fsuid;
+        b->fsgid = current->fsgid;
+        b->capability = current->cap_effective;
+        b->fid1.id = ino;
+        b->fid1.f_type = type;
+        b->size = offset;                       /* !! */
+        b->suppgid = -1;
+        b->blocks = xid;                        /* !! */
+        b->nlink = size;                        /* !! */
+}
+
+static void mds_pack_body(struct mds_body *b)
+{
+        LASSERT (b != NULL);
+
+        b->fsuid = current->fsuid;
+        b->fsgid = current->fsgid;
+        b->capability = current->cap_effective;
+}
+
+void mds_pack_req_body(struct ptlrpc_request *req)
+{
+        struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*b));
+        mds_pack_body(b);
+}
+
+/* packing of MDS records */
+void mds_create_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *op_data,
+                     __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time,
+                     const void *data, int datalen)
+{
+        struct mds_rec_create *rec;
+        char *tmp;
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+
+        rec->cr_opcode = REINT_CREATE;
+        rec->cr_fsuid = current->fsuid;
+        rec->cr_fsgid = current->fsgid;
+        rec->cr_cap = current->cap_effective;
+        ll_ino2fid(&rec->cr_fid, op_data->ino1, op_data->gen1, op_data->typ1);
+        memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid));
+        rec->cr_mode = mode;
+        rec->cr_rdev = rdev;
+        rec->cr_uid = uid;
+        rec->cr_gid = gid;
+        rec->cr_time = time;
+        if (in_group_p(op_data->gid1))
+                rec->cr_suppgid = op_data->gid1;
+        else
+                rec->cr_suppgid = -1;
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, op_data->namelen + 1);
+        LOGL0(op_data->name, op_data->namelen, tmp);
+
+        if (data) {
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, datalen);
+                memcpy (tmp, data, datalen);
+        }
+}
+/* packing of MDS records */
+void mds_open_pack(struct ptlrpc_request *req, int offset,
+                   struct mdc_op_data *op_data,
+                   __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time,
+                   __u32 flags,
+                   const void *data, int datalen)
+{
+        struct mds_rec_create *rec;
+        char *tmp;
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+
+        /* XXX do something about time, uid, gid */
+        rec->cr_opcode = REINT_OPEN;
+        rec->cr_fsuid = current->fsuid;
+        rec->cr_fsgid = current->fsgid;
+        rec->cr_cap = current->cap_effective;
+        ll_ino2fid(&rec->cr_fid, op_data->ino1,
+                   op_data->gen1, op_data->typ1);
+        memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid));
+        rec->cr_mode = mode;
+        rec->cr_flags = flags;
+        rec->cr_rdev = rdev;
+        rec->cr_uid = uid;
+        rec->cr_gid = gid;
+        rec->cr_time = time;
+        if (in_group_p(op_data->gid1))
+                rec->cr_suppgid = op_data->gid1;
+        else
+                rec->cr_suppgid = -1;
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, op_data->namelen + 1);
+        LOGL0(op_data->name, op_data->namelen, tmp);
+
+        if (data) {
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, datalen);
+                memcpy (tmp, data, datalen);
+        }
+}
+void mds_setattr_pack(struct ptlrpc_request *req,
+                      struct mdc_op_data *data,
+                      struct iattr *iattr, void *ea, int ealen)
+{
+        struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, 0,
+                                                     sizeof (*rec));
+        rec->sa_opcode = REINT_SETATTR;
+        rec->sa_fsuid = current->fsuid;
+        rec->sa_fsgid = current->fsgid;
+        rec->sa_cap = current->cap_effective;
+        ll_ino2fid(&rec->sa_fid, data->ino1, data->gen1, data->typ1);
+
+        if (iattr) {
+                rec->sa_valid = iattr->ia_valid;
+                rec->sa_mode = iattr->ia_mode;
+                rec->sa_uid = iattr->ia_uid;
+                rec->sa_gid = iattr->ia_gid;
+                rec->sa_size = iattr->ia_size;
+                rec->sa_atime = LTIME_S(iattr->ia_atime);
+                rec->sa_mtime = LTIME_S(iattr->ia_mtime);
+                rec->sa_ctime = LTIME_S(iattr->ia_ctime);
+                rec->sa_attr_flags = iattr->ia_attr_flags;
+
+                if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid))
+                        rec->sa_suppgid = iattr->ia_gid;
+                else if ((iattr->ia_valid & ATTR_MODE) &&
+                         in_group_p(data->gid1))
+                        rec->sa_suppgid = data->gid1;
+                else
+                        rec->sa_suppgid = -1;
+        }
+
+        if (ealen != 0)
+                memcpy(lustre_msg_buf(req->rq_reqmsg, 1, ealen), ea, ealen);
+}
+
+void mds_unlink_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *data)
+{
+        struct mds_rec_unlink *rec;
+        char *tmp;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+        LASSERT (rec != NULL);
+
+        rec->ul_opcode = REINT_UNLINK;
+        rec->ul_fsuid = current->fsuid;
+        rec->ul_fsgid = current->fsgid;
+        rec->ul_cap = current->cap_effective;
+        rec->ul_mode = data->mode;
+        if (in_group_p(data->gid1))
+                rec->ul_suppgid = data->gid1;
+        else
+                rec->ul_suppgid = -1;
+        ll_ino2fid(&rec->ul_fid1, data->ino1, data->gen1, data->typ1);
+        if (data->ino2)
+                ll_ino2fid(&rec->ul_fid2, data->ino2, data->gen2, data->typ2);
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, data->namelen + 1);
+        LASSERT (tmp != NULL);
+        LOGL0(data->name, data->namelen, tmp);
+}
+
+void mds_link_pack(struct ptlrpc_request *req, int offset,
+                   struct mdc_op_data *data)
+{
+        struct mds_rec_link *rec;
+        char *tmp;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+
+        rec->lk_opcode = REINT_LINK;
+        rec->lk_fsuid = current->fsuid;
+        rec->lk_fsgid = current->fsgid;
+        rec->lk_cap = current->cap_effective;
+        if (in_group_p(data->gid1))
+                rec->lk_suppgid1 = data->gid1;
+        else
+                rec->lk_suppgid1 = -1;
+        if (in_group_p(data->gid2))
+                rec->lk_suppgid2 = data->gid2;
+        else
+                rec->lk_suppgid2 = -1;
+        ll_ino2fid(&rec->lk_fid1, data->ino1, data->gen1, data->typ1);
+        ll_ino2fid(&rec->lk_fid2, data->ino2, data->gen2, data->typ2);
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, data->namelen + 1);
+        LOGL0(data->name, data->namelen, tmp);
+}
+
+void mds_rename_pack(struct ptlrpc_request *req, int offset,
+                     struct mdc_op_data *data,
+                     const char *old, int oldlen, const char *new, int newlen)
+{
+        struct mds_rec_rename *rec;
+        char *tmp;
+
+        rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec));
+
+        /* XXX do something about time, uid, gid */
+        rec->rn_opcode = REINT_RENAME;
+        rec->rn_fsuid = current->fsuid;
+        rec->rn_fsgid = current->fsgid;
+        rec->rn_cap = current->cap_effective;
+        if (in_group_p(data->gid1))
+                rec->rn_suppgid1 = data->gid1;
+        else
+                rec->rn_suppgid1 = -1;
+        if (in_group_p(data->gid2))
+                rec->rn_suppgid2 = data->gid2;
+        else
+                rec->rn_suppgid2 = -1;
+        ll_ino2fid(&rec->rn_fid1, data->ino1, data->gen1, data->typ1);
+        ll_ino2fid(&rec->rn_fid2, data->ino2, data->gen2, data->typ2);
+
+        tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, oldlen + 1);
+        LOGL0(old, oldlen, tmp);
+
+        if (new) {
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, newlen + 1);
+                LOGL0(new, newlen, tmp);
+        }
+}
+
+void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset,
+                      int flags, struct mdc_op_data *data)
+{
+        struct mds_body *b;
+        b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*b));
+
+        b->fsuid = current->fsuid;
+        b->fsgid = current->fsgid;
+        b->capability = current->cap_effective;
+        b->valid = valid;
+        b->flags = flags;
+        if (in_group_p(data->gid1))
+                b->suppgid = data->gid1;
+        else
+                b->suppgid = -1;
+
+        ll_ino2fid(&b->fid1, data->ino1, data->gen1, data->typ1);
+        if (data->name) {
+                char *tmp;
+                tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1,
+                                     data->namelen + 1);
+                LOGL0(data->name, data->namelen, tmp);
+        }
+}
index 3553a45..68d7f0d 100644 (file)
  */
 
 #define EXPORT_SYMTAB
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-
 #define DEBUG_SUBSYSTEM S_MDC
 
+#ifdef __KERNEL__
+# include <linux/config.h>
+# include <linux/module.h>
+# include <linux/kernel.h>
+#else
+# include <liblustre.h>
+#endif
+
 #include <linux/obd_class.h>
 #include <linux/lustre_mds.h>
+#include "mdc_internal.h"
 
 /* mdc_setattr does its own semaphore handling */
 static int mdc_reint(struct ptlrpc_request *request, int level)
 {
         int rc;
-        __u32 *opcodeptr = lustre_msg_buf(request->rq_reqmsg, 0);
+        __u32 *opcodeptr;
 
+        opcodeptr = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*opcodeptr));
         request->rq_level = level;
 
         if (!(*opcodeptr == REINT_SETATTR))
@@ -44,12 +49,8 @@ static int mdc_reint(struct ptlrpc_request *request, int level)
         if (!(*opcodeptr == REINT_SETATTR))
                 mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
 
-        if (rc) {
+        if (rc)
                 CDEBUG(D_INFO, "error in handling %d\n", rc);
-        } else {
-                /* For future resend/replays. */
-                *opcodeptr |= REINT_REPLAYING;
-        }
         return rc;
 }
 
@@ -59,7 +60,8 @@ static int mdc_reint(struct ptlrpc_request *request, int level)
  * If it is called with iattr->ia_valid & ATTR_FROM_OPEN, then it is a
  * magic open-path setattr that should take the setattr semaphore and
  * go to the setattr portal. */
-int mdc_setattr(struct lustre_handle *conn, struct inode *inode,
+int mdc_setattr(struct lustre_handle *conn,
+                struct mdc_op_data *data,
                 struct iattr *iattr, void *ea, int ealen,
                 struct ptlrpc_request **request)
 {
@@ -85,7 +87,7 @@ int mdc_setattr(struct lustre_handle *conn, struct inode *inode,
         } else
                 rpc_lock = &mdc_rpc_lock;
 
-        mds_setattr_pack(req, inode, iattr, ea, ealen);
+        mds_setattr_pack(req, data, iattr, ea, ealen);
 
         size[0] = sizeof(struct mds_body);
         req->rq_replen = lustre_msg_size(1, size);
@@ -101,15 +103,17 @@ int mdc_setattr(struct lustre_handle *conn, struct inode *inode,
         RETURN(rc);
 }
 
-int mdc_create(struct lustre_handle *conn, struct inode *dir,
-               const char *name, int namelen, const void *data, int datalen,
+int mdc_create(struct lustre_handle *conn,
+               struct mdc_op_data *op_data,
+               const void *data, int datalen,
                int mode, __u32 uid, __u32 gid, __u64 time, __u64 rdev,
                struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
-        int rc, size[3] = {sizeof(struct mds_rec_create), namelen + 1, 0};
+        int rc, size[3] = {sizeof(struct mds_rec_create),
+                           op_data->namelen + 1, 0};
         int level, bufcount = 2;
-        ENTRY;
+//        ENTRY;
 
         if (data && datalen) {
                 size[bufcount] = datalen;
@@ -119,12 +123,14 @@ int mdc_create(struct lustre_handle *conn, struct inode *dir,
         req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, bufcount,
                               size, NULL);
         if (!req)
-                RETURN(-ENOMEM);
+                return -ENOMEM;
+//                RETURN(-ENOMEM);
 
         /* mds_create_pack fills msg->bufs[1] with name
          * and msg->bufs[2] with tgt, for symlinks or lov MD data */
-        mds_create_pack(req, 0, dir, mode, rdev, uid, gid, time,
-                        name, namelen, data, datalen);
+        mds_create_pack(req, 0, op_data,
+                        mode, rdev, uid, gid, time,
+                        data, datalen);
 
         size[0] = sizeof(struct mds_body);
         req->rq_replen = lustre_msg_size(1, size);
@@ -135,7 +141,6 @@ int mdc_create(struct lustre_handle *conn, struct inode *dir,
         /* Resend if we were told to. */
         if (rc == -ERESTARTSYS) {
                 level = LUSTRE_CONN_RECOVD;
-                req->rq_flags = 0;
                 goto resend;
         }
 
@@ -143,16 +148,17 @@ int mdc_create(struct lustre_handle *conn, struct inode *dir,
                 mdc_store_inode_generation(req, 0, 0);
 
         *request = req;
-        RETURN(rc);
+        return rc;
+//        RETURN(rc);
 }
 
-int mdc_unlink(struct lustre_handle *conn, struct inode *dir,
-               struct inode *child, __u32 mode, const char *name, int namelen,
+int mdc_unlink(struct lustre_handle *conn,
+               struct mdc_op_data *data,
                struct ptlrpc_request **request)
 {
         struct obd_device *obddev = class_conn2obd(conn);
         struct ptlrpc_request *req = *request;
-        int rc, size[2] = {sizeof(struct mds_rec_unlink), namelen + 1};
+        int rc, size[2] = {sizeof(struct mds_rec_unlink), data->namelen + 1};
         ENTRY;
 
         LASSERT(req == NULL);
@@ -167,7 +173,7 @@ int mdc_unlink(struct lustre_handle *conn, struct inode *dir,
         size[1] = obddev->u.cli.cl_max_mds_easize;
         req->rq_replen = lustre_msg_size(2, size);
 
-        mds_unlink_pack(req, 0, dir, child, mode, name, namelen);
+        mds_unlink_pack(req, 0, data);
 
         rc = mdc_reint(req, LUSTRE_CONN_FULL);
         if (rc == -ERESTARTSYS)
@@ -176,11 +182,11 @@ int mdc_unlink(struct lustre_handle *conn, struct inode *dir,
 }
 
 int mdc_link(struct lustre_handle *conn,
-             struct inode *src, struct inode *dir, const char *name,
-             int namelen, struct ptlrpc_request **request)
+             struct mdc_op_data *data,
+             struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
-        int rc, size[2] = {sizeof(struct mds_rec_link), namelen + 1};
+        int rc, size[2] = {sizeof(struct mds_rec_link), data->namelen + 1};
         ENTRY;
 
         req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 2, size,
@@ -188,7 +194,7 @@ int mdc_link(struct lustre_handle *conn,
         if (!req)
                 RETURN(-ENOMEM);
 
-        mds_link_pack(req, 0, src, dir, name, namelen);
+        mds_link_pack(req, 0, data);
 
         size[0] = sizeof(struct mds_body);
         req->rq_replen = lustre_msg_size(1, size);
@@ -202,8 +208,9 @@ int mdc_link(struct lustre_handle *conn,
 }
 
 int mdc_rename(struct lustre_handle *conn,
-               struct inode *src, struct inode *tgt, const char *old,
-               int oldlen, const char *new, int newlen,
+               struct mdc_op_data *data,
+               const char *old, int oldlen,
+               const char *new, int newlen,
                struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
@@ -216,7 +223,7 @@ int mdc_rename(struct lustre_handle *conn,
         if (!req)
                 RETURN(-ENOMEM);
 
-        mds_rename_pack(req, 0, src, tgt, old, oldlen, new, newlen);
+        mds_rename_pack(req, 0, data, old, oldlen, new, newlen);
 
         size[0] = sizeof(struct mds_body);
         req->rq_replen = lustre_msg_size(1, size);
index 68075f5..dfcd7af 100644 (file)
 #define EXPORT_SYMTAB
 #define DEBUG_SUBSYSTEM S_MDC
 
-#include <linux/module.h>
-#include <linux/pagemap.h>
-#include <linux/miscdevice.h>
+#ifdef __KERNEL__
+# include <linux/module.h>
+# include <linux/pagemap.h>
+# include <linux/miscdevice.h>
+# include <linux/init.h>
+#else
+# include <liblustre.h>
+# include <linux/obd_class.h>
+#endif
+
 #include <linux/lustre_mds.h>
 #include <linux/lustre_lite.h>
 #include <linux/lustre_dlm.h>
-#include <linux/init.h>
 #include <linux/lprocfs_status.h>
+#include "mdc_internal.h"
 
 #define REQUEST_MINOR 244
 
@@ -51,19 +58,22 @@ static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid,
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
         req->rq_level = level;
         req->rq_replen = lustre_msg_size(1, &size);
 
         mds_pack_req_body(req);
         req->rq_reqmsg->flags |= msg_flags;
-        mdc_get_rpc_lock(&mdc_rpc_lock, NULL);
         rc = ptlrpc_queue_wait(req);
-        mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
 
         if (!rc) {
-                body = lustre_msg_buf(req->rq_repmsg, 0);
-                mds_unpack_body(body);
+                body = lustre_swab_repbuf (req, 0, sizeof (*body),
+                                           lustre_swab_mds_body);
+                if (body == NULL) {
+                        CERROR ("Can't extract mds_body\n");
+                        GOTO (out, rc = -EPROTO);
+                }
+
                 memcpy(rootfid, &body->fid1, sizeof(*rootfid));
 
                 CDEBUG(D_NET, "root ino="LPU64", last_committed="LPU64
@@ -90,85 +100,158 @@ int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh,
 {
         struct ptlrpc_request *req;
         struct mds_status_req *streq;
+        struct lov_desc       *desc;
+        struct obd_uuid       *uuids;
         int rc, size[2] = {sizeof(*streq)};
+        int i;
         ENTRY;
 
         req = ptlrpc_prep_req(class_conn2cliimp(mdc_connh), MDS_GETLOVINFO, 1,
                               size, NULL);
         if (!req)
-                GOTO(out, rc = -ENOMEM);
+                RETURN (-ENOMEM);
 
         *request = req;
-        streq = lustre_msg_buf(req->rq_reqmsg, 0);
-        streq->flags = HTON__u32(MDS_STATUS_LOV);
-        streq->repbuf = HTON__u32(8192);
+        streq = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*streq));
+        streq->flags = MDS_STATUS_LOV;
+        streq->repbuf = LOV_MAX_UUID_BUFFER_SIZE;
 
         /* prepare for reply */
         req->rq_level = LUSTRE_CONN_CON;
-        size[0] = 512;
-        size[1] = 8192;
+        size[0] = sizeof (*desc);
+        size[1] = LOV_MAX_UUID_BUFFER_SIZE;
         req->rq_replen = lustre_msg_size(2, size);
+
         mdc_get_rpc_lock(&mdc_rpc_lock, NULL);
         rc = ptlrpc_queue_wait(req);
         mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
 
- out:
-        RETURN(rc);
+        if (rc != 0) {
+                CERROR ("rcp failed\n");
+                GOTO (failed, rc);
+        }
+        
+        desc = lustre_swab_repbuf (req, 0, sizeof (*desc),
+                                   lustre_swab_lov_desc);
+        if (desc == NULL) {
+                CERROR ("Can't unpack lov_desc\n");
+                GOTO (failed, rc = -EPROTO);
+        }
+        
+        LASSERT_REPSWAB (req, 1);
+        /* array of uuids byte-sex insensitive; just verify they are all
+         * there and terminated */
+        uuids = lustre_msg_buf (req->rq_repmsg, 1,
+                                desc->ld_tgt_count * sizeof (*uuids));
+        if (uuids == NULL) {
+                CERROR ("Can't unpack %d uuids\n", desc->ld_tgt_count);
+                GOTO (failed, rc = -EPROTO);
+        }
+
+        for (i = 0; i < desc->ld_tgt_count; i++) {
+                int uid_len = strnlen (uuids[i].uuid, sizeof (uuids[i].uuid));
+                
+                if (uid_len == sizeof (uuids[i].uuid)) {
+                        CERROR ("Unterminated uuid %d:%*s\n",
+                                i, (int)sizeof (uuids[i].uuid), uuids[i].uuid);
+                        GOTO (failed, rc = -EPROTO);
+                }
+        }
+        RETURN(0);
+
+ failed:
+        ptlrpc_req_finished (req);
+        RETURN (rc);
 }
 
-int mdc_getattr(struct lustre_handle *conn,
-                obd_id ino, int type, unsigned long valid, unsigned int ea_size,
+int mdc_getattr_common (struct lustre_handle *conn,
+                        unsigned int ea_size, struct ptlrpc_request *req)
+{
+        struct mds_body *body;
+        void            *eadata;
+        int              rc; 
+        int              size[2] = {sizeof(*body), 0};
+        int              bufcount = 1;
+        ENTRY;
+
+        /* request message already built */
+
+        if (ea_size != 0) {
+                size[bufcount++] = ea_size;
+                CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
+                       ea_size);
+        }
+        req->rq_replen = lustre_msg_size(bufcount, size);
+
+        mdc_get_rpc_lock(&mdc_rpc_lock, NULL);
+        rc = ptlrpc_queue_wait(req);
+        mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
+        if (rc != 0)
+                RETURN (rc);
+        
+        body = lustre_swab_repbuf (req, 0, sizeof (*body),
+                                   lustre_swab_mds_body);
+        if (body == NULL) {
+                CERROR ("Can't unpack mds_body\n");
+                RETURN (-EPROTO);
+        }
+
+        CDEBUG(D_NET, "mode: %o\n", body->mode);
+
+        LASSERT_REPSWAB (req, 1);
+        if (body->eadatasize != 0) {
+                /* reply indicates presence of eadata; check it's there... */
+                eadata = lustre_msg_buf (req->rq_repmsg, 1, body->eadatasize);
+                if (eadata == NULL) {
+                        CERROR ("Missing/short eadata\n");
+                        RETURN (-EPROTO);
+                }
+        }
+
+        RETURN (0);
+}
+                        
+int mdc_getattr(struct lustre_handle *conn, struct ll_fid *fid,
+                unsigned long valid, unsigned int ea_size,
                 struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
-        int rc, size[2] = {sizeof(*body), 0}, bufcount = 1;
+        int size = sizeof(*body);
+        int rc;
         ENTRY;
 
         /* XXX do we need to make another request here?  We just did a getattr
          *     to do the lookup in the first place.
          */
-        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR, 1, size,
+        req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR, 1, &size,
                               NULL);
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
-        ll_ino2fid(&body->fid1, ino, 0, type);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+        memcpy(&body->fid1, fid, sizeof(*fid));
         body->valid = valid;
-
-        if (ea_size) {
-                size[bufcount] = ea_size;
-                bufcount++;
-                body->size = ea_size;
-                CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
-                       ea_size);
-        }
-        req->rq_replen = lustre_msg_size(bufcount, size);
+        body->eadatasize = ea_size;
         mds_pack_req_body(req);
 
-        mdc_get_rpc_lock(&mdc_rpc_lock, NULL);
-        rc = ptlrpc_queue_wait(req);
-        mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
-        if (!rc) {
-                body = lustre_msg_buf(req->rq_repmsg, 0);
-                mds_unpack_body(body);
-                CDEBUG(D_NET, "mode: %o\n", body->mode);
+        rc = mdc_getattr_common (conn, ea_size, req);
+        if (rc != 0) {
+                ptlrpc_req_finished (req);
+                req = NULL;
         }
-
-        GOTO(out, rc);
  out:
         *request = req;
-        return rc;
+        RETURN (rc);
 }
 
-int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent,
+int mdc_getattr_name(struct lustre_handle *conn, struct ll_fid *fid,
                      char *filename, int namelen, unsigned long valid,
                      unsigned int ea_size, struct ptlrpc_request **request)
 {
         struct ptlrpc_request *req;
         struct mds_body *body;
-        int rc, size[2] = {sizeof(*body), namelen}, bufcount = 1;
+        int rc, size[2] = {sizeof(*body), namelen};
         ENTRY;
 
         req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR_NAME, 2,
@@ -176,32 +259,20 @@ int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent,
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
-        ll_inode2fid(&body->fid1, parent);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+        memcpy(&body->fid1, fid, sizeof(*fid));
         body->valid = valid;
-        memcpy(lustre_msg_buf(req->rq_reqmsg, 1), filename, namelen);
-
-        if (ea_size) {
-                size[1] = ea_size;
-                bufcount++;
-                body->size = ea_size;
-                CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n",
-                       ea_size);
-                valid |= OBD_MD_FLEASIZE;
-        }
-
-        req->rq_replen = lustre_msg_size(bufcount, size);
+        body->eadatasize = ea_size;
         mds_pack_req_body(req);
 
-        mdc_get_rpc_lock(&mdc_rpc_lock, NULL);
-        rc = ptlrpc_queue_wait(req);
-        mdc_put_rpc_lock(&mdc_rpc_lock, NULL);
-        if (!rc) {
-                body = lustre_msg_buf(req->rq_repmsg, 0);
-                mds_unpack_body(body);
-        }
+        LASSERT (strnlen (filename, namelen) == namelen - 1);
+        memcpy(lustre_msg_buf(req->rq_reqmsg, 1, namelen), filename, namelen);
 
-        EXIT;
+        rc = mdc_getattr_common (conn, ea_size, req);
+        if (rc != 0) {
+                ptlrpc_req_finished (req);
+                req = NULL;
+        }
  out:
         *request = req;
         return rc;
@@ -211,143 +282,104 @@ int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent,
 void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff,
                                 int repoff)
 {
-        struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, reqoff);
-        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, repoff);
-
+        struct mds_rec_create *rec =
+                lustre_msg_buf(req->rq_reqmsg, reqoff, sizeof (*rec));
+        struct mds_body *body =
+                lustre_msg_buf(req->rq_repmsg, repoff, sizeof (*body));
+
+        LASSERT (rec != NULL);
+        LASSERT (body != NULL);
+        
         memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid);
         DEBUG_REQ(D_HA, req, "storing generation %x for ino "LPD64,
                   rec->cr_replayfid.generation, rec->cr_replayfid.id);
 }
 
-static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
-                            void *data, int flag)
-{
-        int rc;
-        struct lustre_handle lockh;
-        ENTRY;
-
-
-        switch (flag) {
-        case LDLM_CB_BLOCKING:
-                ldlm_lock2handle(lock, &lockh);
-                rc = ldlm_cli_cancel(&lockh);
-                if (rc < 0) {
-                        CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
-                        RETURN(rc);
-                }
-                break;
-        case LDLM_CB_CANCELING: {
-                /* Invalidate all dentries associated with this inode */
-                struct inode *inode = lock->l_data;
-
-                LASSERT(data != NULL);
-
-                /* XXX what tells us that 'data' is a valid inode at all?
-                 *     we should probably validate the lock handle first?
-                 */
-                inode = igrab(inode);
-
-                if (inode == NULL) /* inode->i_state & I_FREEING */
-                        break;
-
-                if (S_ISDIR(inode->i_mode)) {
-                        CDEBUG(D_INODE, "invalidating inode %lu\n",
-                               inode->i_ino);
-
-                        ll_invalidate_inode_pages(inode);
-                }
-
-                if (inode->i_sb->s_root && 
-                    inode != inode->i_sb->s_root->d_inode)
-                        d_unhash_aliases(inode);
-
-                iput(inode);
-                break;
-        }
-        default:
-                LBUG();
-        }
-
-        RETURN(0);
-}
-
 /* We always reserve enough space in the reply packet for a stripe MD, because
  * we don't know in advance the file type.
  *
  * XXX we could get that from ext2_dir_entry_2 file_type
  */
-int mdc_enqueue(struct lustre_handle *conn, int lock_type,
-                struct lookup_intent *it, int lock_mode, struct inode *dir,
-                struct dentry *de, struct lustre_handle *lockh,
-                char *tgt, int tgtlen, void *data, int datalen)
+int mdc_enqueue(struct lustre_handle *conn,
+                int lock_type,
+                struct lookup_intent *it,
+                int lock_mode,
+                struct mdc_op_data *data,
+                struct lustre_handle *lockh,
+                char *tgt,
+                int tgtlen,
+                ldlm_completion_callback cb_completion,
+                ldlm_blocking_callback cb_blocking,
+                void *cb_data)
 {
         struct ptlrpc_request *req;
         struct obd_device *obddev = class_conn2obd(conn);
         struct ldlm_res_id res_id =
-                { .name = {dir->i_ino, dir->i_generation} };
+                { .name = {data->ino1, data->gen1} };
         int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)};
         int rc, flags = LDLM_FL_HAS_INTENT;
         int repsize[3] = {sizeof(struct ldlm_reply),
                           sizeof(struct mds_body),
                           obddev->u.cli.cl_max_mds_easize};
-        struct mdc_unlink_data *d = data;
         struct ldlm_reply *dlm_rep;
         struct ldlm_intent *lit;
         struct ldlm_request *lockreq;
+        void *eadata;
+        unsigned long irqflags;
+        int   reply_buffers = 0;
         ENTRY;
 
-        LDLM_DEBUG_NOLOCK("mdsintent %s parent dir %lu",
-                          ldlm_it2str(it->it_op), dir->i_ino);
+//        LDLM_DEBUG_NOLOCK("mdsintent=%s,name=%s,dir=%lu",
+//                          ldlm_it2str(it->it_op), it_name, it_inode->i_ino);
 
         if (it->it_op & IT_OPEN) {
                 it->it_mode |= S_IFREG;
                 it->it_mode &= ~current->fs->umask;
 
                 size[2] = sizeof(struct mds_rec_create);
-                size[3] = de->d_name.len + 1;
+                size[3] = data->namelen + 1;
                 req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4,
                                       size, NULL);
                 if (!req)
                         RETURN(-ENOMEM);
 
-                req->rq_flags |= PTL_RPC_FL_REPLAY;
+                spin_lock_irqsave (&req->rq_lock, irqflags);
+                req->rq_replay = 1;
+                spin_unlock_irqrestore (&req->rq_lock, irqflags);
 
                 /* pack the intent */
-                lit = lustre_msg_buf(req->rq_reqmsg, 1);
-                lit->opc = NTOH__u64((__u64)it->it_op);
+                lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+                lit->opc = (__u64)it->it_op;
 
                 /* pack the intended request */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                mds_open_pack(req, 2, dir, it->it_mode, 0, current->fsuid,
-                              current->fsgid, CURRENT_TIME, it->it_flags,
-                              de->d_name.name, de->d_name.len, tgt, tgtlen);
-#else
-                mds_open_pack(req, 2, dir, it->it_mode, 0, current->fsuid,
-                              current->fsgid, CURRENT_TIME.tv_sec, it->it_flags,
-                              de->d_name.name, de->d_name.len, tgt, tgtlen);
-#endif
+                mds_open_pack(req, 2, data, it->it_mode, 0,
+                              current->fsuid, current->fsgid,
+                              LTIME_S(CURRENT_TIME), it->it_flags,
+                              tgt, tgtlen);
+                /* get ready for the reply */
+                reply_buffers = 3;
                 req->rq_replen = lustre_msg_size(3, repsize);
         } else if (it->it_op & IT_UNLINK) {
                 size[2] = sizeof(struct mds_rec_unlink);
-                size[3] = d->unl_len + 1;
+                size[3] = data->namelen + 1;
                 req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4,
                                       size, NULL);
                 if (!req)
                         RETURN(-ENOMEM);
 
                 /* pack the intent */
-                lit = lustre_msg_buf(req->rq_reqmsg, 1);
-                lit->opc = NTOH__u64((__u64)it->it_op);
+                lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+                lit->opc = (__u64)it->it_op;
 
                 /* pack the intended request */
-                mds_unlink_pack(req, 2, d->unl_dir, 
-                                d->unl_de, d->unl_mode,
-                                d->unl_name, d->unl_len);
+                mds_unlink_pack(req, 2, data);
+                /* get ready for the reply */
+                reply_buffers = 3;
                 req->rq_replen = lustre_msg_size(3, repsize);
         } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) {
                 int valid = OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE;
                 size[2] = sizeof(struct mds_body);
-                size[3] = de->d_name.len + 1;
+                size[3] = data->namelen + 1;
 
                 req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4,
                                       size, NULL);
@@ -355,13 +387,13 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                         RETURN(-ENOMEM);
 
                 /* pack the intent */
-                lit = lustre_msg_buf(req->rq_reqmsg, 1);
-                lit->opc = NTOH__u64((__u64)it->it_op);
+                lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit));
+                lit->opc = (__u64)it->it_op;
 
                 /* pack the intended request */
-                mds_getattr_pack(req, valid, 2, it->it_flags,  dir,
-                                 de->d_name.name, de->d_name.len);
+                mds_getattr_pack(req, valid, 2, it->it_flags, data);
                 /* get ready for the reply */
+                reply_buffers = 3;
                 req->rq_replen = lustre_msg_size(3, repsize);
         } else if (it->it_op == IT_READDIR) {
                 req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 1,
@@ -370,6 +402,7 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                         RETURN(-ENOMEM);
 
                 /* get ready for the reply */
+                reply_buffers = 1;
                 req->rq_replen = lustre_msg_size(1, repsize);
         }  else {
                 LBUG();
@@ -379,20 +412,13 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
         mdc_get_rpc_lock(&mdc_rpc_lock, it);
         rc = ldlm_cli_enqueue(conn, req, obddev->obd_namespace, NULL, res_id,
                               lock_type, NULL, 0, lock_mode, &flags,
-                              ldlm_completion_ast, mdc_blocking_ast, dir, NULL,
-                              lockh);
+                              cb_completion, cb_blocking, cb_data, lockh);
         mdc_put_rpc_lock(&mdc_rpc_lock, it);
 
-        /* If we successfully created, mark the request so that replay will
-         * do the right thing */
-        if (req->rq_transno) {
-                struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, 2);
-                rec->cr_opcode |= REINT_REPLAYING;
-        }
         /* Similarly, if we're going to replay this request, we don't want to
          * actually get a lock, just perform the intent. */
-        if (req->rq_transno || (req->rq_flags & PTL_RPC_FL_REPLAY)) {
-                lockreq = lustre_msg_buf(req->rq_reqmsg, 0);
+        if (req->rq_transno || req->rq_replay) {
+                lockreq = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*lockreq));
                 lockreq->lock_flags |= LDLM_FL_INTENT_ONLY;
         }
 
@@ -403,6 +429,8 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 memset(lockh, 0, sizeof(*lockh));
         } else if (rc != 0) {
                 CERROR("ldlm_cli_enqueue: %d\n", rc);
+                LASSERT (rc < 0);
+                ptlrpc_req_finished(req);
                 RETURN(rc);
         } else { /* rc = 0 */
                 struct ldlm_lock *lock = ldlm_handle2lock(lockh);
@@ -432,47 +460,57 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type,
                 LDLM_LOCK_PUT(lock);
         }
 
-        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0);
+        dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep));
+        LASSERT (dlm_rep != NULL);           /* checked by ldlm_cli_enqueue() */
+        LASSERT_REPSWABBED (req, 0);         /* swabbed by ldlm_cli_enqueue() */
+        
         it->it_disposition = (int) dlm_rep->lock_policy_res1;
         it->it_status = (int) dlm_rep->lock_policy_res2;
         it->it_lock_mode = lock_mode;
         it->it_data = req;
 
-        RETURN(rc);
-}
-
-void mdc_lock_set_inode(struct lustre_handle *lockh, struct inode *inode)
-{
-        struct ldlm_lock *lock = ldlm_handle2lock(lockh);
-        ENTRY;
+        /* We know what to expect, so we do any byte flipping required here */
+        LASSERT (reply_buffers == 3 || reply_buffers == 1);
+        if (reply_buffers == 3) {
+                struct mds_body *body;
 
-        LASSERT(lock != NULL);
-        lock->l_data = inode;
-        LDLM_LOCK_PUT(lock);
-        EXIT;
-}
+                body = lustre_swab_repbuf (req, 1, sizeof (*body),
+                                           lustre_swab_mds_body);
+                if (body == NULL) {
+                        CERROR ("Can't swab mds_body\n");
+                        RETURN (-EPROTO);
+                }
 
-int mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode,
-                      int flags)
-{
-        struct ldlm_res_id res_id =
-                { .name = {inode->i_ino, inode->i_generation} };
-        struct obd_device *obddev = class_conn2obd(conn);
-        ENTRY;
-        RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags));
+                if ((body->valid & OBD_MD_FLEASIZE) != 0) {
+                        /* The eadata is opaque; just check that it is
+                         * there.  Eventually, obd_unpackmd() will check
+                         * the contents */
+                        eadata = lustre_swab_repbuf (req, 2, body->eadatasize, 
+                                                     NULL);
+                        if (eadata == NULL) {
+                                CERROR ("Missing/short eadata\n");
+                                RETURN (-EPROTO);
+                        }
+                }
+        }
+        
+        RETURN(rc);
 }
 
 static void mdc_replay_open(struct ptlrpc_request *req)
 {
-        struct lustre_handle old, *file_fh = req->rq_replay_data;
+        struct obd_client_handle *och = req->rq_replay_data;
+        struct lustre_handle old, *file_fh = &och->och_fh;
         struct list_head *tmp;
-        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 0);
+        struct mds_body *body;
 
-        mds_unpack_body(body);
+        body = lustre_swab_repbuf (req, 1, sizeof (*body),
+                                   lustre_swab_mds_body);
+        LASSERT (body != NULL);
+        
         memcpy(&old, file_fh, sizeof(old));
-        CDEBUG(D_HA, "updating from "LPD64"/"LPD64" to "LPD64"/"LPD64"\n",
-               file_fh->addr, file_fh->cookie, body->handle.addr,
-               body->handle.cookie);
+        CDEBUG(D_HA, "updating handle from "LPD64" to "LPD64"\n",
+               file_fh->cookie, body->handle.cookie);
         memcpy(file_fh, &body->handle, sizeof(body->handle));
 
         /* A few frames up, ptlrpc_replay holds the lock, so this is safe. */
@@ -480,7 +518,7 @@ static void mdc_replay_open(struct ptlrpc_request *req)
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 if (req->rq_reqmsg->opc != MDS_CLOSE)
                         continue;
-                body = lustre_msg_buf(req->rq_reqmsg, 0);
+                body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
                 if (memcmp(&body->handle, &old, sizeof(old)))
                         continue;
 
@@ -489,15 +527,23 @@ static void mdc_replay_open(struct ptlrpc_request *req)
         }
 }
 
-void mdc_set_open_replay_data(struct ll_file_data *fd)
+void mdc_set_open_replay_data(struct obd_client_handle *och)
 {
-        struct ptlrpc_request *req = fd->fd_req;
-        struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, 2);
-        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+        struct ptlrpc_request *req = och->och_req;
+        struct mds_rec_create *rec =
+                lustre_msg_buf(req->rq_reqmsg, 2, sizeof (*rec));
+        struct mds_body *body =
+                lustre_msg_buf(req->rq_repmsg, 1, sizeof (*body));
+
+        LASSERT (rec != NULL);
+        /* outgoing messages always in my byte order */
+        LASSERT (body != NULL);
+        /* incoming message in my byte order (it's been swabbed) */
+        LASSERT_REPSWABBED (req, 1);
 
         memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid);
-        fd->fd_req->rq_replay_cb = mdc_replay_open;
-        fd->fd_req->rq_replay_data = &fd->fd_mdshandle;
+        req->rq_replay_cb = mdc_replay_open;
+        req->rq_replay_data = och;
 }
 
 int mdc_close(struct lustre_handle *conn, obd_id ino, int type,
@@ -513,7 +559,7 @@ int mdc_close(struct lustre_handle *conn, obd_id ino, int type,
         if (!req)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
         ll_ino2fid(&body->fid1, ino, 0, type);
         memcpy(&body->handle, fh, sizeof(body->handle));
 
@@ -530,72 +576,83 @@ int mdc_close(struct lustre_handle *conn, obd_id ino, int type,
 }
 
 int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset,
-                 char *addr, struct ptlrpc_request **request)
+                 struct page *page, struct ptlrpc_request **request)
 {
         struct obd_import *imp = class_conn2cliimp(conn);
-        struct ptlrpc_connection *connection =
-                client_conn2cli(conn)->cl_import.imp_connection;
         struct ptlrpc_request *req = NULL;
         struct ptlrpc_bulk_desc *desc = NULL;
-        struct ptlrpc_bulk_page *bulk = NULL;
         struct mds_body *body;
         int rc, size = sizeof(*body);
         ENTRY;
 
         CDEBUG(D_INODE, "inode: %ld\n", (long)ino);
 
-        desc = ptlrpc_prep_bulk(connection);
-        if (desc == NULL)
-                GOTO(out, rc = -ENOMEM);
-
         req = ptlrpc_prep_req(imp, MDS_READPAGE, 1, &size, NULL);
         if (!req)
-                GOTO(out2, rc = -ENOMEM);
-
+                GOTO(out, rc = -ENOMEM);
         /* XXX FIXME bug 249 */
         req->rq_request_portal = MDS_READPAGE_PORTAL;
 
-        bulk = ptlrpc_prep_bulk_page(desc);
-        if (bulk == NULL)
-                GOTO(out2, rc = -ENOMEM);
-
-        bulk->bp_xid = ptlrpc_next_xid();
-        bulk->bp_buflen = PAGE_CACHE_SIZE;
-        bulk->bp_buf = addr;
-
-        desc->bd_ptl_ev_hdlr = NULL;
-        desc->bd_portal = MDS_BULK_PORTAL;
-
-        rc = ptlrpc_register_bulk_put(desc);
-        if (rc) {
-                CERROR("couldn't setup bulk sink: error %d.\n", rc);
-                GOTO(out2, rc);
+        desc = ptlrpc_prep_bulk_imp (req, BULK_PUT_SINK, MDS_BULK_PORTAL);
+        if (desc == NULL) {
+                GOTO(out, rc = -ENOMEM);
         }
+        /* NB req now owns desc and will free it when it gets freed */
 
-        mds_readdir_pack(req, offset, ino, type, bulk->bp_xid);
+        rc = ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE);
+        if (rc != 0)
+                GOTO(out, rc);
+
+        mds_readdir_pack(req, offset, PAGE_CACHE_SIZE, ino, type);
 
         req->rq_replen = lustre_msg_size(1, &size);
         rc = ptlrpc_queue_wait(req);
-        if (rc) {
-                ptlrpc_abort_bulk(desc);
-                GOTO(out2, rc);
-        } else {
-                body = lustre_msg_buf(req->rq_repmsg, 0);
-                mds_unpack_body(body);
+
+        if (rc == 0) {
+                LASSERT (desc->bd_page_count == 1);
+                body = lustre_swab_repbuf (req, 0, sizeof (*body),
+                                           lustre_swab_mds_body);
+                if (body == NULL) {
+                        CERROR ("Can't unpack mds_body\n");
+                        GOTO (out, rc = -EPROTO);
+                }
         }
 
         EXIT;
- out2:
-        ptlrpc_bulk_decref(desc);
  out:
         *request = req;
         return rc;
 }
 
+static int mdc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
+                         void *karg, void *uarg)
+{
+        struct obd_device *obddev = class_conn2obd(conn);
+        struct obd_ioctl_data *data = karg;
+        struct obd_import *imp = obddev->u.cli.cl_import;
+        ENTRY;
+
+        switch (cmd) {
+        case OBD_IOC_CLIENT_RECOVER:
+                RETURN(ptlrpc_recover_import(imp, data->ioc_inlbuf1));
+        case IOC_OSC_SET_ACTIVE:
+                if (data->ioc_offset) {
+                        CERROR("%s: can't reactivate MDC\n",
+                               obddev->obd_uuid.uuid);
+                        RETURN(-ENOTTY);
+                }
+                RETURN(ptlrpc_set_import_active(imp, 0));
+        default:
+                CERROR("osc_ioctl(): unrecognised ioctl %#x\n", cmd);
+                RETURN(-ENOTTY);
+        }
+}
+
 static int mdc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
 {
         struct ptlrpc_request *req;
-        int rc, size = sizeof(*osfs);
+        struct obd_statfs *msfs;
+        int rc, size = sizeof(*msfs);
         ENTRY;
 
         req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_STATFS, 0, NULL,
@@ -612,8 +669,14 @@ static int mdc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
         if (rc)
                 GOTO(out, rc);
 
-        obd_statfs_unpack(osfs, lustre_msg_buf(req->rq_repmsg, 0));
-
+        msfs = lustre_swab_repbuf (req, 0, sizeof (*msfs),
+                                   lustre_swab_obd_statfs);
+        if (msfs == NULL) {
+                CERROR ("Can't unpack obd_statfs\n");
+                GOTO (out, rc = -EPROTO);
+        }
+        
+        memcpy (osfs, msfs, sizeof (*msfs));
         EXIT;
 out:
         ptlrpc_req_finished(req);
@@ -634,122 +697,19 @@ static int mdc_detach(struct obd_device *dev)
         return lprocfs_obd_detach(dev);
 }
 
-/* Send a mostly-dummy GETSTATUS request and indicate that we're done replay. */
-static int signal_completed_replay(struct obd_import *imp)
-{
-        struct ll_fid fid;
-
-        return send_getstatus(imp, &fid, LUSTRE_CONN_RECOVD, MSG_LAST_REPLAY);
-}
-
-static int mdc_recover(struct obd_import *imp, int phase)
-{
-        int rc;
-        unsigned long flags;
-        struct ptlrpc_request *req;
-        struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
-        ENTRY;
-
-        switch(phase) {
-            case PTLRPC_RECOVD_PHASE_PREPARE:
-                ldlm_cli_cancel_unused(ns, NULL, LDLM_FL_LOCAL_ONLY);
-                RETURN(0);
-
-            case PTLRPC_RECOVD_PHASE_NOTCONN:
-                ldlm_namespace_cleanup(ns, 1);
-                ptlrpc_abort_inflight(imp, 0);
-                /* FALL THROUGH */
-            case PTLRPC_RECOVD_PHASE_RECOVER:
-        reconnect:
-                rc = ptlrpc_reconnect_import(imp, MDS_CONNECT, &req);
-
-                flags = req->rq_repmsg
-                        ? lustre_msg_get_op_flags(req->rq_repmsg)
-                        : 0;
-
-                if (rc == -EBUSY && (flags & MSG_CONNECT_RECOVERING))
-                        CERROR("reconnect denied by recovery; should retry\n");
-
-                if (rc) {
-                        if (phase != PTLRPC_RECOVD_PHASE_NOTCONN) {
-                                CERROR("can't reconnect, invalidating\n");
-                                ldlm_namespace_cleanup(ns, 1);
-                                ptlrpc_abort_inflight(imp, 0);
-                        }
-                        ptlrpc_req_finished(req);
-                        RETURN(rc);
-                }
-
-                if (flags & MSG_CONNECT_RECOVERING) {
-                        /* Replay if they want it. */
-                        DEBUG_REQ(D_HA, req, "MDS wants replay");
-                        rc = ptlrpc_replay(imp);
-                        if (rc)
-                                GOTO(check_rc, rc);
-
-                        rc = ldlm_replay_locks(imp);
-                        if (rc)
-                                GOTO(check_rc, rc);
-
-                        rc = signal_completed_replay(imp);
-                        if (rc)
-                                GOTO(check_rc, rc);
-                } else if (flags & MSG_CONNECT_RECONNECT) {
-                        DEBUG_REQ(D_HA, req, "reconnecting to MDS");
-                        /* Nothing else to do here. */
-                } else {
-                        DEBUG_REQ(D_HA, req, "evicted: invalidating");
-                        /* Otherwise, clean everything up. */
-                        ldlm_namespace_cleanup(ns, 1);
-                        ptlrpc_abort_inflight(imp, 0);
-                }
-
-                ptlrpc_req_finished(req);
-                spin_lock_irqsave(&imp->imp_lock, flags);
-                imp->imp_level = LUSTRE_CONN_FULL;
-                spin_unlock_irqrestore(&imp->imp_lock, flags);
-
-                ptlrpc_wake_delayed(imp);
-
-                rc = ptlrpc_resend(imp);
-                if (rc)
-                        GOTO(check_rc, rc);
-
-                RETURN(0);
-        check_rc:
-                /* If we get disconnected in the middle, recovery has probably
-                 * failed.  Reconnect and find out.
-                 */
-                if (rc == -ENOTCONN)
-                        goto reconnect;
-                RETURN(rc);
-
-            default:
-                RETURN(-EINVAL);
-        }
-}
-
-static int mdc_connect(struct lustre_handle *conn, struct obd_device *obd,
-                       struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                       ptlrpc_recovery_cb_t recover)
-{
-        struct obd_import *imp = &obd->u.cli.cl_import;
-        imp->imp_recover = mdc_recover;
-        return client_obd_connect(conn, obd, cluuid, recovd, recover);
-}
-
 struct obd_ops mdc_obd_ops = {
         o_owner:       THIS_MODULE,
         o_attach:      mdc_attach,
         o_detach:      mdc_detach,
         o_setup:       client_obd_setup,
         o_cleanup:     client_obd_cleanup,
-        o_connect:     mdc_connect,
-        o_disconnect:  client_obd_disconnect,
+        o_connect:     client_import_connect,
+        o_disconnect:  client_import_disconnect,
+        o_iocontrol:   mdc_iocontrol,
         o_statfs:      mdc_statfs
 };
 
-static int __init ptlrpc_request_init(void)
+int __init mdc_init(void)
 {
         struct lprocfs_static_vars lvars;
         mdc_init_rpc_lock(&mdc_rpc_lock);
@@ -759,11 +719,12 @@ static int __init ptlrpc_request_init(void)
                                    LUSTRE_MDC_NAME);
 }
 
-static void __exit ptlrpc_request_exit(void)
+static void __exit mdc_exit(void)
 {
         class_unregister_type(LUSTRE_MDC_NAME);
 }
 
+#ifdef __KERNEL__
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
 MODULE_DESCRIPTION("Lustre Metadata Client");
 MODULE_LICENSE("GPL");
@@ -771,7 +732,6 @@ MODULE_LICENSE("GPL");
 EXPORT_SYMBOL(mdc_getstatus);
 EXPORT_SYMBOL(mdc_getlovinfo);
 EXPORT_SYMBOL(mdc_enqueue);
-EXPORT_SYMBOL(mdc_cancel_unused);
 EXPORT_SYMBOL(mdc_getattr);
 EXPORT_SYMBOL(mdc_getattr_name);
 EXPORT_SYMBOL(mdc_create);
@@ -781,10 +741,10 @@ EXPORT_SYMBOL(mdc_link);
 EXPORT_SYMBOL(mdc_readpage);
 EXPORT_SYMBOL(mdc_setattr);
 EXPORT_SYMBOL(mdc_close);
-EXPORT_SYMBOL(mdc_lock_set_inode);
 EXPORT_SYMBOL(mdc_set_open_replay_data);
 
 EXPORT_SYMBOL(mdc_store_inode_generation);
 
-module_init(ptlrpc_request_init);
-module_exit(ptlrpc_request_exit);
+module_init(mdc_init);
+module_exit(mdc_exit);
+#endif
index f789c22..cb63910 100644 (file)
@@ -4,21 +4,10 @@
 # See the file COPYING in this distribution
 
 DEFS= 
-
 MODULE = mds
-
 modulefs_DATA = mds.o
 EXTRA_PROGRAMS = mds
-
-LINX= mds_updates.c mds_open.c simple.c target.c
-
-mds_updates.c: 
-       test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c
-simple.c: 
-       test -e simple.c || ln -sf $(top_srcdir)/lib/simple.c
-target.c: 
-       test -e target.c || ln -sf $(top_srcdir)/lib/target.c
-
-mds_SOURCES = mds_lov.c handler.c mds_reint.c mds_fs.c lproc_mds.c $(LINX)
+mds_SOURCES = mds_lov.c handler.c mds_reint.c mds_fs.c lproc_mds.c mds_open.c \
+mds_lib.c mds_internal.h
 
 include $(top_srcdir)/Rules
diff --git a/lustre/mds/Makefile.mk b/lustre/mds/Makefile.mk
new file mode 100644 (file)
index 0000000..6b712fb
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include fs/lustre/portals/Kernelenv
+
+obj-y += mds.o
+
+mds-objs    := mds_lov.o handler.o mds_reint.o mds_fs.o lproc_mds.o mds_internal.h mds_updates.o mds_open.o simple.o target.o
index 58cfa20..259a6bc 100644 (file)
 #include <linux/init.h>
 #include <linux/obd_class.h>
 #include <linux/random.h>
+#include <linux/fs.h>
+#include <linux/jbd.h>
+#include <linux/ext3_fs.h>
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/workqueue.h>
-#include <linux/mount.h>
-#else 
-#include <linux/locks.h>
+# include <linux/smp_lock.h>
+# include <linux/buffer_head.h>
+# include <linux/workqueue.h>
+# include <linux/mount.h>
+#else
+# include <linux/locks.h>
 #endif
 #include <linux/obd_lov.h>
 #include <linux/lustre_mds.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/lprocfs_status.h>
-
-kmem_cache_t *mds_file_cache;
+#include "mds_internal.h"
 
 extern int mds_get_lovtgts(struct mds_obd *obd, int tgt_count,
                            struct obd_uuid *uuidarray);
 extern int mds_get_lovdesc(struct mds_obd  *obd, struct lov_desc *desc);
 int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
                        struct ptlrpc_request *req, int rc, int disp);
-static int mds_cleanup(struct obd_device * obddev);
+static int mds_cleanup(struct obd_device * obddev, int force, int failover);
 
 inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req)
 {
@@ -65,9 +67,13 @@ inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req)
 static int mds_bulk_timeout(void *data)
 {
         struct ptlrpc_bulk_desc *desc = data;
+        struct obd_export *exp = desc->bd_export;
 
-        ENTRY;
-        recovd_conn_fail(desc->bd_connection);
+        CERROR("bulk send timed out: evicting %s@%s\n",
+               exp->exp_client_uuid.uuid,
+               exp->exp_connection->c_remote_uuid.uuid);
+        ptlrpc_fail_export(exp);
+        ptlrpc_abort_bulk (desc);
         RETURN(1);
 }
 
@@ -76,39 +82,35 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
                         __u64 offset, __u64 xid)
 {
         struct ptlrpc_bulk_desc *desc;
-        struct ptlrpc_bulk_page *bulk;
         struct l_wait_info lwi;
-        char *buf;
+        struct page *page;
         int rc = 0;
         ENTRY;
 
-        desc = ptlrpc_prep_bulk(req->rq_connection);
+        LASSERT ((offset & (PAGE_CACHE_SIZE - 1)) == 0);
+
+        desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, MDS_BULK_PORTAL);
         if (desc == NULL)
                 GOTO(out, rc = -ENOMEM);
 
-        bulk = ptlrpc_prep_bulk_page(desc);
-        if (bulk == NULL)
+        LASSERT (PAGE_SIZE == PAGE_CACHE_SIZE);
+        page = alloc_pages (GFP_KERNEL, 0);
+        if (page == NULL)
                 GOTO(cleanup_bulk, rc = -ENOMEM);
 
-        OBD_ALLOC(buf, PAGE_CACHE_SIZE);
-        if (buf == NULL)
-                GOTO(cleanup_bulk, rc = -ENOMEM);
+        rc = ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE);
+        if (rc != 0)
+                GOTO(cleanup_buf, rc);
 
         CDEBUG(D_EXT2, "reading %lu@"LPU64" from dir %lu (size %llu)\n",
                PAGE_CACHE_SIZE, offset, file->f_dentry->d_inode->i_ino,
                file->f_dentry->d_inode->i_size);
-        rc = fsfilt_readpage(req->rq_export->exp_obd, file, buf,
+        rc = fsfilt_readpage(req->rq_export->exp_obd, file, page_address (page),
                              PAGE_CACHE_SIZE, (loff_t *)&offset);
 
         if (rc != PAGE_CACHE_SIZE)
                 GOTO(cleanup_buf, rc = -EIO);
 
-        bulk->bp_xid = xid;
-        bulk->bp_buf = buf;
-        bulk->bp_buflen = PAGE_CACHE_SIZE;
-        desc->bd_ptl_ev_hdlr = NULL;
-        desc->bd_portal = MDS_BULK_PORTAL;
-
         rc = ptlrpc_bulk_put(desc);
         if (rc)
                 GOTO(cleanup_buf, rc);
@@ -121,19 +123,17 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
         }
 
         lwi = LWI_TIMEOUT(obd_timeout * HZ, mds_bulk_timeout, desc);
-        rc = l_wait_event(desc->bd_waitq, desc->bd_flags & PTL_BULK_FL_SENT,
-                          &lwi);
+        rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi);
         if (rc) {
-                if (rc != -ETIMEDOUT)
-                        LBUG();
+                LASSERT (rc == -ETIMEDOUT);
                 GOTO(cleanup_buf, rc);
         }
 
         EXIT;
  cleanup_buf:
-        OBD_FREE(buf, PAGE_SIZE);
+        __free_pages (page, 0);
  cleanup_bulk:
-        ptlrpc_bulk_decref(desc);
+        ptlrpc_free_bulk (desc);
  out:
         return rc;
 }
@@ -157,7 +157,7 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid,
         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL,
                               res_id, LDLM_PLAIN, NULL, 0, lock_mode,
                               &flags, ldlm_completion_ast,
-                              mds_blocking_ast, NULL, NULL, lockh);
+                              mds_blocking_ast, NULL, lockh);
         if (rc != ELDLM_OK) {
                 l_dput(de);
                 retval = ERR_PTR(-ENOLCK); /* XXX translate ldlm code */
@@ -171,67 +171,52 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid,
 #endif
 
 
-
 /* Look up an entry by inode number. */
 /* this function ONLY returns valid dget'd dentries with an initialized inode
    or errors */
 struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
                               struct vfsmount **mnt)
 {
-        /* stolen from NFS */
-        struct super_block *sb = mds->mds_sb;
+        char fid_name[32];
         unsigned long ino = fid->id;
         __u32 generation = fid->generation;
         struct inode *inode;
-        struct list_head *lp;
         struct dentry *result;
 
         if (ino == 0)
                 RETURN(ERR_PTR(-ESTALE));
 
-        inode = iget(sb, ino);
-        if (inode == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
+        snprintf(fid_name, sizeof(fid_name), "0x%lx", ino);
 
-        CDEBUG(D_DENTRY, "--> mds_fid2dentry: sb %p\n", inode->i_sb);
+        /* under ext3 this is neither supposed to return bad inodes
+           nor NULL inodes. */
+        result = ll_lookup_one_len(fid_name, mds->mds_fid_de, strlen(fid_name));
+        if (IS_ERR(result))
+                RETURN(result);
 
-        if (is_bad_inode(inode) ||
-            (generation && inode->i_generation != generation)) {
+        inode = result->d_inode;
+        if (!inode)
+                RETURN(ERR_PTR(-ENOENT));
+
+        CDEBUG(D_DENTRY, "--> mds_fid2dentry: ino %lu, gen %u, sb %p\n",
+               inode->i_ino, inode->i_generation, inode->i_sb);
+
+        if (generation && inode->i_generation != generation) {
                 /* we didn't find the right inode.. */
-                CERROR("bad inode %lu, link: %d ct: %d or version  %u/%u\n",
+                CERROR("bad inode %lu, link: %d ct: %d or generation %u/%u\n",
                        inode->i_ino, inode->i_nlink,
                        atomic_read(&inode->i_count), inode->i_generation,
                        generation);
-                iput(inode);
+                dput(result);
                 RETURN(ERR_PTR(-ENOENT));
         }
 
-        /* now to find a dentry. If possible, get a well-connected one */
-        if (mnt)
+        if (mnt) {
                 *mnt = mds->mds_vfsmnt;
-        spin_lock(&dcache_lock);
-        list_for_each(lp, &inode->i_dentry) {
-                result = list_entry(lp, struct dentry, d_alias);
-                if (!(result->d_flags & DCACHE_DISCONNECTED)) {
-                        dget_locked(result);
-                        result->d_vfs_flags |= DCACHE_REFERENCED;
-                        spin_unlock(&dcache_lock);
-                        iput(inode);
-                        if (mnt)
-                                mntget(*mnt);
-                        return result;
-                }
-        }
-        spin_unlock(&dcache_lock);
-        result = d_alloc_root(inode);
-        if (result == NULL) {
-                iput(inode);
-                return ERR_PTR(-ENOMEM);
-        }
-        if (mnt)
                 mntget(*mnt);
-        result->d_flags |= DCACHE_DISCONNECTED;
-        return result;
+        }
+
+        RETURN(result);
 }
 
 
@@ -242,13 +227,12 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid,
  * on the server, etc.
  */
 static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
-                       struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                       ptlrpc_recovery_cb_t recover)
+                       struct obd_uuid *cluuid)
 {
         struct obd_export *exp;
         struct mds_export_data *med;
         struct mds_client_data *mcd;
-        int rc;
+        int rc, abort_recovery;
         ENTRY;
 
         if (!conn || !obd || !cluuid)
@@ -256,9 +240,10 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
 
         /* Check for aborted recovery. */
         spin_lock_bh(&obd->obd_processing_task_lock);
-        if (obd->obd_flags & OBD_ABORT_RECOVERY)
-                target_abort_recovery(obd);
+        abort_recovery = obd->obd_abort_recovery;
         spin_unlock_bh(&obd->obd_processing_task_lock);
+        if (abort_recovery)
+                target_abort_recovery(obd);
 
         /* XXX There is a small race between checking the list and adding a
          * new connection for the same UUID, but the real threat (list
@@ -276,6 +261,7 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
         exp = class_conn2export(conn);
         LASSERT(exp);
         med = &exp->exp_mds_data;
+        class_export_put(exp);
 
         OBD_ALLOC(mcd, sizeof(*mcd));
         if (!mcd) {
@@ -289,7 +275,7 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
         INIT_LIST_HEAD(&med->med_open_head);
         spin_lock_init(&med->med_open_lock);
 
-        rc = mds_client_add(&obd->u.mds, med, -1);
+        rc = mds_client_add(obd, &obd->u.mds, med, -1);
         if (rc)
                 GOTO(out_mcd, rc);
 
@@ -298,42 +284,116 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd,
 out_mcd:
         OBD_FREE(mcd, sizeof(*mcd));
 out_export:
-        class_disconnect(conn);
+        class_disconnect(conn, 0);
 
         return rc;
 }
 
+static void mds_mfd_addref(void *mfdp)
+{
+        struct mds_file_data *mfd = mfdp;
+
+        atomic_inc(&mfd->mfd_refcount);
+        CDEBUG(D_INFO, "GETting mfd %p : new refcount %d\n", mfd,
+               atomic_read(&mfd->mfd_refcount));
+}
+
+struct mds_file_data *mds_mfd_new(void)
+{
+        struct mds_file_data *mfd;
+
+        OBD_ALLOC(mfd, sizeof *mfd);
+        if (mfd == NULL) {
+                CERROR("mds: out of memory\n");
+                return NULL;
+        }
+
+        atomic_set(&mfd->mfd_refcount, 2);
+
+        INIT_LIST_HEAD(&mfd->mfd_handle.h_link);
+        class_handle_hash(&mfd->mfd_handle, mds_mfd_addref);
+
+        return mfd;
+}
+
+static struct mds_file_data *mds_handle2mfd(struct lustre_handle *handle)
+{
+        ENTRY;
+        LASSERT(handle != NULL);
+        RETURN(class_handle2object(handle->cookie));
+}
+
+void mds_mfd_put(struct mds_file_data *mfd)
+{
+        CDEBUG(D_INFO, "PUTting mfd %p : new refcount %d\n", mfd,
+               atomic_read(&mfd->mfd_refcount) - 1);
+        LASSERT(atomic_read(&mfd->mfd_refcount) > 0 &&
+                atomic_read(&mfd->mfd_refcount) < 0x5a5a);
+        if (atomic_dec_and_test(&mfd->mfd_refcount)) {
+                LASSERT(list_empty(&mfd->mfd_handle.h_link));
+                OBD_FREE(mfd, sizeof *mfd);
+        }
+}
+
+void mds_mfd_destroy(struct mds_file_data *mfd)
+{
+        class_handle_unhash(&mfd->mfd_handle);
+        mds_mfd_put(mfd);
+}
+
 /* Call with med->med_open_lock held, please. */
-inline int mds_close_mfd(struct mds_file_data *mfd, struct mds_export_data *med)
+static int mds_close_mfd(struct mds_file_data *mfd, struct mds_export_data *med)
 {
-        struct file *file = mfd->mfd_file;
-        int rc;
         struct dentry *de = NULL;
-        LASSERT(file->private_data == mfd);
-
-        LASSERT(mfd->mfd_servercookie != DEAD_HANDLE_MAGIC);
 
+#ifdef CONFIG_SMP
+        LASSERT(spin_is_locked(&med->med_open_lock));
+#endif
         list_del(&mfd->mfd_list);
-        mfd->mfd_servercookie = DEAD_HANDLE_MAGIC;
-        kmem_cache_free(mds_file_cache, mfd);
 
-        if (file->f_dentry->d_parent) {
-                LASSERT(atomic_read(&file->f_dentry->d_parent->d_count));
-                de = dget(file->f_dentry->d_parent);
+        if (mfd->mfd_dentry->d_parent) {
+                LASSERT(atomic_read(&mfd->mfd_dentry->d_parent->d_count));
+                de = dget(mfd->mfd_dentry->d_parent);
         }
-        rc = filp_close(file, 0);
+
+        /* this is the actual "close" */
+        l_dput(mfd->mfd_dentry);
+
         if (de)
                 l_dput(de);
-        RETURN(rc);
+
+        mds_mfd_destroy(mfd);
+        RETURN(0);
 }
 
-static int mds_disconnect(struct lustre_handle *conn)
+static int mds_disconnect(struct lustre_handle *conn, int failover)
 {
         struct obd_export *export = class_conn2export(conn);
-        struct list_head *tmp, *n;
+        int rc;
+        unsigned long flags;
+        ENTRY;
+
+        ldlm_cancel_locks_for_export(export);
+
+        spin_lock_irqsave(&export->exp_lock, flags);
+        export->exp_failover = failover;
+        spin_unlock_irqrestore(&export->exp_lock, flags);
+
+        rc = class_disconnect(conn, failover);
+        class_export_put(export);
+
+        RETURN(rc);
+}
+
+static void mds_destroy_export(struct obd_export *export)
+{
         struct mds_export_data *med = &export->exp_mds_data;
+        struct list_head *tmp, *n;
         int rc;
+
         ENTRY;
+        LASSERT(!strcmp(export->exp_obd->obd_type->typ_name,
+                        LUSTRE_MDS_NAME));
 
         /*
          * Close any open files.
@@ -342,28 +402,39 @@ static int mds_disconnect(struct lustre_handle *conn)
         list_for_each_safe(tmp, n, &med->med_open_head) {
                 struct mds_file_data *mfd =
                         list_entry(tmp, struct mds_file_data, mfd_list);
-                CERROR("force closing client file handle for %*s\n",
-                       mfd->mfd_file->f_dentry->d_name.len,
-                       mfd->mfd_file->f_dentry->d_name.name);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+                struct dentry *dentry = mfd->mfd_dentry;
+                CERROR("force closing client file handle for %*s (%s:%lu)\n",
+                       dentry->d_name.len, dentry->d_name.name,
+                       kdevname(dentry->d_inode->i_sb->s_dev),
+                       dentry->d_inode->i_ino);
+#endif
                 rc = mds_close_mfd(mfd, med);
                 if (rc)
                         CDEBUG(D_INODE, "Error closing file: %d\n", rc);
         }
         spin_unlock(&med->med_open_lock);
 
-        ldlm_cancel_locks_for_export(export);
-        if (med->med_outstanding_reply) {
+        if (export->exp_outstanding_reply) {
+                struct ptlrpc_request *req = export->exp_outstanding_reply;
+                unsigned long          flags;
+
                 /* Fake the ack, so the locks get cancelled. */
-                med->med_outstanding_reply->rq_flags &= ~PTL_RPC_FL_WANT_ACK;
-                med->med_outstanding_reply->rq_flags |= PTL_RPC_FL_ERR;
-                wake_up(&med->med_outstanding_reply->rq_wait_for_rep);
-                med->med_outstanding_reply = NULL;
-        }
-        mds_client_free(export);
+                LBUG ();
+                /* Actually we can't do this because it prevents us knowing
+                 * if the ACK callback ran or not */
+                spin_lock_irqsave (&req->rq_lock, flags);
+                req->rq_want_ack = 0;
+                req->rq_err = 1;
+                wake_up(&req->rq_wait_for_rep);
+                spin_unlock_irqrestore (&req->rq_lock, flags);
 
-        rc = class_disconnect(conn);
+                export->exp_outstanding_reply = NULL;
+        }
 
-        RETURN(rc);
+        if (!export->exp_failover)
+                mds_client_free(export);
+        EXIT;
 }
 
 /*
@@ -393,7 +464,7 @@ static int mds_getstatus(struct ptlrpc_request *req)
         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_GETSTATUS_PACK)) {
                 CERROR("mds: out of memory for message: size=%d\n", size);
-                req->rq_status = -ENOMEM;
+                req->rq_status = -ENOMEM;       /* superfluous? */
                 RETURN(-ENOMEM);
         }
 
@@ -404,7 +475,7 @@ static int mds_getstatus(struct ptlrpc_request *req)
          */
         mds_fsync_super(mds->mds_sb);
 
-        body = lustre_msg_buf(req->rq_repmsg, 0);
+        body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
         memcpy(&body->fid1, &mds->mds_rootfid, sizeof(body->fid1));
 
         /* the last_committed and last_xid fields are filled in for all
@@ -418,19 +489,28 @@ static int mds_getlovinfo(struct ptlrpc_request *req)
         struct mds_obd *mds = mds_req2mds(req);
         struct mds_status_req *streq;
         struct lov_desc *desc;
+        struct obd_uuid *uuid0;
         int tgt_count;
         int rc, size[2] = {sizeof(*desc)};
         ENTRY;
 
-        streq = lustre_msg_buf(req->rq_reqmsg, 0);
-        streq->flags = NTOH__u32(streq->flags);
-        streq->repbuf = NTOH__u32(streq->repbuf);
+        streq = lustre_swab_reqbuf (req, 0, sizeof (*streq),
+                                    lustre_swab_mds_status_req);
+        if (streq == NULL) {
+                CERROR ("Can't unpack mds_status_req\n");
+                RETURN (-EFAULT);
+        }
+
+        if (streq->repbuf > LOV_MAX_UUID_BUFFER_SIZE) {
+                CERROR ("Illegal request for uuid array > %d\n",
+                        streq->repbuf);
+                RETURN (-EINVAL);
+        }
         size[1] = streq->repbuf;
 
         rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc) {
                 CERROR("mds: out of memory for message: size=%d\n", size[1]);
-                req->rq_status = -ENOMEM;
                 RETURN(-ENOMEM);
         }
 
@@ -439,18 +519,21 @@ static int mds_getlovinfo(struct ptlrpc_request *req)
                 RETURN(0);
         }
 
-        desc = lustre_msg_buf(req->rq_repmsg, 0);
-        memcpy(desc, &mds->mds_lov_desc, sizeof *desc);
-        lov_packdesc(desc);
-        tgt_count = le32_to_cpu(desc->ld_tgt_count);
-        if (tgt_count * sizeof(struct obd_uuid) > streq->repbuf) {
+        /* XXX We're sending the lov_desc in my byte order.
+         * Receiver will swab... */
+        desc = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*desc));
+        memcpy(desc, &mds->mds_lov_desc, sizeof (*desc));
+
+        tgt_count = mds->mds_lov_desc.ld_tgt_count;
+        uuid0 = lustre_msg_buf (req->rq_repmsg, 1,
+                                tgt_count * sizeof (*uuid0));
+        if (uuid0 == NULL) {
                 CERROR("too many targets, enlarge client buffers\n");
                 req->rq_status = -ENOSPC;
                 RETURN(0);
         }
 
-        rc = mds_get_lovtgts(mds, tgt_count,
-                             lustre_msg_buf(req->rq_repmsg, 1));
+        rc = mds_get_lovtgts(mds, tgt_count, uuid0);
         if (rc) {
                 CERROR("get_lovtgts error %d\n", rc);
                 req->rq_status = rc;
@@ -507,17 +590,19 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg,
 {
         struct mds_obd *mds = &obd->u.mds;
         struct lov_mds_md *lmm;
-        int lmm_size = msg->buflens[offset];
+        int lmm_size;
         int rc;
         ENTRY;
 
-        if (lmm_size == 0) {
+        lmm = lustre_msg_buf(msg, offset, 0);
+        if (lmm == NULL) {
+                /* Some problem with getting eadata when I sized the reply
+                 * buffer... */
                 CDEBUG(D_INFO, "no space reserved for inode %lu MD\n",
                        inode->i_ino);
                 RETURN(0);
         }
-
-        lmm = lustre_msg_buf(msg, offset);
+        lmm_size = msg->buflens[offset];
 
         /* I don't really like this, but it is a sanity check on the client
          * MD request.  However, if the client doesn't know how much space
@@ -529,15 +614,13 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg,
                 // RETURN(-EINVAL);
         }
 
-        /* We don't need to store the reply size, because this buffer is
-         * discarded right after unpacking, and the LOV can figure out the
-         * size itself from the ost count.
-         */
-        if ((rc = fsfilt_get_md(obd, inode, lmm, lmm_size)) < 0) {
-                CDEBUG(D_INFO, "No md for ino %lu: rc = %d\n",
-                       inode->i_ino, rc);
+        rc = fsfilt_get_md(obd, inode, lmm, lmm_size);
+        if (rc < 0) {
+                CERROR ("Error %d reading eadata for ino %lu\n",
+                        rc, inode->i_ino);
         } else if (rc > 0) {
                 body->valid |= OBD_MD_FLEASIZE;
+                body->eadatasize = rc;
                 rc = 0;
         }
 
@@ -556,24 +639,36 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry,
         if (inode == NULL)
                 RETURN(-ENOENT);
 
-        body = lustre_msg_buf(req->rq_repmsg, reply_off);
+        body = lustre_msg_buf(req->rq_repmsg, reply_off, sizeof (*body));
+        LASSERT (body != NULL);                 /* caller prepped reply */
 
         mds_pack_inode2fid(&body->fid1, inode);
         mds_pack_inode2body(body, inode);
 
-        if (S_ISREG(inode->i_mode) && reqbody->valid & OBD_MD_FLEASIZE) {
+        if (S_ISREG(inode->i_mode) &&
+            (reqbody->valid & OBD_MD_FLEASIZE) != 0) {
                 rc = mds_pack_md(obd, req->rq_repmsg, reply_off + 1,
                                  body, inode);
-        } else if (S_ISLNK(inode->i_mode) && reqbody->valid & OBD_MD_LINKNAME) {
-                char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1);
-                int len = req->rq_repmsg->buflens[reply_off + 1];
+        } else if (S_ISLNK(inode->i_mode) &&
+                   (reqbody->valid & OBD_MD_LINKNAME) != 0) {
+                char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1, 0);
+                int len;
+
+                LASSERT (symname != NULL);       /* caller prepped reply */
+                len = req->rq_repmsg->buflens[reply_off + 1];
 
                 rc = inode->i_op->readlink(dentry, symname, len);
                 if (rc < 0) {
                         CERROR("readlink failed: %d\n", rc);
+                } else if (rc != len - 1) {
+                        CERROR ("Unexpected readlink rc %d: expecting %d\n",
+                                rc, len - 1);
+                        rc = -EINVAL;
                 } else {
                         CDEBUG(D_INODE, "read symlink dest %s\n", symname);
                         body->valid |= OBD_MD_LINKNAME;
+                        body->eadatasize = rc + 1;
+                        symname[rc] = 0;        /* NULL terminate */
                         rc = 0;
                 }
         }
@@ -588,9 +683,12 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
         int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1;
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, offset);
+        body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body));
+        LASSERT (body != NULL);                 /* checked by caller */
+        LASSERT_REQSWABBED (req, offset);       /* swabbed by caller */
 
-        if (S_ISREG(inode->i_mode) && body->valid & OBD_MD_FLEASIZE) {
+        if (S_ISREG(inode->i_mode) &&
+            (body->valid & OBD_MD_FLEASIZE) != 0) {
                 int rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0);
                 CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n",
                        rc, inode->i_ino);
@@ -606,11 +704,15 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
                 } else
                         size[bufcount] = rc;
                 bufcount++;
-        } else if (body->valid & OBD_MD_LINKNAME) {
-                size[bufcount] = MIN(inode->i_size + 1, body->size);
+        } else if (S_ISLNK (inode->i_mode) &&
+                   (body->valid & OBD_MD_LINKNAME) != 0) {
+                if (inode->i_size + 1 != body->eadatasize)
+                        CERROR ("symlink size: %Lu, reply space: %d\n",
+                                inode->i_size + 1, body->eadatasize);
+                size[bufcount] = MIN(inode->i_size + 1, body->eadatasize);
                 bufcount++;
-                CDEBUG(D_INODE, "symlink size: %Lu, reply space: "LPU64"\n",
-                       inode->i_size + 1, body->size);
+                CDEBUG(D_INODE, "symlink size: %Lu, reply space: %d\n",
+                       inode->i_size + 1, body->eadatasize);
         }
 
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) {
@@ -636,8 +738,6 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode,
 static void reconstruct_getattr_name(int offset, struct ptlrpc_request *req,
                                      struct lustre_handle *client_lockh)
 {
-        struct mds_export_data *med = &req->rq_export->exp_mds_data;
-        struct mds_client_data *mcd = med->med_mcd;
         struct obd_device *obd = req->rq_export->exp_obd;
         struct mds_obd *mds = mds_req2mds(req);
         struct dentry *parent, *child;
@@ -648,18 +748,19 @@ static void reconstruct_getattr_name(int offset, struct ptlrpc_request *req,
         int namelen, rc = 0;
         char *name;
 
-        req->rq_transno = mcd->mcd_last_transno;
-        req->rq_status = mcd->mcd_last_result;
-
-        if (med->med_outstanding_reply)
-                mds_steal_ack_locks(med, req);
+        if (req->rq_export->exp_outstanding_reply)
+                mds_steal_ack_locks(req->rq_export, req);
 
-        if (req->rq_status)
-                return;
+        body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body));
+        LASSERT (body != NULL);                 /* checked by caller */
+        LASSERT_REQSWABBED (req, offset);       /* swabbed by caller */
 
-        body = lustre_msg_buf(req->rq_reqmsg, offset);
-        name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
+        name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0);
+        LASSERT (name != NULL);                 /* checked by caller */
+        LASSERT_REQSWABBED (req, offset + 1);   /* swabbed by caller */
         namelen = req->rq_reqmsg->buflens[offset + 1];
+
+        LASSERT (offset == 2 || offset == 0);
         /* requests were at offset 2, replies go back at 1 */
         if (offset)
                 offset = 1;
@@ -674,19 +775,17 @@ static void reconstruct_getattr_name(int offset, struct ptlrpc_request *req,
         LASSERT(!IS_ERR(parent));
         dir = parent->d_inode;
         LASSERT(dir);
-        child = lookup_one_len(name, parent, namelen - 1);
+        child = ll_lookup_one_len(name, parent, namelen - 1);
         LASSERT(!IS_ERR(child));
 
-        if (!med->med_outstanding_reply) {
-                /* XXX need to enqueue client lock */
-                LBUG();
+        if (req->rq_repmsg == NULL) {
+                rc = mds_getattr_pack_msg(req, child->d_inode, offset);
+                /* XXX need to handle error here */
+                LASSERT (rc == 0);
         }
 
-        if (req->rq_repmsg == NULL)
-                mds_getattr_pack_msg(req, child->d_inode, offset);
-        
         rc = mds_getattr_internal(obd, child, req, body, offset);
-        LASSERT(!rc);
+        req->rq_status = rc;
         l_dput(child);
         l_dput(parent);
 }
@@ -703,24 +802,41 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
         struct obd_ucred uc;
         struct ldlm_res_id child_res_id = { .name = {0} };
         struct lustre_handle parent_lockh;
-        int namelen, flags = 0, rc = 0, cleanup_phase = 0;
+        int namesize;
+        int flags = 0, rc = 0, cleanup_phase = 0, req_was_resent;
         char *name;
         ENTRY;
 
         LASSERT(!strcmp(obd->obd_type->typ_name, "mds"));
 
-        MDS_CHECK_RESENT(req, 
-                         reconstruct_getattr_name(offset, req, child_lockh));
+        /* Swab now, before anyone looks inside the request */
 
-        if (req->rq_reqmsg->bufcount <= offset + 1) {
-                LBUG();
-                GOTO(cleanup, rc = -EINVAL);
+        body = lustre_swab_reqbuf (req, offset, sizeof (*body),
+                                   lustre_swab_mds_body);
+        if (body == NULL) {
+                CERROR ("Can't swab mds_body\n");
+                GOTO (cleanup, rc = -EFAULT);
         }
 
-        body = lustre_msg_buf(req->rq_reqmsg, offset);
-        name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        namelen = req->rq_reqmsg->buflens[offset + 1];
-        /* requests were at offset 2, replies go back at 1 */
+        LASSERT_REQSWAB (req, offset + 1);
+        name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0);
+        if (name == NULL) {
+                CERROR ("Can't unpack name\n");
+                GOTO (cleanup, rc = -EFAULT);
+        }
+        namesize = req->rq_reqmsg->buflens[offset + 1];
+
+        req_was_resent = lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT;
+        if (child_lockh->cookie) {
+                LASSERT(req_was_resent);
+                reconstruct_getattr_name(offset, req, child_lockh);
+                RETURN(0);
+        } else if (req_was_resent) {
+                DEBUG_REQ(D_HA, req, "no reply for RESENT req");
+        }
+
+        LASSERT (offset == 0 || offset == 2);
+        /* if requests were at offset 2, replies go back at 1 */
         if (offset)
                 offset = 1;
 
@@ -740,10 +856,10 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
 
         cleanup_phase = 1; /* parent dentry and lock */
 
-        CDEBUG(D_INODE, "parent ino %lu, name %*s\n", dir->i_ino,namelen,name);
+        CDEBUG(D_INODE, "parent ino %lu, name %s\n", dir->i_ino, name);
 
         /* Step 2: Lookup child */
-        dchild = lookup_one_len(name, de, namelen - 1);
+        dchild = ll_lookup_one_len(name, de, namesize - 1);
         if (IS_ERR(dchild)) {
                 CDEBUG(D_INODE, "child lookup error %ld\n", PTR_ERR(dchild));
                 GOTO(cleanup, rc = PTR_ERR(dchild));
@@ -761,7 +877,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL,
                               child_res_id, LDLM_PLAIN, NULL, 0, LCK_PR,
                               &flags, ldlm_completion_ast, mds_blocking_ast,
-                              NULL, NULL, child_lockh);
+                              NULL, child_lockh);
         if (rc != ELDLM_OK) {
                 CERROR("ldlm_cli_enqueue: %d\n", rc);
                 GOTO(cleanup, rc = -EIO);
@@ -769,15 +885,18 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req,
 
         cleanup_phase = 3; /* child lock */
 
-        if (req->rq_repmsg == NULL)
-                mds_getattr_pack_msg(req, dchild->d_inode, offset);
+        if (req->rq_repmsg == NULL) {
+                rc = mds_getattr_pack_msg(req, dchild->d_inode, offset);
+                if (rc != 0) {
+                        CERROR ("mds_getattr_pack_msg: %d\n", rc);
+                        GOTO (cleanup, rc);
+                }
+        }
 
         rc = mds_getattr_internal(obd, dchild, req, body, offset);
         GOTO(cleanup, rc); /* returns the lock to the client */
-        
+
  cleanup:
-        rc = mds_finish_transno(mds, dchild ? dchild->d_inode : NULL, NULL,
-                                req, rc, 0);
         switch (cleanup_phase) {
         case 3:
                 if (rc)
@@ -812,7 +931,13 @@ static int mds_getattr(int offset, struct ptlrpc_request *req)
         int rc = 0;
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, offset);
+        body = lustre_swab_reqbuf (req, offset, sizeof (*body),
+                                   lustre_swab_mds_body);
+        if (body == NULL) {
+                CERROR ("Can't unpack body\n");
+                RETURN (-EFAULT);
+        }
+
         uc.ouc_fsuid = body->fsuid;
         uc.ouc_fsgid = body->fsgid;
         uc.ouc_cap = body->capability;
@@ -824,6 +949,10 @@ static int mds_getattr(int offset, struct ptlrpc_request *req)
         }
 
         rc = mds_getattr_pack_msg(req, de->d_inode, offset);
+        if (rc != 0) {
+                CERROR ("mds_getattr_pack_msg: %d\n", rc);
+                GOTO (out_pop, rc);
+        }
 
         req->rq_status = mds_getattr_internal(obd, de, req, body, 0);
 
@@ -847,13 +976,12 @@ static int mds_statfs(struct ptlrpc_request *req)
                 GOTO(out, rc);
         }
 
-        osfs = lustre_msg_buf(req->rq_repmsg, 0);
+        osfs = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*osfs));
         rc = fsfilt_statfs(obd, obd->u.mds.mds_sb, osfs);
         if (rc) {
                 CERROR("mds: statfs failed: rc %d\n", rc);
                 GOTO(out, rc);
         }
-        obd_statfs_pack(osfs, osfs);
 
         EXIT;
 out:
@@ -861,69 +989,6 @@ out:
         return 0;
 }
 
-static struct mds_file_data *mds_handle2mfd(struct lustre_handle *handle)
-{
-        struct mds_file_data *mfd = NULL;
-        ENTRY;
-
-        if (!handle || !handle->addr)
-                RETURN(NULL);
-
-        mfd = (struct mds_file_data *)(unsigned long)(handle->addr);
-        if (!kmem_cache_validate(mds_file_cache, mfd))
-                RETURN(NULL);
-
-        if (mfd->mfd_servercookie != handle->cookie)
-                RETURN(NULL);
-
-        RETURN(mfd);
-}
-
-#if 0
-
-static int mds_store_md(struct mds_obd *mds, struct ptlrpc_request *req,
-                        int offset, struct mds_body *body, struct inode *inode)
-{
-        struct obd_device *obd = req->rq_export->exp_obd;
-        struct lov_mds_md *lmm = lustre_msg_buf(req->rq_reqmsg, offset);
-        int lmm_size = req->rq_reqmsg->buflens[offset];
-        struct obd_run_ctxt saved;
-        struct obd_ucred uc;
-        void *handle;
-        int rc, rc2;
-        ENTRY;
-
-        /* I don't really like this, but it is a sanity check on the client
-         * MD request.
-         */
-        if (lmm_size > mds->mds_max_mdsize) {
-                CERROR("Saving MD for inode %lu of %d bytes > max %d\n",
-                       inode->i_ino, lmm_size, mds->mds_max_mdsize);
-                //RETURN(-EINVAL);
-        }
-
-        CDEBUG(D_INODE, "storing %d bytes MD for inode %lu\n",
-               lmm_size, inode->i_ino);
-        uc.ouc_fsuid = body->fsuid;
-        uc.ouc_fsgid = body->fsgid;
-        uc.ouc_cap = body->capability;
-        push_ctxt(&saved, &mds->mds_ctxt, &uc);
-        handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR);
-        if (IS_ERR(handle)) {
-                rc = PTR_ERR(handle);
-                GOTO(out_ea, rc);
-        }
-
-        rc = fsfilt_set_md(obd, inode,handle,lmm,lmm_size);
-        rc = mds_finish_transno(mds, inode, handle, req, rc, 0);
-out_ea:
-        pop_ctxt(&saved, &mds->mds_ctxt, &uc);
-
-        RETURN(rc);
-}
-
-#endif
-
 static void reconstruct_close(struct ptlrpc_request *req)
 {
         struct mds_export_data *med = &req->rq_export->exp_mds_data;
@@ -948,13 +1013,17 @@ static int mds_close(struct ptlrpc_request *req)
 
         MDS_CHECK_RESENT(req, reconstruct_close(req));
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf(req, 0, sizeof (*body),
+                                  lustre_swab_mds_body);
+        if (body == NULL) {
+                CERROR ("Can't unpack body\n");
+                RETURN (-EFAULT);
+        }
 
         mfd = mds_handle2mfd(&body->handle);
         if (mfd == NULL) {
                 DEBUG_REQ(D_ERROR, req, "no handle for file close "LPD64
-                          ": addr "LPX64", cookie "LPX64"\n",
-                          body->fid1.id, body->handle.addr,
+                          ": cookie "LPX64"\n", body->fid1.id,
                           body->handle.cookie);
                 RETURN(-ESTALE);
         }
@@ -966,6 +1035,7 @@ static int mds_close(struct ptlrpc_request *req)
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK)) {
                 CERROR("test case OBD_FAIL_MDS_CLOSE_PACK\n");
                 req->rq_status = -ENOMEM;
+                mds_mfd_put(mfd);
                 RETURN(-ENOMEM);
         }
 
@@ -975,6 +1045,7 @@ static int mds_close(struct ptlrpc_request *req)
                 req->rq_status = rc;
         }
 
+        mds_mfd_put(mfd);
         RETURN(0);
 }
 
@@ -986,7 +1057,7 @@ static int mds_readpage(struct ptlrpc_request *req)
         struct file *file;
         struct mds_body *body, *repbody;
         struct obd_run_ctxt saved;
-        int rc, size = sizeof(*body);
+        int rc, size = sizeof(*repbody);
         struct obd_ucred uc;
         ENTRY;
 
@@ -996,7 +1067,23 @@ static int mds_readpage(struct ptlrpc_request *req)
                 GOTO(out, rc = -ENOMEM);
         }
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_mds_body);
+        if (body == NULL)
+                GOTO (out, rc = -EFAULT);
+
+        /* body->size is actually the offset -eeb */
+        if ((body->size & (PAGE_SIZE - 1)) != 0) {
+                CERROR ("offset "LPU64"not on a page boundary\n", body->size);
+                GOTO (out, rc = -EFAULT);
+        }
+
+        /* body->nlink is actually the #bytes to read -eeb */
+        if (body->nlink != PAGE_SIZE) {
+                CERROR ("size %d is not PAGE_SIZE\n", body->nlink);
+                GOTO (out, rc = -EFAULT);
+        }
+
         uc.ouc_fsuid = body->fsuid;
         uc.ouc_fsgid = body->fsgid;
         uc.ouc_cap = body->capability;
@@ -1012,7 +1099,7 @@ static int mds_readpage(struct ptlrpc_request *req)
         if (IS_ERR(file))
                 GOTO(out_pop, rc = PTR_ERR(file));
 
-        repbody = lustre_msg_buf(req->rq_repmsg, 0);
+        repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
         repbody->size = file->f_dentry->d_inode->i_size;
         repbody->valid = OBD_MD_FLSIZE;
 
@@ -1020,6 +1107,7 @@ static int mds_readpage(struct ptlrpc_request *req)
            doesn't send a reply when this function completes. Instead a
            callback function would send the reply */
         /* body->blocks is actually the xid -phil */
+        /* body->size is actually the offset -eeb */
         rc = mds_sendpage(req, file, body->size, body->blocks);
 
         filp_close(file, 0);
@@ -1057,12 +1145,15 @@ static int filter_recovery_request(struct ptlrpc_request *req,
 {
         switch (req->rq_reqmsg->opc) {
         case MDS_CONNECT: /* This will never get here, but for completeness. */
+        case OST_CONNECT: /* This will never get here, but for completeness. */
         case MDS_DISCONNECT:
+        case OST_DISCONNECT:
                *process = 1;
                RETURN(0);
 
         case MDS_CLOSE:
         case MDS_GETSTATUS: /* used in unmounting */
+        case OBD_PING:
         case MDS_REINT:
         case LDLM_ENQUEUE:
                 *process = target_queue_recovery_request(req, obd);
@@ -1072,7 +1163,8 @@ static int filter_recovery_request(struct ptlrpc_request *req,
                 DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
                 *process = 0;
                 /* XXX what should we set rq_status to here? */
-                RETURN(ptlrpc_error(req->rq_svc, req));
+                req->rq_status = -EAGAIN;
+                RETURN(ptlrpc_error(req));
         }
 }
 
@@ -1085,106 +1177,42 @@ static char *reint_names[] = {
         [REINT_OPEN]    "open",
 };
 
-void mds_steal_ack_locks(struct mds_export_data *med,
+void mds_steal_ack_locks(struct obd_export *exp,
                          struct ptlrpc_request *req)
 {
-        struct ptlrpc_request *oldrep = med->med_outstanding_reply;
+        unsigned long  flags;
+
+        struct ptlrpc_request *oldrep = exp->exp_outstanding_reply;
         memcpy(req->rq_ack_locks, oldrep->rq_ack_locks,
                sizeof req->rq_ack_locks);
-        oldrep->rq_flags |= PTL_RPC_FL_RESENT;
+        spin_lock_irqsave (&req->rq_lock, flags);
+        oldrep->rq_resent = 1;
         wake_up(&oldrep->rq_wait_for_rep);
+        spin_unlock_irqrestore (&req->rq_lock, flags);
         DEBUG_REQ(D_HA, oldrep, "stole locks from");
         DEBUG_REQ(D_HA, req, "stole locks for");
 }
 
-static void mds_send_reply(struct ptlrpc_request *req, int rc)
-{
-        int i;
-        struct ptlrpc_req_ack_lock *ack_lock;
-        struct l_wait_info lwi;
-        struct mds_export_data *med =
-                (req->rq_export && req->rq_ack_locks[0].mode) ?
-                &req->rq_export->exp_mds_data : NULL;
-
-        if (med) {
-                med->med_outstanding_reply = req;
-                req->rq_flags |= PTL_RPC_FL_WANT_ACK;
-                init_waitqueue_head(&req->rq_wait_for_rep);
-        }
-
-        if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_ALL_REPLY_NET | OBD_FAIL_ONCE)) {
-                if (rc) {
-                        DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc);
-                        ptlrpc_error(req->rq_svc, req);
-                } else {
-                        DEBUG_REQ(D_NET, req, "sending reply");
-                        ptlrpc_reply(req->rq_svc, req);
-                }
-        } else {
-                obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
-                DEBUG_REQ(D_ERROR, req, "dropping reply");
-                if (!med && req->rq_repmsg)
-                        OBD_FREE(req->rq_repmsg, req->rq_replen);
-        }
-
-        if (!med) {
-                DEBUG_REQ(D_HA, req, "not waiting for ack");
-                return;
-        }
-
-        lwi = LWI_TIMEOUT(obd_timeout / 2 * HZ, NULL, NULL);
-        rc = l_wait_event(req->rq_wait_for_rep, 
-                          (req->rq_flags & PTL_RPC_FL_WANT_ACK) == 0 ||
-                          (req->rq_flags & PTL_RPC_FL_RESENT),
-                          &lwi);
-
-        if (req->rq_flags & PTL_RPC_FL_RESENT) {
-                /* The client resent this request, so abort the
-                 * waiting-ack portals stuff, and don't decref the
-                 * locks.
-                 */
-                DEBUG_REQ(D_HA, req, "resent: not cancelling locks");
-                ptlrpc_abort(req);
-                return;
-        }
-
-        if (rc == -ETIMEDOUT) {
-                ptlrpc_abort(req);
-                recovd_conn_fail(req->rq_export->exp_connection);
-                DEBUG_REQ(D_HA, req, "cancelling locks for timeout");
-        } else {
-                DEBUG_REQ(D_HA, req, "cancelling locks for ack");
-        }
-        
-        med->med_outstanding_reply = NULL;
-        
-        for (ack_lock = req->rq_ack_locks, i = 0; i < 4; i++, ack_lock++) {
-                if (!ack_lock->mode)
-                        break;
-                ldlm_lock_decref(&ack_lock->lock, ack_lock->mode);
-        }
-}
-
 int mds_handle(struct ptlrpc_request *req)
 {
-        int should_process, rc;
+        int should_process;
+        int rc = 0;
         struct mds_obd *mds = NULL; /* quell gcc overwarning */
         struct obd_device *obd = NULL;
         ENTRY;
 
-        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
-        if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_HANDLE_UNPACK)) {
-                DEBUG_REQ(D_ERROR, req, "invalid request (%d)", rc);
-                GOTO(out, rc);
-        }
-
         OBD_FAIL_RETURN(OBD_FAIL_MDS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0);
 
         LASSERT(!strcmp(req->rq_obd->obd_type->typ_name, LUSTRE_MDT_NAME));
 
+        /* XXX identical to OST */
         if (req->rq_reqmsg->opc != MDS_CONNECT) {
                 struct mds_export_data *med;
+                int recovering, abort_recovery;
+
                 if (req->rq_export == NULL) {
+                        CERROR("lustre_mds: operation %d on unconnected MDS\n",
+                               req->rq_reqmsg->opc);
                         req->rq_status = -ENOTCONN;
                         GOTO(out, rc = -ENOTCONN);
                 }
@@ -1192,12 +1220,15 @@ int mds_handle(struct ptlrpc_request *req)
                 med = &req->rq_export->exp_mds_data;
                 obd = req->rq_export->exp_obd;
                 mds = &obd->u.mds;
+
+                /* Check for aborted recovery. */
                 spin_lock_bh(&obd->obd_processing_task_lock);
-                if (obd->obd_flags & OBD_ABORT_RECOVERY)
-                        target_abort_recovery(obd);
+                abort_recovery = obd->obd_abort_recovery;
+                recovering = obd->obd_recovering;
                 spin_unlock_bh(&obd->obd_processing_task_lock);
-
-                if (obd->obd_flags & OBD_RECOVERING) {
+                if (abort_recovery) {
+                        target_abort_recovery(obd);
+                } else if (recovering) {
                         rc = filter_recovery_request(req, obd, &should_process);
                         if (rc || !should_process)
                                 RETURN(rc);
@@ -1224,7 +1255,7 @@ int mds_handle(struct ptlrpc_request *req)
                 /* Make sure that last_rcvd is correct. */
                 if (!rc)
                         mds_fsync_super(mds->mds_sb);
-                req->rq_status = rc;
+                req->rq_status = rc;            /* superfluous? */
                 break;
 
         case MDS_GETSTATUS:
@@ -1253,9 +1284,9 @@ int mds_handle(struct ptlrpc_request *req)
                  * acquiring any new locks in mds_getattr_name, so we don't
                  * want to cancel.
                  */
-                lockh.addr = 0;
+                lockh.cookie = 0;
                 rc = mds_getattr_name(0, req, &lockh);
-                if (rc == 0 && lockh.addr)
+                if (rc == 0 && lockh.cookie)
                         ldlm_lock_decref(&lockh, LCK_PR);
                 break;
         }
@@ -1275,13 +1306,24 @@ int mds_handle(struct ptlrpc_request *req)
                 break;
 
         case MDS_REINT: {
-                int opc = *(u32 *)lustre_msg_buf(req->rq_reqmsg, 0);
+                __u32 *opcp = lustre_msg_buf (req->rq_reqmsg, 0, sizeof (*opcp));
+                __u32  opc;
                 int size[2] = {sizeof(struct mds_body), mds->mds_max_mdsize};
                 int bufcount;
 
-                DEBUG_REQ(D_INODE, req, "reint (%s%s)",
-                          reint_names[opc & REINT_OPCODE_MASK],
-                          opc & REINT_REPLAYING ? "|REPLAYING" : "");
+                /* NB only peek inside req now; mds_reint() will swab it */
+                if (opcp == NULL) {
+                        CERROR ("Can't inspect opcode\n");
+                        rc = -EINVAL;
+                        break;
+                }
+                opc = *opcp;
+                if (lustre_msg_swabbed (req->rq_reqmsg))
+                        __swab32s (&opc);
+
+                DEBUG_REQ(D_INODE, req, "reint %d (%s)", opc,
+                          (opc < sizeof (reint_names) / sizeof (reint_names[0]) ||
+                           reint_names[opc] == NULL) ? reint_names[opc] : "unknown opcode");
 
                 OBD_FAIL_RETURN(OBD_FAIL_MDS_REINT_NET, 0);
 
@@ -1306,6 +1348,11 @@ int mds_handle(struct ptlrpc_request *req)
                 rc = mds_close(req);
                 break;
 
+        case OBD_PING:
+                DEBUG_REQ(D_INODE, req, "ping");
+                rc = target_handle_ping(req);
+                break;
+
         case LDLM_ENQUEUE:
                 DEBUG_REQ(D_INODE, req, "enqueue");
                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0);
@@ -1325,7 +1372,8 @@ int mds_handle(struct ptlrpc_request *req)
                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_BL_CALLBACK, 0);
                 break;
         default:
-                rc = ptlrpc_error(req->rq_svc, req);
+                req->rq_status = -ENOTSUPP;
+                rc = ptlrpc_error(req);
                 RETURN(rc);
         }
 
@@ -1337,10 +1385,11 @@ int mds_handle(struct ptlrpc_request *req)
                 struct obd_device *obd = list_entry(mds, struct obd_device,
                                                     u.mds);
                 req->rq_repmsg->last_xid =
-                        HTON__u64(le64_to_cpu(med->med_mcd->mcd_last_xid));
-                if ((obd->obd_flags & OBD_NO_TRANSNO) == 0) {
+                        le64_to_cpu (med->med_mcd->mcd_last_xid);
+
+                if (!obd->obd_no_transno) {
                         req->rq_repmsg->last_committed =
-                                HTON__u64(obd->obd_last_committed);
+                                obd->obd_last_committed;
                 } else {
                         DEBUG_REQ(D_IOCTL, req,
                                   "not sending last_committed update");
@@ -1348,12 +1397,12 @@ int mds_handle(struct ptlrpc_request *req)
                 CDEBUG(D_INFO, "last_transno "LPU64", last_committed "LPU64
                        ", xid "LPU64"\n",
                        mds->mds_last_transno, obd->obd_last_committed,
-                       NTOH__u64(req->rq_xid));
+                       req->rq_xid);
         }
  out:
 
         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
-                if (obd && (obd->obd_flags & OBD_RECOVERING)) {
+                if (obd && obd->obd_recovering) {
                         DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
                         return target_queue_final_reply(req, rc);
                 }
@@ -1361,7 +1410,7 @@ int mds_handle(struct ptlrpc_request *req)
                 rc = req->rq_status = -ENOTCONN;
         }
 
-        mds_send_reply(req, rc);
+        target_send_reply(req, rc, OBD_FAIL_MDS_ALL_REPLY_NET);
         return 0;
 }
 
@@ -1414,8 +1463,10 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
         struct mds_obd *mds = &obddev->u.mds;
         struct vfsmount *mnt;
         int rc = 0;
+        unsigned long page;
         ENTRY;
 
+
 #ifdef CONFIG_DEV_RDONLY
         dev_clear_rdonly(2);
 #endif
@@ -1426,7 +1477,15 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
         if (IS_ERR(obddev->obd_fsops))
                 RETURN(rc = PTR_ERR(obddev->obd_fsops));
 
-        mnt = do_kern_mount(data->ioc_inlbuf2, 0, data->ioc_inlbuf1, NULL);
+       if (!(page = __get_free_page(GFP_KERNEL)))
+               return -ENOMEM;
+
+        memset((void *)page, 0, PAGE_SIZE);
+        sprintf((char *)page, "iopen_nopriv");
+
+        mnt = do_kern_mount(data->ioc_inlbuf2, 0,
+                            data->ioc_inlbuf1, (void *)page);
+        free_page(page);
         if (IS_ERR(mnt)) {
                 rc = PTR_ERR(mnt);
                 CERROR("do_kern_mount failed: rc = %d\n", rc);
@@ -1449,7 +1508,7 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
         obddev->obd_namespace =
                 ldlm_namespace_new("mds_server", LDLM_NAMESPACE_SERVER);
         if (obddev->obd_namespace == NULL) {
-                mds_cleanup(obddev);
+                mds_cleanup(obddev, 0, 0);
                 GOTO(err_fs, rc = -ENOMEM);
         }
 
@@ -1461,7 +1520,7 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf)
         RETURN(0);
 
 err_fs:
-        mds_fs_cleanup(obddev);
+        mds_fs_cleanup(obddev, 0);
 err_put:
         unlock_kernel();
         mntput(mds->mds_vfsmnt);
@@ -1472,7 +1531,7 @@ err_ops:
         return rc;
 }
 
-static int mds_cleanup(struct obd_device *obddev)
+static int mds_cleanup(struct obd_device *obddev, int force, int failover)
 {
         struct super_block *sb;
         struct mds_obd *mds = &obddev->u.mds;
@@ -1483,14 +1542,25 @@ static int mds_cleanup(struct obd_device *obddev)
                 RETURN(0);
 
         mds_update_server_data(mds);
-        mds_fs_cleanup(obddev);
+        mds_fs_cleanup(obddev, failover);
 
         unlock_kernel();
+
+        /* 2 seems normal on mds, (may_umount() also expects 2
+          fwiw), but we only see 1 at this point in obdfilter. */
+        if (atomic_read(&obddev->u.mds.mds_vfsmnt->mnt_count) > 2){
+                CERROR("%s: mount point busy, mnt_count: %d\n",
+                       obddev->obd_name,
+                       atomic_read(&obddev->u.mds.mds_vfsmnt->mnt_count));
+        }
+
         mntput(mds->mds_vfsmnt);
         mds->mds_sb = 0;
 
         ldlm_namespace_free(obddev->obd_namespace);
 
+        if (obddev->obd_recovering)
+                target_cancel_recovery_timer(obddev);
         lock_kernel();
 #ifdef CONFIG_DEV_RDONLY
         dev_clear_rdonly(2);
@@ -1503,18 +1573,32 @@ static int mds_cleanup(struct obd_device *obddev)
 inline void fixup_handle_for_resent_req(struct ptlrpc_request *req,
                                         struct lustre_handle *lockh)
 {
-        struct mds_export_data *med = &req->rq_export->exp_mds_data;
-        struct mds_client_data *mcd = med->med_mcd;
-        struct ptlrpc_request *oldrep = med->med_outstanding_reply;
-        struct ldlm_reply *dlm_rep;
+        struct obd_export *exp = req->rq_export;
+        struct obd_device *obd = exp->exp_obd;
+        struct ldlm_request *dlmreq =
+                lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*dlmreq));
+        struct lustre_handle remote_hdl = dlmreq->lock_handle1;
+        struct list_head *iter;
+
+        if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT))
+                return;
+
+        l_lock(&obd->obd_namespace->ns_lock);
+        list_for_each(iter, &exp->exp_ldlm_data.led_held_locks) {
+                struct ldlm_lock *lock;
+                lock = list_entry(iter, struct ldlm_lock, l_export_chain);
+                if (lock->l_remote_handle.cookie == remote_hdl.cookie) {
+                        lockh->cookie = lock->l_handle.h_cookie;
+                        DEBUG_REQ(D_HA, req, "restoring lock cookie "LPX64,
+                                  lockh->cookie);
+                        l_unlock(&obd->obd_namespace->ns_lock);
+                        return;
+                }
 
-        if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) &&
-            (mcd->mcd_last_xid == req->rq_xid) && (oldrep != NULL)) {
-                DEBUG_REQ(D_HA, req, "restoring lock handle from %p", oldrep);
-                dlm_rep = lustre_msg_buf(oldrep->rq_repmsg, 0);
-                lockh->addr = dlm_rep->lock_handle.addr;
-                lockh->cookie = dlm_rep->lock_handle.cookie;
         }
+        l_unlock(&obd->obd_namespace->ns_lock);
+        DEBUG_REQ(D_HA, req, "no existing lock with rhandle "LPX64,
+                  remote_hdl.cookie);
 }
 
 static int ldlm_intent_policy(struct ldlm_namespace *ns,
@@ -1531,17 +1615,23 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
 
         if (req->rq_reqmsg->bufcount > 1) {
                 /* an intent needs to be considered */
-                struct ldlm_intent *it = lustre_msg_buf(req->rq_reqmsg, 1);
+                struct ldlm_intent *it;
                 struct mds_obd *mds = &req->rq_export->exp_obd->u.mds;
                 struct mds_body *mds_body;
                 struct ldlm_reply *rep;
-                struct lustre_handle lockh;
+                struct lustre_handle lockh = { 0 };
                 struct ldlm_lock *new_lock;
                 int rc, offset = 2, repsize[3] = {sizeof(struct ldlm_reply),
                                                   sizeof(struct mds_body),
                                                   mds->mds_max_mdsize};
 
-                it->opc = NTOH__u64(it->opc);
+                it = lustre_swab_reqbuf (req, 1, sizeof (*it),
+                                         lustre_swab_ldlm_intent);
+                if (it == NULL) {
+                        CERROR ("Intent missing\n");
+                        rc = req->rq_status = -EFAULT;
+                        RETURN (rc);
+                }
 
                 LDLM_DEBUG(lock, "intent policy, opc: %s",
                            ldlm_it2str(it->opc));
@@ -1553,7 +1643,7 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                         RETURN(rc);
                 }
 
-                rep = lustre_msg_buf(req->rq_repmsg, 0);
+                rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep));
                 rep->lock_policy_res1 = IT_INTENT_EXEC;
 
                 fixup_handle_for_resent_req(req, &lockh);
@@ -1584,7 +1674,7 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                                 rep->lock_policy_res2 = req->rq_status;
                                 RETURN(ELDLM_LOCK_ABORTED);
                         }
-                        mds_body = lustre_msg_buf(req->rq_repmsg, 1);
+                        mds_body = lustre_msg_buf(req->rq_repmsg, 1, sizeof (*mds_body));
                         if (!(mds_body->valid & OBD_MD_FLEASIZE)) {
                                 rep->lock_policy_res2 = rc;
                                 RETURN(ELDLM_LOCK_ABORTED);
@@ -1611,17 +1701,37 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                         LBUG();
                 }
 
-                if (flags & LDLM_FL_INTENT_ONLY) {
-                        LDLM_DEBUG(lock, "INTENT_ONLY, aborting lock");
-                        RETURN(ELDLM_LOCK_ABORTED);
-                }
-
                 /* By this point, whatever function we called above must have
                  * filled in 'lockh' or returned an error.  We want to give the
                  * new lock to the client instead of whatever lock it was about
                  * to get. */
                 new_lock = ldlm_handle2lock(&lockh);
                 LASSERT(new_lock != NULL);
+
+                /* If we've already given this lock to a client once, then we
+                 * should have no readers or writers.  Otherwise, we should
+                 * have one reader _or_ writer ref (which will be zeroed below
+                 * before returning the lock to a client.
+                 */
+                if (new_lock->l_export == req->rq_export)
+                        LASSERT(new_lock->l_readers + new_lock->l_writers == 0);
+                else
+                        LASSERT(new_lock->l_readers + new_lock->l_writers == 1);
+
+                /* If we're running an intent only, we want to abort the new
+                 * lock, and let the client abort the original lock. */
+                if (flags & LDLM_FL_INTENT_ONLY) {
+                        LDLM_DEBUG(lock, "INTENT_ONLY, aborting locks");
+                        l_lock(&new_lock->l_resource->lr_namespace->ns_lock);
+                        if (new_lock->l_readers)
+                                ldlm_lock_decref(&lockh, LCK_PR);
+                        else
+                                ldlm_lock_decref(&lockh, LCK_PW);
+                        l_unlock(&new_lock->l_resource->lr_namespace->ns_lock);
+                        LDLM_LOCK_PUT(new_lock);
+                        RETURN(ELDLM_LOCK_ABORTED);
+                }
+
                 *lockp = new_lock;
 
                 rep->lock_policy_res2 = req->rq_status;
@@ -1629,14 +1739,13 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns,
                 if (new_lock->l_export == req->rq_export) {
                         /* Already gave this to the client, which means that we
                          * reconstructed a reply. */
-                        LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & 
+                        LASSERT(lustre_msg_get_flags(req->rq_reqmsg) &
                                 MSG_RESENT);
                         RETURN(ELDLM_LOCK_REPLACED);
                 }
 
                 /* Fixup the lock to be given to the client */
                 l_lock(&new_lock->l_resource->lr_namespace->ns_lock);
-                LASSERT(new_lock->l_readers + new_lock->l_writers == 1);
                 new_lock->l_readers = 0;
                 new_lock->l_writers = 0;
 
@@ -1706,7 +1815,8 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
         mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
                                            MDS_BUFSIZE, MDS_MAXREQSIZE,
                                            MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL,
-                                           mds_handle, "mds");
+                                           mds_handle, "mds", obddev);
+
         if (!mds->mds_service) {
                 CERROR("failed to start service\n");
                 RETURN(rc = -ENOMEM);
@@ -1726,7 +1836,7 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
                 ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
                                 MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL,
-                                mds_handle, "mds");
+                                mds_handle, "mds_setattr", obddev);
         if (!mds->mds_setattr_service) {
                 CERROR("failed to start getattr service\n");
                 GOTO(err_thread, rc = -ENOMEM);
@@ -1748,7 +1858,7 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
                 ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS,
                                 MDS_BUFSIZE, MDS_MAXREQSIZE,
                                 MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL,
-                                mds_handle, "mds");
+                                mds_handle, "mds_readpage", obddev);
         if (!mds->mds_readpage_service) {
                 CERROR("failed to start readpage service\n");
                 GOTO(err_thread2, rc = -ENOMEM);
@@ -1781,7 +1891,7 @@ err_thread:
 }
 
 
-static int mdt_cleanup(struct obd_device *obddev)
+static int mdt_cleanup(struct obd_device *obddev, int force, int failover)
 {
         struct mds_obd *mds = &obddev->u.mds;
         ENTRY;
@@ -1803,14 +1913,15 @@ extern int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn,
 
 /* use obd ops to offer management infrastructure */
 static struct obd_ops mds_obd_ops = {
-        o_owner:       THIS_MODULE,
-        o_attach:      mds_attach,
-        o_detach:      mds_detach,
-        o_connect:     mds_connect,
-        o_disconnect:  mds_disconnect,
-        o_setup:       mds_setup,
-        o_cleanup:     mds_cleanup,
-        o_iocontrol:   mds_iocontrol
+        o_owner:          THIS_MODULE,
+        o_attach:         mds_attach,
+        o_detach:         mds_detach,
+        o_connect:        mds_connect,
+        o_disconnect:     mds_disconnect,
+        o_setup:          mds_setup,
+        o_cleanup:        mds_cleanup,
+        o_iocontrol:      mds_iocontrol,
+        o_destroy_export: mds_destroy_export
 };
 
 static struct obd_ops mdt_obd_ops = {
@@ -1825,11 +1936,6 @@ static struct obd_ops mdt_obd_ops = {
 static int __init mds_init(void)
 {
         struct lprocfs_static_vars lvars;
-        mds_file_cache = kmem_cache_create("ll_mds_file_data",
-                                           sizeof(struct mds_file_data),
-                                           0, 0, NULL, NULL);
-        if (mds_file_cache == NULL)
-                return -ENOMEM;
 
         lprocfs_init_multi_vars(0, &lvars);
         class_register_type(&mds_obd_ops, lvars.module_vars, LUSTRE_MDS_NAME);
@@ -1845,8 +1951,6 @@ static void __exit mds_exit(void)
         ldlm_unregister_intent();
         class_unregister_type(LUSTRE_MDS_NAME);
         class_unregister_type(LUSTRE_MDT_NAME);
-        if (kmem_cache_destroy(mds_file_cache))
-                CERROR("couldn't free MDS file cache\n");
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
index e4522fb..5d6fa57 100644 (file)
@@ -37,8 +37,7 @@ struct lprocfs_vars lprocfs_mdt_module_vars[] = { {0} };
 
 #else
 
-static inline
-int lprocfs_mds_statfs(void *data, struct statfs *sfs)
+static inline int lprocfs_mds_statfs(void *data, struct statfs *sfs)
 {
         struct obd_device* dev = (struct obd_device*) data;
         struct mds_obd *mds;
@@ -66,16 +65,28 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof,
         return snprintf(page, count, "%s\n", obd->obd_fsops->fs_type);
 }
 
+int lprocfs_mds_rd_mntdev(char *page, char **start, off_t off, int count,
+                          int *eof, void *data)
+{
+        struct obd_device* obd = (struct obd_device *)data;
+
+        LASSERT(obd != NULL);
+        LASSERT(obd->u.mds.mds_vfsmnt->mnt_devname);
+        *eof = 1;
+        return snprintf(page, count, "%s\n",
+                        obd->u.mds.mds_vfsmnt->mnt_devname);
+}
 
 struct lprocfs_vars lprocfs_mds_obd_vars[] = {
         { "uuid",       lprocfs_rd_uuid, 0, 0 },
         { "blocksize",  rd_blksize,      0, 0 },
-        { "bytestotal", rd_kbytestotal,  0, 0 },
+        { "kbytestotal",rd_kbytestotal,  0, 0 },
         { "kbytesfree", rd_kbytesfree,   0, 0 },
         { "fstype",     rd_fstype,       0, 0 },
         { "filestotal", rd_filestotal,   0, 0 },
         { "filesfree",  rd_filesfree,    0, 0 },
         { "filegroups", rd_filegroups,   0, 0 },
+        { "mntdev",     lprocfs_mds_rd_mntdev,    0, 0 },
         { 0 }
 };
 
@@ -101,5 +112,5 @@ struct lprocfs_static_vars lprocfs_array_vars[] = { {lprocfs_mds_module_vars,
                                                      lprocfs_mdt_obd_vars}};
 
 LPROCFS_INIT_MULTI_VARS(lprocfs_array_vars,
-                        (sizeof(lprocfs_array_vars)/
-                         sizeof(struct lprocfs_static_vars))) 
+                        (sizeof(lprocfs_array_vars) /
+                         sizeof(struct lprocfs_static_vars)))
index 7952101..cefc680 100644 (file)
@@ -42,8 +42,6 @@
 #define MDS_MAX_CLIENTS (PAGE_SIZE * 8)
 #define MDS_MAX_CLIENT_WORDS (MDS_MAX_CLIENTS / sizeof(unsigned long))
 
-static unsigned long last_rcvd_slots[MDS_MAX_CLIENT_WORDS];
-
 #define LAST_RCVD "last_rcvd"
 
 /* Add client data to the MDS.  We use a bitmap to locate a free space
@@ -51,29 +49,37 @@ static unsigned long last_rcvd_slots[MDS_MAX_CLIENT_WORDS];
  * Otherwise, we have just read the data from the last_rcvd file and
  * we know its offset.
  */
-int mds_client_add(struct mds_obd *mds, struct mds_export_data *med, int cl_off)
+int mds_client_add(struct obd_device *obd, struct mds_obd *mds,
+                   struct mds_export_data *med, int cl_off)
 {
+        unsigned long *bitmap = mds->mds_client_bitmap;
         int new_client = (cl_off == -1);
 
+        LASSERT(bitmap != NULL);
+
+        /* XXX if mcd_uuid were a real obd_uuid, I could use obd_uuid_equals */
+        if (!strcmp(med->med_mcd->mcd_uuid, "OBD_CLASS_UUID"))
+                RETURN(0);
+
         /* the bitmap operations can handle cl_off > sizeof(long) * 8, so
          * there's no need for extra complication here
          */
         if (new_client) {
-                cl_off = find_first_zero_bit(last_rcvd_slots, MDS_MAX_CLIENTS);
+                cl_off = find_first_zero_bit(bitmap, MDS_MAX_CLIENTS);
         repeat:
                 if (cl_off >= MDS_MAX_CLIENTS) {
                         CERROR("no room for clients - fix MDS_MAX_CLIENTS\n");
                         return -ENOMEM;
                 }
-                if (test_and_set_bit(cl_off, last_rcvd_slots)) {
+                if (test_and_set_bit(cl_off, bitmap)) {
                         CERROR("MDS client %d: found bit is set in bitmap\n",
                                cl_off);
-                        cl_off = find_next_zero_bit(last_rcvd_slots,
-                                                    MDS_MAX_CLIENTS, cl_off);
+                        cl_off = find_next_zero_bit(bitmap, MDS_MAX_CLIENTS,
+                                                    cl_off);
                         goto repeat;
                 }
         } else {
-                if (test_and_set_bit(cl_off, last_rcvd_slots)) {
+                if (test_and_set_bit(cl_off, bitmap)) {
                         CERROR("MDS client %d: bit already set in bitmap!!\n",
                                cl_off);
                         LBUG();
@@ -89,11 +95,36 @@ int mds_client_add(struct mds_obd *mds, struct mds_export_data *med, int cl_off)
                 struct obd_run_ctxt saved;
                 loff_t off = MDS_LR_CLIENT + (cl_off * MDS_LR_SIZE);
                 ssize_t written;
+                void *handle;
 
                 push_ctxt(&saved, &mds->mds_ctxt, NULL);
-                written = lustre_fwrite(mds->mds_rcvd_filp,
-                                        (char *)med->med_mcd,
-                                        sizeof(*med->med_mcd), &off);
+                /* We need to start a transaction here first, to avoid a
+                 * possible ordering deadlock on last_rcvd->i_sem and the
+                 * journal lock. In most places we start the journal handle
+                 * first (because we do compound transactions), and then
+                 * later do the write into last_rcvd, which gets i_sem.
+                 *
+                 * Without this transaction, clients connecting at the same
+                 * time other MDS operations are ongoing get last_rcvd->i_sem
+                 * first (in generic_file_write()) and start the journal
+                 * transaction afterwards, and can deadlock with other ops.
+                 *
+                 * We use FSFILT_OP_SETATTR because it is smallest, but all
+                 * ops include enough space for the last_rcvd update so we
+                 * could use any of them, or maybe an FSFILT_OP_NONE is best?
+                 */
+                handle = fsfilt_start(obd,mds->mds_rcvd_filp->f_dentry->d_inode,
+                                      FSFILT_OP_SETATTR);
+                if (IS_ERR(handle)) {
+                        written = PTR_ERR(handle);
+                        CERROR("unable to start transaction: rc %d\n",
+                               (int)written);
+                } else {
+                        written = lustre_fwrite(mds->mds_rcvd_filp,med->med_mcd,
+                                                sizeof(*med->med_mcd), &off);
+                        fsfilt_commit(obd,mds->mds_rcvd_filp->f_dentry->d_inode,
+                                      handle, 0);
+                }
                 pop_ctxt(&saved, &mds->mds_ctxt, NULL);
 
                 if (written != sizeof(*med->med_mcd)) {
@@ -115,17 +146,23 @@ int mds_client_free(struct obd_export *exp)
         struct mds_client_data zero_mcd;
         struct obd_run_ctxt saved;
         int written;
+        unsigned long *bitmap = mds->mds_client_bitmap;
         loff_t off;
 
+        LASSERT(bitmap);
         if (!med->med_mcd)
                 RETURN(0);
 
+        /* XXX if mcd_uuid were a real obd_uuid, I could use obd_uuid_equals */
+        if (!strcmp(med->med_mcd->mcd_uuid, "OBD_CLASS_UUID"))
+                GOTO(free_and_out, 0);
+
         off = MDS_LR_CLIENT + (med->med_off * MDS_LR_SIZE);
 
         CDEBUG(D_INFO, "freeing client at offset %u (%lld)with UUID '%s'\n",
                med->med_off, off, med->med_mcd->mcd_uuid);
 
-        if (!test_and_clear_bit(med->med_off, last_rcvd_slots)) {
+        if (!test_and_clear_bit(med->med_off, bitmap)) {
                 CERROR("MDS client %u: bit already clear in bitmap!!\n",
                        med->med_off);
                 LBUG();
@@ -146,6 +183,7 @@ int mds_client_free(struct obd_export *exp)
                        med->med_mcd->mcd_uuid, med->med_off);
         }
 
+ free_and_out:
         OBD_FREE(med->med_mcd, sizeof(*med->med_mcd));
 
         return 0;
@@ -153,6 +191,8 @@ int mds_client_free(struct obd_export *exp)
 
 static int mds_server_free_data(struct mds_obd *mds)
 {
+        OBD_FREE(mds->mds_client_bitmap,
+                 MDS_MAX_CLIENT_WORDS * sizeof(unsigned long));
         OBD_FREE(mds->mds_server_data, sizeof(*mds->mds_server_data));
         mds->mds_server_data = NULL;
 
@@ -170,18 +210,27 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
         __u64 last_transno = 0;
         __u64 last_mount;
         int rc = 0;
+
         LASSERT(sizeof(struct mds_client_data) == MDS_LR_SIZE);
         LASSERT(sizeof(struct mds_server_data) <= MDS_LR_CLIENT);
 
         OBD_ALLOC(msd, sizeof(*msd));
         if (!msd)
                 RETURN(-ENOMEM);
+
+        OBD_ALLOC(mds->mds_client_bitmap,
+                  MDS_MAX_CLIENT_WORDS * sizeof(unsigned long));
+        if (!mds->mds_client_bitmap) {
+                OBD_FREE(msd, sizeof(*msd));
+                RETURN(-ENOMEM);
+        }
+
         rc = lustre_fread(f, (char *)msd, sizeof(*msd), &off);
 
         mds->mds_server_data = msd;
         if (rc == 0) {
-                CERROR("empty MDS %s, new MDS?\n", LAST_RCVD);
+                CERROR("%s: empty MDS %s, new MDS?\n", obddev->obd_name,
+                       LAST_RCVD);
                 RETURN(0);
         }
 
@@ -252,21 +301,21 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                                sizeof exp->exp_client_uuid.uuid);
                         med = &exp->exp_mds_data;
                         med->med_mcd = mcd;
-                        mds_client_add(mds, med, cl_off);
+                        mds_client_add(obddev, mds, med, cl_off);
                         /* create helper if export init gets more complex */
                         INIT_LIST_HEAD(&med->med_open_head);
                         spin_lock_init(&med->med_open_lock);
 
                         mcd = NULL;
                         obddev->obd_recoverable_clients++;
+                        class_export_put(exp);
                 } else {
-                        CDEBUG(D_INFO,
-                               "discarded client %d, UUID '%s', count %Ld\n",
-                               cl_off, mcd->mcd_uuid,
-                               (long long)le64_to_cpu(mcd->mcd_mount_count));
+                        CDEBUG(D_INFO, "discarded client %d, UUID '%s', count "
+                               LPU64"\n", cl_off, mcd->mcd_uuid,
+                               le64_to_cpu(mcd->mcd_mount_count));
                 }
 
-                CDEBUG(D_OTHER, "client at offset %d has last_rcvd = %Lu\n",
+                CDEBUG(D_OTHER, "client at offset %d has last_transno = %Lu\n",
                        cl_off, (unsigned long long)last_transno);
 
                 if (last_transno > mds->mds_last_transno)
@@ -280,7 +329,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f)
                        obddev->obd_recoverable_clients, mds->mds_last_transno);
                 obddev->obd_next_recovery_transno = obddev->obd_last_committed
                         + 1;
-                obddev->obd_flags |= OBD_RECOVERING;
+                obddev->obd_recovering = 1;
         }
 
         if (mcd)
@@ -315,14 +364,14 @@ static int mds_fs_prep(struct obd_device *obddev)
 
         dput(dentry);
 
-        dentry = simple_mkdir(current->fs->pwd, "FH", 0700);
-        if (IS_ERR(dentry)) {
-                rc = PTR_ERR(dentry);
-                CERROR("cannot create FH directory: rc = %d\n", rc);
+        dentry = lookup_one_len("__iopen__", current->fs->pwd,
+                                strlen("__iopen__"));
+        if (IS_ERR(dentry) || !dentry->d_inode) {
+                rc = (IS_ERR(dentry)) ? PTR_ERR(dentry): -ENOENT;
+                CERROR("cannot open iopen FH directory: rc = %d\n", rc);
                 GOTO(err_pop, rc);
         }
-        /* XXX probably want to hold on to this later... */
-        dput(dentry);
+        mds->mds_fid_de = dentry;
 
         f = filp_open(LAST_RCVD, O_RDWR | O_CREAT, 0644);
         if (IS_ERR(f)) {
@@ -354,7 +403,7 @@ err_pop:
         return rc;
 
 err_client:
-        class_disconnect_all(obddev);
+        class_disconnect_exports(obddev, 0);
 err_filp:
         if (filp_close(f, 0))
                 CERROR("can't close %s after error\n", LAST_RCVD);
@@ -372,28 +421,33 @@ int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt)
         mds->mds_ctxt.pwdmnt = mnt;
         mds->mds_ctxt.pwd = mnt->mnt_root;
         mds->mds_ctxt.fs = get_ds();
-
         RETURN(mds_fs_prep(obddev));
 }
 
-int mds_fs_cleanup(struct obd_device *obddev)
+int mds_fs_cleanup(struct obd_device *obddev, int failover)
 {
         struct mds_obd *mds = &obddev->u.mds;
         struct obd_run_ctxt saved;
         int rc = 0;
 
-        class_disconnect_all(obddev); /* this cleans up client info too */
+        if (failover)
+                CERROR("%s: shutting down for failover; client state will"
+                       " be preserved.\n", obddev->obd_name);
+
+        class_disconnect_exports(obddev, failover); /* this cleans up client
+                                                   info too */
         mds_server_free_data(mds);
 
         push_ctxt(&saved, &mds->mds_ctxt, NULL);
         if (mds->mds_rcvd_filp) {
                 rc = filp_close(mds->mds_rcvd_filp, 0);
                 mds->mds_rcvd_filp = NULL;
-
                 if (rc)
                         CERROR("last_rcvd file won't close, rc=%d\n", rc);
         }
         pop_ctxt(&saved, &mds->mds_ctxt, NULL);
+        shrink_dcache_parent(mds->mds_fid_de);
+        dput(mds->mds_fid_de);
 
         return rc;
 }
diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h
new file mode 100644 (file)
index 0000000..0b62a92
--- /dev/null
@@ -0,0 +1,15 @@
+struct mds_file_data *mds_mfd_new(void);
+void mds_mfd_put(struct mds_file_data *mfd);
+void mds_mfd_destroy(struct mds_file_data *mfd);
+int mds_update_unpack(struct ptlrpc_request *, int offset,
+                      struct mds_update_record *);
+
+/* mds/mds_fs.c */
+int mds_client_add(struct obd_device *obd, struct mds_obd *mds,
+                  struct mds_export_data *med, int cl_off);
+int mds_client_free(struct obd_export *exp);
+
+#ifdef __KERNEL__
+void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode);
+void mds_pack_inode2body(struct mds_body *body, struct inode *inode);
+#endif
diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c
new file mode 100644 (file)
index 0000000..8f16795
--- /dev/null
@@ -0,0 +1,310 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+# include <linux/locks.h>   // for wait_on_buffer
+#else
+# include <linux/buffer_head.h>   // for wait_on_buffer
+#endif
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <linux/slab.h>
+#include <asm/segment.h>
+
+#include <linux/obd_support.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_mds.h>
+#include <linux/lustre_lite.h>
+
+void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode)
+{
+        fid->id = inode->i_ino;
+        fid->generation = inode->i_generation;
+        fid->f_type = (S_IFMT & inode->i_mode);
+}
+
+void mds_pack_inode2body(struct mds_body *b, struct inode *inode)
+{
+        b->valid = OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME |
+                OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
+                OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
+                OBD_MD_FLNLINK | OBD_MD_FLGENER;
+
+        /* The MDS file size isn't authoritative for regular files, so don't
+         * even pretend. */
+        if (S_ISREG(inode->i_mode))
+                b->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+
+        b->ino = inode->i_ino;
+        b->atime = LTIME_S(inode->i_atime);
+        b->mtime = LTIME_S(inode->i_mtime);
+        b->ctime = LTIME_S(inode->i_ctime);
+        b->mode = inode->i_mode;
+        b->size = inode->i_size;
+        b->blocks = inode->i_blocks;
+        b->uid = inode->i_uid;
+        b->gid = inode->i_gid;
+        b->flags = inode->i_flags;
+        b->rdev = b->rdev;
+        b->nlink = inode->i_nlink;
+        b->generation = inode->i_generation;
+        b->suppgid = -1;
+}
+/* unpacking */
+static int mds_setattr_unpack(struct ptlrpc_request *req, int offset,
+                              struct mds_update_record *r)
+{
+        struct iattr *attr = &r->ur_iattr;
+        struct mds_rec_setattr *rec;
+        ENTRY;
+
+        rec = lustre_swab_reqbuf (req, offset, sizeof (*rec),
+                                  lustre_swab_mds_rec_setattr);
+        if (rec == NULL)
+                RETURN (-EFAULT);
+
+        r->ur_fsuid = rec->sa_fsuid;
+        r->ur_fsgid = rec->sa_fsgid;
+        r->ur_cap = rec->sa_cap;
+        r->ur_suppgid1 = rec->sa_suppgid;
+        r->ur_suppgid2 = -1;
+        r->ur_fid1 = &rec->sa_fid;
+        attr->ia_valid = rec->sa_valid;
+        attr->ia_mode = rec->sa_mode;
+        attr->ia_uid = rec->sa_uid;
+        attr->ia_gid = rec->sa_gid;
+        attr->ia_size = rec->sa_size;
+        LTIME_S(attr->ia_atime) = rec->sa_atime;
+        LTIME_S(attr->ia_mtime) = rec->sa_mtime;
+        LTIME_S(attr->ia_ctime) = rec->sa_ctime;
+        attr->ia_attr_flags = rec->sa_attr_flags;
+
+        LASSERT_REQSWAB (req, offset + 1);
+        if (req->rq_reqmsg->bufcount > offset + 1) {
+                r->ur_eadata = lustre_msg_buf (req->rq_reqmsg,
+                                               offset + 1, 0);
+                if (r->ur_eadata == NULL)
+                        RETURN (-EFAULT);
+                r->ur_eadatalen = req->rq_reqmsg->buflens[offset + 1];
+        } else {
+                r->ur_eadata = NULL;
+                r->ur_eadatalen = 0;
+        }
+
+        RETURN(0);
+}
+
+static int mds_create_unpack(struct ptlrpc_request *req, int offset,
+                             struct mds_update_record *r)
+{
+        struct mds_rec_create *rec;
+        ENTRY;
+
+        rec = lustre_swab_reqbuf (req, offset, sizeof (*rec),
+                                  lustre_swab_mds_rec_create);
+        if (rec == NULL)
+                RETURN (-EFAULT);
+
+        r->ur_fsuid = rec->cr_fsuid;
+        r->ur_fsgid = rec->cr_fsgid;
+        r->ur_cap = rec->cr_cap;
+        r->ur_fid1 = &rec->cr_fid;
+        r->ur_fid2 = &rec->cr_replayfid;
+        r->ur_mode = rec->cr_mode;
+        r->ur_rdev = rec->cr_rdev;
+        r->ur_uid = rec->cr_uid;
+        r->ur_gid = rec->cr_gid;
+        r->ur_time = rec->cr_time;
+        r->ur_flags = rec->cr_flags;
+        r->ur_suppgid1 = rec->cr_suppgid;
+        r->ur_suppgid2 = -1;
+
+        LASSERT_REQSWAB (req, offset + 1);
+        r->ur_name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0);
+        if (r->ur_name == NULL)
+                RETURN (-EFAULT);
+        r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
+
+        LASSERT_REQSWAB (req, offset + 2);
+        if (req->rq_reqmsg->bufcount > offset + 2) {
+                /* NB for now, we only seem to pass NULL terminated symlink
+                 * target strings here.  If this ever changes, we'll have
+                 * to stop checking for a buffer filled completely with a
+                 * NULL terminated string here, and make the callers check
+                 * depending on what they expect.  We should probably stash
+                 * it in r->ur_eadata in that case, so it's obvious... -eeb
+                 */
+                r->ur_tgt = lustre_msg_string(req->rq_reqmsg, offset + 2, 0);
+                if (r->ur_tgt == NULL)
+                        RETURN (-EFAULT);
+                r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2];
+        } else {
+                r->ur_tgt = NULL;
+                r->ur_tgtlen = 0;
+        }
+        RETURN(0);
+}
+
+static int mds_link_unpack(struct ptlrpc_request *req, int offset,
+                           struct mds_update_record *r)
+{
+        struct mds_rec_link *rec;
+        ENTRY;
+
+        rec = lustre_swab_reqbuf (req, offset, sizeof (*rec),
+                                  lustre_swab_mds_rec_link);
+        if (rec == NULL)
+                RETURN (-EFAULT);
+
+        r->ur_fsuid = rec->lk_fsuid;
+        r->ur_fsgid = rec->lk_fsgid;
+        r->ur_cap = rec->lk_cap;
+        r->ur_suppgid1 = rec->lk_suppgid1;
+        r->ur_suppgid2 = rec->lk_suppgid2;
+        r->ur_fid1 = &rec->lk_fid1;
+        r->ur_fid2 = &rec->lk_fid2;
+
+        LASSERT_REQSWAB (req, offset + 1);
+        r->ur_name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0);
+        if (r->ur_name == NULL)
+                RETURN (-EFAULT);
+        r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
+        RETURN(0);
+}
+
+static int mds_unlink_unpack(struct ptlrpc_request *req, int offset,
+                             struct mds_update_record *r)
+{
+        struct mds_rec_unlink *rec;
+        ENTRY;
+
+        rec = lustre_swab_reqbuf (req, offset, sizeof (*rec),
+                                  lustre_swab_mds_rec_unlink);
+        if (rec == NULL)
+                RETURN(-EFAULT);
+
+        r->ur_fsuid = rec->ul_fsuid;
+        r->ur_fsgid = rec->ul_fsgid;
+        r->ur_cap = rec->ul_cap;
+        r->ur_mode = rec->ul_mode;
+        r->ur_suppgid1 = rec->ul_suppgid;
+        r->ur_suppgid2 = -1;
+        r->ur_fid1 = &rec->ul_fid1;
+        r->ur_fid2 = &rec->ul_fid2;
+
+        LASSERT_REQSWAB (req, offset + 1);
+        r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0);
+        if (r->ur_name == NULL)
+                RETURN(-EFAULT);
+        r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
+        RETURN(0);
+}
+
+static int mds_rename_unpack(struct ptlrpc_request *req, int offset,
+                             struct mds_update_record *r)
+{
+        struct mds_rec_rename *rec;
+        ENTRY;
+
+        rec = lustre_swab_reqbuf (req, offset, sizeof (*rec),
+                                  lustre_swab_mds_rec_unlink);
+        if (rec == NULL)
+                RETURN(-EFAULT);
+
+        r->ur_fsuid = rec->rn_fsuid;
+        r->ur_fsgid = rec->rn_fsgid;
+        r->ur_cap = rec->rn_cap;
+        r->ur_suppgid1 = rec->rn_suppgid1;
+        r->ur_suppgid2 = rec->rn_suppgid2;
+        r->ur_fid1 = &rec->rn_fid1;
+        r->ur_fid2 = &rec->rn_fid2;
+
+        LASSERT_REQSWAB (req, offset + 1);
+        r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0);
+        if (r->ur_name == NULL)
+                RETURN(-EFAULT);
+        r->ur_namelen = req->rq_reqmsg->buflens[offset + 1];
+
+        LASSERT_REQSWAB (req, offset + 2);
+        r->ur_tgt = lustre_msg_string(req->rq_reqmsg, offset + 2, 0);
+        if (r->ur_tgt == NULL)
+                RETURN(-EFAULT);
+        r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2];
+        RETURN(0);
+}
+
+typedef int (*update_unpacker)(struct ptlrpc_request *req, int offset,
+                               struct mds_update_record *r);
+
+static update_unpacker mds_unpackers[REINT_MAX + 1] = {
+        [REINT_SETATTR] mds_setattr_unpack,
+        [REINT_CREATE] mds_create_unpack,
+        [REINT_LINK] mds_link_unpack,
+        [REINT_UNLINK] mds_unlink_unpack,
+        [REINT_RENAME] mds_rename_unpack,
+        [REINT_OPEN] mds_create_unpack,
+};
+
+int mds_update_unpack(struct ptlrpc_request *req, int offset,
+                      struct mds_update_record *rec)
+{
+        __u32 *opcodep;
+        __u32  opcode;
+        int rc;
+        ENTRY;
+
+        /* NB don't lustre_swab_reqbuf() here.  We're just taking a peek
+         * and we want to leave it to the specific unpacker once we've
+         * identified the message type */
+        opcodep = lustre_msg_buf (req->rq_reqmsg, offset, sizeof (*opcodep));
+        if (opcodep == NULL)
+                RETURN(-EFAULT);
+
+        opcode = *opcodep;
+        if (lustre_msg_swabbed (req->rq_reqmsg))
+                __swab32s (&opcode);
+
+        if (opcode > REINT_MAX ||
+            mds_unpackers[opcode] == NULL) {
+                CERROR ("Unexpected opcode %d\n", opcode);
+                RETURN(-EFAULT);
+        }
+
+        rec->ur_opcode = opcode;
+        rc = mds_unpackers[opcode](req, offset, rec);
+        RETURN(rc);
+}
index 796fcd2..02c53cc 100644 (file)
 #include <linux/obd_lov.h>
 #include <linux/lustre_lib.h>
 
-/* lov_unpackdesc() is in lov/lov_pack.c */
+void le_lov_desc_to_cpu (struct lov_desc *ld)
+{
+        ld->ld_tgt_count = le32_to_cpu (ld->ld_tgt_count);
+        ld->ld_default_stripe_count = le32_to_cpu (ld->ld_default_stripe_count);
+        ld->ld_default_stripe_size = le32_to_cpu (ld->ld_default_stripe_size);
+        ld->ld_pattern = le32_to_cpu (ld->ld_pattern);
+}
 
-void lov_packdesc(struct lov_desc *ld)
+void cpu_to_le_lov_desc (struct lov_desc *ld)
 {
-        ld->ld_tgt_count = HTON__u32(ld->ld_tgt_count);
-        ld->ld_default_stripe_count = HTON__u32(ld->ld_default_stripe_count);
-        ld->ld_default_stripe_size = HTON__u32(ld->ld_default_stripe_size);
-        ld->ld_pattern = HTON__u32(ld->ld_pattern);
+        ld->ld_tgt_count = cpu_to_le32 (ld->ld_tgt_count);
+        ld->ld_default_stripe_count = cpu_to_le32 (ld->ld_default_stripe_count);
+        ld->ld_default_stripe_size = cpu_to_le32 (ld->ld_default_stripe_size);
+        ld->ld_pattern = cpu_to_le32 (ld->ld_pattern);
 }
 
 int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc,
@@ -52,6 +58,7 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc,
         int tgt_count;
         int rc;
         int i;
+        struct lov_desc *disk_desc;
         ENTRY;
 
         tgt_count = desc->ld_tgt_count;
@@ -76,36 +83,44 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc,
                 RETURN(-EINVAL);
         }
 
-        memcpy(&mds->mds_lov_desc, desc, sizeof *desc);
-        mds->mds_has_lov_desc = 1;
-        /* XXX the MDS should not really know about this */
-        mds->mds_max_mdsize = lov_mds_md_size(desc->ld_tgt_count);
+        OBD_ALLOC (disk_desc, sizeof (*disk_desc));
+        if (disk_desc == NULL) {
+                CERROR ("Can't allocate disk_desc\n");
+                RETURN (-ENOMEM);
+        }
 
-        lov_packdesc(desc);
+        *disk_desc = *desc;
+        cpu_to_le_lov_desc (disk_desc);
 
+        rc = 0;
         push_ctxt(&saved, &mds->mds_ctxt, NULL);
+
+        /* Bug 1186: FIXME: if there is an existing LOVDESC, verify new
+         * tgt_count > old */
         f = filp_open("LOVDESC", O_CREAT|O_RDWR, 0644);
         if (IS_ERR(f)) {
                 CERROR("Cannot open/create LOVDESC file\n");
                 GOTO(out, rc = PTR_ERR(f));
         }
 
-#warning FIXME: if there is an existing LOVDESC, verify new tgt_count > old
-        rc = lustre_fwrite(f, (char *)desc, sizeof(*desc), &f->f_pos);
+        rc = lustre_fwrite(f, (char *)disk_desc, sizeof(*disk_desc), &f->f_pos);
         if (filp_close(f, 0))
                 CERROR("Error closing LOVDESC file\n");
         if (rc != sizeof(*desc)) {
                 CERROR("Cannot open/create LOVDESC file\n");
-                GOTO(out, rc = PTR_ERR(f));
+                if (rc >= 0)
+                        rc = -EIO;
+                GOTO(out, rc);
         }
 
+        /* Bug 1186: FIXME: if there is an existing LOVTGTS, verify
+         * existing UUIDs same */
         f = filp_open("LOVTGTS", O_CREAT|O_RDWR, 0644);
         if (IS_ERR(f)) {
                 CERROR("Cannot open/create LOVTGTS file\n");
                 GOTO(out, rc = PTR_ERR(f));
         }
 
-#warning FIXME: if there is an existing LOVTGTS, verify existing UUIDs same
         rc = 0;
         for (i = 0; i < tgt_count ; i++) {
                 rc = lustre_fwrite(f, uuidarray[i].uuid,
@@ -116,14 +131,21 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc,
                         if (rc >= 0)
                                 rc = -EIO;
                         break;
-                } else
-                        rc = 0;
+                }
+                rc = 0;
         }
         if (filp_close(f, 0))
                 CERROR("Error closing LOVTGTS file\n");
 
+        memcpy(&mds->mds_lov_desc, desc, sizeof *desc);
+        mds->mds_has_lov_desc = 1;
+        /* XXX the MDS should not really know about this */
+        mds->mds_max_mdsize = lov_mds_md_size(desc->ld_tgt_count);
+
 out:
         pop_ctxt(&saved, &mds->mds_ctxt, NULL);
+        OBD_FREE (disk_desc, sizeof (*disk_desc));
+
         RETURN(rc);
 }
 
@@ -150,6 +172,9 @@ int mds_get_lovdesc(struct mds_obd *mds, struct lov_desc *desc)
                 GOTO(out, rc = -EIO);
         } else
                 rc = 0;
+
+        le_lov_desc_to_cpu (desc);              /* convert to my byte order */
+
         EXIT;
 out:
         pop_ctxt(&saved, &mds->mds_ctxt, NULL);
@@ -192,7 +217,7 @@ out:
 }
 
 int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn,
-                          int len, void *karg, void *uarg)
+                  int len, void *karg, void *uarg)
 {
         struct obd_device *obd = class_conn2obd(conn);
         struct obd_ioctl_data *data = karg;
@@ -236,11 +261,12 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn,
                         CERROR("UUID array size too small\n");
                         RETURN(-ENOSPC);
                 }
-                rc = mds_get_lovtgts(&obd->u.mds, desc->ld_tgt_count, uuidarray);
+                rc = mds_get_lovtgts(&obd->u.mds, desc->ld_tgt_count,
+                                     uuidarray);
 
                 RETURN(rc);
 
-            case OBD_IOC_SET_READONLY:
+        case OBD_IOC_SET_READONLY:
                 CERROR("setting device %s read-only\n",
                        ll_bdevname(obd->u.mds.mds_sb->s_dev));
 #ifdef CONFIG_DEV_RDONLY
@@ -248,6 +274,11 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn,
 #endif
                 RETURN(0);
 
+        case OBD_IOC_ABORT_RECOVERY:
+                CERROR("aborting recovery for device %s\n", obd->obd_name);
+                target_abort_recovery(obd);
+                RETURN(0);
+
         default:
                 RETURN(-EINVAL);
         }
index 50ca592..d83e4ee 100644 (file)
 #include <linux/obd_class.h>
 #include <linux/random.h>
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#include <linux/buffer_head.h>
-#include <linux/workqueue.h>
+# include <linux/buffer_head.h>
+# include <linux/workqueue.h>
 #else
-#include <linux/locks.h>
+# include <linux/locks.h>
 #endif
 #include <linux/obd_lov.h>
 #include <linux/lustre_mds.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/lprocfs_status.h>
 
-extern kmem_cache_t *mds_file_cache;
+#include "mds_internal.h"
+
 extern inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req);
 int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
                        struct ptlrpc_request *req, int rc, __u32 op_data);
@@ -57,7 +58,53 @@ extern int enqueue_ordered_locks(int lock_mode, struct obd_device *obd,
                                  struct lustre_handle *c1_lockh,
                                  struct lustre_handle *c2_lockh);
 
-void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
+struct mds_file_data *mds_dentry_open(struct dentry *dentry,
+                                      struct vfsmount *mnt,
+                                      int flags,
+                                      struct ptlrpc_request *req)
+{
+        struct mds_export_data *med = &req->rq_export->exp_mds_data;
+        struct inode *inode;
+        int mode;
+        struct mds_file_data *mfd;
+        int error;
+
+        mfd = mds_mfd_new();
+        if (!mfd) {
+                CERROR("mds: out of memory\n");
+                GOTO(cleanup_dentry, error = -ENOMEM);
+        }
+
+        mode = (flags+1) & O_ACCMODE;
+        inode = dentry->d_inode;
+
+        if (mode & FMODE_WRITE) {
+                error = get_write_access(inode);
+                if (error)
+                        goto cleanup_mfd;
+        }
+
+        mfd->mfd_mode = mode;
+        mfd->mfd_dentry = dentry;
+        mfd->mfd_xid = req->rq_xid;
+
+        spin_lock(&med->med_open_lock);
+        list_add(&mfd->mfd_list, &med->med_open_head);
+        spin_unlock(&med->med_open_lock);
+        mds_mfd_put(mfd);
+        return mfd;
+
+cleanup_mfd:
+        mds_mfd_put(mfd);
+        mds_mfd_destroy(mfd);
+cleanup_dentry:
+        dput(dentry);
+        mntput(mnt);
+        return ERR_PTR(error);
+}
+
+void reconstruct_open(struct mds_update_record *rec, int offset,
+                      struct ptlrpc_request *req,
                       struct lustre_handle *child_lockh)
 {
         struct mds_export_data *med = &req->rq_export->exp_mds_data;
@@ -66,21 +113,23 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
         struct mds_file_data *mfd;
         struct obd_device *obd = req->rq_export->exp_obd;
         struct dentry *parent, *child;
-        struct ldlm_reply *rep = lustre_msg_buf(req->rq_repmsg, 0);
-        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+        struct ldlm_reply *rep;
+        struct mds_body *body;
         int disp, rc;
         ENTRY;
 
-        ENTRY;
+        LASSERT(offset == 2);                  /* only called via intent */
+        rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep));
+        body = lustre_msg_buf(req->rq_repmsg, 1, sizeof (*body));
 
         /* copy rc, transno and disp; steal locks */
         req->rq_transno = mcd->mcd_last_transno;
         req->rq_status = mcd->mcd_last_result;
         disp = rep->lock_policy_res1 = mcd->mcd_last_data;
-        
-        if (med->med_outstanding_reply)
-                mds_steal_ack_locks(med, req);
-        
+
+        if (req->rq_export->exp_outstanding_reply)
+                mds_steal_ack_locks(req->rq_export, req);
+
         /* We never care about these. */
         disp &= ~(IT_OPEN_LOOKUP | IT_OPEN_POS | IT_OPEN_NEG);
         if (!disp) {
@@ -91,10 +140,9 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
         parent = mds_fid2dentry(mds, rec->ur_fid1, NULL);
         LASSERT(!IS_ERR(parent));
 
-        child = lookup_one_len(lustre_msg_buf(req->rq_reqmsg, 3),
-                               parent, req->rq_reqmsg->buflens[3] - 1);
+        child = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1);
         LASSERT(!IS_ERR(child));
-        
+
         if (!child->d_inode) {
                 GOTO(out_dput, 0); /* child not present to open */
         }
@@ -108,12 +156,8 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
                 GOTO(out_dput, 0);
         }
 
-        if (!med->med_outstanding_reply) {
-                LBUG(); /* XXX need to get enqueue client lock */
-        }
-
         /* get lock (write for O_CREAT, read otherwise) */
-        
+
         mds_pack_inode2fid(&body->fid1, child->d_inode);
         mds_pack_inode2body(body, child->d_inode);
         if (S_ISREG(child->d_inode->i_mode)) {
@@ -127,7 +171,7 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
 
         /* If we're opening a file without an EA, change to a write
            lock (unless we already have one). */
-                   
+
         /* If we have -EEXIST as the status, and we were asked to create
          * exclusively, we can tell we failed because the file already existed.
          */
@@ -150,7 +194,7 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
                 GOTO(out_dput, 0);
         }
 
-        if (med->med_outstanding_reply) {
+        if (req->rq_export->exp_outstanding_reply) {
                 struct list_head *t;
                 mfd = NULL;
                 /* XXX can we just look in the old reply to find the handle in
@@ -164,28 +208,16 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
                 /* if we're not recovering, it had better be found */
                 LASSERT(mfd);
         } else {
-                struct file *file;
-                mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL);
+                mntget(mds->mds_vfsmnt);
+                mfd = mds_dentry_open(child, mds->mds_vfsmnt,
+                                   rec->ur_flags & ~(O_DIRECT | O_TRUNC), req);
                 if (!mfd) {
                         CERROR("mds: out of memory\n");
                         GOTO(out_dput, req->rq_status = -ENOMEM);
                 }
-                mntget(mds->mds_vfsmnt);
-                file = dentry_open(child, mds->mds_vfsmnt,
-                                   rec->ur_flags & ~(O_DIRECT | O_TRUNC));
-                LASSERT(!IS_ERR(file)); /* XXX -ENOMEM? */
-                file->private_data = mfd;
-                mfd->mfd_file = file;
-                mfd->mfd_xid = req->rq_xid;
-                get_random_bytes(&mfd->mfd_servercookie,
-                                 sizeof(mfd->mfd_servercookie));
-                spin_lock(&med->med_open_lock);
-                list_add(&mfd->mfd_list, &med->med_open_head);
-                spin_unlock(&med->med_open_lock);
         }
-                
-        body->handle.addr = (__u64)(unsigned long)mfd;
-        body->handle.cookie = mfd->mfd_servercookie;
+
+        body->handle.cookie = mfd->mfd_handle.h_cookie;
 
  out_dput:
         l_dput(child);
@@ -196,11 +228,13 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req,
 int mds_open(struct mds_update_record *rec, int offset,
              struct ptlrpc_request *req, struct lustre_handle *child_lockh)
 {
+        static const char acc_table [] = {[O_RDONLY] MAY_READ,
+                                          [O_WRONLY] MAY_WRITE,
+                                          [O_RDWR]   MAY_READ | MAY_WRITE};
         struct mds_obd *mds = mds_req2mds(req);
         struct obd_device *obd = req->rq_export->exp_obd;
-        struct ldlm_reply *rep = lustre_msg_buf(req->rq_repmsg, 0);
-        struct file *file;
-        struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1);
+        struct ldlm_reply *rep;
+        struct mds_body *body;
         struct dentry *dchild = NULL, *parent;
         struct mds_export_data *med;
         struct mds_file_data *mfd = NULL;
@@ -209,9 +243,14 @@ int mds_open(struct mds_update_record *rec, int offset,
         int rc = 0, parent_mode, child_mode = LCK_PR, lock_flags, created = 0;
         int cleanup_phase = 0;
         void *handle = NULL;
+        int acc_mode;
         ENTRY;
 
-        MDS_CHECK_RESENT(req, reconstruct_open(rec, req, child_lockh));
+        LASSERT(offset == 2);                  /* only called via intent */
+        rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep));
+        body = lustre_msg_buf(req->rq_repmsg, 1, sizeof (*body));
+
+        MDS_CHECK_RESENT(req, reconstruct_open(rec, offset, req, child_lockh));
 
         med = &req->rq_export->exp_mds_data;
         rep->lock_policy_res1 |= IT_OPEN_LOOKUP;
@@ -221,6 +260,12 @@ int mds_open(struct mds_update_record *rec, int offset,
                 RETURN(-ENOMEM);
         }
 
+        if ((rec->ur_flags & O_ACCMODE) >= sizeof (acc_table))
+                RETURN(-EINVAL);
+        acc_mode = acc_table [rec->ur_flags & O_ACCMODE];
+        if ((rec->ur_flags & O_TRUNC) != 0)
+                acc_mode |= MAY_WRITE;
+
         /* Step 1: Find and lock the parent */
         parent_mode = (rec->ur_flags & O_CREAT) ? LCK_PW : LCK_PR;
         parent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, parent_mode,
@@ -235,8 +280,7 @@ int mds_open(struct mds_update_record *rec, int offset,
         cleanup_phase = 1; /* parent dentry and lock */
 
         /* Step 2: Lookup the child */
-        dchild = lookup_one_len(lustre_msg_buf(req->rq_reqmsg, 3),
-                                parent, req->rq_reqmsg->buflens[3] - 1);
+        dchild = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1);
         if (IS_ERR(dchild))
                 GOTO(cleanup, rc = PTR_ERR(dchild));
 
@@ -267,6 +311,7 @@ int mds_open(struct mds_update_record *rec, int offset,
                         GOTO(cleanup, rc);
                 created = 1;
                 child_mode = LCK_PW;
+                acc_mode = 0;                  /* Don't check for permissions */
         }
 
         /* Step 4: It's positive, so lock the child */
@@ -277,7 +322,7 @@ int mds_open(struct mds_update_record *rec, int offset,
         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL,
                               child_res_id, LDLM_PLAIN, NULL, 0, child_mode,
                               &lock_flags, ldlm_completion_ast,
-                              mds_blocking_ast, NULL, NULL, child_lockh);
+                              mds_blocking_ast, NULL, child_lockh);
         if (rc != ELDLM_OK) {
                 CERROR("ldlm_cli_enqueue: %d\n", rc);
                 GOTO(cleanup, rc = -EIO);
@@ -287,21 +332,32 @@ int mds_open(struct mds_update_record *rec, int offset,
 
         mds_pack_inode2fid(&body->fid1, dchild->d_inode);
         mds_pack_inode2body(body, dchild->d_inode);
+
         if (S_ISREG(dchild->d_inode->i_mode)) {
+                /* Check permissions etc */
+                rc = permission(dchild->d_inode, acc_mode);
+                if (rc != 0)
+                        GOTO(cleanup, rc);
+
+                /* Can't write to a read-only file */
+                if (IS_RDONLY(dchild->d_inode) && (acc_mode & MAY_WRITE) != 0)
+                        GOTO(cleanup, rc = -EPERM);
+
+                /* An append-only file must be opened in append mode for
+                 * writing */
+                if (IS_APPEND(dchild->d_inode) &&
+                    (acc_mode & MAY_WRITE) != 0 &&
+                    ((rec->ur_flags & O_APPEND) == 0 ||
+                     (rec->ur_flags & O_TRUNC) != 0))
+                        GOTO (cleanup, rc = -EPERM);
+
                 rc = mds_pack_md(obd, req->rq_repmsg, 2, body, dchild->d_inode);
                 if (rc)
                         GOTO(cleanup, rc);
-        } else {
-                /* If this isn't a regular file, we can't open it. */
-
-                /* We want to drop the child dentry, because we're not returning
-                 * failure (which would do this for us in step 2), and we're not
-                 * handing it off to the open file in dentry_open. */
-                l_dput(dchild);
-                GOTO(cleanup, rc = 0); /* returns the lock to the client */
         }
 
-        if (!created && (rec->ur_flags & O_CREAT) && (rec->ur_flags & O_EXCL)) {
+        if (!created && (rec->ur_flags & O_CREAT) &&
+            (rec->ur_flags & O_EXCL)) {
                 /* File already exists, we didn't just create it, and we
                  * were passed O_EXCL; err-or. */
                 GOTO(cleanup, rc = -EEXIST); // returns a lock to the client
@@ -309,43 +365,33 @@ int mds_open(struct mds_update_record *rec, int offset,
 
         /* If we're opening a file without an EA, the client needs a write
          * lock. */
-        if (child_mode != LCK_PW && !(body->valid & OBD_MD_FLEASIZE)) {
+        if (S_ISREG(dchild->d_inode->i_mode) &&
+            child_mode != LCK_PW && !(body->valid & OBD_MD_FLEASIZE)) {
                 ldlm_lock_decref(child_lockh, child_mode);
                 child_mode = LCK_PW;
                 goto reacquire;
         }
 
-        /* Step 5: Open it */
+        /* if we are following a symlink, don't open */
+        if (S_ISLNK(dchild->d_inode->i_mode))
+                GOTO(cleanup, rc = 0);
+
+        /* Step 5: mds_open it */
         rep->lock_policy_res1 |= IT_OPEN_OPEN;
-        mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL);
+
+        /* dentry_open does a dput(de) and mntput(mds->mds_vfsmnt) on error */
+        mfd = mds_dentry_open(dchild, mds->mds_vfsmnt,
+                              rec->ur_flags & ~(O_DIRECT | O_TRUNC), req);
         if (!mfd) {
                 CERROR("mds: out of memory\n");
+                dchild = NULL; /* prevent a double dput in step 2 */
                 GOTO(cleanup, rc = -ENOMEM);
         }
 
         cleanup_phase = 4; /* mfd allocated */
-
-        /* dentry_open does a dput(de) and mntput(mds->mds_vfsmnt) on error */
-        mntget(mds->mds_vfsmnt);
-        file = dentry_open(dchild, mds->mds_vfsmnt,
-                           rec->ur_flags & ~(O_DIRECT | O_TRUNC));
-        if (IS_ERR(file)) {
-                dchild = NULL; /* prevent a double dput in step 2 */
-                GOTO(cleanup, rc = PTR_ERR(file));
-        }
-
-        file->private_data = mfd;
-        mfd->mfd_file = file;
-        mfd->mfd_xid = req->rq_xid;
-        get_random_bytes(&mfd->mfd_servercookie, sizeof(mfd->mfd_servercookie));
-        spin_lock(&med->med_open_lock);
-        list_add(&mfd->mfd_list, &med->med_open_head);
-        spin_unlock(&med->med_open_lock);
-
-        body->handle.addr = (__u64)(unsigned long)mfd;
-        body->handle.cookie = mfd->mfd_servercookie;
-        CDEBUG(D_INODE, "file %p: mfd %p, cookie "LPX64"\n",
-               mfd->mfd_file, mfd, mfd->mfd_servercookie);
+        body->handle.cookie = mfd->mfd_handle.h_cookie;
+        CDEBUG(D_INODE, "mfd %p, cookie "LPX64"\n", mfd,
+               mfd->mfd_handle.h_cookie);
         GOTO(cleanup, rc = 0); /* returns a lock to the client */
 
  cleanup:
@@ -353,18 +399,18 @@ int mds_open(struct mds_update_record *rec, int offset,
                                 req, rc, rep->lock_policy_res1);
         switch (cleanup_phase) {
         case 4:
-                if (rc)
-                        kmem_cache_free(mds_file_cache, mfd);
+                if (rc && !S_ISLNK(dchild->d_inode->i_mode))
+                        mds_mfd_destroy(mfd);
         case 3:
-                /* This is the same logic as in the IT_OPEN part of 
+                /* This is the same logic as in the IT_OPEN part of
                  * ldlm_intent_policy: if we found the dentry, or we tried to
                  * open it (meaning that we created, if it wasn't found), then
                  * we return the lock to the caller and client. */
                 if (!(rep->lock_policy_res1 & (IT_OPEN_OPEN | IT_OPEN_POS)))
                         ldlm_lock_decref(child_lockh, child_mode);
         case 2:
-                if (rc
-                    l_dput(dchild);
+                if (rc || S_ISLNK(dchild->d_inode->i_mode))
+                        l_dput(dchild);
         case 1:
                 l_dput(parent);
                 if (rc) {
index 583ba4a..823a7a6 100644 (file)
 #include <linux/lustre_mds.h>
 #include <linux/lustre_dlm.h>
 #include <linux/lustre_fsfilt.h>
+#include "mds_internal.h"
 
 extern inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req);
 
-static void mds_last_rcvd_cb(struct obd_device *obd, __u64 last_rcvd, int error)
+static void mds_commit_cb(struct obd_device *obd, __u64 transno, int error)
 {
-        CDEBUG(D_HA, "got callback for last_rcvd "LPD64": rc = %d\n",
-               last_rcvd, error);
-        if (!error && last_rcvd > obd->obd_last_committed)
-                obd->obd_last_committed = last_rcvd;
+        obd_transno_commit_cb(obd, transno, error);
 }
 
 /* Assumes caller has already pushed us into the kernel context. */
@@ -56,15 +54,19 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
         struct mds_export_data *med = &req->rq_export->exp_mds_data;
         struct mds_client_data *mcd = med->med_mcd;
         struct obd_device *obd = req->rq_export->exp_obd;
-        int started_handle = 0, err;
+        int err;
         __u64 transno;
         loff_t off;
         ssize_t written;
         ENTRY;
 
-        /* we don't allocate new transnos for replayed requests */
-        if (req->rq_level == LUSTRE_CONN_RECOVD)
-                GOTO(out, rc = rc);
+        /* if the export has already been failed, we have no last_rcvd slot */
+        if (req->rq_export->exp_failed) {
+                CERROR("committing transaction for disconnected client\n");
+                if (handle)
+                        GOTO(commit, rc);
+                GOTO(out, rc);
+        }
 
         if (!handle) {
                 /* if we're starting our own xaction, use our own inode */
@@ -74,15 +76,17 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
                         CERROR("fsfilt_start: %ld\n", PTR_ERR(handle));
                         GOTO(out, rc = PTR_ERR(handle));
                 }
-                started_handle = 1;
         }
 
         off = MDS_LR_CLIENT + med->med_off * MDS_LR_SIZE;
 
-        spin_lock(&mds->mds_transno_lock);
-        transno = ++mds->mds_last_transno;
-        spin_unlock(&mds->mds_transno_lock);
-        req->rq_repmsg->transno = req->rq_transno = HTON__u64(transno);
+        transno = req->rq_reqmsg->transno;
+        if (transno == 0) {
+                spin_lock(&mds->mds_transno_lock);
+                transno = ++mds->mds_last_transno;
+                spin_unlock(&mds->mds_transno_lock);
+        }
+        req->rq_repmsg->transno = req->rq_transno = transno;
         mcd->mcd_last_transno = cpu_to_le64(transno);
         mcd->mcd_mount_count = cpu_to_le64(mds->mds_mount_count);
         mcd->mcd_last_xid = cpu_to_le64(req->rq_xid);
@@ -90,9 +94,8 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
         mcd->mcd_last_data = cpu_to_le32(op_data);
 
         fsfilt_set_last_rcvd(req->rq_export->exp_obd, transno, handle,
-                             mds_last_rcvd_cb);
-        written = lustre_fwrite(mds->mds_rcvd_filp, (char *)mcd, sizeof(*mcd),
-                                &off);
+                             mds_commit_cb);
+        written = lustre_fwrite(mds->mds_rcvd_filp, mcd, sizeof(*mcd), &off);
         CDEBUG(D_INODE, "wrote trans "LPU64" client %s at #%u: written = "
                LPSZ"\n", transno, mcd->mcd_uuid, med->med_off, written);
 
@@ -106,7 +109,8 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
                 }
         }
 
-        err = fsfilt_commit(obd, i, handle);
+commit:
+        err = fsfilt_commit(obd, i, handle, 0);
         if (err) {
                 CERROR("error committing transaction: %d\n", err);
                 if (!rc)
@@ -125,11 +129,7 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle,
  */
 int mds_fix_attr(struct inode *inode, struct mds_update_record *rec)
 {
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        time_t now = CURRENT_TIME;
-#else
-        time_t now = CURRENT_TIME.tv_sec;
-#endif
+        time_t now = LTIME_S(CURRENT_TIME);
         struct iattr *attr = &rec->ur_iattr;
         unsigned int ia_valid = attr->ia_valid;
         int error;
@@ -142,19 +142,11 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec)
         if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                 RETURN(-EPERM);
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        attr->ia_ctime = now;
-        if (!(ia_valid & ATTR_ATIME_SET))
-                attr->ia_atime = now;
-        if (!(ia_valid & ATTR_MTIME_SET))
-                attr->ia_mtime = now;
-#else
-        attr->ia_ctime.tv_sec = now;
+        LTIME_S(attr->ia_ctime) = now;
         if (!(ia_valid & ATTR_ATIME_SET))
-                attr->ia_atime.tv_sec = now;
+                LTIME_S(attr->ia_atime) = now;
         if (!(ia_valid & ATTR_MTIME_SET))
-                attr->ia_mtime.tv_sec = now;
-#endif
+                LTIME_S(attr->ia_mtime) = now;
 
         /* times */
         if ((ia_valid & (ATTR_MTIME|ATTR_ATIME))==(ATTR_MTIME|ATTR_ATIME) &&
@@ -227,8 +219,8 @@ static void reconstruct_reint_setattr(struct mds_update_record *rec,
         req->rq_transno = mcd->mcd_last_transno;
         req->rq_status = mcd->mcd_last_result;
 
-        if (med->med_outstanding_reply)
-                mds_steal_ack_locks(med, req);
+        if (req->rq_export->exp_outstanding_reply)
+                mds_steal_ack_locks(req->rq_export, req);
 
         de = mds_fid2dentry(obd, rec->ur_fid1, NULL);
         if (IS_ERR(de)) {
@@ -236,7 +228,7 @@ static void reconstruct_reint_setattr(struct mds_update_record *rec,
                 return;
         }
 
-        body = lustre_msg_buf(req->rq_repmsg, 0);
+        body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
         mds_pack_inode2fid(&body->fid1, de->d_inode);
         mds_pack_inode2body(body, de->d_inode);
 
@@ -262,6 +254,8 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
         int rc = 0, cleanup_phase = 0, err, locked = 0;
         ENTRY;
 
+        LASSERT(offset == 0);
+
         MDS_CHECK_RESENT(req, reconstruct_reint_setattr(rec, offset, req));
 
         if (rec->ur_iattr.ia_valid & ATTR_FROM_OPEN) {
@@ -297,14 +291,14 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset,
                 GOTO(cleanup, rc);
 
         rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr);
-        if (rc == 0 && S_ISREG(inode->i_mode) &&
-            req->rq_reqmsg->bufcount > 1) {
+        if (rc == 0 &&
+            S_ISREG(inode->i_mode) &&
+            rec->ur_eadata != NULL) {
                 rc = fsfilt_set_md(obd, inode, handle,
-                                   lustre_msg_buf(req->rq_reqmsg, 1),
-                                   req->rq_reqmsg->buflens[1]);
+                                   rec->ur_eadata, rec->ur_eadatalen);
         }
 
-        body = lustre_msg_buf(req->rq_repmsg, 0);
+        body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
         mds_pack_inode2fid(&body->fid1, inode);
         mds_pack_inode2body(body, inode);
 
@@ -343,21 +337,21 @@ static void reconstruct_reint_create(struct mds_update_record *rec, int offset,
         struct mds_obd *obd = &req->rq_export->exp_obd->u.mds;
         struct dentry *parent, *child;
         struct mds_body *body;
-        
+
         req->rq_transno = mcd->mcd_last_transno;
         req->rq_status = mcd->mcd_last_result;
 
-        if (med->med_outstanding_reply)
-                mds_steal_ack_locks(med, req);
-        
+        if (req->rq_export->exp_outstanding_reply)
+                mds_steal_ack_locks(req->rq_export, req);
+
         if (req->rq_status)
                 return;
 
         parent = mds_fid2dentry(obd, rec->ur_fid1, NULL);
         LASSERT(!IS_ERR(parent));
-        child = lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1);
+        child = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1);
         LASSERT(!IS_ERR(child));
-        body = lustre_msg_buf(req->rq_repmsg, offset);
+        body = lustre_msg_buf(req->rq_repmsg, offset, sizeof (*body));
         mds_pack_inode2fid(&body->fid1, child->d_inode);
         mds_pack_inode2body(body, child->d_inode);
         l_dput(parent);
@@ -401,7 +395,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
 
         ldlm_lock_dump_handle(D_OTHER, &lockh);
 
-        dchild = lookup_one_len(rec->ur_name, de, rec->ur_namelen - 1);
+        dchild = ll_lookup_one_len(rec->ur_name, de, rec->ur_namelen - 1);
         if (IS_ERR(dchild)) {
                 rc = PTR_ERR(dchild);
                 CERROR("child lookup error %d\n", rc);
@@ -421,8 +415,6 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
 
         if (rec->ur_fid2->id)
                 dchild->d_fsdata = (void *)(unsigned long)rec->ur_fid2->id;
-        else
-                LASSERT(!(rec->ur_opcode & REINT_REPLAYING));
 
         switch (type) {
         case S_IFREG:{
@@ -445,7 +437,10 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                 handle = fsfilt_start(obd, dir, FSFILT_OP_SYMLINK);
                 if (IS_ERR(handle))
                         GOTO(cleanup, rc = PTR_ERR(handle));
-                rc = vfs_symlink(dir, dchild, rec->ur_tgt);
+                if (rec->ur_tgt == NULL)        /* no target supplied */
+                        rc = -EINVAL;           /* -EPROTO? */
+                else
+                        rc = vfs_symlink(dir, dchild, rec->ur_tgt);
                 EXIT;
                 break;
         }
@@ -480,15 +475,9 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                 struct mds_body *body;
 
                 created = 1;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-                iattr.ia_atime = rec->ur_time;
-                iattr.ia_ctime = rec->ur_time;
-                iattr.ia_mtime = rec->ur_time;
-#else
-                iattr.ia_atime.tv_sec = rec->ur_time;
-                iattr.ia_ctime.tv_sec = rec->ur_time;
-                iattr.ia_mtime.tv_sec = rec->ur_time;
-#endif
+                LTIME_S(iattr.ia_atime) = rec->ur_time;
+                LTIME_S(iattr.ia_ctime) = rec->ur_time;
+                LTIME_S(iattr.ia_mtime) = rec->ur_time;
                 iattr.ia_uid = rec->ur_uid;
                 iattr.ia_gid = rec->ur_gid;
                 iattr.ia_valid = ATTR_UID | ATTR_GID | ATTR_ATIME |
@@ -511,7 +500,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
                         /* XXX should we abort here in case of error? */
                 }
 
-                body = lustre_msg_buf(req->rq_repmsg, offset);
+                body = lustre_msg_buf(req->rq_repmsg, offset, sizeof (*body));
                 mds_pack_inode2fid(&body->fid1, inode);
                 mds_pack_inode2body(body, inode);
         }
@@ -519,7 +508,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset,
 
 cleanup:
         err = mds_finish_transno(mds, dir, handle, req, rc, 0);
-                
+
         if (rc && created) {
                 /* Destroy the file we just created.  This should not need
                  * extra journal credits, as we have already modified all of
@@ -604,7 +593,7 @@ int enqueue_ordered_locks(int lock_mode, struct obd_device *obd,
         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, res_id[0],
                               LDLM_PLAIN, NULL, 0, lock_mode, &flags,
                               ldlm_completion_ast, mds_blocking_ast, NULL,
-                              NULL, handles[0]);
+                              handles[0]);
         if (rc != ELDLM_OK)
                 RETURN(-EIO);
         ldlm_lock_dump_handle(D_OTHER, handles[0]);
@@ -617,7 +606,7 @@ int enqueue_ordered_locks(int lock_mode, struct obd_device *obd,
                 rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL,
                                       res_id[1], LDLM_PLAIN, NULL, 0, lock_mode,
                                       &flags, ldlm_completion_ast,
-                                      mds_blocking_ast, NULL, 0, handles[1]);
+                                      mds_blocking_ast, NULL, handles[1]);
                 if (rc != ELDLM_OK) {
                         ldlm_lock_decref(handles[0], lock_mode);
                         RETURN(-EIO);
@@ -638,9 +627,9 @@ static void reconstruct_reint_unlink(struct mds_update_record *rec, int offset,
         req->rq_transno = mcd->mcd_last_transno;
         req->rq_status = mcd->mcd_last_result;
 
-        if (med->med_outstanding_reply)
-                mds_steal_ack_locks(med, req);
-        
+        if (req->rq_export->exp_outstanding_reply)
+                mds_steal_ack_locks(req->rq_export, req);
+
         DEBUG_REQ(D_ERROR, req,
                   "can't get EA for reconstructed unlink, leaking OST inodes");
 }
@@ -658,12 +647,13 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
         struct lustre_handle parent_lockh;
         void *handle = NULL;
         struct ldlm_res_id child_res_id = { .name = {0} };
-        char *name;
-        int namelen, rc = 0, flags = 0, return_lock = 0;
+        int rc = 0, flags = 0, return_lock = 0;
         int cleanup_phase = 0;
         ENTRY;
 
-        MDS_CHECK_RESENT(req, reconstruct_reint_unlink(rec, offset, req, 
+        LASSERT(offset == 0 || offset == 2);
+
+        MDS_CHECK_RESENT(req, reconstruct_reint_unlink(rec, offset, req,
                                                        child_lockh));
 
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNLINK))
@@ -680,28 +670,18 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
         cleanup_phase = 1; /* Have parent dentry lock */
 
         /* Step 2: Lookup the child */
-        name = lustre_msg_buf(req->rq_reqmsg, offset + 1);
-        namelen = req->rq_reqmsg->buflens[offset + 1] - 1;
-
-        dchild = lookup_one_len(name, dir_de, namelen);
+        dchild = ll_lookup_one_len(rec->ur_name, dir_de, rec->ur_namelen - 1);
         if (IS_ERR(dchild))
                 GOTO(cleanup, rc = PTR_ERR(dchild));
-        
+
         cleanup_phase = 2; /* child dentry */
 
         child_inode = dchild->d_inode;
         if (child_inode == NULL) {
-                if (rec->ur_opcode & REINT_REPLAYING) {
-                        CDEBUG(D_INODE,
-                               "child missing (%lu/%s); OK for REPLAYING\n",
-                               dir_inode->i_ino, rec->ur_name);
-                        rc = 0;
-                } else {
-                        CDEBUG(D_INODE,
-                               "child doesn't exist (dir %lu, name %s)\n",
-                               dir_inode->i_ino, rec->ur_name);
-                        rc = -ENOENT;
-                }
+                CDEBUG(D_INODE,
+                       "child doesn't exist (dir %lu, name %s)\n",
+                       dir_inode->i_ino, rec->ur_name);
+                rc = -ENOENT;
                 GOTO(cleanup, rc);
         }
 
@@ -715,7 +695,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
         rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL,
                               child_res_id, LDLM_PLAIN, NULL, 0, LCK_EX,
                               &flags, ldlm_completion_ast, mds_blocking_ast,
-                              NULL, NULL, child_lockh);
+                              NULL, child_lockh);
         if (rc != ELDLM_OK)
                 GOTO(cleanup, rc);
 
@@ -724,11 +704,12 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
         OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_UNLINK_WRITE,
                        to_kdev_t(dir_inode->i_sb->s_dev));
 
-        /* Slightly magical; see ldlm_intent_policy */
+        /* ldlm_reply in buf[0] if called via intent */
         if (offset)
                 offset = 1;
 
-        body = lustre_msg_buf(req->rq_repmsg, offset);
+        body = lustre_msg_buf(req->rq_repmsg, offset, sizeof (*body));
+        LASSERT(body != NULL);
 
         /* Step 4: Do the unlink: client decides between rmdir/unlink!
          * (bug 72) */
@@ -742,8 +723,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
         case S_IFREG:
                 /* If this is the last reference to this inode, get the OBD EA
                  * data first so the client can destroy OST objects */
-                if ((child_inode->i_mode & S_IFMT) == S_IFREG &&
-                    child_inode->i_nlink == 1) {
+                if (S_ISREG(child_inode->i_mode) && child_inode->i_nlink == 1) {
                         mds_pack_inode2fid(&body->fid1, child_inode);
                         mds_pack_inode2body(body, child_inode);
                         mds_pack_md(obd, req->rq_repmsg, offset + 1,
@@ -763,7 +743,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset,
                 rc = vfs_unlink(dir_inode, dchild);
                 break;
         default:
-                CERROR("bad file type %o unlinking %s\n", rec->ur_mode, name);
+                CERROR("bad file type %o unlinking %s\n", rec->ur_mode, rec->ur_name);
                 LBUG();
                 GOTO(cleanup, rc = -EINVAL);
         }
@@ -807,11 +787,9 @@ static void reconstruct_reint_link(struct mds_update_record *rec, int offset,
 
         req->rq_transno = mcd->mcd_last_transno;
         req->rq_status = mcd->mcd_last_result;
-        
-        if (med->med_outstanding_reply)
-                mds_steal_ack_locks(med, req);
-        else
-                LBUG(); /* don't support it yet, but it'll be fun! */
+
+        if (req->rq_export->exp_outstanding_reply)
+                mds_steal_ack_locks(req->rq_export, req);
 }
 
 static int mds_reint_link(struct mds_update_record *rec, int offset,
@@ -829,6 +807,8 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
         int lock_mode = 0, rc = 0, cleanup_phase = 0;
         ENTRY;
 
+        LASSERT(offset == 0);
+
         MDS_CHECK_RESENT(req, reconstruct_reint_link(rec, offset, req));
 
         if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_LINK))
@@ -866,7 +846,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
         cleanup_phase = 3; /* locks */
 
         /* Step 3: Lookup the child */
-        dchild = lookup_one_len(rec->ur_name, de_tgt_dir, rec->ur_namelen - 1);
+        dchild = ll_lookup_one_len(rec->ur_name, de_tgt_dir, rec->ur_namelen-1);
         if (IS_ERR(dchild)) {
                 CERROR("child lookup error %ld\n", PTR_ERR(dchild));
                 GOTO(cleanup, rc = PTR_ERR(dchild));
@@ -875,17 +855,9 @@ static int mds_reint_link(struct mds_update_record *rec, int offset,
         cleanup_phase = 4; /* child dentry */
 
         if (dchild->d_inode) {
-                if (rec->ur_opcode & REINT_REPLAYING) {
-                        /* XXX verify that the link is to the the right file? */
-                        CDEBUG(D_INODE,
-                               "child exists (dir %lu, name %s) (REPLAYING)\n",
-                               de_tgt_dir->d_inode->i_ino, rec->ur_name);
-                        rc = 0;
-                } else {
-                        CDEBUG(D_INODE, "child exists (dir %lu, name %s)\n",
-                               de_tgt_dir->d_inode->i_ino, rec->ur_name);
-                        rc = -EEXIST;
-                }
+                CDEBUG(D_INODE, "child exists (dir %lu, name %s)\n",
+                       de_tgt_dir->d_inode->i_ino, rec->ur_name);
+                rc = -EEXIST;
                 GOTO(cleanup, rc);
         }
 
@@ -944,9 +916,9 @@ static void reconstruct_reint_rename(struct mds_update_record *rec,
 
         req->rq_transno = mcd->mcd_last_transno;
         req->rq_status = mcd->mcd_last_result;
-        
-        if (med->med_outstanding_reply)
-                mds_steal_ack_locks(med, req);
+
+        if (req->rq_export->exp_outstanding_reply)
+                mds_steal_ack_locks(req->rq_export, req);
         else
                 LBUG(); /* don't support it yet, but it'll be fun! */
 
@@ -972,12 +944,14 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
         void *handle = NULL;
         ENTRY;
 
+        LASSERT(offset == 0);
+
         MDS_CHECK_RESENT(req, reconstruct_reint_rename(rec, offset, req));
 
         de_srcdir = mds_fid2dentry(mds, rec->ur_fid1, NULL);
         if (IS_ERR(de_srcdir))
                 GOTO(cleanup, rc = PTR_ERR(de_srcdir));
-        
+
         cleanup_phase = 1; /* source directory dentry */
 
         de_tgtdir = mds_fid2dentry(mds, rec->ur_fid2, NULL);
@@ -1014,7 +988,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
         cleanup_phase = 3; /* parent locks */
 
         /* Step 2: Lookup the children */
-        de_old = lookup_one_len(rec->ur_name, de_srcdir, rec->ur_namelen - 1);
+        de_old = ll_lookup_one_len(rec->ur_name, de_srcdir, rec->ur_namelen-1);
         if (IS_ERR(de_old)) {
                 CERROR("old child lookup error (%*s): %ld\n",
                        rec->ur_namelen - 1, rec->ur_name, PTR_ERR(de_old));
@@ -1031,7 +1005,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
             de_old->d_inode->i_ino == de_tgtdir->d_inode->i_ino)
                 GOTO(cleanup, rc = -EINVAL);
 
-        de_new = lookup_one_len(rec->ur_tgt, de_tgtdir, rec->ur_tgtlen - 1);
+        de_new = ll_lookup_one_len(rec->ur_tgt, de_tgtdir, rec->ur_tgtlen - 1);
         if (IS_ERR(de_new)) {
                 CERROR("new child lookup error (%*s): %ld\n",
                        rec->ur_tgtlen - 1, rec->ur_tgt, PTR_ERR(de_new));
@@ -1054,7 +1028,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset,
                 rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL,
                                       c1_res_id, LDLM_PLAIN, NULL, 0, LCK_EX,
                                       &flags, ldlm_completion_ast,
-                                      mds_blocking_ast, NULL, NULL,
+                                      mds_blocking_ast, NULL,
                                       &(dlm_handles[2]));
                 lock_count = 3;
         } else {
@@ -1150,26 +1124,16 @@ int mds_reint_rec(struct mds_update_record *rec, int offset,
 {
         struct mds_obd *mds = mds_req2mds(req);
         struct obd_run_ctxt saved;
-        struct obd_ucred uc;
-        int realop = rec->ur_opcode & REINT_OPCODE_MASK, rc;
+        int rc;
         ENTRY;
 
-        if (realop < 1 || realop > REINT_MAX) {
-                CERROR("opcode %d not valid (%sREPLAYING)\n", realop,
-                       rec->ur_opcode & REINT_REPLAYING ? "" : "not ");
-                rc = req->rq_status = -EINVAL;
-                RETURN(rc);
-        }
-
-        uc.ouc_fsuid = rec->ur_fsuid;
-        uc.ouc_fsgid = rec->ur_fsgid;
-        uc.ouc_cap = rec->ur_cap;
-        uc.ouc_suppgid1 = rec->ur_suppgid1;
-        uc.ouc_suppgid2 = rec->ur_suppgid2;
+        /* checked by unpacker */
+        LASSERT(rec->ur_opcode <= REINT_MAX &&
+                reinters[rec->ur_opcode] != NULL);
 
-        push_ctxt(&saved, &mds->mds_ctxt, &uc);
-        rc = reinters[realop] (rec, offset, req, lockh);
-        pop_ctxt(&saved, &mds->mds_ctxt, &uc);
+        push_ctxt(&saved, &mds->mds_ctxt, &rec->ur_uc);
+        rc = reinters[rec->ur_opcode] (rec, offset, req, lockh);
+        pop_ctxt(&saved, &mds->mds_ctxt, &rec->ur_uc);
 
         RETURN(rc);
 }
index fb04cc1..7b7c5b9 100644 (file)
@@ -1,18 +1,17 @@
-
 # FIXME: we need to make it clear that obdclass.o depends on
 # lustre_build_version, or 'make -j2' breaks!
 DEFS=
 MODULE = obdclass
 
-if LINUX25
-FSMOD = fsfilt_ext3
-else
+if EXTN
 FSMOD = fsfilt_extN
+else
+FSMOD = fsfilt_ext3
 endif
 
 if LIBLUSTRE
 lib_LIBRARIES = liblustreclass.a
-liblustreclass_a_SOURCES = uuid.c statfs_pack.c genops.c debug.c class_obd.c lustre_handles.c lustre_peer.c lprocfs_status.c
+liblustreclass_a_SOURCES = uuid.c statfs_pack.c genops.c debug.c class_obd.c lustre_handles.c lustre_peer.c lprocfs_status.c simple.c
 
 class_obd.o: lustre_version
 
@@ -24,7 +23,8 @@ else
 modulefs_DATA = lustre_build_version obdclass.o $(FSMOD).o fsfilt_reiserfs.o
 EXTRA_PROGRAMS = obdclass $(FSMOD) fsfilt_reiserfs
 
-obdclass_SOURCES = class_obd.c debug.c genops.c sysctl.c uuid.c lprocfs_status.c lustre_handles.c lustre_peer.c
+obdclass_SOURCES = class_obd.c debug.c genops.c sysctl.c uuid.c simple.c
+obdclass_SOURCES += lprocfs_status.c lustre_handles.c lustre_peer.c
 obdclass_SOURCES += fsfilt.c statfs_pack.c
 endif
 
@@ -33,7 +33,7 @@ include $(top_srcdir)/Rules
 # XXX I'm sure there's some automake mv-if-different helper for this.
 lustre_build_version:
        perl $(top_srcdir)/scripts/version_tag.pl $(top_srcdir) $(top_builddir) > tmpver
-       cmp -z $(top_builddir)/include/linux/lustre_build_version.h tmpver \
+       cmp -s $(top_builddir)/include/linux/lustre_build_version.h tmpver \
                2> /dev/null &&                                            \
                $(RM) tmpver ||                                            \
                mv tmpver $(top_builddir)/include/linux/lustre_build_version.h
index 6209d75..1e180a8 100644 (file)
@@ -51,6 +51,7 @@
 #include <asm/poll.h>
 #include <asm/uaccess.h>
 #include <linux/miscdevice.h>
+#include <linux/smp_lock.h>
 #else
 
 # include <liblustre.h>
@@ -60,7 +61,6 @@
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/lustre_debug.h>
-#include <linux/smp_lock.h>
 #include <linux/lprocfs_status.h>
 #include <portals/lib-types.h> /* for PTL_MD_MAX_IOV */
 #include <linux/lustre_build_version.h>
@@ -77,9 +77,11 @@ struct proc_dir_entry *proc_lustre_root = NULL;
 /* The following are visible and mutable through /proc/sys/lustre/. */
 unsigned long obd_fail_loc;
 unsigned long obd_timeout = 100;
-char obd_recovery_upcall[128] = "/usr/lib/lustre/ha_assist";
+unsigned long obd_bulk_timeout = 1;
+char obd_lustre_upcall[128] = "/usr/lib/lustre/lustre_upcall";
 unsigned long obd_sync_filter; /* = 0, don't sync by default */
 
+#ifdef __KERNEL__
 /*  opening /dev/obd */
 static int obd_class_open(struct inode * inode, struct file * file)
 {
@@ -93,10 +95,38 @@ static int obd_class_open(struct inode * inode, struct file * file)
         INIT_LIST_HEAD(&ocus->ocus_conns);
         file->private_data = ocus;
 
-        MOD_INC_USE_COUNT;
+        PORTAL_MODULE_USE;
         RETURN(0);
 }
 
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+        struct obd_class_user_state *ocus = file->private_data;
+        struct obd_class_user_conn  *c;
+        ENTRY;
+
+        while (!list_empty (&ocus->ocus_conns)) {
+                c = list_entry (ocus->ocus_conns.next,
+                                struct obd_class_user_conn, ocuc_chain);
+                list_del (&c->ocuc_chain);
+
+                CDEBUG (D_IOCTL, "Auto-disconnect %p\n", &c->ocuc_conn);
+
+                down (&obd_conf_sem);
+                obd_disconnect (&c->ocuc_conn, 0);
+                up (&obd_conf_sem);
+
+                OBD_FREE (c, sizeof (*c));
+        }
+
+        OBD_FREE (ocus, sizeof (*ocus));
+
+        PORTAL_MODULE_UNUSE;
+        RETURN(0);
+}
+#endif
+
 static int
 obd_class_add_user_conn (struct obd_class_user_state *ocus,
                          struct lustre_handle *conn)
@@ -125,7 +155,7 @@ obd_class_remove_user_conn (struct obd_class_user_state *ocus,
 
         list_for_each (e, &ocus->ocus_conns) {
                 c = list_entry (e, struct obd_class_user_conn, ocuc_chain);
-                if (!memcmp (conn, &c->ocuc_conn, sizeof (*conn))) {
+                if (conn->cookie == c->ocuc_conn.cookie) {
                         list_del (&c->ocuc_chain);
                         OBD_FREE (c, sizeof (*c));
                         return;
@@ -133,76 +163,39 @@ obd_class_remove_user_conn (struct obd_class_user_state *ocus,
         }
 }
 
-/*  closing /dev/obd */
-static int obd_class_release(struct inode * inode, struct file * file)
-{
-        struct obd_class_user_state *ocus = file->private_data;
-        struct obd_class_user_conn  *c;
-        ENTRY;
-
-        while (!list_empty (&ocus->ocus_conns)) {
-                c = list_entry (ocus->ocus_conns.next,
-                                struct obd_class_user_conn, ocuc_chain);
-                list_del (&c->ocuc_chain);
-
-                CDEBUG (D_IOCTL, "Auto-disconnect %p\n", &c->ocuc_conn);
-
-                down (&obd_conf_sem);
-                obd_disconnect (&c->ocuc_conn);
-                up (&obd_conf_sem);
-
-                OBD_FREE (c, sizeof (*c));
-        }
-
-        OBD_FREE (ocus, sizeof (*ocus));
-
-        MOD_DEC_USE_COUNT;
-        RETURN(0);
-}
-
 static inline void obd_data2conn(struct lustre_handle *conn,
                                  struct obd_ioctl_data *data)
 {
-        conn->addr = data->ioc_addr;
+        memset(conn, 0, sizeof *conn);
         conn->cookie = data->ioc_cookie;
 }
 
 static inline void obd_conn2data(struct obd_ioctl_data *data,
                                  struct lustre_handle *conn)
 {
-        data->ioc_addr = conn->addr;
         data->ioc_cookie = conn->cookie;
 }
 
-static void forcibly_detach_exports(struct obd_device *obd)
+static void dump_exports(struct obd_device *obd)
 {
-        int rc;
         struct list_head *tmp, *n;
-        struct lustre_handle fake_conn;
 
-        CDEBUG(D_IOCTL, "OBD device %d (%p) has exports, "
-               "disconnecting them", obd->obd_minor, obd);
         list_for_each_safe(tmp, n, &obd->obd_exports) {
                 struct obd_export *exp = list_entry(tmp, struct obd_export,
                                                     exp_obd_chain);
-                fake_conn.addr = (__u64)(unsigned long)exp;
-                fake_conn.cookie = exp->exp_cookie;
-                rc = obd_disconnect(&fake_conn);
-                if (rc) {
-                        CDEBUG(D_IOCTL, "disconnecting export %p failed: %d\n",
-                               exp, rc);
-                } else {
-                        CDEBUG(D_IOCTL, "export %p disconnected\n", exp);
-                }
+                CDEBUG(D_ERROR, "%s: %p %s %d %d %p\n",
+                       obd->obd_name, exp, exp->exp_client_uuid.uuid,
+                       atomic_read(&exp->exp_refcount),
+                       exp->exp_failed, exp->exp_outstanding_reply );
         }
 }
 
-
 int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                        unsigned long arg)
 {
         char *buf = NULL;
         struct obd_ioctl_data *data;
+        struct portals_debug_ioctl_data *debug_data;
         struct obd_device *obd = ocus->ocus_current_obd;
         struct lustre_handle conn;
         int err = 0, len = 0, serialised = 0;
@@ -211,6 +204,14 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
         if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
                 RETURN(err = -ENOTTY);
 
+        /* only for debugging */
+        if (cmd == PTL_IOC_DEBUG_MASK) {
+                debug_data = (struct portals_debug_ioctl_data*)arg;
+                portal_subsystem_debug = debug_data->subs;
+                portal_debug = debug_data->debug;
+                return 0;
+        }
+
         switch (cmd) {
         case OBD_IOC_BRW_WRITE:
         case OBD_IOC_BRW_READ:
@@ -227,9 +228,9 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
         CDEBUG(D_IOCTL, "cmd = %x, obd = %p\n", cmd, obd);
         if (!obd && cmd != OBD_IOC_DEVICE &&
             cmd != OBD_IOC_LIST && cmd != OBD_GET_VERSION &&
-            cmd != OBD_IOC_NAME2DEV && cmd != OBD_IOC_NEWDEV &&
-            cmd != OBD_IOC_ADD_UUID && cmd != OBD_IOC_DEL_UUID  &&
-            cmd != OBD_IOC_CLOSE_UUID) {
+            cmd != OBD_IOC_NAME2DEV && cmd != OBD_IOC_UUID2DEV &&
+            cmd != OBD_IOC_NEWDEV && cmd != OBD_IOC_ADD_UUID &&
+            cmd != OBD_IOC_DEL_UUID && cmd != OBD_IOC_CLOSE_UUID) {
                 CERROR("OBD ioctl: No device\n");
                 GOTO(out, err = -EINVAL);
         }
@@ -244,12 +245,12 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 CDEBUG(D_IOCTL, "\n");
                 if (data->ioc_dev >= MAX_OBD_DEVICES || data->ioc_dev < 0) {
                         CERROR("OBD ioctl: DEVICE insufficient devices\n");
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
                 CDEBUG(D_IOCTL, "device %d\n", data->ioc_dev);
 
                 ocus->ocus_current_obd = &obd_dev[data->ioc_dev];
-                GOTO(out, err=0);
+                GOTO(out, err = 0);
         }
 
         case OBD_IOC_LIST: {
@@ -259,7 +260,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
 
                 if (!data->ioc_inlbuf1) {
                         CERROR("No buffer passed!\n");
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
 
 
@@ -270,9 +271,11 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
 
                         if (!obd->obd_type)
                                 continue;
-                        if (obd->obd_flags & OBD_SET_UP)
+                        if (obd->obd_stopping)
+                                status = "ST";
+                        else if (obd->obd_set_up)
                                 status = "UP";
-                        else if (obd->obd_flags & OBD_ATTACHED)
+                        else if (obd->obd_attached)
                                 status = "AT";
                         else
                                 status = "-";
@@ -321,11 +324,11 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
 
                 if (!data->ioc_inllen1 || !data->ioc_inlbuf1 ) {
                         CERROR("No name passed,!\n");
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
-                if (data->ioc_inlbuf1[data->ioc_inllen1-1] !=0) {
+                if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
                         CERROR("Name not nul terminated!\n");
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
 
                 CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
@@ -334,7 +337,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 if (dev == -1) {
                         CDEBUG(D_IOCTL, "No device for name %s!\n",
                                data->ioc_inlbuf1);
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
 
                 CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
@@ -354,11 +357,11 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
 
                 if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
                         CERROR("No UUID passed!\n");
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
-                if (data->ioc_inlbuf1[data->ioc_inllen1-1] !=0) {
-                        CERROR("Name not nul terminated!\n");
-                        GOTO(out, err=-EINVAL);
+                if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+                        CERROR("UUID not NUL terminated!\n");
+                        GOTO(out, err = -EINVAL);
                 }
 
                 CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
@@ -366,9 +369,9 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 dev = class_uuid2dev(&uuid);
                 data->ioc_dev = dev;
                 if (dev == -1) {
-                        CDEBUG(D_IOCTL, "No device for name %s!\n",
+                        CDEBUG(D_IOCTL, "No device for UUID %s!\n",
                                data->ioc_inlbuf1);
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
                 }
 
                 CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
@@ -379,6 +382,8 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 GOTO(out, err);
         }
 
+
+
         case OBD_IOC_NEWDEV: {
                 int dev = -1;
                 int i;
@@ -396,7 +401,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
 
                 data->ioc_dev = dev;
                 if (dev == -1)
-                        GOTO(out, err=-EINVAL);
+                        GOTO(out, err = -EINVAL);
 
                 err = copy_to_user((void *)arg, data, sizeof(*data));
                 if (err)
@@ -409,7 +414,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 int minor, len;
 
                 /* have we attached a type to this device */
-                if (obd->obd_flags & OBD_ATTACHED || obd->obd_type) {
+                if (obd->obd_attached|| obd->obd_type) {
                         CERROR("OBD: Device %d already typed as %s.\n",
                                obd->obd_minor, MKSTR(obd->obd_type->typ_name));
                         GOTO(out, err = -EBUSY);
@@ -419,7 +424,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                         CERROR("No type passed!\n");
                         GOTO(out, err = -EINVAL);
                 }
-                if (data->ioc_inlbuf1[data->ioc_inllen1-1] !=0) {
+                if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
                         CERROR("Type not nul terminated!\n");
                         GOTO(out, err = -EINVAL);
                 }
@@ -427,6 +432,19 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                         CERROR("No name passed!\n");
                         GOTO(out, err = -EINVAL);
                 }
+                if (data->ioc_inlbuf2[data->ioc_inllen2 - 1] != 0) {
+                        CERROR("Name not nul terminated!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+                if (!data->ioc_inllen3 || !data->ioc_inlbuf3) {
+                        CERROR("No UUID passed!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+                if (data->ioc_inlbuf3[data->ioc_inllen3 - 1] != 0) {
+                        CERROR("UUID not nul terminated!\n");
+                        GOTO(out, err = -EINVAL);
+                }
+
                 CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
                        MKSTR(data->ioc_inlbuf1),
                        MKSTR(data->ioc_inlbuf2), MKSTR(data->ioc_inlbuf3));
@@ -445,6 +463,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 INIT_LIST_HEAD(&obd->obd_exports);
                 INIT_LIST_HEAD(&obd->obd_imports);
                 spin_lock_init(&obd->obd_dev_lock);
+                init_waitqueue_head(&obd->obd_refcount_waitq);
 
                 /* XXX belong ins setup not attach  */
                 /* recovery data */
@@ -453,6 +472,8 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 INIT_LIST_HEAD(&obd->obd_recovery_queue);
                 INIT_LIST_HEAD(&obd->obd_delayed_reply_queue);
 
+                init_waitqueue_head(&obd->obd_commit_waitq);
+
                 len = strlen(data->ioc_inlbuf2) + 1;
                 OBD_ALLOC(obd->obd_name, len);
                 if (!obd->obd_name) {
@@ -462,20 +483,19 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 }
                 memcpy(obd->obd_name, data->ioc_inlbuf2, len);
 
-                if (data->ioc_inlbuf3) {
-                        int len = strlen(data->ioc_inlbuf3);
-                        if (len >= sizeof(obd->obd_uuid)) {
-                                CERROR("uuid must be < "LPSZ" bytes long\n",
-                                       sizeof(obd->obd_uuid));
-                                if (obd->obd_name)
-                                        OBD_FREE(obd->obd_name,
-                                                 strlen(obd->obd_name) + 1);
-                                class_put_type(obd->obd_type);
-                                obd->obd_type = NULL;
-                                GOTO(out, err=-EINVAL);
-                        }
-                        memcpy(obd->obd_uuid.uuid, data->ioc_inlbuf3, len);
+                len = strlen(data->ioc_inlbuf3);
+                if (len >= sizeof(obd->obd_uuid)) {
+                        CERROR("uuid must be < "LPSZ" bytes long\n",
+                               sizeof(obd->obd_uuid));
+                        if (obd->obd_name)
+                                OBD_FREE(obd->obd_name,
+                                         strlen(obd->obd_name) + 1);
+                        class_put_type(obd->obd_type);
+                        obd->obd_type = NULL;
+                        GOTO(out, err = -EINVAL);
                 }
+                memcpy(obd->obd_uuid.uuid, data->ioc_inlbuf3, len);
+
                 /* do the attach */
                 if (OBP(obd, attach))
                         err = OBP(obd,attach)(obd, sizeof(*data), data);
@@ -486,7 +506,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                         class_put_type(obd->obd_type);
                         obd->obd_type = NULL;
                 } else {
-                        obd->obd_flags |= OBD_ATTACHED;
+                        obd->obd_attached = 1;
 
                         type->typ_refcnt++;
                         CDEBUG(D_IOCTL, "OBD: dev %d attached type %s\n",
@@ -498,13 +518,13 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
 
         case OBD_IOC_DETACH: {
                 ENTRY;
-                if (obd->obd_flags & OBD_SET_UP) {
+                if (obd->obd_set_up) {
                         CERROR("OBD device %d still set up\n", obd->obd_minor);
-                        GOTO(out, err=-EBUSY);
+                        GOTO(out, err = -EBUSY);
                 }
-                if (!(obd->obd_flags & OBD_ATTACHED) ) {
+                if (!obd->obd_attached) {
                         CERROR("OBD device %d not attached\n", obd->obd_minor);
-                        GOTO(out, err=-ENODEV);
+                        GOTO(out, err = -ENODEV);
                 }
                 if (OBP(obd, detach))
                         err = OBP(obd,detach)(obd);
@@ -514,7 +534,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                         obd->obd_name = NULL;
                 }
 
-                obd->obd_flags &= ~OBD_ATTACHED;
+                obd->obd_attached = 0;
                 obd->obd_type->typ_refcnt--;
                 class_put_type(obd->obd_type);
                 obd->obd_type = NULL;
@@ -523,49 +543,106 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
 
         case OBD_IOC_SETUP: {
                 /* have we attached a type to this device? */
-                if (!(obd->obd_flags & OBD_ATTACHED)) {
+                if (!obd->obd_attached) {
                         CERROR("Device %d not attached\n", obd->obd_minor);
-                        GOTO(out, err=-ENODEV);
+                        GOTO(out, err = -ENODEV);
                 }
 
                 /* has this been done already? */
-                if ( obd->obd_flags & OBD_SET_UP ) {
+                if (obd->obd_set_up) {
                         CERROR("Device %d already setup (type %s)\n",
                                obd->obd_minor, obd->obd_type->typ_name);
-                        GOTO(out, err=-EBUSY);
+                        GOTO(out, err = -EBUSY);
                 }
 
+                atomic_set(&obd->obd_refcount, 0);
+
                 if ( OBT(obd) && OBP(obd, setup) )
                         err = obd_setup(obd, sizeof(*data), data);
 
                 if (!err) {
                         obd->obd_type->typ_refcnt++;
-                        obd->obd_flags |= OBD_SET_UP;
+                        obd->obd_set_up = 1;
+                        atomic_inc(&obd->obd_refcount);
                 }
 
                 GOTO(out, err);
         }
         case OBD_IOC_CLEANUP: {
-                /* have we attached a type to this device? */
-                if (!(obd->obd_flags & OBD_ATTACHED)) {
-                        CERROR("Device %d not attached\n", obd->obd_minor);
-                        GOTO(out, err=-ENODEV);
-                }
-                if (!list_empty(&obd->obd_exports)) {
-                        if (!data->ioc_inlbuf1 || data->ioc_inlbuf1[0] != 'F') {
-                                CERROR("OBD device %d (%p) has exports\n",
-                                       obd->obd_minor, obd);
+                int force = 0, failover = 0;
+                char * flag;
+
+                if (!obd->obd_set_up) {
+                        CERROR("Device %d not setup\n", obd->obd_minor);
+                        GOTO(out, err = -ENODEV);
+                }
+
+                if (data->ioc_inlbuf1) {
+                        for (flag = data->ioc_inlbuf1; *flag != 0; flag++)
+                                switch (*flag) {
+                                case 'F':
+                                        force = 1;
+                                        break;
+                                case 'A':
+                                        failover = 1;
+                                        break;
+                                default:
+                                        CERROR("unrecognised flag '%c'\n", 
+                                               *flag);
+                                }
+                }
+                
+                if (atomic_read(&obd->obd_refcount) == 1 || force) {
+                        /* this will stop new connections, and need to
+                           do it before class_disconnect_exports() */
+                        obd->obd_stopping = 1;
+                }
+
+                if (atomic_read(&obd->obd_refcount) > 1) {
+                        struct l_wait_info lwi = LWI_TIMEOUT_INTR(60 * HZ, NULL,
+                                                                  NULL, NULL);
+                        int rc;
+                        
+                        if (!force) {
+                                CERROR("OBD device %d (%p) has refcount %d\n",
+                                       obd->obd_minor, obd, 
+                                       atomic_read(&obd->obd_refcount));
+                                dump_exports(obd);
                                 GOTO(out, err = -EBUSY);
                         }
-                        forcibly_detach_exports(obd);
+                        class_disconnect_exports(obd, failover);
+                        CDEBUG(D_IOCTL, 
+                               "%s: waiting for obd refs to go away: %d\n", 
+                               obd->obd_name, atomic_read(&obd->obd_refcount));
+                
+                        rc = l_wait_event(obd->obd_refcount_waitq,
+                                     atomic_read(&obd->obd_refcount) < 2, &lwi);
+                        if (rc == 0) {
+                                LASSERT(atomic_read(&obd->obd_refcount) == 1);
+                        } else {
+                                CERROR("wait cancelled cleaning anyway. "
+                                       "refcount: %d\n",
+                                       atomic_read(&obd->obd_refcount));
+                                dump_exports(obd);
+                        }
+                        CDEBUG(D_IOCTL, "%s: awake, now finishing cleanup\n", 
+                               obd->obd_name);
                 }
+
                 if (OBT(obd) && OBP(obd, cleanup))
-                        err = obd_cleanup(obd);
+                        err = obd_cleanup(obd, force, failover);
 
                 if (!err) {
-                        obd->obd_flags &= ~OBD_SET_UP;
+                        obd->obd_set_up = obd->obd_stopping = 0;
                         obd->obd_type->typ_refcnt--;
+                        atomic_dec(&obd->obd_refcount);
+                        /* XXX this should be an LASSERT */
+                        if (atomic_read(&obd->obd_refcount) > 0) 
+                                CERROR("%s still has refcount %d after "
+                                       "cleanup.\n", obd->obd_name,
+                                       atomic_read(&obd->obd_refcount));
                 }
+
                 GOTO(out, err);
         }
 
@@ -573,24 +650,24 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
                 struct obd_uuid cluuid = { "OBD_CLASS_UUID" };
                 obd_data2conn(&conn, data);
 
-                err = obd_connect(&conn, obd, &cluuid, NULL, NULL);
+                err = obd_connect(&conn, obd, &cluuid);
 
-                CDEBUG(D_IOCTL, "assigned export "LPX64"\n", conn.addr);
+                CDEBUG(D_IOCTL, "assigned export "LPX64"\n", conn.cookie);
                 obd_conn2data(data, &conn);
                 if (err)
                         GOTO(out, err);
 
                 err = obd_class_add_user_conn (ocus, &conn);
                 if (err != 0) {
-                        obd_disconnect (&conn);
+                        obd_disconnect (&conn, 0);
                         GOTO (out, err);
                 }
 
                 err = copy_to_user((void *)arg, data, sizeof(*data));
                 if (err != 0) {
                         obd_class_remove_user_conn (ocus, &conn);
-                        obd_disconnect (&conn);
-                        GOTO (out, err=-EFAULT);
+                        obd_disconnect (&conn, 0);
+                        GOTO (out, err = -EFAULT);
                 }
                 GOTO(out, err);
         }
@@ -598,19 +675,19 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
         case OBD_IOC_DISCONNECT: {
                 obd_data2conn(&conn, data);
                 obd_class_remove_user_conn (ocus, &conn);
-                err = obd_disconnect(&conn);
+                err = obd_disconnect(&conn, 0);
                 GOTO(out, err);
         }
 
         case OBD_IOC_NO_TRANSNO: {
-                if (!(obd->obd_flags & OBD_ATTACHED)) {
+                if (!obd->obd_attached) {
                         CERROR("Device %d not attached\n", obd->obd_minor);
-                        GOTO(out, err=-ENODEV);
+                        GOTO(out, err = -ENODEV);
                 }
                 CDEBUG(D_IOCTL,
                        "disabling committed-transno notifications on %d\n",
                        obd->obd_minor);
-                obd->obd_flags |= OBD_NO_TRANSNO;
+                obd->obd_no_transno = 1;
                 GOTO(out, err = 0);
         }
 
@@ -654,11 +731,11 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd,
 
  out:
         if (buf)
-                OBD_FREE(buf, len);
+                obd_ioctl_freedata(buf, len);
         if (serialised)
                 up(&obd_conf_sem);
         RETURN(err);
-} /* obd_class_ioctl */
+} /* class_handle_ioctl */
 
 
 
@@ -688,86 +765,20 @@ static struct miscdevice obd_psdev = {
 void *obd_psdev = NULL;
 #endif
 
-void (*class_signal_connection_failure)(struct ptlrpc_connection *);
-
-#ifdef CONFIG_HIGHMEM
-/* Allow at most 3/4 of the kmap mappings to be consumed by vector I/O
- * requests.  This avoids deadlocks on servers which have a lot of clients
- * doing vector I/O.  We don't need to do this for non-vector I/O requests
- * because singleton requests will just block on the kmap itself and never
- * deadlock waiting for additional kmaps to complete.
- *
- * If we are a "server" task, we can have at most a single reservation
- * in excess of the maximum.  This avoids a deadlock when multiple client
- * threads are on the same machine as the server threads, and the clients
- * have consumed all of the available mappings.  As long as a single server
- * thread is can make progress, we are guaranteed to avoid deadlock.
- */
-#define OBD_KMAP_MAX (LAST_PKMAP * 3 / 4)
-static atomic_t obd_kmap_count = ATOMIC_INIT(OBD_KMAP_MAX);
-static DECLARE_WAIT_QUEUE_HEAD(obd_kmap_waitq);
-
-void obd_kmap_get(int count, int server)
-{
-        //CERROR("getting %d kmap counts (%d/%d)\n", count,
-        //       atomic_read(&obd_kmap_count), OBD_KMAP_MAX);
-        if (count == 1)
-                atomic_dec(&obd_kmap_count);
-        else while (atomic_add_negative(-count, &obd_kmap_count)) {
-                struct l_wait_info lwi = { 0 };
-                static long next_show = 0;
-                static int skipped = 0;
-
-                if (server && atomic_read(&obd_kmap_count) >= -PTL_MD_MAX_IOV)
-                        break;
-
-                CDEBUG(D_OTHER, "negative kmap reserved count: %d\n",
-                       atomic_read(&obd_kmap_count));
-                atomic_add(count, &obd_kmap_count);
-
-                if (time_after(jiffies, next_show)) {
-                        CERROR("blocking %s (and %d others) for kmaps\n",
-                               current->comm, skipped);
-                        next_show = jiffies + 5*HZ;
-                        skipped = 0;
-                } else
-                        skipped++;
-                l_wait_event(obd_kmap_waitq,
-                             atomic_read(&obd_kmap_count) >= count, &lwi);
-        }
-}
-
-void obd_kmap_put(int count)
-{
-        atomic_add(count, &obd_kmap_count);
-        /* Wake up sleepers.  Sadly, this wakes up all of the tasks at once.
-         * We could have something smarter here like:
-        while (atomic_read(&obd_kmap_count) > 0)
-                wake_up_nr(obd_kmap_waitq, 1);
-        although we would need to set somewhere (probably obd_class_init):
-        obd_kmap_waitq.flags |= WQ_FLAG_EXCLUSIVE;
-        For now the wait_event() condition will handle this OK I believe.
-         */
-        if (atomic_read(&obd_kmap_count) > 0)
-                wake_up(&obd_kmap_waitq);
-}
-
-EXPORT_SYMBOL(obd_kmap_get);
-EXPORT_SYMBOL(obd_kmap_put);
-#endif
-
 EXPORT_SYMBOL(obd_dev);
 EXPORT_SYMBOL(obdo_cachep);
 EXPORT_SYMBOL(obd_memory);
 EXPORT_SYMBOL(obd_memmax);
 EXPORT_SYMBOL(obd_fail_loc);
 EXPORT_SYMBOL(obd_timeout);
-EXPORT_SYMBOL(obd_recovery_upcall);
+EXPORT_SYMBOL(obd_lustre_upcall);
 EXPORT_SYMBOL(obd_sync_filter);
 EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
 EXPORT_SYMBOL(ptlrpc_abort_inflight_superhack);
 EXPORT_SYMBOL(proc_lustre_root);
 
+EXPORT_SYMBOL(lctl_fake_uuid);
+
 EXPORT_SYMBOL(class_register_type);
 EXPORT_SYMBOL(class_unregister_type);
 EXPORT_SYMBOL(class_get_type);
@@ -775,19 +786,26 @@ EXPORT_SYMBOL(class_put_type);
 EXPORT_SYMBOL(class_name2dev);
 EXPORT_SYMBOL(class_uuid2dev);
 EXPORT_SYMBOL(class_uuid2obd);
+EXPORT_SYMBOL(class_export_get);
+EXPORT_SYMBOL(class_export_put);
 EXPORT_SYMBOL(class_new_export);
-EXPORT_SYMBOL(class_destroy_export);
+EXPORT_SYMBOL(class_unlink_export);
+EXPORT_SYMBOL(class_import_get);
+EXPORT_SYMBOL(class_import_put);
+EXPORT_SYMBOL(class_new_import);
+EXPORT_SYMBOL(class_destroy_import);
 EXPORT_SYMBOL(class_connect);
 EXPORT_SYMBOL(class_conn2export);
 EXPORT_SYMBOL(class_conn2obd);
 EXPORT_SYMBOL(class_conn2cliimp);
 EXPORT_SYMBOL(class_conn2ldlmimp);
 EXPORT_SYMBOL(class_disconnect);
-EXPORT_SYMBOL(class_disconnect_all);
-EXPORT_SYMBOL(class_uuid_unparse);
+EXPORT_SYMBOL(class_disconnect_exports);
 EXPORT_SYMBOL(lustre_uuid_to_peer);
 
-EXPORT_SYMBOL(class_signal_connection_failure);
+/* uuid.c */
+EXPORT_SYMBOL(class_uuid_unparse);
+EXPORT_SYMBOL(client_tgtuuid2obd);
 
 EXPORT_SYMBOL(class_handle_hash);
 EXPORT_SYMBOL(class_handle_unhash);
@@ -851,7 +869,7 @@ static void cleanup_obdclass(void)
         misc_deregister(&obd_psdev);
         for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
-                if (obd->obd_type && (obd->obd_flags & OBD_SET_UP) &&
+                if (obd->obd_type && obd->obd_set_up &&
                     OBT(obd) && OBP(obd, detach)) {
                         /* XXX should this call generic detach otherwise? */
                         OBP(obd, detach)(obd);
@@ -879,14 +897,15 @@ static void cleanup_obdclass(void)
  * kernel patch */
 #ifdef __KERNEL__
 #include <linux/lustre_version.h>
-#define LUSTRE_SOURCE_VERSION 13
-#if (LUSTRE_KERNEL_VERSION < LUSTRE_SOURCE_VERSION)
+#define LUSTRE_MIN_VERSION 18
+#define LUSTRE_MAX_VERSION 19
+#if (LUSTRE_KERNEL_VERSION < LUSTRE_MIN_VERSION)
 # error Cannot continue: Your Lustre kernel patch is older than the sources
-#elif (LUSTRE_KERNEL_VERSION > LUSTRE_SOURCE_VERSION)
+#elif (LUSTRE_KERNEL_VERSION > LUSTRE_MAX_VERSION)
 # error Cannot continue: Your Lustre sources are older than the kernel patch
 #endif
-#else
-#warning "Lib Lustre - no versioning information"
+ #else
+# warning "Lib Lustre - no versioning information"
 #endif
 
 #ifdef __KERNEL__
index 6118084..f824b98 100644 (file)
@@ -1,20 +1,31 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * Helper routines for dumping data structs for debugging.
+ *  Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
  *
- * This code is issued under the GNU General Public License.
- * See the file COPYING in this distribution
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
  *
- * Copryright (C) 2002 Cluster File Systems, Inc.
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
+ * Helper routines for dumping data structs for debugging.
  */
 
 #define DEBUG_SUBSYSTEM D_OTHER
 
 #define EXPORT_SYMTAB
 #ifndef __KERNEL__
-#include <liblustre.h>
+# include <liblustre.h>
 #endif
 
 #include <linux/obd_ost.h>
 
 int dump_ioo(struct obd_ioobj *ioo)
 {
-        CERROR("obd_ioobj: ioo_id="LPD64", ioo_gr="LPD64", ioo_type=%d, ioo_bufct=%d\n",
+        CERROR("obd_ioobj: ioo_id="LPD64", ioo_gr="LPD64", ioo_type=%d, "
+               "ioo_bufct=%d\n",
                ioo->ioo_id, ioo->ioo_gr, ioo->ioo_type, ioo->ioo_bufcnt);
         return -EINVAL;
 }
 
 int dump_lniobuf(struct niobuf_local *nb)
 {
-        CERROR("niobuf_local: addr=%p, offset="LPD64", len=%d, xid=%d, page=%p\n",
-               nb->addr, nb->offset, nb->len, nb->xid, nb->page);
+        CERROR("niobuf_local: offset="LPD64", len=%d, page=%p, rc=%d\n",
+               nb->offset, nb->len, nb->page, nb->rc);
         CERROR("nb->page: index = %ld\n", nb->page ? nb->page->index : -1);
 
         return -EINVAL;
@@ -40,8 +52,8 @@ int dump_lniobuf(struct niobuf_local *nb)
 
 int dump_rniobuf(struct niobuf_remote *nb)
 {
-        CERROR("niobuf_remote: offset="LPD64", len=%d, flags=%x, xid=%d\n",
-               nb->offset, nb->len, nb->flags, nb->xid);
+        CERROR("niobuf_remote: offset="LPU64", len=%d, flags=%x\n",
+               nb->offset, nb->len, nb->flags);
 
         return -EINVAL;
 }
@@ -104,8 +116,8 @@ int page_debug_setup(void *addr, int len, __u64 off, __u64 id)
 {
         LASSERT(addr);
 
-        off = HTON__u64(off);
-        id = HTON__u64(id);
+        off = cpu_to_le64 (off);
+        id = cpu_to_le64 (id);
         memcpy(addr, (char *)&off, LPDS);
         memcpy(addr + LPDS, (char *)&id, LPDS);
 
@@ -123,28 +135,28 @@ int page_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
 
         LASSERT(addr);
 
-        ne_off = HTON__u64(off);
-        id = HTON__u64(id);
+        ne_off = le64_to_cpu (off);
+        id = le64_to_cpu (id);
         if (memcmp(addr, (char *)&ne_off, LPDS)) {
-                CERROR("%s: id "LPU64" offset "LPU64" off: "LPX64" != "LPX64"\n",
-                       who, id, off, *(__u64 *)addr, ne_off);
+                CERROR("%s: id "LPX64" offset "LPU64" off: "LPX64" != "
+                       LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
                 err = -EINVAL;
         }
         if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
-                CERROR("%s: id "LPU64" offset "LPU64" id: "LPX64" != "LPX64"\n",
+                CERROR("%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n",
                        who, id, off, *(__u64 *)(addr + LPDS), id);
                 err = -EINVAL;
         }
 
         addr += end - LPDS - LPDS;
         if (memcmp(addr, (char *)&ne_off, LPDS)) {
-                CERROR("%s: id "LPU64" offset "LPU64" end off: "LPX64" != "LPX64"\n",
-                       who, id, off, *(__u64 *)addr, ne_off);
+                CERROR("%s: id "LPX64" offset "LPU64" end off: "LPX64" != "
+                       LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
                 err = -EINVAL;
         }
         if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
-                CERROR("%s: id "LPU64" offset "LPU64" end id: "LPX64" != "LPX64"\n",
-                       who, id, off, *(__u64 *)(addr + LPDS), id);
+                CERROR("%s: id "LPX64" offset "LPU64" end id: "LPX64" != "
+                       LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id);
                 err = -EINVAL;
         }
 
index 07ce0b3..4357b79 100644 (file)
@@ -38,7 +38,7 @@ int fsfilt_register_ops(struct fsfilt_operations *fs_ops)
                         RETURN(-EEXIST);
                 }
         } else {
-               MOD_INC_USE_COUNT;
+                PORTAL_MODULE_USE;
                list_add(&fs_ops->fs_list, &fsfilt_types);
        }
 
@@ -57,7 +57,7 @@ void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops)
                 found = list_entry(p, typeof(*found), fs_list);
                 if (found == fs_ops) {
                         list_del(p);
-                        MOD_DEC_USE_COUNT;
+                        PORTAL_MODULE_UNUSE;
                         break;
                 }
         }
index 72f2830..a02f1f5 100644 (file)
@@ -4,7 +4,7 @@
  *  lustre/lib/fsfilt_ext3.c
  *  Lustre filesystem abstraction routines
  *
- *  Copyright (C) 2002 Cluster File Systems, Inc.
+ *  Copyright (C) 2002, 2003 Cluster File Systems, Inc.
  *   Author: Andreas Dilger <adilger@clusterfs.com>
  *
  *   This file is part of Lustre, http://www.lustre.org.
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-//#error "FIXME: this needs to be updated to match fsfilt_extN.c"
-
 #define DEBUG_SUBSYSTEM S_FILTER
 
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/slab.h>
-#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
-#include <linux/version.h>
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-# include <linux/ext3_xattr.h>
-#else
-# include <asm/statfs.h>
-#endif
+#include <linux/ext3_xattr.h>
 #include <linux/kp30.h>
 #include <linux/lustre_fsfilt.h>
 #include <linux/obd.h>
+#include <linux/obd_class.h>
 #include <linux/module.h>
 
 static kmem_cache_t *fcb_cache;
@@ -75,18 +70,21 @@ static void *fsfilt_ext3_start(struct inode *inode, int op)
                 nblocks += EXT3_DELETE_TRANS_BLOCKS;
                 break;
         case FSFILT_OP_RENAME:
-                /* We may be modifying two directories */
+                /* modify additional directory */
                 nblocks += EXT3_DATA_TRANS_BLOCKS;
+                /* no break */
         case FSFILT_OP_SYMLINK:
-                /* Possible new block + block bitmap + GDT for long symlink */
+                /* additional block + block bitmap + GDT for long symlink */
                 nblocks += 3;
+                /* no break */
         case FSFILT_OP_CREATE:
         case FSFILT_OP_MKDIR:
         case FSFILT_OP_MKNOD:
-                /* New inode + block bitmap + GDT for new file */
+                /* modify one inode + block bitmap + GDT */
                 nblocks += 3;
+                /* no break */
         case FSFILT_OP_LINK:
-                /* Change parent directory */
+                /* modify parent directory */
                 nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS;
                 break;
         case FSFILT_OP_SETATTR:
@@ -97,6 +95,7 @@ static void *fsfilt_ext3_start(struct inode *inode, int op)
                  LBUG();
         }
 
+        LASSERT(!current->journal_info);
         lock_kernel();
         handle = journal_start(EXT3_JOURNAL(inode), nblocks);
         unlock_kernel();
@@ -104,12 +103,135 @@ static void *fsfilt_ext3_start(struct inode *inode, int op)
         return handle;
 }
 
-static int fsfilt_ext3_commit(struct inode *inode, void *handle)
+/*
+ * Calculate the number of buffer credits needed to write multiple pages in
+ * a single ext3 transaction.  No, this shouldn't be here, but as yet ext3
+ * doesn't have a nice API for calculating this sort of thing in advance.
+ *
+ * See comment above ext3_writepage_trans_blocks for details.  We assume
+ * no data journaling is being done, but it does allow for all of the pages
+ * being non-contiguous.  If we are guaranteed contiguous pages we could
+ * reduce the number of (d)indirect blocks a lot.
+ *
+ * With N blocks per page and P pages, for each inode we have at most:
+ * N*P indirect
+ * min(N*P, blocksize/4 + 1) dindirect blocks
+ * niocount tindirect
+ *
+ * For the entire filesystem, we have at most:
+ * min(sum(nindir + P), ngroups) bitmap blocks (from the above)
+ * min(sum(nindir + P), gdblocks) group descriptor blocks (from the above)
+ * objcount inode blocks
+ * 1 superblock
+ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quota files
+ * 
+ * 1 EXT3_DATA_TRANS_BLOCKS for the last_rcvd update.
+ */
+static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso)
+{
+        struct super_block *sb = fso->fso_dentry->d_inode->i_sb;
+        int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+        int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp;
+        int nbitmaps = 0;
+        int ngdblocks = 0;
+        int needed = objcount + 1;
+        int i;
+
+        for (i = 0; i < objcount; i++, fso++) {
+                int nblocks = fso->fso_bufcnt * blockpp;
+                int ndindirect = min(nblocks, addrpp + 1);
+                int nindir = nblocks + ndindirect + 1;
+
+                nbitmaps += nindir + nblocks;
+                ngdblocks += nindir + nblocks;
+
+                needed += nindir;
+        }
+
+        /* Assumes ext3 and ext3 have same sb_info layout at the start. */
+        if (nbitmaps > EXT3_SB(sb)->s_groups_count)
+                nbitmaps = EXT3_SB(sb)->s_groups_count;
+        if (ngdblocks > EXT3_SB(sb)->s_gdb_count)
+                ngdblocks = EXT3_SB(sb)->s_gdb_count;
+
+        needed += nbitmaps + ngdblocks;
+        
+        /* last_rcvd update */
+        needed += EXT3_DATA_TRANS_BLOCKS;
+
+#ifdef CONFIG_QUOTA
+        /* We assume that there will be 1 bit set in s_dquot.flags for each
+         * quota file that is active.  This is at least true for now.
+         */
+        needed += hweight32(sb_any_quota_enabled(sb)) *
+                EXT3_SINGLEDATA_TRANS_BLOCKS;
+#endif
+
+        return needed;
+}
+
+/* We have to start a huge journal transaction here to hold all of the
+ * metadata for the pages being written here.  This is necessitated by
+ * the fact that we do lots of prepare_write operations before we do
+ * any of the matching commit_write operations, so even if we split
+ * up to use "smaller" transactions none of them could complete until
+ * all of them were opened.  By having a single journal transaction,
+ * we eliminate duplicate reservations for common blocks like the
+ * superblock and group descriptors or bitmaps.
+ *
+ * We will start the transaction here, but each prepare_write will
+ * add a refcount to the transaction, and each commit_write will
+ * remove a refcount.  The transaction will be closed when all of
+ * the pages have been written.
+ */
+static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso,
+                                   int niocount, struct niobuf_remote *nb)
+{
+        journal_t *journal;
+        handle_t *handle;
+        int needed;
+        ENTRY;
+
+        LASSERT(!current->journal_info);
+        journal = EXT3_SB(fso->fso_dentry->d_inode->i_sb)->s_journal;
+        needed = fsfilt_ext3_credits_needed(objcount, fso);
+
+        /* The number of blocks we could _possibly_ dirty can very large.
+         * We reduce our request if it is absurd (and we couldn't get that
+         * many credits for a single handle anyways).
+         *
+         * At some point we have to limit the size of I/Os sent at one time,
+         * increase the size of the journal, or we have to calculate the
+         * actual journal requirements more carefully by checking all of
+         * the blocks instead of being maximally pessimistic.  It remains to
+         * be seen if this is a real problem or not.
+         */
+        if (needed > journal->j_max_transaction_buffers) {
+                CERROR("want too many journal credits (%d) using %d instead\n",
+                       needed, journal->j_max_transaction_buffers);
+                needed = journal->j_max_transaction_buffers;
+        }
+
+        lock_kernel();
+        handle = journal_start(journal, needed);
+        unlock_kernel();
+        if (IS_ERR(handle))
+                CERROR("can't get handle for %d credits: rc = %ld\n", needed,
+                       PTR_ERR(handle));
+
+        RETURN(handle);
+}
+
+static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync)
 {
         int rc;
+        handle_t *handle = h;
+
+        if (force_sync)
+                handle->h_sync = 1; /* recovery likes this */
 
         lock_kernel();
-        rc = journal_stop((handle_t *)handle);
+        rc = journal_stop(handle);
         unlock_kernel();
 
         return rc;
@@ -122,10 +244,38 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle,
         int rc;
 
         lock_kernel();
+
+        /* A _really_ horrible hack to avoid removing the data stored
+         * in the block pointers; this is really the "small" stripe MD data.
+         * We can avoid further hackery by virtue of the MDS file size being
+         * zero all the time (which doesn't invoke block truncate at unlink
+         * time), so we assert we never change the MDS file size from zero.
+         */
+        if (iattr->ia_valid & ATTR_SIZE) {
+                CERROR("hmm, setting %*s file size to %lld\n",
+                       dentry->d_name.len, dentry->d_name.name, iattr->ia_size);
+                LASSERT(iattr->ia_size == 0);
+#if 0
+                /* ATTR_SIZE would invoke truncate: clear it */
+                iattr->ia_valid &= ~ATTR_SIZE;
+                inode->i_size = iattr->ia_size;
+
+                /* make sure _something_ gets set - so new inode
+                 * goes to disk (probably won't work over XFS
+                 */
+                if (!iattr->ia_valid & ATTR_MODE) {
+                        iattr->ia_valid |= ATTR_MODE;
+                        iattr->ia_mode = inode->i_mode;
+                }
+#endif
+        }
         if (inode->i_op->setattr)
                 rc = inode->i_op->setattr(dentry, iattr);
-        else
-                rc = inode_setattr(inode, iattr);
+        else{
+                rc = inode_change_ok(inode, iattr);
+                if (!rc)
+                        rc = inode_setattr(inode, iattr);
+        }
 
         unlock_kernel();
 
@@ -137,29 +287,58 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle,
 {
         int rc;
 
-        down(&inode->i_sem);
-        lock_kernel();
-        rc = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_LUSTRE,
-                            XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
-        unlock_kernel();
-        up(&inode->i_sem);
+        /* Nasty hack city - store stripe MD data in the block pointers if
+         * it will fit, because putting it in an EA currently kills the MDS
+         * performance.  We'll fix this with "fast EAs" in the future.
+         */
+        if (lmm_size <= sizeof(EXT3_I(inode)->i_data) -
+                        sizeof(EXT3_I(inode)->i_data[0])) {
+                /* XXX old_size is debugging only */
+                int old_size = EXT3_I(inode)->i_data[0];
+                if (old_size != 0) {
+                        LASSERT(old_size < sizeof(EXT3_I(inode)->i_data));
+                        CERROR("setting EA on %lu again... interesting\n",
+                               inode->i_ino);
+                }
 
-        if (rc) {
+                EXT3_I(inode)->i_data[0] = cpu_to_le32(lmm_size);
+                memcpy(&EXT3_I(inode)->i_data[1], lmm, lmm_size);
+                mark_inode_dirty(inode);
+                return 0;
+        } else {
+                down(&inode->i_sem);
+                lock_kernel();
+                rc = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_LUSTRE,
+                                    XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0);
+                unlock_kernel();
+                up(&inode->i_sem);
+        }
+
+        if (rc)
                 CERROR("error adding MD data to inode %lu: rc = %d\n",
                        inode->i_ino, rc);
-                if (rc != -ENOSPC) LBUG();
-        }
         return rc;
 }
 
-static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size)
+static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size)
 {
         int rc;
 
+        if (EXT3_I(inode)->i_data[0]) {
+                int size = le32_to_cpu(EXT3_I(inode)->i_data[0]);
+                LASSERT(size < sizeof(EXT3_I(inode)->i_data));
+                if (lmm) {
+                        if (size > lmm_size)
+                                return -ERANGE;
+                        memcpy(lmm, &EXT3_I(inode)->i_data[1], size);
+                }
+                return size;
+        }
+
         down(&inode->i_sem);
         lock_kernel();
         rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_LUSTRE,
-                            XATTR_LUSTRE_MDS_OBJID, lmm, size);
+                            XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size);
         unlock_kernel();
         up(&inode->i_sem);
 
@@ -170,7 +349,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size)
         if (rc < 0) {
                 CDEBUG(D_INFO, "error getting EA %s from inode %lu: "
                        "rc = %d\n", XATTR_LUSTRE_MDS_OBJID, inode->i_ino, rc);
-                memset(lmm, 0, size);
+                memset(lmm, 0, lmm_size);
                 return (rc == -ENODATA) ? 0 : rc;
         }
 
@@ -178,26 +357,55 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size)
 }
 
 static ssize_t fsfilt_ext3_readpage(struct file *file, char *buf, size_t count,
-                                    loff_t *offset)
+                                    loff_t *off)
 {
         struct inode *inode = file->f_dentry->d_inode;
         int rc = 0;
 
         if (S_ISREG(inode->i_mode))
-                rc = file->f_op->read(file, buf, count, offset);
+                rc = file->f_op->read(file, buf, count, off);
         else {
-                struct buffer_head *bh;
-
-                /* FIXME: this assumes the blocksize == count, but the calling
-                 *        function will detect this as an error for now */
-                bh = ext3_bread(NULL, inode,
-                                *offset >> inode->i_sb->s_blocksize_bits,
-                                0, &rc);
-
-                if (bh) {
-                        memcpy(buf, bh->b_data, inode->i_blksize);
-                        brelse(bh);
-                        rc = inode->i_blksize;
+                const int blkbits = inode->i_sb->s_blocksize_bits;
+                const int blksize = inode->i_sb->s_blocksize;
+
+                CDEBUG(D_EXT2, "reading "LPSZ" at dir %lu+%llu\n",
+                       count, inode->i_ino, *off);
+                while (count > 0) {
+                        struct buffer_head *bh;
+
+                        bh = NULL;
+                        if (*off < inode->i_size) {
+                                int err = 0;
+
+                                bh = ext3_bread(NULL, inode, *off >> blkbits,
+                                                0, &err);
+
+                                CDEBUG(D_EXT2, "read %u@%llu\n", blksize, *off);
+
+                                if (bh) {
+                                        memcpy(buf, bh->b_data, blksize);
+                                        brelse(bh);
+                                } else if (err) {
+                                        /* XXX in theory we should just fake
+                                         * this buffer and continue like ext3,
+                                         * especially if this is a partial read
+                                         */
+                                        CERROR("error read dir %lu+%llu: %d\n",
+                                               inode->i_ino, *off, err);
+                                        RETURN(err);
+                                }
+                        }
+                        if (!bh) {
+                                struct ext3_dir_entry_2 *fake = (void *)buf;
+
+                                CDEBUG(D_EXT2, "fake %u@%llu\n", blksize, *off);
+                                memset(fake, 0, sizeof(*fake));
+                                fake->rec_len = cpu_to_le32(blksize);
+                        }
+                        count -= blksize;
+                        buf += blksize;
+                        *off += blksize;
+                        rc += blksize;
                 }
         }
 
@@ -210,18 +418,17 @@ static void fsfilt_ext3_cb_func(struct journal_callback *jcb, int error)
 
         fcb->cb_func(fcb->cb_obd, fcb->cb_last_rcvd, error);
 
-        kmem_cache_free(fcb_cache, fcb);
+        OBD_SLAB_FREE(fcb, fcb_cache, sizeof *fcb);
         atomic_dec(&fcb_cache_count);
 }
 
 static int fsfilt_ext3_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
                                      void *handle, fsfilt_cb_t cb_func)
 {
-#ifdef HAVE_JOURNAL_CALLBACK_STATUS
         struct fsfilt_cb_data *fcb;
 
-        fcb = kmem_cache_alloc(fcb_cache, GFP_NOFS);
-        if (!fcb)
+        OBD_SLAB_ALLOC(fcb, fcb_cache, GFP_NOFS, sizeof *fcb);
+        if (fcb == NULL)
                 RETURN(-ENOMEM);
 
         atomic_inc(&fcb_cache_count);
@@ -235,17 +442,6 @@ static int fsfilt_ext3_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
         journal_callback_set(handle, fsfilt_ext3_cb_func,
                              (struct journal_callback *)fcb);
         unlock_kernel();
-#else
-#warning "no journal callback kernel patch, faking it..."
-        static long next = 0;
-
-        if (time_after(jiffies, next)) {
-                CERROR("no journal callback kernel patch, faking it...\n");
-                next = jiffies + 300 * HZ;
-        }
-
-        cb_func(obd, last_rcvd, 0);
-#endif
 
         return 0;
 }
@@ -266,13 +462,17 @@ static int fsfilt_ext3_journal_data(struct file *filp)
  *
  * This can be removed when the ext3 EA code is fixed.
  */
-static int fsfilt_ext3_statfs(struct super_block *sb, struct statfs *sfs)
+static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs)
 {
-        int rc = vfs_statfs(sb, sfs);
+        struct statfs sfs;
+        int rc = vfs_statfs(sb, &sfs);
 
-        if (!rc && sfs->f_bfree < sfs->f_ffree)
-                sfs->f_ffree = sfs->f_bfree;
+        if (!rc && sfs.f_bfree < sfs.f_ffree) {
+                sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
+                sfs.f_ffree = sfs.f_bfree;
+        }
 
+        statfs_pack(osfs, &sfs);
         return rc;
 }
 
@@ -281,10 +481,19 @@ static int fsfilt_ext3_sync(struct super_block *sb)
         return ext3_force_commit(sb);
 }
 
+extern int ext3_prep_san_write(struct inode *inode, long *blocks,
+                              int nblocks, loff_t newsize);
+static int fsfilt_ext3_prep_san_write(struct inode *inode, long *blocks,
+                                      int nblocks, loff_t newsize)
+{
+        return ext3_prep_san_write(inode, blocks, nblocks, newsize);
+}
+
 static struct fsfilt_operations fsfilt_ext3_ops = {
         fs_type:                "ext3",
         fs_owner:               THIS_MODULE,
         fs_start:               fsfilt_ext3_start,
+        fs_brw_start:           fsfilt_ext3_brw_start,
         fs_commit:              fsfilt_ext3_commit,
         fs_setattr:             fsfilt_ext3_setattr,
         fs_set_md:              fsfilt_ext3_set_md,
@@ -294,6 +503,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = {
         fs_set_last_rcvd:       fsfilt_ext3_set_last_rcvd,
         fs_statfs:              fsfilt_ext3_statfs,
         fs_sync:                fsfilt_ext3_sync,
+        fs_prep_san_write:      fsfilt_ext3_prep_san_write,
 };
 
 static int __init fsfilt_ext3_init(void)
index d029785..ddec807 100644 (file)
@@ -222,15 +222,13 @@ static void *fsfilt_extN_brw_start(int objcount, struct fsfilt_objinfo *fso,
         RETURN(handle);
 }
 
-static int fsfilt_extN_commit(struct inode *inode, void *h /*, force_sync */)
+static int fsfilt_extN_commit(struct inode *inode, void *h, int force_sync)
 {
         int rc;
         handle_t *handle = h;
 
-#if 0
         if (force_sync)
                 handle->h_sync = 1; /* recovery likes this */
-#endif
 
         lock_kernel();
         rc = journal_stop(handle);
@@ -273,8 +271,11 @@ static int fsfilt_extN_setattr(struct dentry *dentry, void *handle,
         }
         if (inode->i_op->setattr)
                 rc = inode->i_op->setattr(dentry, iattr);
-        else
-                rc = inode_setattr(inode, iattr);
+        else{
+                rc = inode_change_ok(inode, iattr);
+                if (!rc)
+                        rc = inode_setattr(inode, iattr);
+        }
 
         unlock_kernel();
 
@@ -386,7 +387,7 @@ static ssize_t fsfilt_extN_readpage(struct file *file, char *buf, size_t count,
                                         brelse(bh);
                                 } else if (err) {
                                         /* XXX in theory we should just fake
-                                         * this buffer and continue like ext3,
+                                         * this buffer and continue like extN,
                                          * especially if this is a partial read
                                          */
                                         CERROR("error read dir %lu+%llu: %d\n",
@@ -417,7 +418,7 @@ static void fsfilt_extN_cb_func(struct journal_callback *jcb, int error)
 
         fcb->cb_func(fcb->cb_obd, fcb->cb_last_rcvd, error);
 
-        kmem_cache_free(fcb_cache, fcb);
+        OBD_SLAB_FREE(fcb, fcb_cache, sizeof *fcb);
         atomic_dec(&fcb_cache_count);
 }
 
@@ -426,8 +427,8 @@ static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd,
 {
         struct fsfilt_cb_data *fcb;
 
-        fcb = kmem_cache_alloc(fcb_cache, GFP_NOFS);
-        if (!fcb)
+        OBD_SLAB_ALLOC(fcb, fcb_cache, GFP_NOFS, sizeof *fcb);
+        if (fcb == NULL)
                 RETURN(-ENOMEM);
 
         atomic_inc(&fcb_cache_count);
@@ -466,8 +467,10 @@ static int fsfilt_extN_statfs(struct super_block *sb, struct obd_statfs *osfs)
         struct statfs sfs;
         int rc = vfs_statfs(sb, &sfs);
 
-        if (!rc && sfs.f_bfree < sfs.f_ffree)
+        if (!rc && sfs.f_bfree < sfs.f_ffree) {
+                sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree;
                 sfs.f_ffree = sfs.f_bfree;
+        }
 
         statfs_pack(osfs, &sfs);
         return rc;
index 06302c5..2aba0f1 100644 (file)
@@ -59,7 +59,8 @@ static void *fsfilt_reiserfs_brw_start(int objcount, struct fsfilt_objinfo *fso,
         return (void *)0xf00f00be;
 }
 
-static int fsfilt_reiserfs_commit(struct inode *inode, void *handle)
+static int fsfilt_reiserfs_commit(struct inode *inode, void *handle, 
+                                  int force_sync)
 {
         if (handle != (void *)0xf00f00be) {
                 CERROR("bad handle %p", handle);
index bd43554..9000771 100644 (file)
 #include <linux/lprocfs_status.h>
 
 extern struct list_head obd_types;
+static spinlock_t obd_types_lock = SPIN_LOCK_UNLOCKED;
 kmem_cache_t *obdo_cachep = NULL;
 kmem_cache_t *import_cachep = NULL;
-kmem_cache_t *export_cachep = NULL;
 
 int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
-void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp,
-                                        int dying_import);
+void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp);
+
+struct obd_uuid lctl_fake_uuid = { .uuid = "OBD_CLASS_UUID" };
 
 /*
  * support functions: we could use inter-module communication, but this
@@ -53,17 +54,17 @@ static struct obd_type *class_search_type(char *name)
 {
         struct list_head *tmp;
         struct obd_type *type;
-        CDEBUG(D_INFO, "SEARCH %s\n", name);
 
-        tmp = &obd_types;
+        spin_lock(&obd_types_lock);
         list_for_each(tmp, &obd_types) {
                 type = list_entry(tmp, struct obd_type, typ_chain);
-                CDEBUG(D_INFO, "TYP %s\n", type->typ_name);
                 if (strlen(type->typ_name) == strlen(name) &&
                     strcmp(type->typ_name, name) == 0) {
+                        spin_unlock(&obd_types_lock);
                         return type;
                 }
         }
+        spin_unlock(&obd_types_lock);
         return NULL;
 }
 
@@ -117,17 +118,19 @@ int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars,
 
         *(type->typ_ops) = *ops;
         strcpy(type->typ_name, name);
-        list_add(&type->typ_chain, &obd_types);
 
         type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
                                               vars, type);
         if (IS_ERR(type->typ_procroot)) {
                 rc = PTR_ERR(type->typ_procroot);
                 type->typ_procroot = NULL;
-                list_del(&type->typ_chain);
                 GOTO (failed, rc);
         }
 
+        spin_lock(&obd_types_lock);
+        list_add(&type->typ_chain, &obd_types);
+        spin_unlock(&obd_types_lock);
+
         RETURN (0);
 
  failed:
@@ -161,7 +164,9 @@ int class_unregister_type(char *name)
                 type->typ_procroot = NULL;
         }
 
+        spin_lock(&obd_types_lock);
         list_del(&type->typ_chain);
+        spin_unlock(&obd_types_lock);
         OBD_FREE(type->typ_name, strlen(name) + 1);
         if (type->typ_ops != NULL)
                 OBD_FREE(type->typ_ops, sizeof(*type->typ_ops));
@@ -171,7 +176,6 @@ int class_unregister_type(char *name)
 
 int class_name2dev(char *name)
 {
-        int res = -1;
         int i;
 
         if (!name)
@@ -179,39 +183,33 @@ int class_name2dev(char *name)
 
         for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
-                if (obd->obd_name && strcmp(name, obd->obd_name) == 0) {
-                        res = i;
-                        return res;
-                }
+                if (obd->obd_name && strcmp(name, obd->obd_name) == 0)
+                        return i;
         }
 
-        return res;
+        return -1;
 }
 
 int class_uuid2dev(struct obd_uuid *uuid)
 {
-        int res = -1;
         int i;
 
         for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
-                if (strncmp(uuid->uuid, obd->obd_uuid.uuid, sizeof(obd->obd_uuid.uuid)) == 0) {
-                        res = i;
-                        return res;
-                }
+                if (obd_uuid_equals(uuid, &obd->obd_uuid))
+                        return i;
         }
 
-        return res;
+        return -1;
 }
 
-
 struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
 {
         int i;
 
         for (i = 0; i < MAX_OBD_DEVICES; i++) {
                 struct obd_device *obd = &obd_dev[i];
-                if (strncmp(uuid->uuid, obd->obd_uuid.uuid, sizeof(obd->obd_uuid.uuid)) == 0)
+                if (obd_uuid_equals(uuid, &obd->obd_uuid))
                         return obd;
         }
 
@@ -234,12 +232,6 @@ void obd_cleanup_caches(void)
                         CERROR("Cannot destory ll_import_cache\n");
                 import_cachep = NULL;
         }
-        if (export_cachep) {
-                rc = kmem_cache_destroy(export_cachep);
-                if (rc)
-                        CERROR("Cannot destory ll_export_cache\n");
-                export_cachep = NULL;
-        }
         EXIT;
 }
 
@@ -252,13 +244,6 @@ int obd_init_caches(void)
         if (!obdo_cachep)
                 GOTO(out, -ENOMEM);
 
-        LASSERT(export_cachep == NULL);
-        export_cachep = kmem_cache_create("ll_export_cache",
-                                          sizeof(struct obd_export),
-                                          0, 0, NULL, NULL);
-        if (!export_cachep)
-                GOTO(out, -ENOMEM);
-
         LASSERT(import_cachep == NULL);
         import_cachep = kmem_cache_create("ll_import_cache",
                                           sizeof(struct obd_import),
@@ -284,262 +269,277 @@ struct obd_export *class_conn2export(struct lustre_handle *conn)
                 RETURN(NULL);
         }
 
-        if (conn->addr == -1) {  /* this means assign a new connection */
+        if (conn->cookie == -1) {  /* this means assign a new connection */
                 CDEBUG(D_CACHE, "want a new connection\n");
                 RETURN(NULL);
         }
 
-        if (!conn->addr) {
-                CDEBUG(D_CACHE, "looking for null addr\n");
-                fixme();
-                RETURN(NULL);
-        }
-
-        CDEBUG(D_IOCTL, "looking for export addr "LPX64" cookie "LPX64"\n",
-               conn->addr, conn->cookie);
-        export = (struct obd_export *) (unsigned long)conn->addr;
-        if (!kmem_cache_validate(export_cachep, (void *)export))
-                RETURN(NULL);
-
-        if (export->exp_cookie != conn->cookie)
-                RETURN(NULL);
+        CDEBUG(D_IOCTL, "looking for export cookie "LPX64"\n", conn->cookie);
+        export = class_handle2object(conn->cookie);
         RETURN(export);
-} /* class_conn2export */
+}
 
 struct obd_device *class_conn2obd(struct lustre_handle *conn)
 {
         struct obd_export *export;
         export = class_conn2export(conn);
-        if (export)
-                return export->exp_obd;
-        fixme();
+        if (export) {
+                struct obd_device *obd = export->exp_obd;
+                class_export_put(export);
+                return obd;
+        }
         return NULL;
 }
 
 struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
 {
-        return &class_conn2obd(conn)->u.cli.cl_import;
+        return class_conn2obd(conn)->u.cli.cl_import;
 }
 
 struct obd_import *class_conn2ldlmimp(struct lustre_handle *conn)
 {
-        return &class_conn2export(conn)->exp_ldlm_data.led_import;
+        struct obd_export *export;
+        export = class_conn2export(conn);
+        if (export) {
+                struct obd_import *imp = export->exp_ldlm_data.led_import;
+                class_export_put(export);
+                return imp;
+        }
+        fixme();
+        return NULL;
+}
+
+/* Export management functions */
+static void export_handle_addref(void *export)
+{
+        class_export_get(export);
+}
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+        atomic_inc(&exp->exp_refcount);
+        CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+               atomic_read(&exp->exp_refcount));
+        return exp;
+}
+
+void class_export_put(struct obd_export *exp)
+{
+        ENTRY;
+
+        CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+               atomic_read(&exp->exp_refcount) - 1);
+        LASSERT(atomic_read(&exp->exp_refcount) > 0);
+        LASSERT(atomic_read(&exp->exp_refcount) < 0x5a5a5a);
+        if (atomic_dec_and_test(&exp->exp_refcount)) {
+                struct obd_device *obd = exp->exp_obd;
+                CDEBUG(D_IOCTL, "destroying export %p/%s\n", exp,
+                       exp->exp_client_uuid.uuid);
+
+                LASSERT(obd != NULL);
+
+                /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+                if (exp->exp_connection)
+                        ptlrpc_put_connection_superhack(exp->exp_connection);
+
+                LASSERT(list_empty(&exp->exp_handle.h_link));
+
+                obd_destroy_export(exp);
+
+                OBD_FREE(exp, sizeof(*exp));
+                atomic_dec(&obd->obd_refcount);
+                wake_up(&obd->obd_refcount_waitq);
+        }
+        EXIT;
 }
 
 struct obd_export *class_new_export(struct obd_device *obddev)
 {
         struct obd_export *export;
 
-        PORTAL_SLAB_ALLOC(export, export_cachep, sizeof(*export));
+        OBD_ALLOC(export, sizeof(*export));
         if (!export) {
                 CERROR("no memory! (minor %d)\n", obddev->obd_minor);
                 return NULL;
         }
 
-        get_random_bytes(&export->exp_cookie, sizeof(export->exp_cookie));
+        atomic_set(&export->exp_refcount, 2);
         export->exp_obd = obddev;
         /* XXX this should be in LDLM init */
         INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks);
-        INIT_LIST_HEAD(&export->exp_conn_chain);
+
+        INIT_LIST_HEAD(&export->exp_handle.h_link);
+        class_handle_hash(&export->exp_handle, export_handle_addref);
+        spin_lock_init(&export->exp_lock);
+
         spin_lock(&obddev->obd_dev_lock);
+        LASSERT(!obddev->obd_stopping); /* shouldn't happen, but might race */
+        atomic_inc(&obddev->obd_refcount);
         list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
         spin_unlock(&obddev->obd_dev_lock);
         return export;
 }
 
-void class_destroy_export(struct obd_export *exp)
+void class_unlink_export(struct obd_export *exp)
 {
-        LASSERT(exp->exp_cookie != DEAD_HANDLE_MAGIC);
-
-        CDEBUG(D_IOCTL, "destroying export %p/%s\n", exp,
-               exp->exp_client_uuid.uuid);
+        class_handle_unhash(&exp->exp_handle);
 
         spin_lock(&exp->exp_obd->obd_dev_lock);
-        list_del(&exp->exp_obd_chain);
+        list_del_init(&exp->exp_obd_chain);
         spin_unlock(&exp->exp_obd->obd_dev_lock);
 
-        /* XXXshaver no connection here... */
-        if (exp->exp_connection)
-                spin_lock(&exp->exp_connection->c_lock);
-        list_del(&exp->exp_conn_chain);
-        if (exp->exp_connection) {
-                spin_unlock(&exp->exp_connection->c_lock);
-                ptlrpc_put_connection_superhack(exp->exp_connection);
-        }
+        class_export_put(exp);
+}
 
-        /* Abort any inflight DLM requests and NULL out their (about to be
-         * freed) import. */
-        if (exp->exp_ldlm_data.led_import.imp_obd)
-                ptlrpc_abort_inflight_superhack(&exp->exp_ldlm_data.led_import,
-                                                1);
+/* Import management functions */
+static void import_handle_addref(void *import)
+{
+        class_import_get(import);
+}
 
-        PORTAL_SLAB_FREE(exp, export_cachep, sizeof(*exp));
+struct obd_import *class_import_get(struct obd_import *import)
+{
+        atomic_inc(&import->imp_refcount);
+        CDEBUG(D_IOCTL, "import %p refcount=%d\n", import,
+               atomic_read(&import->imp_refcount));
+        return import;
 }
 
-/* a connection defines an export context in which preallocation can
-   be managed. */
-int class_connect(struct lustre_handle *exporth, struct obd_device *obd,
-                  struct obd_uuid *cluuid)
+void class_import_put(struct obd_import *import)
 {
-        struct obd_export * export;
-        if (exporth == NULL) {
-                LBUG();
-                return -EINVAL;
-        }
+        ENTRY;
 
-        if (obd == NULL) {
-                LBUG();
-                return -EINVAL;
-        }
+        CDEBUG(D_IOCTL, "import %p refcount=%d\n", import,
+               atomic_read(&import->imp_refcount) - 1);
 
-        if (cluuid == NULL) {
-                LBUG();
-                return -EINVAL;
+        LASSERT(atomic_read(&import->imp_refcount) > 0);
+        LASSERT(atomic_read(&import->imp_refcount) < 0x5a5a5a);
+        if (!atomic_dec_and_test(&import->imp_refcount)) {
+                EXIT;
+                return;
         }
 
-        export = class_new_export(obd);
-        if (!export)
-                return -ENOMEM;
+        CDEBUG(D_IOCTL, "destroying import %p\n", import);
 
-        exporth->addr = (__u64) (unsigned long)export;
-        exporth->cookie = export->exp_cookie;
-        memcpy(&export->exp_client_uuid, cluuid, sizeof(export->exp_client_uuid));
+        ptlrpc_put_connection_superhack(import->imp_connection);
 
-        CDEBUG(D_IOCTL, "connect: addr %Lx cookie %Lx\n",
-               (long long)exporth->addr, (long long)exporth->cookie);
-        return 0;
+        LASSERT(list_empty(&import->imp_handle.h_link));
+        OBD_FREE(import, sizeof(*import));
+        EXIT;
 }
 
-int class_disconnect(struct lustre_handle *conn)
+struct obd_import *class_new_import(void)
 {
-        struct obd_export *export;
-        ENTRY;
+        struct obd_import *imp;
 
-        if (!(export = class_conn2export(conn))) {
-                fixme();
-                CDEBUG(D_IOCTL, "disconnect: attempting to free "
-                       "nonexistent client "LPX64"\n", conn->addr);
-                RETURN(-EINVAL);
-        }
+        OBD_ALLOC(imp, sizeof(*imp));
+        if (imp == NULL)
+                return NULL;
 
-        CDEBUG(D_IOCTL, "disconnect: addr %Lx cookie %Lx\n",
-                       (long long)conn->addr, (long long)conn->cookie);
+        INIT_LIST_HEAD(&imp->imp_replay_list);
+        INIT_LIST_HEAD(&imp->imp_sending_list);
+        INIT_LIST_HEAD(&imp->imp_delayed_list);
+        spin_lock_init(&imp->imp_lock);
+        imp->imp_max_transno = 0;
+        imp->imp_peer_committed_transno = 0;
 
-        class_destroy_export(export);
+        atomic_set(&imp->imp_refcount, 2);
+        INIT_LIST_HEAD(&imp->imp_handle.h_link);
+        class_handle_hash(&imp->imp_handle, import_handle_addref);
 
-        RETURN(0);
+        return imp;
 }
 
-void class_disconnect_all(struct obd_device *obddev)
+void class_destroy_import(struct obd_import *import)
 {
-        int again = 1;
-
-        while (again) {
-                spin_lock(&obddev->obd_dev_lock);
-                if (!list_empty(&obddev->obd_exports)) {
-                        struct obd_export *export;
-                        struct lustre_handle conn;
-                        int rc;
-
-                        export = list_entry(obddev->obd_exports.next,
-                                            struct obd_export,
-                                            exp_obd_chain);
-                        conn.addr = (__u64)(unsigned long)export;
-                        conn.cookie = export->exp_cookie;
-                        spin_unlock(&obddev->obd_dev_lock);
-                        CERROR("force disconnecting %s:%s export %p\n",
-                               export->exp_obd->obd_type->typ_name,
-                               export->exp_connection ?
-                               (char *)export->exp_connection->c_remote_uuid.uuid :
-                               "<unconnected>", export);
-                        rc = obd_disconnect(&conn);
-                        if (rc < 0) {
-                                /* AED: not so sure about this...  We can't
-                                 * loop here forever, yet we shouldn't leak
-                                 * exports on a struct we will soon destroy.
-                                 */
-                                CERROR("destroy export %p with err: rc = %d\n",
-                                       export, rc);
-                                class_destroy_export(export);
-                        }
-                } else {
-                        spin_unlock(&obddev->obd_dev_lock);
-                        again = 0;
-                }
-        }
-}
+        LASSERT(import != NULL);
 
-#if 0
+        class_handle_unhash(&import->imp_handle);
 
-/* FIXME: Data is a space- or comma-separated list of device IDs.  This will
- * have to change. */
-int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data)
-{
-        int count, rc;
-        char *p;
-        ENTRY;
+        /* Abort any inflight DLM requests and NULL out their (about to be
+         * freed) import. */
+        ptlrpc_abort_inflight_superhack(import);
 
-        for (p = data, count = 0; p < (char *)data + len; count++) {
-                char *end;
-                int tmp = simple_strtoul(p, &end, 0);
+        class_import_put(import);
+}
 
-                if (p == end) {
-                        CERROR("invalid device ID starting at: %s\n", p);
-                        GOTO(err_disconnect, rc = -EINVAL);
-                }
+/* a connection defines an export context in which preallocation can
+   be managed. */
+int class_connect(struct lustre_handle *exporth, struct obd_device *obd,
+                  struct obd_uuid *cluuid)
+{
+        struct obd_export *export;
+        LASSERT(exporth != NULL);
+        LASSERT(obd != NULL);
+        LASSERT(cluuid != NULL);
 
-                if (tmp < 0 || tmp >= MAX_OBD_DEVICES) {
-                        CERROR("Trying to sub dev %d  - dev no too large\n",
-                               tmp);
-                        GOTO(err_disconnect, rc  = -EINVAL);
-                }
+        export = class_new_export(obd);
+        if (export == NULL)
+                return -ENOMEM;
 
-                rc = obd_connect(&obddev->obd_multi_conn[count], &obd_dev[tmp]);
-                if (rc) {
-                        CERROR("cannot connect to device %d: rc = %d\n", tmp,
-                               rc);
-                        GOTO(err_disconnect, rc);
-                }
+        exporth->cookie = export->exp_handle.h_cookie;
+        memcpy(&export->exp_client_uuid, cluuid,
+               sizeof(export->exp_client_uuid));
+        class_export_put(export);
 
-                CDEBUG(D_INFO, "target OBD %d is of type %s\n", count,
-                       obd_dev[tmp].obd_type->typ_name);
+        CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n",
+               cluuid->uuid, exporth->cookie);
+        return 0;
+}
 
-                p = end + 1;
+int class_disconnect(struct lustre_handle *conn, int failover)
+{
+        struct obd_export *export = class_conn2export(conn);
+        ENTRY;
+
+        if (export == NULL) {
+                fixme();
+                CDEBUG(D_IOCTL, "disconnect: attempting to free "
+                       "nonexistent client "LPX64"\n", conn->cookie);
+                RETURN(-EINVAL);
         }
 
-        obddev->obd_multi_count = count;
+        CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n", conn->cookie);
 
+        class_unlink_export(export);
+        class_export_put(export);
         RETURN(0);
-
- err_disconnect:
-        for (count--; count >= 0; count--)
-                obd_disconnect(&obddev->obd_multi_conn[count]);
-        return rc;
 }
 
-/*
- *    remove all connections to this device
- *    close all connections to lower devices
- *    needed for forced unloads of OBD client drivers
- */
-int class_multi_cleanup(struct obd_device *obddev)
+void class_disconnect_exports(struct obd_device *obd, int failover)
 {
-        int i;
+        int rc;
+        struct list_head *tmp, *n, work_list;
+        struct lustre_handle fake_conn;
+        ENTRY;
 
-        for (i = 0; i < obddev->obd_multi_count; i++) {
-                int rc;
-                struct obd_device *obd =
-                        class_conn2obd(&obddev->obd_multi_conn[i]);
+        /* Move all of the exports from obd_exports to a work list, en masse. */
+        spin_lock(&obd->obd_dev_lock);
+        list_add(&work_list, &obd->obd_exports);
+        list_del_init(&obd->obd_exports);
+        spin_unlock(&obd->obd_dev_lock);
+
+        CDEBUG(D_IOCTL, "OBD device %d (%p) has exports, "
+               "disconnecting them\n", obd->obd_minor, obd);
+        list_for_each_safe(tmp, n, &work_list) {
+                struct obd_export *exp = list_entry(tmp, struct obd_export,
+                                                    exp_obd_chain);
+
+                class_export_get(exp);
+                fake_conn.cookie = exp->exp_handle.h_cookie;
+                rc = obd_disconnect(&fake_conn, failover);
+                /* exports created from last_rcvd data, and "fake"
+                   exports created by lctl don't have an import */
+                if (exp->exp_ldlm_data.led_import != NULL)
+                        class_destroy_import(exp->exp_ldlm_data.led_import);
+                class_export_put(exp);
 
-                if (!obd) {
-                        CERROR("no such device [i %d]\n", i);
-                        RETURN(-EINVAL);
+                if (rc) {
+                        CDEBUG(D_IOCTL, "disconnecting export %p failed: %d\n",
+                               exp, rc);
+                } else {
+                        CDEBUG(D_IOCTL, "export %p disconnected\n", exp);
                 }
-
-                rc = obd_disconnect(&obddev->obd_multi_conn[i]);
-                if (rc)
-                        CERROR("disconnect failure %d\n", obd->obd_minor);
         }
-        return 0;
+        EXIT;
 }
-#endif
index 26bbdf7..2984e9c 100644 (file)
@@ -31,6 +31,7 @@
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 #include <asm/statfs.h>
 #endif
+#include <linux/seq_file.h>
 
 #else
 #include <liblustre.h>
@@ -100,7 +101,7 @@ int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
 
                 OBD_FREE(pathcopy, pathsize);
 
-                if ((cur_root==NULL) || (proc==NULL)) {
+                if ((cur_root == NULL) || (proc == NULL)) {
                         CERROR("LprocFS: No memory to create /proc entry %s",
                                list->name);
                         return -ENOMEM;
@@ -259,13 +260,14 @@ int lprocfs_rd_filegroups(char* page, char **start, off_t off, int count,
 int lprocfs_rd_server_uuid(char* page, char **start, off_t off, int count,
                            int *eof, void *data)
 {
-        struct obd_device* obd = (struct obd_device*)data;
-        struct client_obdcli;
+        struct obd_device *obd = (struct obd_device *)data;
+        struct client_obd *cli;
 
         LASSERT(obd != NULL);
         cli = &obd->u.cli;
         *eof = 1;
-        return snprintf(page, count, "%s\n", cli->cl_target_uuid.uuid);
+        return snprintf(page, count, "%s\n",
+                        cli->cl_import->imp_target_uuid.uuid);
 }
 
 int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
@@ -275,7 +277,7 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count,
         struct ptlrpc_connection *conn;
 
         LASSERT(obd != NULL);
-        conn = obd->u.cli.cl_import.imp_connection;
+        conn = obd->u.cli.cl_import->imp_connection;
         LASSERT(conn != NULL);
         *eof = 1;
         return snprintf(page, count, "%s\n", conn->c_remote_uuid.uuid);
@@ -318,6 +320,251 @@ int lprocfs_obd_detach(struct obd_device *dev)
         return 0;
 }
 
+struct lprocfs_counters* lprocfs_alloc_counters(unsigned int num)
+{
+        struct lprocfs_counters* cntrs;
+        int csize;
+        if (num == 0)
+                return NULL;
+
+        csize = offsetof(struct lprocfs_counters, cntr[num]);
+        OBD_ALLOC(cntrs, csize);
+        if (cntrs != NULL) {
+                cntrs->num = num;
+        }
+        return cntrs;
+}
+
+void lprocfs_free_counters(struct lprocfs_counters* cntrs)
+{
+        if (cntrs != NULL) {
+                int csize = offsetof(struct lprocfs_counters, cntr[cntrs->num]);                OBD_FREE(cntrs, csize);
+        }
+}
+
+/* Reset counter under lock */
+int lprocfs_counter_write(struct file *file, const char *buffer,
+                          unsigned long count, void *data)
+{
+        struct lprocfs_counters *cntrs = (struct lprocfs_counters*) data;
+        unsigned int i;
+        LASSERT(cntrs != NULL);
+
+        for (i = 0; i < cntrs->num; i++) {
+                struct lprocfs_counter *cntr = &(cntrs->cntr[i]);
+                spinlock_t *lock = (cntr->config & LPROCFS_CNTR_EXTERNALLOCK) ?
+                        cntr->l.external : &cntr->l.internal;
+
+                spin_lock(lock);
+                cntr->count     = 0;
+                cntr->sum       = 0;
+                cntr->min       = (~(__u64)0);
+                cntr->max       = 0;
+                cntr->sumsquare = 0;
+                spin_unlock(lock);
+        }
+        return 0;
+}
+
+static void *lprocfs_counters_seq_start(struct seq_file *p, loff_t *pos)
+{
+        struct lprocfs_counters *cntrs = p->private;
+        return (*pos >= cntrs->num) ? NULL : (void*) &cntrs->cntr[*pos];
+}
+
+static void lprocfs_counters_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_counters_seq_next(struct seq_file *p, void *v,
+                                       loff_t *pos)
+{
+        struct lprocfs_counters *cntrs = p->private;
+        ++*pos;
+        return (*pos >= cntrs->num) ? NULL : (void*) &(cntrs->cntr[*pos]);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_counters_seq_show(struct seq_file *p, void *v)
+{
+       struct lprocfs_counters *cntrs = p->private;
+       struct lprocfs_counter  *cntr = v;
+       spinlock_t              *lock;
+       struct lprocfs_counter  c;
+       int rc = 0;
+
+       if (cntr == &(cntrs->cntr[0])) {
+               struct timeval now;
+               do_gettimeofday(&now);
+               rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+                               "snapshot_time", now.tv_sec, now.tv_usec);
+               if (rc < 0)
+                       return rc;
+       }
+
+       /* Take a snapshot of the counter under lock */
+       lock = (cntr->config & LPROCFS_CNTR_EXTERNALLOCK) ?
+               cntr->l.external : &cntr->l.internal;
+       spin_lock(lock);
+
+       c.count = cntr->count;
+       c.sum = cntr->sum;
+       c.min = cntr->min;
+       c.max = cntr->max;
+       c.sumsquare = cntr->sumsquare;
+
+       spin_unlock(lock);
+
+       rc = seq_printf(p, "%-25s "LPU64" samples [%s]", cntr->name, c.count,
+                       cntr->units);
+       if (rc < 0)
+               goto out;
+
+       if ((cntr->config & LPROCFS_CNTR_AVGMINMAX) && (c.count > 0)) {
+               rc = seq_printf(p, " "LPU64" "LPU64" "LPU64, c.min,c.max,c.sum);
+               if (rc < 0)
+                       goto out;
+               if (cntr->config & LPROCFS_CNTR_STDDEV)
+                       rc = seq_printf(p, " "LPU64, c.sumsquare);
+               if (rc < 0)
+                       goto out;
+       }
+       rc = seq_printf(p, "\n");
+ out:
+       return (rc < 0) ? rc : 0;
+}
+
+struct seq_operations lprocfs_counters_seq_sops = {
+        .start = lprocfs_counters_seq_start,
+        .stop = lprocfs_counters_seq_stop,
+        .next = lprocfs_counters_seq_next,
+        .show = lprocfs_counters_seq_show,
+};
+
+static int lprocfs_counters_seq_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *dp = inode->u.generic_ip;
+        struct seq_file *seq;
+        int rc;
+
+        rc = seq_open(file, &lprocfs_counters_seq_sops);
+        if (rc)
+                return rc;
+        seq = file->private_data;
+        seq->private = dp->data;
+        return 0;
+}
+
+struct file_operations lprocfs_counters_seq_fops = {
+        .open    = lprocfs_counters_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+
+int lprocfs_register_counters(struct proc_dir_entry *root, const char* name,
+                              struct lprocfs_counters *cntrs)
+{
+        struct proc_dir_entry *entry;
+        LASSERT(root != NULL);
+
+        entry = create_proc_entry(name, 0444, root);
+        if (entry == NULL)
+                return -ENOMEM;
+        entry->proc_fops = &lprocfs_counters_seq_fops;
+        entry->data = (void*) cntrs;
+        entry->write_proc = lprocfs_counter_write;
+        return 0;
+}
+
+#define LPROCFS_OBD_OP_INIT(base, cntrs, op)                               \
+do {                                                                       \
+        unsigned int coffset = base + OBD_COUNTER_OFFSET(op);              \
+        LASSERT(coffset < cntrs->num);                                     \
+        LPROCFS_COUNTER_INIT(&cntrs->cntr[coffset], 0, NULL, #op, "reqs"); \
+} while (0)
+
+
+int lprocfs_alloc_obd_counters(struct obd_device *obddev,
+                               unsigned int num_private_counters)
+{
+        struct lprocfs_counters* obdops_cntrs;
+        unsigned int num_counters;
+        int rc, i;
+
+        LASSERT(obddev->counters == NULL);
+        LASSERT(obddev->obd_proc_entry != NULL);
+        LASSERT(obddev->cntr_base == 0);
+
+        num_counters = 1 + OBD_COUNTER_OFFSET(san_preprw)+num_private_counters;
+        obdops_cntrs = lprocfs_alloc_counters(num_counters);
+        if (!obdops_cntrs)
+                return -ENOMEM;
+
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, iocontrol);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, get_info);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, set_info);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, attach);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, detach);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, setup);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, cleanup);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, connect);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, disconnect);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, statfs);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, syncfs);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, packmd);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, unpackmd);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, preallocate);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, create);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, destroy);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, setattr);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, getattr);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, getattr_async);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, open);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, close);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, brw);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, brw_async);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, punch);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, sync);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, migrate);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, copy);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, iterate);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, preprw);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, commitrw);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, enqueue);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, match);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, cancel);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, cancel_unused);
+        LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, san_preprw);
+
+        for (i = num_private_counters; i < num_counters; i++) {
+                /* If this assertion failed, it is likely that an obd
+                 * operation was added to struct obd_ops in
+                 * <linux/obd.h>, and that the corresponding line item
+                 * LPROCFS_OBD_OP_INIT(.., .., opname)
+                 * is missing from the list above. */
+                LASSERT(obdops_cntrs->cntr[i].name != NULL);
+        }
+        rc = lprocfs_register_counters(obddev->obd_proc_entry, "obd_stats",
+                                       obdops_cntrs);
+        if (rc < 0) {
+                lprocfs_free_counters(obdops_cntrs);
+        } else {
+                obddev->counters  = obdops_cntrs;
+                obddev->cntr_base = num_private_counters;
+        }
+        return rc;
+}
+
+void lprocfs_free_obd_counters(struct obd_device *obddev)
+{
+        struct lprocfs_counters* obdops_cntrs = obddev->counters;
+        if (obdops_cntrs != NULL) {
+                obddev->counters = NULL;
+                lprocfs_free_counters(obdops_cntrs);
+        }
+}
+
 #endif /* LPROCFS*/
 
 EXPORT_SYMBOL(lprocfs_register);
@@ -325,6 +572,11 @@ EXPORT_SYMBOL(lprocfs_remove);
 EXPORT_SYMBOL(lprocfs_add_vars);
 EXPORT_SYMBOL(lprocfs_obd_attach);
 EXPORT_SYMBOL(lprocfs_obd_detach);
+EXPORT_SYMBOL(lprocfs_alloc_counters);
+EXPORT_SYMBOL(lprocfs_free_counters);
+EXPORT_SYMBOL(lprocfs_register_counters);
+EXPORT_SYMBOL(lprocfs_alloc_obd_counters);
+EXPORT_SYMBOL(lprocfs_free_obd_counters);
 
 EXPORT_SYMBOL(lprocfs_rd_u64);
 EXPORT_SYMBOL(lprocfs_rd_uuid);
index 01dd75b..06f86ad 100644 (file)
@@ -20,7 +20,7 @@
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_CLASS
 #ifdef __KERNEL__
 #include <linux/types.h>
 #include <linux/random.h>
@@ -146,7 +146,7 @@ static void cleanup_all_handles(void)
                         class_handle_unhash_nolock(h);
                 }
         }
-        spin_lock(&handle_lock);
+        spin_unlock(&handle_lock);
 }
 
 void class_handle_cleanup(void)
similarity index 83%
rename from lustre/lib/simple.c
rename to lustre/obdclass/simple.c
index c0d4f31..0ce54a3 100644 (file)
 
 #define DEBUG_SUBSYSTEM S_FILTER
 
-#include <linux/obd_support.h>
 #include <linux/obd.h>
-#include <linux/lustre_mds.h>
 #include <linux/lustre_lib.h>
-#include <linux/lustre_net.h>
 
-#ifdef OBD_CTXT_DEBUG
 /* Debugging check only needed during development */
-#define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
-#define ASSERT_NOT_KERNEL_CTXT(msg) LASSERT(!segment_eq(get_fs(), get_ds()))
-#define ASSERT_KERNEL_CTXT(msg) LASSERT(segment_eq(get_fs(), get_ds()))
+#ifdef OBD_CTXT_DEBUG
+# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC)
+# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERT(!segment_eq(get_fs(), get_ds()))
+# define ASSERT_KERNEL_CTXT(msg) LASSERT(segment_eq(get_fs(), get_ds()))
 #else
-#define ASSERT_CTXT_MAGIC(magic) do {} while(0)
-#define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
-#define ASSERT_KERNEL_CTXT(msg) do {} while(0)
+# define ASSERT_CTXT_MAGIC(magic) do {} while(0)
+# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0)
+# define ASSERT_KERNEL_CTXT(msg) do {} while(0)
 #endif
 
 /* push / pop to root of obd store */
@@ -70,6 +67,7 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx,
         LASSERT(atomic_read(&new_ctx->pwd->d_count));
         save->pwd = dget(current->fs->pwd);
         save->pwdmnt = mntget(current->fs->pwdmnt);
+        save->ngroups = current->ngroups;
 
         LASSERT(save->pwd);
         LASSERT(save->pwdmnt);
@@ -77,13 +75,17 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx,
         LASSERT(new_ctx->pwdmnt);
 
         if (uc) {
-                save->fsuid = current->fsuid;
-                save->fsgid = current->fsgid;
-                save->cap = current->cap_effective;
+                save->ouc.ouc_fsuid = current->fsuid;
+                save->ouc.ouc_fsgid = current->fsgid;
+                save->ouc.ouc_cap = current->cap_effective;
+                save->ouc.ouc_suppgid1 = current->groups[0];
+                save->ouc.ouc_suppgid2 = current->groups[1];
 
                 current->fsuid = uc->ouc_fsuid;
                 current->fsgid = uc->ouc_fsgid;
                 current->cap_effective = uc->ouc_cap;
+                current->ngroups = 0;
+
                 if (uc->ouc_suppgid1 != -1)
                         current->groups[current->ngroups++] = uc->ouc_suppgid1;
                 if (uc->ouc_suppgid2 != -1)
@@ -103,6 +105,7 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx,
                atomic_read(&current->fs->pwdmnt->mnt_count));
         */
 }
+EXPORT_SYMBOL(push_ctxt);
 
 void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx,
               struct obd_ucred *uc)
@@ -132,14 +135,13 @@ void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx,
         dput(saved->pwd);
         mntput(saved->pwdmnt);
         if (uc) {
-                current->fsuid = saved->fsuid;
-                current->fsgid = saved->fsgid;
-                current->cap_effective = saved->cap;
+                current->fsuid = saved->ouc.ouc_fsuid;
+                current->fsgid = saved->ouc.ouc_fsgid;
+                current->cap_effective = saved->ouc.ouc_cap;
+                current->ngroups = saved->ngroups;
 
-                if (uc->ouc_suppgid1 != -1)
-                        current->ngroups--;
-                if (uc->ouc_suppgid2 != -1)
-                        current->ngroups--;
+                current->groups[0] = saved->ouc.ouc_suppgid1;
+                current->groups[1] = saved->ouc.ouc_suppgid2;
         }
 
         /*
@@ -153,6 +155,7 @@ void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx,
                atomic_read(&current->fs->pwdmnt->mnt_count));
         */
 }
+EXPORT_SYMBOL(pop_ctxt);
 
 /* utility to make a file */
 struct dentry *simple_mknod(struct dentry *dir, char *name, int mode)
@@ -169,7 +172,7 @@ struct dentry *simple_mknod(struct dentry *dir, char *name, int mode)
                 GOTO(out_up, dchild);
 
         if (dchild->d_inode) {
-                if ((dchild->d_inode->i_mode & S_IFMT) != S_IFREG)
+                if (!S_ISREG(dchild->d_inode->i_mode))
                         GOTO(out_err, err = -EEXIST);
 
                 GOTO(out_up, dchild);
@@ -187,6 +190,7 @@ out_err:
 out_up:
         return dchild;
 }
+EXPORT_SYMBOL(simple_mknod);
 
 /* utility to make a directory */
 struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode)
@@ -220,25 +224,27 @@ out_err:
 out_up:
         return dchild;
 }
+EXPORT_SYMBOL(simple_mkdir);
 
 /*
  * Read a file from within kernel context.  Prior to calling this
  * function we should already have done a push_ctxt().
  */
-int lustre_fread(struct file *file, char *str, int len, loff_t *off)
+int lustre_fread(struct file *file, void *buf, int len, loff_t *off)
 {
         ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n");
         if (!file || !file->f_op || !file->f_op->read || !off)
                 RETURN(-ENOSYS);
 
-        return file->f_op->read(file, str, len, off);
+        return file->f_op->read(file, buf, len, off);
 }
+EXPORT_SYMBOL(lustre_fread);
 
 /*
  * Write a file from within kernel context.  Prior to calling this
  * function we should already have done a push_ctxt().
  */
-int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off)
+int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off)
 {
         ENTRY;
         ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n");
@@ -252,8 +258,9 @@ int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off)
         if (!file->f_op->write)
                 RETURN(-EROFS);
 
-        RETURN(file->f_op->write(file, str, len, off));
+        RETURN(file->f_op->write(file, buf, len, off));
 }
+EXPORT_SYMBOL(lustre_fwrite);
 
 /*
  * Sync a file from within kernel context.  Prior to calling this
@@ -268,3 +275,4 @@ int lustre_fsync(struct file *file)
 
         RETURN(file->f_op->fsync(file, file->f_dentry, 0));
 }
+EXPORT_SYMBOL(lustre_fsync);
index 1998ba3..1a5f6fa 100644 (file)
 #define EXPORT_SYMTAB
 #ifndef __KERNEL__
 #include <liblustre.h>
-#endif
-
+#else
 #include <linux/version.h>
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 #include <asm/statfs.h>
 #endif
+#endif
 
 #include <linux/lustre_export.h>
 #include <linux/lustre_net.h>
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 
-void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src)
-{
-        tgt->os_type = HTON__u64(src->os_type);
-        tgt->os_blocks = HTON__u64(src->os_blocks);
-        tgt->os_bfree = HTON__u64(src->os_bfree);
-        tgt->os_bavail = HTON__u64(src->os_bavail);
-        tgt->os_files = HTON__u64(src->os_files);
-        tgt->os_ffree = HTON__u64(src->os_ffree);
-        tgt->os_bsize = HTON__u32(src->os_bsize);
-        tgt->os_namelen = HTON__u32(src->os_namelen);
-}
-
-void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src)
-{
-        obd_statfs_pack(tgt, src);
-}
-
 void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs)
 {
         osfs->os_type = sfs->f_type;
@@ -89,27 +72,33 @@ int obd_self_statfs(struct obd_device *obd, struct statfs *sfs)
         int rc;
         ENTRY;
 
+        LASSERT( obd != NULL );
+
+        spin_lock(&obd->obd_dev_lock);
         if (list_empty(&obd->obd_exports)) {
+                spin_unlock(&obd->obd_dev_lock);
                 export = my_export = class_new_export(obd);
                 if (export == NULL)
                         RETURN(-ENOMEM);
-        } else
+        } else {
                 export = list_entry(obd->obd_exports.next, typeof(*export),
                                     exp_obd_chain);
-        conn.addr = (unsigned long)export;
-        conn.cookie = export->exp_cookie;
+                export = class_export_get(export);
+                spin_unlock(&obd->obd_dev_lock);
+        }
+        conn.cookie = export->exp_handle.h_cookie;
 
         rc = obd_statfs(&conn, &osfs);
         if (!rc)
                 statfs_unpack(sfs, &osfs);
 
         if (my_export)
-                class_destroy_export(my_export);
+                class_unlink_export(my_export);
+
+        class_export_put(export);
         RETURN(rc);
 }
 
-EXPORT_SYMBOL(obd_statfs_pack);
-EXPORT_SYMBOL(obd_statfs_unpack);
 EXPORT_SYMBOL(statfs_pack);
 EXPORT_SYMBOL(statfs_unpack);
 EXPORT_SYMBOL(obd_self_statfs);
index 125f392..3d68f2e 100644 (file)
@@ -75,7 +75,7 @@ static ctl_table obd_table[] = {
         {OBD_RESET, "reset", NULL, 0, 0644, NULL, &obd_sctl_reset},
         {OBD_TIMEOUT, "timeout", &obd_timeout, sizeof(int), 0644, NULL, &proc_dointvec},
         /* XXX need to lock so we avoid update races with the recovery upcall! */
-        {OBD_UPCALL, "recovery_upcall", obd_recovery_upcall, 128, 0644, NULL,
+        {OBD_UPCALL, "upcall", obd_lustre_upcall, 128, 0644, NULL,
          &proc_dostring, &sysctl_string },
         {OBD_SYNCFILTER, "filter_sync_on_commit", &obd_sync_filter, sizeof(int),
                 0644, NULL, &proc_dointvec},
index fed9a8f..9f103df 100644 (file)
 #define DEBUG_SUBSYSTEM S_CLASS
 
 #ifdef __KERNEL__
-#include <linux/ctype.h>
-#include <linux/kernel.h>
-#else 
-#include <liblustre.h>
+# include <linux/ctype.h>
+# include <linux/kernel.h>
+# include <linux/sched.h>
+# include <linux/smp_lock.h>
+#else
+# include <liblustre.h>
 #endif
 
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
-#include <linux/smp_lock.h>
+#include <linux/obd_ost.h>
 
 struct uuid {
        __u32   time_low;
@@ -138,3 +140,26 @@ void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
                uuid.node[0], uuid.node[1], uuid.node[2],
                uuid.node[3], uuid.node[4], uuid.node[5]);
 }
+
+struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid)
+{
+        int i;
+
+        for (i = 0; i < MAX_OBD_DEVICES; i++) {
+                struct obd_device *obd = &obd_dev[i];
+                if (obd->obd_type == NULL)
+                        continue;
+                if ((strncmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME,
+                             sizeof LUSTRE_OSC_NAME) == 0) ||
+                    (strncmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME,
+                             sizeof LUSTRE_MDC_NAME) == 0)) {
+                        struct client_obd *cli = &obd->u.cli;
+                        struct obd_import *imp = cli->cl_import;
+                        if (strncmp(tgtuuid->uuid, imp->imp_target_uuid.uuid,
+                                    sizeof(imp->imp_target_uuid)) == 0)
+                                return obd;
+                }
+        }
+
+        return NULL;
+}
index f8ed503..08136d7 100644 (file)
@@ -17,4 +17,3 @@ obdecho_SOURCES = echo.c echo_client.c lproc_echo.c $(LINX)
 endif
 
 include $(top_srcdir)/Rules
-
index 1796957..1eaa282 100644 (file)
@@ -63,7 +63,7 @@ struct xprocfs_io_stat {
         __u64    st_create_reqs;
         __u64    st_destroy_reqs;
         __u64    st_statfs_reqs;
-        __u64    st_sync_reqs;
+        __u64    st_syncfs_reqs;
         __u64    st_open_reqs;
         __u64    st_close_reqs;
         __u64    st_punch_reqs;
@@ -77,6 +77,7 @@ do {                                                            \
         xprocfs_iostats[smp_processor_id()].field += (count);   \
 } while (0)
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #define DECLARE_XPROCFS_SUM_STAT(field)                 \
 static long long                                        \
 xprocfs_sum_##field (void)                              \
@@ -88,7 +89,7 @@ xprocfs_sum_##field (void)                              \
                 stat += xprocfs_iostats[i].field;       \
         return (stat);                                  \
 }
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+
 DECLARE_XPROCFS_SUM_STAT (st_read_bytes)
 DECLARE_XPROCFS_SUM_STAT (st_read_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_write_bytes)
@@ -98,7 +99,7 @@ DECLARE_XPROCFS_SUM_STAT (st_setattr_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_create_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_destroy_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_statfs_reqs)
-DECLARE_XPROCFS_SUM_STAT (st_sync_reqs)
+DECLARE_XPROCFS_SUM_STAT (st_syncfs_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_open_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_close_reqs)
 DECLARE_XPROCFS_SUM_STAT (st_punch_reqs)
@@ -146,7 +147,7 @@ xprocfs_init (char *name)
 
         xprocfs_dir = proc_mkdir (dirname, NULL);
         if (xprocfs_dir == NULL) {
-                CERROR ("Can't make dir\n");
+                CERROR ("Can't make procfs dir %s\n", dirname);
                 return;
         }
 
@@ -160,7 +161,7 @@ xprocfs_init (char *name)
         xprocfs_add_stat ("create_reqs",  xprocfs_sum_st_create_reqs);
         xprocfs_add_stat ("destroy_reqs", xprocfs_sum_st_destroy_reqs);
         xprocfs_add_stat ("statfs_reqs",  xprocfs_sum_st_statfs_reqs);
-        xprocfs_add_stat ("sync_reqs",    xprocfs_sum_st_sync_reqs);
+        xprocfs_add_stat ("syncfs_reqs",  xprocfs_sum_st_syncfs_reqs);
         xprocfs_add_stat ("open_reqs",    xprocfs_sum_st_open_reqs);
         xprocfs_add_stat ("close_reqs",   xprocfs_sum_st_close_reqs);
         xprocfs_add_stat ("punch_reqs",   xprocfs_sum_st_punch_reqs);
@@ -181,7 +182,7 @@ void xprocfs_fini (void)
         remove_proc_entry ("create_reqs",  xprocfs_dir);
         remove_proc_entry ("destroy_reqs", xprocfs_dir);
         remove_proc_entry ("statfs_reqs",  xprocfs_dir);
-        remove_proc_entry ("sync_reqs",    xprocfs_dir);
+        remove_proc_entry ("syncfs_reqs",  xprocfs_dir);
         remove_proc_entry ("open_reqs",    xprocfs_dir);
         remove_proc_entry ("close_reqs",   xprocfs_dir);
         remove_proc_entry ("punch_reqs",   xprocfs_dir);
@@ -191,20 +192,20 @@ void xprocfs_fini (void)
 }
 
 static int echo_connect(struct lustre_handle *conn, struct obd_device *obd,
-                        struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                        ptlrpc_recovery_cb_t recover)
+                        struct obd_uuid *cluuid)
 {
         return class_connect(conn, obd, cluuid);
 }
 
-static int echo_disconnect(struct lustre_handle *conn)
+static int echo_disconnect(struct lustre_handle *conn, int failover)
 {
         struct obd_export *exp = class_conn2export(conn);
 
         LASSERT (exp != NULL);
 
-        ldlm_cancel_locks_for_export (exp);
-        return (class_disconnect (conn));
+        ldlm_cancel_locks_for_export(exp);
+        class_export_put(exp);
+        return (class_disconnect(conn, failover));
 }
 
 static __u64 echo_next_id(struct obd_device *obddev)
@@ -226,7 +227,7 @@ int echo_create(struct lustre_handle *conn, struct obdo *oa,
         XPROCFS_BUMP_MYCPU_IOSTAT (st_create_reqs, 1);
 
         if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return -EINVAL;
         }
 
@@ -255,7 +256,7 @@ int echo_destroy(struct lustre_handle *conn, struct obdo *oa,
         XPROCFS_BUMP_MYCPU_IOSTAT (st_destroy_reqs, 1);
 
         if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 RETURN(-EINVAL);
         }
 
@@ -275,7 +276,8 @@ int echo_destroy(struct lustre_handle *conn, struct obdo *oa,
 }
 
 static int echo_open(struct lustre_handle *conn, struct obdo *oa,
-                     struct lov_stripe_md *md, struct obd_trans_info *oti)
+                     struct lov_stripe_md *md, struct obd_trans_info *oti,
+                     struct obd_client_handle *och)
 {
         struct lustre_handle *fh = obdo_handle (oa);
         struct obd_device    *obd = class_conn2obd (conn);
@@ -283,7 +285,7 @@ static int echo_open(struct lustre_handle *conn, struct obdo *oa,
         XPROCFS_BUMP_MYCPU_IOSTAT (st_open_reqs, 1);
 
         if (!obd) {
-                CERROR ("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return (-EINVAL);
         }
 
@@ -292,7 +294,6 @@ static int echo_open(struct lustre_handle *conn, struct obdo *oa,
                 return (-EINVAL);
         }
 
-        fh->addr = oa->o_id;
         fh->cookie = ECHO_HANDLE_MAGIC;
 
         oa->o_valid |= OBD_MD_FLHANDLE;
@@ -308,7 +309,7 @@ static int echo_close(struct lustre_handle *conn, struct obdo *oa,
         XPROCFS_BUMP_MYCPU_IOSTAT (st_close_reqs, 1);
 
         if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 return (-EINVAL);
         }
 
@@ -334,7 +335,7 @@ static int echo_getattr(struct lustre_handle *conn, struct obdo *oa,
         XPROCFS_BUMP_MYCPU_IOSTAT (st_getattr_reqs, 1);
 
         if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 RETURN(-EINVAL);
         }
 
@@ -357,7 +358,7 @@ static int echo_setattr(struct lustre_handle *conn, struct obdo *oa,
         XPROCFS_BUMP_MYCPU_IOSTAT (st_setattr_reqs, 1);
 
         if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 RETURN(-EINVAL);
         }
 
@@ -376,7 +377,7 @@ static int echo_setattr(struct lustre_handle *conn, struct obdo *oa,
 /* This allows us to verify that desc_private is passed unmolested */
 #define DESC_PRIV 0x10293847
 
-int echo_preprw(int cmd, struct lustre_handle *conn, int objcount,
+int echo_preprw(int cmd, struct obd_export *export, int objcount,
                 struct obd_ioobj *obj, int niocount, struct niobuf_remote *nb,
                 struct niobuf_local *res, void **desc_private,
                 struct obd_trans_info *oti)
@@ -392,11 +393,9 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount,
         else
                 XPROCFS_BUMP_MYCPU_IOSTAT (st_read_reqs, 1);
 
-        obd = class_conn2obd(conn);
-        if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+        obd = export->exp_obd;
+        if (obd == NULL)
                 RETURN(-EINVAL);
-        }
 
         memset(res, 0, sizeof(*res) * niocount);
 
@@ -405,8 +404,6 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount,
 
         *desc_private = (void *)DESC_PRIV;
 
-        obd_kmap_get(niocount, 1);
-
         for (i = 0; i < objcount; i++, obj++) {
                 int gfp_mask = (obj->ioo_id & 1) ? GFP_HIGHUSER : GFP_KERNEL;
                 int isobj0 = obj->ioo_id == 0;
@@ -434,24 +431,30 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount,
                         atomic_inc(&obd->u.echo.eo_prep);
 
                         r->offset = nb->offset;
-                        r->addr = kmap(r->page);
                         r->len = nb->len;
+                        LASSERT ((r->offset & (PAGE_SIZE - 1)) + r->len <= PAGE_SIZE);
 
-                        CDEBUG(D_PAGE, "$$$$ get page %p, addr %p@"LPU64"\n",
-                               r->page, r->addr, r->offset);
+                        CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n",
+                               r->page, r->offset, r->len);
 
                         if (cmd == OBD_BRW_READ) {
+                                r->rc = r->len;
                                 XPROCFS_BUMP_MYCPU_IOSTAT(st_read_bytes,r->len);
-                                if (verify)
-                                        page_debug_setup(r->addr, r->len,
+                                if (verify) {
+                                        page_debug_setup(kmap (r->page), r->len,
                                                          r->offset,obj->ioo_id);
+                                        kunmap (r->page);
+                                }
+                                r->rc = r->len;
                         } else {
                                 XPROCFS_BUMP_MYCPU_IOSTAT(st_write_bytes,
                                                           r->len);
-                                if (verify)
-                                        page_debug_setup(r->addr, r->len,
+                                if (verify) {
+                                        page_debug_setup(kmap (r->page), r->len,
                                                          0xecc0ecc0ecc0ecc0,
                                                          0xecc0ecc0ecc0ecc0);
+                                        kunmap (r->page);
+                                }
                         }
                 }
         }
@@ -474,28 +477,23 @@ preprw_cleanup:
                 __free_pages(r->page, 0);
                 atomic_dec(&obd->u.echo.eo_prep);
         }
-        obd_kmap_put(niocount);
         memset(res, 0, sizeof(*res) * niocount);
 
         return rc;
 }
 
-int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount,
+int echo_commitrw(int cmd, struct obd_export *export, int objcount,
                   struct obd_ioobj *obj, int niocount, struct niobuf_local *res,
                   void *desc_private, struct obd_trans_info *oti)
 {
         struct obd_device *obd;
         struct niobuf_local *r = res;
-        int rc = 0;
-        int vrc = 0;
-        int i;
+        int i, vrc = 0, rc = 0;
         ENTRY;
 
-        obd = class_conn2obd(conn);
-        if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+        obd = export->exp_obd;
+        if (obd == NULL)
                 RETURN(-EINVAL);
-        }
 
         if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) {
                 CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n",
@@ -520,11 +518,14 @@ int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount,
                         struct page *page = r->page;
                         void *addr;
 
+                        kmap (page);
+                        
                         if (!page || !(addr = page_address(page)) ||
                             !kern_addr_valid(addr)) {
 
                                 CERROR("bad page objid "LPU64":%p, buf %d/%d\n",
                                        obj->ioo_id, page, j, obj->ioo_bufcnt);
+                                kunmap (page);
                                 GOTO(commitrw_cleanup, rc = -EFAULT);
                         }
 
@@ -541,7 +542,6 @@ int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount,
 
                         kunmap(page);
                         /* NB see comment above regarding object0 pages */
-                        obd_kmap_put(1);
                         __free_pages(page, 0);
                         atomic_dec(&obd->u.echo.eo_prep);
                 }
@@ -556,8 +556,6 @@ commitrw_cleanup:
         while (++r < res + niocount) {
                 struct page *page = r->page;
 
-                kunmap(page);
-                obd_kmap_put(1);
                 /* NB see comment above regarding object0 pages */
                 __free_pages(page, 0);
                 atomic_dec(&obd->u.echo.eo_prep);
@@ -584,7 +582,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf)
         RETURN(0);
 }
 
-static int echo_cleanup(struct obd_device *obddev)
+static int echo_cleanup(struct obd_device *obddev, int force, int failover)
 {
         ENTRY;
 
index 2239762..31f7334 100644 (file)
@@ -355,8 +355,14 @@ echo_put_object (struct ec_object *eco)
         eco->eco_refcount--;
         LASSERT (eco->eco_refcount >= 0);
 
-        if (eco->eco_refcount != 0 ||
-            !eco->eco_deleted) {
+        CDEBUG(D_INFO, "put %p: "LPX64"=%u#%u&%d refs %d del %d\n",
+               eco, eco->eco_id,
+               eco->eco_lsm->lsm_stripe_size,
+               eco->eco_lsm->lsm_stripe_count,
+               eco->eco_lsm->lsm_stripe_offset,
+               eco->eco_refcount, eco->eco_deleted);
+
+        if (eco->eco_refcount != 0 || !eco->eco_deleted) {
                 spin_unlock (&ec->ec_lock);
                 return;
         }
@@ -367,7 +373,7 @@ echo_put_object (struct ec_object *eco)
          * attempting to enqueue on this object number until we can be
          * sure there will be no more lock callbacks.
          */
-        obd_cancel_unused (&ec->ec_conn, eco->eco_lsm, 0);
+        obd_cancel_unused(&ec->ec_conn, eco->eco_lsm, 0, NULL);
 
         /* now we can let it go */
         spin_lock (&ec->ec_lock);
@@ -414,7 +420,6 @@ echo_client_kbrw (struct obd_device *obd, int rw,
                   obd_off offset, obd_size count)
 {
         struct echo_client_obd *ec = &obd->u.echo_client;
-        struct obd_brw_set     *set;
         obd_count               npages;
         struct brw_page        *pga;
         struct brw_page        *pgp;
@@ -438,17 +443,12 @@ echo_client_kbrw (struct obd_device *obd, int rw,
              lsm->lsm_object_id != oa->o_id))
                 return (-EINVAL);
 
-        set = obd_brw_set_new();
-        if (set == NULL)
-                return (-ENOMEM);
-
         /* XXX think again with misaligned I/O */
         npages = count >> PAGE_SHIFT;
 
-        rc = -ENOMEM;
         OBD_ALLOC(pga, npages * sizeof(*pga));
         if (pga == NULL)
-                goto out_0;
+                return (-ENOMEM);
 
         for (i = 0, pgp = pga, off = offset;
              i < npages;
@@ -459,7 +459,7 @@ echo_client_kbrw (struct obd_device *obd, int rw,
                 rc = -ENOMEM;
                 pgp->pg = alloc_pages (gfp_mask, 0);
                 if (pgp->pg == NULL)
-                        goto out_1;
+                        goto out;
 
                 pgp->count = PAGE_SIZE;
                 pgp->off = off;
@@ -484,12 +484,9 @@ echo_client_kbrw (struct obd_device *obd, int rw,
                 }
         }
 
-        set->brw_callback = ll_brw_sync_wait;
-        rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, set, NULL);
-        if (rc == 0)
-                rc = ll_brw_sync_wait(set, CB_PHASE_START);
+        rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, NULL);
 
- out_1:
+ out:
         if (rc != 0)
                 verify = 0;
 
@@ -514,8 +511,6 @@ echo_client_kbrw (struct obd_device *obd, int rw,
                 __free_pages(pgp->pg, 0);
         }
         OBD_FREE(pga, npages * sizeof(*pga));
- out_0:
-        obd_brw_set_decref(set);
         return (rc);
 }
 
@@ -526,7 +521,6 @@ static int echo_client_ubrw(struct obd_device *obd, int rw,
                             obd_off offset, obd_size count, char *buffer)
 {
         struct echo_client_obd *ec = &obd->u.echo_client;
-        struct obd_brw_set     *set;
         obd_count               npages;
         struct brw_page        *pga;
         struct brw_page        *pgp;
@@ -546,17 +540,12 @@ static int echo_client_ubrw(struct obd_device *obd, int rw,
             (lsm != NULL && lsm->lsm_object_id != oa->o_id))
                 return (-EINVAL);
 
-        set = obd_brw_set_new();
-        if (set == NULL)
-                return (-ENOMEM);
-
         /* XXX think again with misaligned I/O */
         npages = count >> PAGE_SHIFT;
 
-        rc = -ENOMEM;
         OBD_ALLOC(pga, npages * sizeof(*pga));
         if (pga == NULL)
-                goto out_0;
+                return (-ENOMEM);
 
         rc = alloc_kiovec (1, &kiobuf);
         if (rc != 0)
@@ -579,11 +568,7 @@ static int echo_client_ubrw(struct obd_device *obd, int rw,
                 pgp->flag = 0;
         }
 
-        set->brw_callback = ll_brw_sync_wait;
-        rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, set, NULL);
-
-        if (rc == 0)
-                rc = ll_brw_sync_wait(set, CB_PHASE_START);
+        rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, NULL);
 
         //        if (rw == OBD_BRW_READ)
         //                mark_dirty_kiobuf (kiobuf, count);
@@ -593,8 +578,6 @@ static int echo_client_ubrw(struct obd_device *obd, int rw,
         free_kiovec (1, &kiobuf);
  out_1:
         OBD_FREE(pga, npages * sizeof(*pga));
- out_0:
-        obd_brw_set_decref(set);
         return (rc);
 }
 #else
@@ -620,14 +603,14 @@ echo_open (struct obd_export *exp, struct obdo *oa)
 
         rc = echo_get_object (&eco, obd, oa);
         if (rc != 0)
-                return (rc);
+                return rc;
 
         rc = -ENOMEM;
         OBD_ALLOC (ecoo, sizeof (*ecoo));
         if (ecoo == NULL)
                 goto failed_0;
 
-        rc = obd_open (&ec->ec_conn, oa, eco->eco_lsm, NULL);
+        rc = obd_open(&ec->ec_conn, oa, eco->eco_lsm, NULL, &ecoo->ecoo_och);
         if (rc != 0)
                 goto failed_1;
 
@@ -638,12 +621,9 @@ echo_open (struct obd_export *exp, struct obdo *oa)
         spin_lock (&ec->ec_lock);
 
         list_add (&ecoo->ecoo_exp_chain, &exp->exp_ec_data.eced_open_head);
-
-        ufh->addr = (__u64)((long) ecoo);
         ufh->cookie = ecoo->ecoo_cookie = ec->ec_unique++;
-
         spin_unlock (&ec->ec_lock);
-        return (0);
+        return 0;
 
  failed_1:
         OBD_FREE (ecoo, sizeof (*ecoo));
@@ -664,24 +644,23 @@ echo_close (struct obd_export *exp, struct obdo *oa)
         int                     rc;
 
         if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
-                return (-EINVAL);
+                return -EINVAL;
 
         spin_lock (&ec->ec_lock);
 
         list_for_each (el, &exp->exp_ec_data.eced_open_head) {
                 ecoo = list_entry (el, struct ec_open_object, ecoo_exp_chain);
-                if ((__u64)((long)ecoo) == ufh->addr) {
-                        found = (ecoo->ecoo_cookie == ufh->cookie);
-                        if (found)
-                                list_del (&ecoo->ecoo_exp_chain);
+                found = (ecoo->ecoo_cookie == ufh->cookie);
+                if (found) {
+                        list_del (&ecoo->ecoo_exp_chain);
                         break;
                 }
         }
 
         spin_unlock (&ec->ec_lock);
 
-        if (!found)
-                return (-EINVAL);
+        memcpy(&ecoo->ecoo_oa.o_inline, &ecoo->ecoo_och, FD_OSTDATA_SIZE);
+        ecoo->ecoo_oa.o_valid |= OBD_MD_FLHANDLE;
 
         rc = obd_close (&ec->ec_conn, &ecoo->ecoo_oa,
                         ecoo->ecoo_object->eco_lsm, NULL);
@@ -718,16 +697,16 @@ echo_ldlm_callback (struct ldlm_lock *lock, struct ldlm_lock_desc *new,
 
         switch (flag) {
         case LDLM_CB_BLOCKING:
-                CDEBUG (D_INFO, "blocking callback on "LPX64", handle "LPX64"."
-                        LPX64"\n", eco->eco_id, lockh.addr, lockh.cookie);
+                CDEBUG(D_INFO, "blocking callback on "LPX64", handle "LPX64"\n",
+                       eco->eco_id, lockh.cookie);
                 rc = ldlm_cli_cancel (&lockh);
                 if (rc != ELDLM_OK)
                         CERROR ("ldlm_cli_cancel failed: %d\n", rc);
                 break;
 
         case LDLM_CB_CANCELING:
-                CDEBUG (D_INFO, "canceling callback on "LPX64", handle "LPX64"."
-                        LPX64"\n", eco->eco_id, lockh.addr, lockh.cookie);
+                CDEBUG(D_INFO, "cancel callback on "LPX64", handle "LPX64"\n",
+                       eco->eco_id, lockh.cookie);
                 break;
 
         default:
@@ -750,15 +729,15 @@ echo_enqueue (struct obd_export *exp, struct obdo *oa,
         int                     rc;
 
         if (!(mode == LCK_PR || mode == LCK_PW))
-                return (-EINVAL);
+                return -EINVAL;
 
         if ((offset & (PAGE_SIZE - 1)) != 0 ||
             (nob & (PAGE_SIZE - 1)) != 0)
-                return (-EINVAL);
+                return -EINVAL;
 
         rc = echo_get_object (&eco, obd, oa);
         if (rc != 0)
-                return (rc);
+                return rc;
 
         rc = -ENOMEM;
         OBD_ALLOC (ecl, sizeof (*ecl));
@@ -768,32 +747,28 @@ echo_enqueue (struct obd_export *exp, struct obdo *oa,
         ecl->ecl_mode = mode;
         ecl->ecl_object = eco;
         ecl->ecl_extent.start = offset;
-        ecl->ecl_extent.end = (nob == 0) ? ((obd_off)-1) : (offset + nob - 1);
+        ecl->ecl_extent.end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1);
 
         flags = 0;
         rc = obd_enqueue (&ec->ec_conn, eco->eco_lsm, NULL, LDLM_EXTENT,
                           &ecl->ecl_extent,sizeof(ecl->ecl_extent), mode,
                           &flags, echo_ldlm_callback, eco, sizeof (*eco),
-                          &ecl->ecl_handle);
+                          &ecl->ecl_lock_handle);
         if (rc != 0)
                 goto failed_1;
 
-        CDEBUG (D_INFO, "enqueue handle "LPX64"."LPX64"\n",
-                ecl->ecl_handle.addr, ecl->ecl_handle.cookie);
+        CDEBUG(D_INFO, "enqueue handle "LPX64"\n", ecl->ecl_lock_handle.cookie);
 
         /* NB ecl takes object ref from echo_get_object() above */
+        spin_lock(&ec->ec_lock);
 
-        spin_lock (&ec->ec_lock);
-
-        list_add (&ecl->ecl_exp_chain, &exp->exp_ec_data.eced_locks);
-
-        ulh->addr = (__u64)((long)ecl);
+        list_add(&ecl->ecl_exp_chain, &exp->exp_ec_data.eced_locks);
         ulh->cookie = ecl->ecl_cookie = ec->ec_unique++;
 
-        spin_unlock (&ec->ec_lock);
+        spin_unlock(&ec->ec_lock);
 
         oa->o_valid |= OBD_MD_FLHANDLE;
-        return (0);
+        return 0;
 
  failed_1:
         OBD_FREE (ecl, sizeof (*ecl));
@@ -814,17 +789,15 @@ echo_cancel (struct obd_export *exp, struct obdo *oa)
         int                     rc;
 
         if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
-                return (-EINVAL);
+                return -EINVAL;
 
         spin_lock (&ec->ec_lock);
 
         list_for_each (el, &exp->exp_ec_data.eced_locks) {
                 ecl = list_entry (el, struct ec_lock, ecl_exp_chain);
-
-                if ((__u64)((long)ecl) == ulh->addr) {
-                        found = (ecl->ecl_cookie == ulh->cookie);
-                        if (found)
-                                list_del (&ecl->ecl_exp_chain);
+                found = (ecl->ecl_cookie == ulh->cookie);
+                if (found) {
+                        list_del (&ecl->ecl_exp_chain);
                         break;
                 }
         }
@@ -834,15 +807,13 @@ echo_cancel (struct obd_export *exp, struct obdo *oa)
         if (!found)
                 return (-ENOENT);
 
-        rc = obd_cancel (&ec->ec_conn,
-                         ecl->ecl_object->eco_lsm,
-                         ecl->ecl_mode,
-                         &ecl->ecl_handle);
+        rc = obd_cancel(&ec->ec_conn, ecl->ecl_object->eco_lsm, ecl->ecl_mode,
+                        &ecl->ecl_lock_handle);
 
         echo_put_object (ecl->ecl_object);
         OBD_FREE (ecl, sizeof (*ecl));
 
-        return (rc);
+        return rc;
 }
 
 static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn,
@@ -987,8 +958,10 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn,
                 GOTO (out, rc = -ENOTTY);
         }
 
+        EXIT;
  out:
-        RETURN(rc);
+        class_export_put(exp);
+        return rc;
 }
 
 static int echo_setup(struct obd_device *obddev, obd_count len, void *buf)
@@ -1013,8 +986,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf)
 
         obd_str2uuid(&uuid, data->ioc_inlbuf1);
         tgt = class_uuid2obd(&uuid);
-        if (!tgt || !(tgt->obd_flags & OBD_ATTACHED) ||
-            !(tgt->obd_flags & OBD_SET_UP)) {
+        if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
                 CERROR("device not attached or not set up (%d)\n",
                        data->ioc_dev);
                 RETURN(rc = -EINVAL);
@@ -1024,7 +996,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf)
         INIT_LIST_HEAD (&ec->ec_objects);
         ec->ec_unique = 0;
 
-        rc = obd_connect(&ec->ec_conn, tgt, &echo_uuid, NULL, NULL);
+        rc = obd_connect(&ec->ec_conn, tgt, &echo_uuid);
         if (rc) {
                 CERROR("fail to connect to device %d\n", data->ioc_dev);
                 return (rc);
@@ -1033,7 +1005,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf)
         ec->ec_lsmsize = obd_alloc_memmd (&ec->ec_conn, &lsm);
         if (ec->ec_lsmsize < 0) {
                 CERROR ("Can't get # stripes: %d\n", rc);
-                obd_disconnect (&ec->ec_conn);
+                obd_disconnect (&ec->ec_conn, 0);
                 rc = ec->ec_lsmsize;
         } else {
                 ec->ec_nstripes = lsm->lsm_stripe_count;
@@ -1043,7 +1015,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf)
         RETURN(rc);
 }
 
-static int echo_cleanup(struct obd_device * obddev)
+static int echo_cleanup(struct obd_device * obddev, int force, int failover)
 {
         struct list_head       *el;
         struct ec_object       *eco;
@@ -1067,7 +1039,7 @@ static int echo_cleanup(struct obd_device * obddev)
                 echo_put_object (eco);
         }
 
-        rc = obd_disconnect (&ec->ec_conn);
+        rc = obd_disconnect (&ec->ec_conn, 0);
         if (rc != 0)
                 CERROR("fail to disconnect device: %d\n", rc);
 
@@ -1075,8 +1047,7 @@ static int echo_cleanup(struct obd_device * obddev)
 }
 
 static int echo_connect(struct lustre_handle *conn, struct obd_device *src,
-                        struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                        ptlrpc_recovery_cb_t recover)
+                        struct obd_uuid *cluuid)
 {
         struct obd_export *exp;
         int                rc;
@@ -1084,14 +1055,15 @@ static int echo_connect(struct lustre_handle *conn, struct obd_device *src,
         rc = class_connect(conn, src, cluuid);
         if (rc == 0) {
                 exp = class_conn2export (conn);
-                INIT_LIST_HEAD (&exp->exp_ec_data.eced_open_head);
-                INIT_LIST_HEAD (&exp->exp_ec_data.eced_locks);
+                INIT_LIST_HEAD(&exp->exp_ec_data.eced_open_head);
+                INIT_LIST_HEAD(&exp->exp_ec_data.eced_locks);
+                class_export_put(exp);
         }
 
         RETURN (rc);
 }
 
-static int echo_disconnect(struct lustre_handle *conn)
+static int echo_disconnect(struct lustre_handle *conn, int failover)
 {
         struct obd_export      *exp = class_conn2export (conn);
         struct obd_device      *obd;
@@ -1101,7 +1073,7 @@ static int echo_disconnect(struct lustre_handle *conn)
         int                     rc;
 
         if (exp == NULL)
-                return (-EINVAL);
+                GOTO(out, rc = -EINVAL);
 
         obd = exp->exp_obd;
         ec = &obd->u.echo_client;
@@ -1113,9 +1085,9 @@ static int echo_disconnect(struct lustre_handle *conn)
                 list_del (&ecl->ecl_exp_chain);
 
                 rc = obd_cancel (&ec->ec_conn, ecl->ecl_object->eco_lsm,
-                                 ecl->ecl_mode, &ecl->ecl_handle);
+                                 ecl->ecl_mode, &ecl->ecl_lock_handle);
 
-                CERROR ("Cancel lock on object "LPX64" on disconnect (%d)\n",
+                CDEBUG (D_INFO, "Cancel lock on object "LPX64" on disconnect (%d)\n",
                         ecl->ecl_object->eco_id, rc);
 
                 echo_put_object (ecl->ecl_object);
@@ -1128,6 +1100,10 @@ static int echo_disconnect(struct lustre_handle *conn)
                                    struct ec_open_object, ecoo_exp_chain);
                 list_del (&ecoo->ecoo_exp_chain);
 
+                memcpy (&ecoo->ecoo_oa.o_inline, &ecoo->ecoo_och, 
+                        FD_OSTDATA_SIZE);
+                ecoo->ecoo_oa.o_valid |= OBD_MD_FLHANDLE;
+                
                 rc = obd_close (&ec->ec_conn, &ecoo->ecoo_oa,
                                 ecoo->ecoo_object->eco_lsm, NULL);
 
@@ -1138,8 +1114,11 @@ static int echo_disconnect(struct lustre_handle *conn)
                 OBD_FREE (ecoo, sizeof (*ecoo));
         }
 
-        rc = class_disconnect (conn);
-        RETURN (rc);
+        rc = class_disconnect (conn, 0);
+        GOTO(out, rc);
+ out:
+        class_export_put(exp);
+        return rc;
 }
 
 static struct obd_ops echo_obd_ops = {
index 4e4e8b1..b9addf1 100644 (file)
@@ -6,16 +6,6 @@
 MODULE = obdfilter
 modulefs_DATA = obdfilter.o
 EXTRA_PROGRAMS = obdfilter
-
-LINX=simple.c
-simple.c:
-       test -e simple.c || ln -sf $(top_srcdir)/lib/simple.c
-
-FILTERC = filter.c lproc_obdfilter.c
-obdfilter_SOURCES = $(FILTERC) $(LINX)
-
-dist-hook:
-       list='$(LINX)'; for f in $$list; do rm -f $(distdir)/$$f; done
+obdfilter_SOURCES = filter.c lproc_obdfilter.c
 
 include $(top_srcdir)/Rules
-
index 0632af0..21d05ef 100644 (file)
 #include <linux/mount.h>
 #endif
 
-static kmem_cache_t *filter_open_cache;
-static kmem_cache_t *filter_dentry_cache;
+enum {
+        LPROC_FILTER_READS = 0,
+        LPROC_FILTER_READ_BYTES = 1,
+        LPROC_FILTER_WRITES = 2,
+        LPROC_FILTER_WRITE_BYTES = 3,
+        LPROC_FILTER_LAST = LPROC_FILTER_WRITE_BYTES +1
+};
 
 /* should be generic per-obd stats... */
 struct xprocfs_io_stat {
@@ -149,9 +154,9 @@ xprocfs_init (char *name)
 
         snprintf (dirname, sizeof (dirname), "sys/%s", name);
 
-        xprocfs_dir = proc_mkdir ("sys/obdfilter", NULL);
+        xprocfs_dir = proc_mkdir (dirname, NULL);
         if (xprocfs_dir == NULL) {
-                CERROR ("Can't make dir\n");
+                CERROR ("Can't make procfs dir %s\n", dirname);
                 return;
         }
 
@@ -212,25 +217,66 @@ static inline const char *obd_mode_to_type(int mode)
         return obd_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
 
-static void filter_last_rcvd_cb(struct obd_device *obd, __u64 last_rcvd,
-                                int error)
+static void filter_ffd_addref(void *ffdp)
+{
+        struct filter_file_data *ffd = ffdp;
+
+        atomic_inc(&ffd->ffd_refcount);
+        CDEBUG(D_INFO, "GETting ffd %p : new refcount %d\n", ffd,
+               atomic_read(&ffd->ffd_refcount));
+}
+
+static struct filter_file_data *filter_ffd_new(void)
 {
-        CDEBUG(D_HA, "got callback for last_rcvd "LPD64": rc = %d\n",
-               last_rcvd, error);
-        if (!error && last_rcvd > obd->obd_last_committed)
-                obd->obd_last_committed = last_rcvd;
+        struct filter_file_data *ffd;
+
+        OBD_ALLOC(ffd, sizeof *ffd);
+        if (ffd == NULL) {
+                CERROR("out of memory\n");
+                return NULL;
+        }
+
+        atomic_set(&ffd->ffd_refcount, 2);
+
+        INIT_LIST_HEAD(&ffd->ffd_handle.h_link);
+        class_handle_hash(&ffd->ffd_handle, filter_ffd_addref);
+
+        return ffd;
 }
 
-void filter_start_transno(struct obd_export *export)
+static struct filter_file_data *filter_handle2ffd(struct lustre_handle *handle)
 {
-#ifdef FILTER_TRANSNO_SEM
-        struct obd_device * obd = export->exp_obd;
+        struct filter_file_data *ffd = NULL;
         ENTRY;
+        LASSERT(handle != NULL);
+        ffd = class_handle2object(handle->cookie);
+        if (ffd != NULL)
+                LASSERT(ffd->ffd_file->private_data == ffd);
+        RETURN(ffd);
+}
 
-        down(&obd->u.filter.fo_transno_sem);
-#endif
+static void filter_ffd_put(struct filter_file_data *ffd)
+{
+        CDEBUG(D_INFO, "PUTting ffd %p : new refcount %d\n", ffd,
+               atomic_read(&ffd->ffd_refcount) - 1);
+        LASSERT(atomic_read(&ffd->ffd_refcount) > 0 &&
+                atomic_read(&ffd->ffd_refcount) < 0x5a5a);
+        if (atomic_dec_and_test(&ffd->ffd_refcount)) {
+                LASSERT(list_empty(&ffd->ffd_handle.h_link));
+                OBD_FREE(ffd, sizeof *ffd);
+        }
 }
 
+static void filter_ffd_destroy(struct filter_file_data *ffd)
+{
+        class_handle_unhash(&ffd->ffd_handle);
+        filter_ffd_put(ffd);
+}
+
+static void filter_commit_cb(struct obd_device *obd, __u64 transno, int error)
+{
+        obd_transno_commit_cb(obd, transno, error);
+}
 /* Assumes caller has already pushed us into the kernel context. */
 int filter_finish_transno(struct obd_export *export, void *handle,
                           struct obd_trans_info *oti, int rc)
@@ -244,16 +290,11 @@ int filter_finish_transno(struct obd_export *export, void *handle,
         ssize_t written;
 
         /* Propagate error code. */
-        if (rc) {
-#ifdef FILTER_TRANSNO_SEM
-                up(&filter->fo_transno_sem);
-#endif
+        if (rc)
                 RETURN(rc);
-        }
 
-        if (!(obd->obd_flags & OBD_REPLAYABLE)) {
-                RETURN(0);
-        }
+        if (!obd->obd_replayable)
+                RETURN(rc);
 
         /* we don't allocate new transnos for replayed requests */
 #if 0
@@ -264,14 +305,10 @@ int filter_finish_transno(struct obd_export *export, void *handle,
 
         off = fed->fed_lr_off;
 
-#ifndef FILTER_TRANSNO_SEM
         spin_lock(&filter->fo_translock);
-#endif
         last_rcvd = le64_to_cpu(filter->fo_fsd->fsd_last_rcvd);
         filter->fo_fsd->fsd_last_rcvd = cpu_to_le64(last_rcvd + 1);
-#ifndef FILTER_TRANSNO_SEM
         spin_unlock(&filter->fo_translock);
-#endif
         if (oti)
                 oti->oti_transno = last_rcvd;
         fcd->fcd_last_rcvd = cpu_to_le64(last_rcvd);
@@ -285,15 +322,12 @@ int filter_finish_transno(struct obd_export *export, void *handle,
 #else
         fcd->fcd_last_xid = 0;
 #endif
-        fsfilt_set_last_rcvd(obd, last_rcvd, handle, filter_last_rcvd_cb);
+        fsfilt_set_last_rcvd(obd, last_rcvd, handle, filter_commit_cb);
         written = lustre_fwrite(filter->fo_rcvd_filp, (char *)fcd, sizeof(*fcd),
                                 &off);
         CDEBUG(D_INODE, "wrote trans #"LPD64" for client %s at #%d: written = "
                LPSZ"\n", last_rcvd, fcd->fcd_uuid, fed->fed_lr_idx, written);
 
-#ifdef FILTER_TRANSNO_SEM
-        up(&filter->fo_transno_sem);
-#endif
         if (written == sizeof(*fcd))
                 RETURN(0);
         CERROR("error writing to last_rcvd file: rc = %d\n", (int)written);
@@ -305,9 +339,9 @@ int filter_finish_transno(struct obd_export *export, void *handle,
 
 /* write the pathname into the string */
 static char *filter_id(char *buf, struct filter_obd *filter, obd_id id,
-                     obd_mode mode)
+                       obd_mode mode)
 {
-        if ((mode & S_IFMT) != S_IFREG || filter->fo_subdir_count == 0)
+        if (!S_ISREG(mode) || filter->fo_subdir_count == 0)
                 sprintf(buf, "O/%s/"LPU64, obd_mode_to_type(mode), id);
         else
                 sprintf(buf, "O/%s/d%d/"LPU64, obd_mode_to_type(mode),
@@ -330,7 +364,7 @@ static inline void f_dput(struct dentry *dentry)
 static void filter_drelease(struct dentry *dentry)
 {
         if (dentry->d_fsdata)
-                kmem_cache_free(filter_dentry_cache, dentry->d_fsdata);
+                OBD_FREE(dentry->d_fsdata, sizeof(struct filter_dentry_data));
 }
 
 struct dentry_operations filter_dops = {
@@ -349,34 +383,38 @@ struct dentry_operations filter_dops = {
  * Otherwise, we have just read the data from the last_rcvd file and
  * we know its offset.
  */
-int filter_client_add(struct filter_obd *filter,
+int filter_client_add(struct obd_device *obd, struct filter_obd *filter,
                       struct filter_export_data *fed, int cl_idx)
 {
+        unsigned long *bitmap = filter->fo_last_rcvd_slots;
         int new_client = (cl_idx == -1);
 
-        LASSERT(filter->fo_last_rcvd_slots != NULL);
+        LASSERT(bitmap != NULL);
+
+        /* XXX if mcd_uuid were a real obd_uuid, I could use obd_uuid_equals */
+        if (!strcmp(fed->fed_fcd->fcd_uuid, "OBD_CLASS_UUID"))
+                RETURN(0);
 
         /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so
          * there's no need for extra complication here
          */
         if (new_client) {
-                cl_idx = find_first_zero_bit(filter->fo_last_rcvd_slots,
-                                             FILTER_LR_MAX_CLIENTS);
+                cl_idx = find_first_zero_bit(bitmap, FILTER_LR_MAX_CLIENTS);
         repeat:
                 if (cl_idx >= FILTER_LR_MAX_CLIENTS) {
                         CERROR("no client slots - fix FILTER_LR_MAX_CLIENTS\n");
                         return -ENOMEM;
                 }
-                if (test_and_set_bit(cl_idx, filter->fo_last_rcvd_slots)) {
+                if (test_and_set_bit(cl_idx, bitmap)) {
                         CERROR("FILTER client %d: found bit is set in bitmap\n",
                                cl_idx);
-                        cl_idx = find_next_zero_bit(filter->fo_last_rcvd_slots,
+                        cl_idx = find_next_zero_bit(bitmap,
                                                     FILTER_LR_MAX_CLIENTS,
                                                     cl_idx);
                         goto repeat;
                 }
         } else {
-                if (test_and_set_bit(cl_idx, filter->fo_last_rcvd_slots)) {
+                if (test_and_set_bit(cl_idx, bitmap)) {
                         CERROR("FILTER client %d: bit already set in bitmap!\n",
                                cl_idx);
                         LBUG();
@@ -394,14 +432,28 @@ int filter_client_add(struct filter_obd *filter,
                 struct obd_run_ctxt saved;
                 loff_t off = fed->fed_lr_off;
                 ssize_t written;
+                void *handle;
 
                 CDEBUG(D_INFO, "writing client fcd at idx %u (%llu) (len %u)\n",
                        fed->fed_lr_idx,off,(unsigned int)sizeof(*fed->fed_fcd));
 
                 push_ctxt(&saved, &filter->fo_ctxt, NULL);
-                written = lustre_fwrite(filter->fo_rcvd_filp,
+                /* Transaction eeded to fix for bug 1403 */
+                handle = fsfilt_start(obd,
+                                      filter->fo_rcvd_filp->f_dentry->d_inode,
+                                      FSFILT_OP_SETATTR);
+                if (IS_ERR(handle)) {
+                        written = PTR_ERR(handle);
+                        CERROR("unable to start transaction: rc %d\n",
+                               (int)written);
+                } else {
+                        written = lustre_fwrite(filter->fo_rcvd_filp,
                                                 (char *)fed->fed_fcd,
                                                 sizeof(*fed->fed_fcd), &off);
+                        fsfilt_commit(obd,
+                                      filter->fo_rcvd_filp->f_dentry->d_inode,
+                                      handle, 0);
+                }
                 pop_ctxt(&saved, &filter->fo_ctxt, NULL);
 
                 if (written != sizeof(*fed->fed_fcd)) {
@@ -413,7 +465,7 @@ int filter_client_add(struct filter_obd *filter,
         return 0;
 }
 
-int filter_client_free(struct obd_export *exp)
+int filter_client_free(struct obd_export *exp, int failover)
 {
         struct filter_export_data *fed = &exp->exp_filter_data;
         struct filter_obd *filter = &exp->exp_obd->u.filter;
@@ -425,6 +477,11 @@ int filter_client_free(struct obd_export *exp)
         if (!fed->fed_fcd)
                 RETURN(0);
 
+        if (failover != 0) {
+                OBD_FREE(fed->fed_fcd, sizeof(*fed->fed_fcd));
+                RETURN(0);
+        }
+
         LASSERT(filter->fo_last_rcvd_slots != NULL);
 
         off = fed->fed_lr_off;
@@ -444,7 +501,9 @@ int filter_client_free(struct obd_export *exp)
                                 sizeof(zero_fcd), &off);
 
         /* XXX: this write gets lost sometimes, unless this sync is here. */
-        file_fsync(filter->fo_rcvd_filp, filter->fo_rcvd_filp->f_dentry, 1);
+        if (written > 0)
+                file_fsync(filter->fo_rcvd_filp,
+                           filter->fo_rcvd_filp->f_dentry, 1);
         pop_ctxt(&saved, &filter->fo_ctxt, NULL);
 
         if (written != sizeof(zero_fcd)) {
@@ -522,7 +581,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp,
                 RETURN(-ENOMEM);
         filter->fo_fsd = fsd;
 
-        OBD_ALLOC(filter->fo_last_rcvd_slots, 
+        OBD_ALLOC(filter->fo_last_rcvd_slots,
                   FILTER_LR_MAX_CLIENT_WORDS * sizeof(unsigned long));
         if (filter->fo_last_rcvd_slots == NULL) {
                 OBD_FREE(fsd, sizeof(*fsd));
@@ -585,7 +644,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp,
          * the header.  If we find clients with higher last_rcvd values
          * then those clients may need recovery done.
          */
-        if (!(obd->obd_flags & OBD_REPLAYABLE)) {
+        if (!obd->obd_replayable) {
                 CERROR("%s: recovery support OFF\n", obd->obd_name);
                 GOTO(out, rc = 0);
         }
@@ -634,9 +693,8 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp,
                                LPU64"\n", fcd->fcd_uuid, cl_idx,
                                last_rcvd, le64_to_cpu(fsd->fsd_last_rcvd),
                                le64_to_cpu(fcd->fcd_mount_count), mount_count);
-                        /* disabled until OST recovery is actually working */
-
-                        if (!exp) {
+                        if (exp == NULL) {
+                                /* XXX this rc is ignored  */
                                 rc = -ENOMEM;
                                 break;
                         }
@@ -644,13 +702,14 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp,
                                sizeof exp->exp_client_uuid.uuid);
                         fed = &exp->exp_filter_data;
                         fed->fed_fcd = fcd;
-                        filter_client_add(filter, fed, cl_idx);
+                        filter_client_add(obd, filter, fed, cl_idx);
                         /* create helper if export init gets more complex */
                         INIT_LIST_HEAD(&fed->fed_open_head);
                         spin_lock_init(&fed->fed_lock);
 
                         fcd = NULL;
                         obd->obd_recoverable_clients++;
+                        class_export_put(exp);
                 } else {
                         CDEBUG(D_INFO,
                                "discarded client %d UUID '%s' count "LPU64"\n",
@@ -664,20 +723,22 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp,
                 if (last_rcvd > le64_to_cpu(filter->fo_fsd->fsd_last_rcvd))
                         filter->fo_fsd->fsd_last_rcvd = cpu_to_le64(last_rcvd);
 
-                obd->obd_last_committed = le64_to_cpu(filter->fo_fsd->fsd_last_rcvd);
+                obd->obd_last_committed =
+                        le64_to_cpu(filter->fo_fsd->fsd_last_rcvd);
                 if (obd->obd_recoverable_clients) {
                         CERROR("RECOVERY: %d recoverable clients, last_rcvd "
                                LPU64"\n", obd->obd_recoverable_clients,
                                le64_to_cpu(filter->fo_fsd->fsd_last_rcvd));
-                        obd->obd_next_recovery_transno = obd->obd_last_committed + 1;
-                        obd->obd_flags |= OBD_RECOVERING;
+                        obd->obd_next_recovery_transno =
+                                obd->obd_last_committed + 1;
+                        obd->obd_recovering = 1;
                 }
 
-                if (fcd)
-                        OBD_FREE(fcd, sizeof(*fcd));
-
         }
 
+        if (fcd)
+                OBD_FREE(fcd, sizeof(*fcd));
+
 out:
         fsd->fsd_mount_count = cpu_to_le64(mount_count + 1);
 
@@ -805,7 +866,7 @@ err_O_sub:
         OBD_FREE(filter->fo_dentry_O_sub,
                  filter->fo_subdir_count * sizeof(dentry));
 err_client:
-        class_disconnect_all(obd);
+        class_disconnect_exports(obd, 0);
 err_filp:
         if (filp_close(file, 0))
                 CERROR("can't close %s after error\n", LAST_RCVD);
@@ -932,78 +993,174 @@ static struct dentry *filter_fid2dentry(struct obd_device *obd,
         RETURN(dchild);
 }
 
+/* direct cut-n-paste of mds_blocking_ast() */
+int filter_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
+                     void *data, int flag)
+{
+        int do_ast;
+        ENTRY;
+
+        if (flag == LDLM_CB_CANCELING) {
+                /* Don't need to do anything here. */
+                RETURN(0);
+        }
+
+        /* XXX layering violation!  -phil */
+        l_lock(&lock->l_resource->lr_namespace->ns_lock);
+        /* Get this: if filter_blocking_ast is racing with ldlm_intent_policy,
+         * such that mds_blocking_ast is called just before l_i_p takes the
+         * ns_lock, then by the time we get the lock, we might not be the
+         * correct blocking function anymore.  So check, and return early, if
+         * so. */
+        if (lock->l_blocking_ast != filter_blocking_ast) {
+                l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+                RETURN(0);
+        }
+
+        lock->l_flags |= LDLM_FL_CBPENDING;
+        do_ast = (!lock->l_readers && !lock->l_writers);
+        l_unlock(&lock->l_resource->lr_namespace->ns_lock);
+
+        if (do_ast) {
+                struct lustre_handle lockh;
+                int rc;
+
+                LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel");
+                ldlm_lock2handle(lock, &lockh);
+                rc = ldlm_cli_cancel(&lockh);
+                if (rc < 0)
+                        CERROR("ldlm_cli_cancel: %d\n", rc);
+        } else {
+                LDLM_DEBUG(lock, "Lock still has references, will be "
+                           "cancelled later");
+        }
+        RETURN(0);
+}
+
+static int filter_lock_dentry(struct obd_device *obd, struct dentry *de,
+                              int lock_mode, struct lustre_handle *lockh)
+{
+        struct ldlm_res_id res_id = { .name = {0} };
+        int flags = 0, rc;
+        ENTRY;
+
+        res_id.name[0] = de->d_inode->i_ino;
+        res_id.name[1] = de->d_inode->i_generation;
+        rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL,
+                              res_id, LDLM_PLAIN, NULL, 0, lock_mode,
+                              &flags, ldlm_completion_ast,
+                              filter_blocking_ast, NULL, lockh);
+
+        RETURN(rc == ELDLM_OK ? 0 : -ENOLCK);  /* XXX translate ldlm code */
+}
+
 static inline struct dentry *filter_parent(struct obd_device *obd,
                                            obd_mode mode, obd_id objid)
 {
         struct filter_obd *filter = &obd->u.filter;
 
-        LASSERT((mode & S_IFMT) == S_IFREG);   /* only regular files for now */
-        if ((mode & S_IFMT) != S_IFREG || filter->fo_subdir_count == 0)
+        LASSERT(S_ISREG(mode));   /* only regular files for now */
+        if (!S_ISREG(mode) || filter->fo_subdir_count == 0)
                 return filter->fo_dentry_O_mode[(mode & S_IFMT) >> S_SHIFT];
 
         return filter->fo_dentry_O_sub[objid & (filter->fo_subdir_count - 1)];
 }
 
+static inline struct dentry *filter_parent_lock(struct obd_device *obd,
+                                                obd_mode mode, obd_id objid,
+                                                int lock_mode,
+                                                struct lustre_handle *lockh)
+{
+        struct dentry *de = filter_parent(obd, mode, objid);
+        int rc;
+
+        if (IS_ERR(de))
+                return de;
+
+        rc = filter_lock_dentry(obd, de, lock_mode, lockh);
+        return rc ? ERR_PTR(rc) : de;
+}
+
 static struct file *filter_obj_open(struct obd_export *export,
-                                    __u64 id, __u32 type)
+                                    __u64 id, __u32 type, int parent_mode,
+                                    struct lustre_handle *parent_lockh)
 {
-        struct filter_obd *filter = &export->exp_obd->u.filter;
+        struct obd_device *obd = export->exp_obd;
+        struct filter_obd *filter = &obd->u.filter;
         struct super_block *sb = filter->fo_sb;
-        struct dentry *dentry;
+        struct dentry *dchild = NULL,  *parent;
         struct filter_export_data *fed = &export->exp_filter_data;
-        struct filter_dentry_data *fdd;
-        struct filter_file_data *ffd;
+        struct filter_dentry_data *fdd = NULL;
+        struct filter_file_data *ffd = NULL;
         struct obd_run_ctxt saved;
         char name[24];
         struct file *file;
+        int len, cleanup_phase = 0;
         ENTRY;
 
+        push_ctxt(&saved, &filter->fo_ctxt, NULL);
+
         if (!sb || !sb->s_dev) {
                 CERROR("fatal: device not initialized.\n");
-                RETURN(ERR_PTR(-ENXIO));
+                GOTO(cleanup, file = ERR_PTR(-ENXIO));
         }
 
         if (!id) {
                 CERROR("fatal: invalid obdo "LPU64"\n", id);
-                RETURN(ERR_PTR(-ESTALE));
+                GOTO(cleanup, file = ERR_PTR(-ESTALE));
         }
 
         if (!(type & S_IFMT)) {
                 CERROR("OBD %s, object "LPU64" has bad type: %o\n",
                        __FUNCTION__, id, type);
-                RETURN(ERR_PTR(-EINVAL));
+                GOTO(cleanup, file = ERR_PTR(-EINVAL));
         }
 
-        PORTAL_SLAB_ALLOC(ffd, filter_open_cache, sizeof(*ffd));
-        if (!ffd) {
+        ffd = filter_ffd_new();
+        if (ffd == NULL) {
                 CERROR("obdfilter: out of memory\n");
-                RETURN(ERR_PTR(-ENOMEM));
+                GOTO(cleanup, file = ERR_PTR(-ENOMEM));
         }
 
+        cleanup_phase = 1;
+
         /* We preallocate this to avoid blocking while holding fo_fddlock */
-        fdd = kmem_cache_alloc(filter_dentry_cache, SLAB_KERNEL);
-        if (!fdd) {
+        OBD_ALLOC(fdd, sizeof *fdd);
+        if (fdd == NULL) {
                 CERROR("obdfilter: out of memory\n");
-                GOTO(out_ffd, file = ERR_PTR(-ENOMEM));
+                GOTO(cleanup, file = ERR_PTR(-ENOMEM));
         }
 
-        push_ctxt(&saved, &filter->fo_ctxt, NULL);
-        file = filp_open(filter_id(name, filter, id, type),
-                         O_RDWR | O_LARGEFILE, type);
-        pop_ctxt(&saved, &filter->fo_ctxt, NULL);
+        cleanup_phase = 2;
+
+        parent = filter_parent_lock(obd, type, id, parent_mode, parent_lockh);
+        if (IS_ERR(parent))
+                GOTO(cleanup, file = (void *)parent);
+
+        cleanup_phase = 3;
+
+        len = snprintf(name, sizeof(name), LPU64, id);
+        dchild = lookup_one_len(name, parent, len);
+        if (IS_ERR(dchild))
+                GOTO(cleanup, file = (void *)dchild);
+        LASSERT(dchild->d_inode);
 
+        cleanup_phase = 4;
+
+        /* dentry_open does a dput(de) and mntput(mds->mds_vfsmnt) on error */
+        mntget(filter->fo_vfsmnt);
+        file = dentry_open(dchild, filter->fo_vfsmnt, O_RDWR | O_LARGEFILE);
         if (IS_ERR(file)) {
+                dchild = NULL; /* prevent a double dput in step 4 */
                 CERROR("error opening %s: rc %ld\n", name, PTR_ERR(file));
-                GOTO(out_fdd, file);
+                GOTO(cleanup, file);
         }
 
-        dentry = file->f_dentry;
         spin_lock(&filter->fo_fddlock);
-        if (dentry->d_fsdata) {
+        if (dchild->d_fsdata) {
                 spin_unlock(&filter->fo_fddlock);
-                kmem_cache_free(filter_dentry_cache, fdd);
-                fdd = dentry->d_fsdata;
-                LASSERT(kmem_cache_validate(filter_dentry_cache, fdd));
+                OBD_FREE(fdd, sizeof *fdd);
+                fdd = dchild->d_fsdata;
                 /* should only happen during client recovery */
                 if (fdd->fdd_flags & FILTER_FLAG_DESTROY)
                         CDEBUG(D_INODE,"opening destroyed object "LPU64"\n",id);
@@ -1013,35 +1170,43 @@ static struct file *filter_obj_open(struct obd_export *export,
                 fdd->fdd_flags = 0;
                 fdd->fdd_objid = id;
                 /* If this is racy, then we can use {cmp}xchg and atomic_add */
-                dentry->d_fsdata = fdd;
+                dchild->d_fsdata = fdd;
                 spin_unlock(&filter->fo_fddlock);
         }
 
-        get_random_bytes(&ffd->ffd_servercookie, sizeof(ffd->ffd_servercookie));
         ffd->ffd_file = file;
         LASSERT(file->private_data == NULL);
         file->private_data = ffd;
 
-        if (!dentry->d_op)
-                dentry->d_op = &filter_dops;
+        if (!dchild->d_op)
+                dchild->d_op = &filter_dops;
         else
-                LASSERT(dentry->d_op == &filter_dops);
+                LASSERT(dchild->d_op == &filter_dops);
 
         spin_lock(&fed->fed_lock);
         list_add(&ffd->ffd_export_list, &fed->fed_open_head);
         spin_unlock(&fed->fed_lock);
 
         CDEBUG(D_INODE, "opened objid "LPU64": rc = %p\n", id, file);
-        EXIT;
-out:
-        return file;
-
-out_fdd:
-        kmem_cache_free(filter_dentry_cache, fdd);
-out_ffd:
-        ffd->ffd_servercookie = DEAD_HANDLE_MAGIC;
-        PORTAL_SLAB_FREE(ffd, filter_open_cache, sizeof(*ffd));
-        goto out;
+cleanup:
+        switch (cleanup_phase) {
+        case 4:
+                if (IS_ERR(file))
+                        l_dput(dchild);
+        case 3:
+                if (IS_ERR(file))
+                        ldlm_lock_decref(parent_lockh, parent_mode);
+        case 2:
+                if (IS_ERR(file))
+                        OBD_FREE(fdd, sizeof *fdd);
+        case 1:
+                if (IS_ERR(file))
+                        filter_ffd_destroy(ffd);
+                filter_ffd_put(ffd);
+        case 0:
+                pop_ctxt(&saved, &filter->fo_ctxt, NULL);
+        }
+        RETURN(file);
 }
 
 /* Caller must hold i_sem on dir_dentry->d_inode */
@@ -1071,16 +1236,23 @@ static int filter_destroy_internal(struct obd_device *obd,
         RETURN(rc);
 }
 
+/* If closing because we are failing this device, then
+   don't do the unlink on close.
+*/
 static int filter_close_internal(struct obd_export *export,
                                  struct filter_file_data *ffd,
-                                 struct obd_trans_info *oti)
+                                 struct obd_trans_info *oti,
+                                 int failover)
 {
         struct obd_device *obd = export->exp_obd;
         struct filter_obd *filter = &obd->u.filter;
         struct file *filp = ffd->ffd_file;
         struct dentry *object_dentry = dget(filp->f_dentry);
         struct filter_dentry_data *fdd = object_dentry->d_fsdata;
-        int rc, rc2;
+        struct lustre_handle parent_lockh;
+        int rc, rc2, cleanup_phase = 0;
+        struct dentry *dir_dentry;
+        struct obd_run_ctxt saved;
         ENTRY;
 
         LASSERT(filp->private_data == ffd);
@@ -1089,39 +1261,56 @@ static int filter_close_internal(struct obd_export *export,
         rc = filp_close(filp, 0);
 
         if (atomic_dec_and_test(&fdd->fdd_open_count) &&
-            fdd->fdd_flags & FILTER_FLAG_DESTROY) {
-                struct dentry *dir_dentry = filter_parent(obd, S_IFREG, fdd->fdd_objid);
-                struct obd_run_ctxt saved;
+            fdd->fdd_flags & FILTER_FLAG_DESTROY && !failover) {
                 void *handle;
 
-                down(&dir_dentry->d_inode->i_sem);
                 push_ctxt(&saved, &filter->fo_ctxt, NULL);
-                filter_start_transno(export);
+                cleanup_phase = 1;
+
+                dir_dentry = filter_parent_lock(obd, S_IFREG, fdd->fdd_objid,
+                                                LCK_PW, &parent_lockh);
+                if (IS_ERR(dir_dentry))
+                        GOTO(cleanup, rc = PTR_ERR(dir_dentry));
+                cleanup_phase = 2;
+
                 handle = fsfilt_start(obd, dir_dentry->d_inode,
                                       FSFILT_OP_UNLINK);
-                if (IS_ERR(handle)) {
-                        rc = filter_finish_transno(export, handle, oti,
-                                                   PTR_ERR(handle));
-                        GOTO(out, rc);
-                }
+                if (IS_ERR(handle))
+                        GOTO(cleanup, rc = PTR_ERR(handle));
+
                 /* XXX unlink from PENDING directory now too */
                 rc2 = filter_destroy_internal(obd, dir_dentry, object_dentry);
                 if (rc2 && !rc)
                         rc = rc2;
                 rc = filter_finish_transno(export, handle, oti, rc);
-                rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle);
+                rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle, 0);
                 if (rc2) {
                         CERROR("error on commit, err = %d\n", rc2);
                         if (!rc)
                                 rc = rc2;
                 }
-        out:
-                pop_ctxt(&saved, &filter->fo_ctxt, NULL);
-                up(&dir_dentry->d_inode->i_sem);
         }
 
-        f_dput(object_dentry);
-        PORTAL_SLAB_FREE(ffd, filter_open_cache, sizeof(*ffd));
+cleanup:
+        switch(cleanup_phase) {
+        case 2:
+                if (rc || oti == NULL) {
+                        ldlm_lock_decref(&parent_lockh, LCK_PW);
+                } else {
+                        memcpy(&oti->oti_ack_locks[0].lock, &parent_lockh,
+                               sizeof(parent_lockh));
+                        oti->oti_ack_locks[0].mode = LCK_PW;
+                }
+        case 1:
+                pop_ctxt(&saved, &filter->fo_ctxt, NULL);
+        case 0:
+                f_dput(object_dentry);
+                filter_ffd_destroy(ffd);
+                break;
+        default:
+                CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+                LBUG();
+        }
 
         RETURN(rc);
 }
@@ -1149,9 +1338,17 @@ static int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         if (IS_ERR(mnt))
                 GOTO(err_ops, rc);
 
-#if OST_RECOVERY
-        obd->obd_flags |= OBD_REPLAYABLE;
-#endif
+        if (data->ioc_inllen3 > 0 && data->ioc_inlbuf3) {
+                if (*data->ioc_inlbuf3 == 'f') {
+                        obd->obd_replayable = 1;
+                        obd_sync_filter = 1;
+                        CERROR("%s: configured for recovery and sync write\n",
+                               obd->obd_name);
+                } else {
+                        CERROR("unrecognised flag '%c'\n",
+                               *data->ioc_inlbuf3);
+                }
+        }
 
         filter = &obd->u.filter;
         filter->fo_vfsmnt = mnt;
@@ -1168,11 +1365,7 @@ static int filter_common_setup(struct obd_device *obd, obd_count len, void *buf,
         if (rc)
                 GOTO(err_kfree, rc);
 
-#ifdef FILTER_TRANSNO_SEM
-        init_MUTEX(&filter->fo_transno_sem);
-#else
         spin_lock_init(&filter->fo_translock);
-#endif
         spin_lock_init(&filter->fo_fddlock);
         spin_lock_init(&filter->fo_objidlock);
         INIT_LIST_HEAD(&filter->fo_export_list);
@@ -1202,7 +1395,13 @@ err_ops:
 
 static int filter_setup(struct obd_device *obd, obd_count len, void *buf)
 {
-        return filter_common_setup(obd, len, buf, NULL);
+        struct obd_ioctl_data* data = buf;
+        char *option = NULL;
+
+        if (!strcmp(data->ioc_inlbuf2, "ext3"))
+                option = "asyncdel";
+
+        return filter_common_setup(obd, len, buf, option);
 }
 
 /* sanobd setup methods - use a specific mount option */
@@ -1215,23 +1414,28 @@ static int filter_san_setup(struct obd_device *obd, obd_count len, void *buf)
                 RETURN(-EINVAL);
 
         /* for extN/ext3 filesystem, we must mount it with 'writeback' mode */
-        if (!strcmp(data->ioc_inlbuf2, "extN") ||
-            !strcmp(data->ioc_inlbuf2, "ext3"))
+        if (!strcmp(data->ioc_inlbuf2, "extN"))
                 option = "data=writeback";
+        else if (!strcmp(data->ioc_inlbuf2, "ext3"))
+                option = "data=writeback,asyncdel";
         else
                 LBUG(); /* just a reminder */
 
         return filter_common_setup(obd, len, buf, option);
 }
 
-static int filter_cleanup(struct obd_device *obd)
+static int filter_cleanup(struct obd_device *obd, int force, int failover)
 {
         struct super_block *sb;
         ENTRY;
 
+        if (failover)
+                CERROR("%s: shutting down for failover; client state will"
+                       " be preserved.\n", obd->obd_name);
+
         if (!list_empty(&obd->obd_exports)) {
-                CERROR("still has clients!\n");
-                class_disconnect_all(obd);
+                CERROR("%s: still has clients!\n", obd->obd_name);
+                class_disconnect_exports(obd, failover);
                 if (!list_empty(&obd->obd_exports)) {
                         CERROR("still has exports after forced cleanup?\n");
                         RETURN(-EBUSY);
@@ -1248,8 +1452,16 @@ static int filter_cleanup(struct obd_device *obd)
 
         shrink_dcache_parent(sb->s_root);
         unlock_kernel();
+
+        if (atomic_read(&obd->u.filter.fo_vfsmnt->mnt_count) > 1){
+                CERROR("%s: mount point busy, mnt_count: %d\n", obd->obd_name,
+                       atomic_read(&obd->u.filter.fo_vfsmnt->mnt_count));
+        }
+
         mntput(obd->u.filter.fo_vfsmnt);
         obd->u.filter.fo_sb = 0;
+/*        destroy_buffers(obd->u.filter.fo_sb->s_dev);*/
+
         kfree(obd->u.filter.fo_fstype);
         fsfilt_put_ops(obd->obd_fsops);
 
@@ -1261,20 +1473,43 @@ static int filter_cleanup(struct obd_device *obd)
 int filter_attach(struct obd_device *dev, obd_count len, void *data)
 {
         struct lprocfs_static_vars lvars;
+        struct lprocfs_counters* cntrs;
+        int rc;
 
         lprocfs_init_vars(&lvars);
-        return lprocfs_obd_attach(dev, lvars.obd_vars);
+        rc = lprocfs_obd_attach(dev, lvars.obd_vars);
+        if (rc != 0)
+                return rc;
+
+        rc = lprocfs_alloc_obd_counters(dev, LPROC_FILTER_LAST);
+        if (rc != 0)
+                return rc;
+
+        /* Init obdfilter private counters here */
+        cntrs = dev->counters;
+        LPROCFS_COUNTER_INIT(&cntrs->cntr[LPROC_FILTER_READS],
+                             0, NULL, "read", "reqs");
+        LPROCFS_COUNTER_INIT(&cntrs->cntr[LPROC_FILTER_READ_BYTES],
+                             LPROCFS_CNTR_AVGMINMAX,
+                             NULL, "read_bytes", "bytes");
+        LPROCFS_COUNTER_INIT(&cntrs->cntr[LPROC_FILTER_WRITES],
+                             0, NULL, "write", "reqs");
+
+        LPROCFS_COUNTER_INIT(&cntrs->cntr[LPROC_FILTER_WRITE_BYTES],
+                             LPROCFS_CNTR_AVGMINMAX,
+                             NULL, "write_bytes", "bytes");
+        return rc;
 }
 
 int filter_detach(struct obd_device *dev)
 {
+        lprocfs_free_obd_counters(dev);
         return lprocfs_obd_detach(dev);
 }
 
 /* nearly identical to mds_connect */
 static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
-                          struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                          ptlrpc_recovery_cb_t recover)
+                          struct obd_uuid *cluuid)
 {
         struct obd_export *exp;
         struct filter_export_data *fed;
@@ -1294,11 +1529,12 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
         LASSERT(exp);
 
         fed = &exp->exp_filter_data;
+        class_export_put(exp);
 
         INIT_LIST_HEAD(&exp->exp_filter_data.fed_open_head);
         spin_lock_init(&exp->exp_filter_data.fed_lock);
 
-        if (!(obd->obd_flags & OBD_REPLAYABLE))
+        if (!obd->obd_replayable)
                 RETURN(0);
 
         OBD_ALLOC(fcd, sizeof(*fcd));
@@ -1311,7 +1547,7 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
         fed->fed_fcd = fcd;
         fcd->fcd_mount_count = cpu_to_le64(filter->fo_fsd->fsd_mount_count);
 
-        rc = filter_client_add(filter, fed, -1);
+        rc = filter_client_add(obd, filter, fed, -1);
         if (rc)
                 GOTO(out_fcd, rc);
 
@@ -1320,21 +1556,16 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd,
 out_fcd:
         OBD_FREE(fcd, sizeof(*fcd));
 out_export:
-        class_disconnect(conn);
+        class_disconnect(conn, 0);
 
         RETURN(rc);
 }
 
-/* also incredibly similar to mds_disconnect */
-static int filter_disconnect(struct lustre_handle *conn)
+static void filter_destroy_export(struct obd_export *exp)
 {
-        struct obd_export *exp = class_conn2export(conn);
-        struct filter_export_data *fed;
-        int rc;
-        ENTRY;
+        struct filter_export_data *fed = &exp->exp_filter_data;
 
-        LASSERT(exp);
-        fed = &exp->exp_filter_data;
+        ENTRY;
         spin_lock(&fed->fed_lock);
         while (!list_empty(&fed->fed_open_head)) {
                 struct filter_file_data *ffd;
@@ -1347,20 +1578,37 @@ static int filter_disconnect(struct lustre_handle *conn)
                 CERROR("force close file %*s (hdl %p:"LPX64") on disconnect\n",
                        ffd->ffd_file->f_dentry->d_name.len,
                        ffd->ffd_file->f_dentry->d_name.name,
-                       ffd, ffd->ffd_servercookie);
+                       ffd, ffd->ffd_handle.h_cookie);
 
-                filter_close_internal(exp, ffd, NULL);
+                filter_close_internal(exp, ffd, NULL, exp->exp_failover);
                 spin_lock(&fed->fed_lock);
         }
         spin_unlock(&fed->fed_lock);
 
+        if (exp->exp_obd->obd_replayable)
+                filter_client_free(exp, exp->exp_failover);
+        EXIT;
+}
+
+/* also incredibly similar to mds_disconnect */
+static int filter_disconnect(struct lustre_handle *conn, int failover)
+{
+        struct obd_export *exp = class_conn2export(conn);
+        int rc;
+        unsigned long flags;
+        ENTRY;
+
+        LASSERT(exp);
         ldlm_cancel_locks_for_export(exp);
 
-        if (exp->exp_obd->obd_flags & OBD_REPLAYABLE)
-                filter_client_free(exp);
+        spin_lock_irqsave(&exp->exp_lock, flags);
+        exp->exp_failover = failover;
+        spin_unlock_irqrestore(&exp->exp_lock, flags);
 
-        rc = class_disconnect(conn);
+        rc = class_disconnect(conn, failover);
 
+        fsfilt_sync(exp->exp_obd, exp->exp_obd->u.filter.fo_sb);
+        class_export_put(exp);
         /* XXX cleanup preallocated inodes */
         RETURN(rc);
 }
@@ -1386,25 +1634,6 @@ static void filter_from_inode(struct obdo *oa, struct inode *inode, int valid)
         EXIT;
 }
 
-static struct filter_file_data *filter_handle2ffd(struct lustre_handle *handle)
-{
-        struct filter_file_data *ffd = NULL;
-        ENTRY;
-
-        if (!handle || !handle->addr)
-                RETURN(NULL);
-
-        ffd = (struct filter_file_data *)(unsigned long)(handle->addr);
-        if (!kmem_cache_validate(filter_open_cache, (void *)ffd))
-                RETURN(NULL);
-
-        if (ffd->ffd_servercookie != handle->cookie)
-                RETURN(NULL);
-
-        LASSERT(ffd->ffd_file->private_data == ffd);
-        RETURN(ffd);
-}
-
 static struct dentry *__filter_oa2dentry(struct lustre_handle *conn,
                                          struct obdo *oa, int locked,char *what)
 {
@@ -1414,14 +1643,16 @@ static struct dentry *__filter_oa2dentry(struct lustre_handle *conn,
                 struct lustre_handle *ost_handle = obdo_handle(oa);
                 struct filter_file_data *ffd = filter_handle2ffd(ost_handle);
 
-                if (ffd)
+                if (ffd != NULL) {
                         dentry = dget(ffd->ffd_file->f_dentry);
+                        filter_ffd_put(ffd);
+                }
         }
 
         if (!dentry) {
                 struct obd_device *obd = class_conn2obd(conn);
                 if (!obd) {
-                        CERROR("invalid client "LPX64"\n", conn->addr);
+                        CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                         RETURN(ERR_PTR(-EINVAL));
                 }
                 dentry = filter_fid2dentry(obd, filter_parent(obd, oa->o_mode,
@@ -1437,7 +1668,6 @@ static struct dentry *__filter_oa2dentry(struct lustre_handle *conn,
         if (!dentry->d_inode) {
                 CERROR("%s on non-existent object: "LPU64"\n", what, oa->o_id);
                 f_dput(dentry);
-                LBUG();
                 RETURN(ERR_PTR(-ENOENT));
         }
 
@@ -1486,7 +1716,7 @@ static int filter_setattr(struct lustre_handle *conn, struct obdo *oa,
         dentry = filter_oa2dentry(conn, oa, 0);
 
         if (IS_ERR(dentry))
-                RETURN(PTR_ERR(dentry));
+                GOTO(out_exp, rc = PTR_ERR(dentry));
 
         iattr_from_obdo(&iattr, oa, oa->o_valid);
         iattr.ia_mode = (iattr.ia_mode & ~S_IFMT) | S_IFREG;
@@ -1497,19 +1727,16 @@ static int filter_setattr(struct lustre_handle *conn, struct obdo *oa,
         if (iattr.ia_valid & ATTR_SIZE)
                 down(&inode->i_sem);
 
-        filter_start_transno(export);
         handle = fsfilt_start(obd, dentry->d_inode, FSFILT_OP_SETATTR);
-        if (IS_ERR(handle)) {
-                rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle));
-                GOTO(out_unlock, rc);
-        }
+        if (IS_ERR(handle))
+                GOTO(out_unlock, rc = PTR_ERR(handle));
 
         if (inode->i_op->setattr)
                 rc = inode->i_op->setattr(dentry, &iattr);
         else
                 rc = inode_setattr(inode, &iattr);
         rc = filter_finish_transno(export, handle, oti, rc);
-        rc2 = fsfilt_commit(obd, dentry->d_inode, handle);
+        rc2 = fsfilt_commit(obd, dentry->d_inode, handle, 0);
         if (rc2) {
                 CERROR("error on commit, err = %d\n", rc2);
                 if (!rc)
@@ -1527,28 +1754,34 @@ out_unlock:
         pop_ctxt(&saved, &filter->fo_ctxt, NULL);
 
         f_dput(dentry);
+ out_exp:
+        class_export_put(export);
         RETURN(rc);
 }
 
 static int filter_open(struct lustre_handle *conn, struct obdo *oa,
-                       struct lov_stripe_md *ea, struct obd_trans_info *oti)
+                       struct lov_stripe_md *ea, struct obd_trans_info *oti,
+                       struct obd_client_handle *och)
 {
         struct obd_export *export;
         struct lustre_handle *handle;
         struct filter_file_data *ffd;
         struct file *filp;
+        struct lustre_handle parent_lockh;
         int rc = 0;
         ENTRY;
 
         export = class_conn2export(conn);
         if (!export) {
-                CDEBUG(D_IOCTL, "fatal: invalid client "LPX64"\n", conn->addr);
-                RETURN(-EINVAL);
+                CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
+                       conn->cookie);
+                GOTO(out, rc = -EINVAL);
         }
 
         XPROCFS_BUMP_MYCPU_IOSTAT (st_open_reqs, 1);
 
-        filp = filter_obj_open(export, oa->o_id, oa->o_mode);
+        filp = filter_obj_open(export, oa->o_id, oa->o_mode,
+                               LCK_PR, &parent_lockh);
         if (IS_ERR(filp))
                 GOTO(out, rc = PTR_ERR(filp));
 
@@ -1556,42 +1789,45 @@ static int filter_open(struct lustre_handle *conn, struct obdo *oa,
 
         ffd = filp->private_data;
         handle = obdo_handle(oa);
-        handle->addr = (__u64)(unsigned long)ffd;
-        handle->cookie = ffd->ffd_servercookie;
+        handle->cookie = ffd->ffd_handle.h_cookie;
         oa->o_valid |= OBD_MD_FLHANDLE;
-        EXIT;
+
 out:
-        return rc;
-} /* filter_open */
+        class_export_put(export);
+        if (!rc) {
+                memcpy(&oti->oti_ack_locks[0].lock, &parent_lockh,
+                       sizeof(parent_lockh));
+                oti->oti_ack_locks[0].mode = LCK_PR;
+        }
+        RETURN(rc);
+}
 
 static int filter_close(struct lustre_handle *conn, struct obdo *oa,
                         struct lov_stripe_md *ea, struct obd_trans_info *oti)
 {
-        struct obd_export *exp;
+        struct obd_export *exp = class_conn2export(conn);
         struct filter_file_data *ffd;
         struct filter_export_data *fed;
         int rc;
         ENTRY;
 
-        exp = class_conn2export(conn);
         if (!exp) {
-                CDEBUG(D_IOCTL, "fatal: invalid client "LPX64"\n", conn->addr);
-                RETURN(-EINVAL);
+                CDEBUG(D_IOCTL, "invalid client cookie"LPX64"\n", conn->cookie);
+                GOTO(out, rc = -EINVAL);
         }
 
         XPROCFS_BUMP_MYCPU_IOSTAT (st_close_reqs, 1);
 
         if (!(oa->o_valid & OBD_MD_FLHANDLE)) {
                 CERROR("no handle for close of objid "LPU64"\n", oa->o_id);
-                RETURN(-EINVAL);
+                GOTO(out, rc = -EINVAL);
         }
 
         ffd = filter_handle2ffd(obdo_handle(oa));
-        if (!ffd) {
-                struct lustre_handle *handle = obdo_handle(oa);
-                CERROR("bad handle ("LPX64") or cookie ("LPX64") for close\n",
-                       handle->addr, handle->cookie);
-                RETURN(-ESTALE);
+        if (ffd == NULL) {
+                CERROR("bad handle ("LPX64") for close\n",
+                       obdo_handle(oa)->cookie);
+                GOTO(out, rc = -ESTALE);
         }
 
         fed = &exp->exp_filter_data;
@@ -1599,57 +1835,69 @@ static int filter_close(struct lustre_handle *conn, struct obdo *oa,
         list_del(&ffd->ffd_export_list);
         spin_unlock(&fed->fed_lock);
 
-        rc = filter_close_internal(exp, ffd, oti);
-
-        RETURN(rc);
-} /* filter_close */
+        rc = filter_close_internal(exp, ffd, oti, 0);
+        filter_ffd_put(ffd);
+        GOTO(out, rc);
+ out:
+        class_export_put(exp);
+        return rc;
+}
 
 static int filter_create(struct lustre_handle *conn, struct obdo *oa,
                          struct lov_stripe_md **ea, struct obd_trans_info *oti)
 {
-        struct obd_export *export = class_conn2export(conn);
+        struct obd_export *export;
         struct obd_device *obd = class_conn2obd(conn);
         struct filter_obd *filter = &obd->u.filter;
         struct obd_run_ctxt saved;
         struct dentry *dir_dentry;
-        struct dentry *new;
+        struct lustre_handle parent_lockh;
+        struct dentry *new = NULL;
         struct iattr;
         void *handle;
-        int err, rc;
+        int err, rc, cleanup_phase;
         ENTRY;
 
         if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
-                return -EINVAL;
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
+                RETURN(-EINVAL);
         }
 
+        export = class_conn2export(conn);
         XPROCFS_BUMP_MYCPU_IOSTAT (st_create_reqs, 1);
 
         oa->o_id = filter_next_id(obd);
 
         push_ctxt(&saved, &filter->fo_ctxt, NULL);
-        dir_dentry = filter_parent(obd, S_IFREG, oa->o_id);
-        down(&dir_dentry->d_inode->i_sem);
+ retry:
+        cleanup_phase = 0;
+        dir_dentry = filter_parent_lock(obd, S_IFREG, oa->o_id, LCK_PW,
+                                        &parent_lockh);
+        if (IS_ERR(dir_dentry))
+                GOTO(cleanup, rc = PTR_ERR(dir_dentry));
+        cleanup_phase = 1;
+
         new = filter_fid2dentry(obd, dir_dentry, oa->o_id, 0);
         if (IS_ERR(new))
-                GOTO(out, rc = PTR_ERR(new));
-
+                GOTO(cleanup, rc = PTR_ERR(new));
         if (new->d_inode) {
                 char buf[32];
 
                 /* This would only happen if lastobjid was bad on disk */
-                CERROR("objid %s already exists\n",
-                       filter_id(buf, filter, oa->o_mode, oa->o_id));
-                LBUG();
-                GOTO(out, rc = -EEXIST);
+                CERROR("Serious error: objid %s already exists; is this "
+                       "filesystem corrupt?  I will try to work around it.\n",
+                       filter_id(buf, filter, oa->o_id, oa->o_mode));
+                f_dput(new);
+                ldlm_lock_decref(&parent_lockh, LCK_PW);
+                oa->o_id = filter_next_id(obd);
+                goto retry;
         }
 
-        filter_start_transno(export);
+        cleanup_phase = 2;
         handle = fsfilt_start(obd, dir_dentry->d_inode, FSFILT_OP_CREATE);
-        if (IS_ERR(handle)) {
-                rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle));
-                GOTO(out_put, rc);
-        }
+        if (IS_ERR(handle))
+                GOTO(cleanup, rc = PTR_ERR(handle));
+
         rc = vfs_create(dir_dentry->d_inode, new, oa->o_mode);
         if (rc)
                 CERROR("create failed rc = %d\n", rc);
@@ -1661,7 +1909,7 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa,
                 if (!rc)
                         rc = err;
         }
-        err = fsfilt_commit(obd, dir_dentry->d_inode, handle);
+        err = fsfilt_commit(obd, dir_dentry->d_inode, handle, 0);
         if (err) {
                 CERROR("error on commit, err = %d\n", err);
                 if (!rc)
@@ -1669,7 +1917,7 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa,
         }
 
         if (rc)
-                GOTO(out_put, rc);
+                GOTO(cleanup, rc);
 
         /* Set flags for fields we have set in the inode struct */
         oa->o_valid = OBD_MD_FLID | OBD_MD_FLBLKSZ | OBD_MD_FLBLOCKS |
@@ -1677,50 +1925,70 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa,
         filter_from_inode(oa, new->d_inode, oa->o_valid);
 
         EXIT;
-out_put:
-        f_dput(new);
-out:
-        up(&dir_dentry->d_inode->i_sem);
-        pop_ctxt(&saved, &filter->fo_ctxt, NULL);
-        return rc;
+cleanup:
+        switch(cleanup_phase) {
+        case 2:
+                f_dput(new);
+        case 1: /* locked parent dentry */
+                if (rc || oti == NULL) {
+                        ldlm_lock_decref(&parent_lockh, LCK_PW);
+                } else {
+                        memcpy(&oti->oti_ack_locks[0].lock, &parent_lockh,
+                               sizeof(parent_lockh));
+                        oti->oti_ack_locks[0].mode = LCK_PW;
+                }
+        case 0:
+                pop_ctxt(&saved, &filter->fo_ctxt, NULL);
+                class_export_put(export);
+                break;
+        default:
+                CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+                LBUG();
+        }
+
+        RETURN(rc);
 }
 
 static int filter_destroy(struct lustre_handle *conn, struct obdo *oa,
                           struct lov_stripe_md *ea, struct obd_trans_info *oti)
 {
-        struct obd_export *export = class_conn2export(conn);
+        struct obd_export *export;
         struct obd_device *obd = class_conn2obd(conn);
         struct filter_obd *filter = &obd->u.filter;
-        struct dentry *dir_dentry, *object_dentry;
+        struct dentry *dir_dentry, *object_dentry = NULL;
         struct filter_dentry_data *fdd;
         struct obd_run_ctxt saved;
-        void *handle;
-        int rc, rc2;
+        void *handle = NULL;
+        struct lustre_handle parent_lockh;
+        int rc, rc2, cleanup_phase = 0;
         ENTRY;
 
         if (!obd) {
-                CERROR("invalid client "LPX64"\n", conn->addr);
+                CERROR("invalid client cookie "LPX64"\n", conn->cookie);
                 RETURN(-EINVAL);
         }
 
+        export = class_conn2export(conn);
         XPROCFS_BUMP_MYCPU_IOSTAT (st_destroy_reqs, 1);
 
         CDEBUG(D_INODE, "destroying objid "LPU64"\n", oa->o_id);
 
-        dir_dentry = filter_parent(obd, oa->o_mode, oa->o_id);
-        down(&dir_dentry->d_inode->i_sem);
+        push_ctxt(&saved, &filter->fo_ctxt, NULL);
+        dir_dentry = filter_parent_lock(obd, oa->o_mode, oa->o_id,
+                                        LCK_PW, &parent_lockh);
+        if (IS_ERR(dir_dentry))
+                GOTO(cleanup, rc = PTR_ERR(dir_dentry));
+        cleanup_phase = 1;
 
         object_dentry = filter_oa2dentry(conn, oa, 0);
         if (IS_ERR(object_dentry))
-                GOTO(out, rc = -ENOENT);
+                GOTO(cleanup, rc = -ENOENT);
+        cleanup_phase = 2;
 
-        push_ctxt(&saved, &filter->fo_ctxt, NULL);
-        filter_start_transno(export);
         handle = fsfilt_start(obd, dir_dentry->d_inode, FSFILT_OP_UNLINK);
-        if (IS_ERR(handle)) {
-                rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle));
-                GOTO(out_ctxt, rc);
-        }
+        if (IS_ERR(handle))
+                GOTO(cleanup, rc = PTR_ERR(handle));
+        cleanup_phase = 3;
 
         fdd = object_dentry->d_fsdata;
         if (fdd && atomic_read(&fdd->fdd_open_count)) {
@@ -1734,28 +2002,41 @@ static int filter_destroy(struct lustre_handle *conn, struct obdo *oa,
                         CDEBUG(D_INODE,
                                "repeat destroy of %dx open objid "LPU64"\n",
                                atomic_read(&fdd->fdd_open_count), oa->o_id);
-                GOTO(out_commit, rc = 0);
+                GOTO(cleanup, rc = 0);
         }
 
         rc = filter_destroy_internal(obd, dir_dentry, object_dentry);
 
-out_commit:
-        /* XXX save last_rcvd on disk */
-        rc = filter_finish_transno(export, handle, oti, rc);
-        rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle);
-        if (rc2) {
-                CERROR("error on commit, err = %d\n", rc2);
-                if (!rc)
-                        rc = rc2;
+cleanup:
+        switch(cleanup_phase) {
+        case 3:
+                rc = filter_finish_transno(export, handle, oti, rc);
+                rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle, 0);
+                if (rc2) {
+                        CERROR("error on commit, err = %d\n", rc2);
+                        if (!rc)
+                                rc = rc2;
+                }
+        case 2:
+                f_dput(object_dentry);
+        case 1:
+                if (rc || oti == NULL) {
+                        ldlm_lock_decref(&parent_lockh, LCK_PW);
+                } else {
+                        memcpy(&oti->oti_ack_locks[0].lock, &parent_lockh,
+                               sizeof(parent_lockh));
+                        oti->oti_ack_locks[0].mode = LCK_PW;
+                }
+        case 0:
+                pop_ctxt(&saved, &filter->fo_ctxt, NULL);
+                class_export_put(export);
+                break;
+        default:
+                CERROR("invalid cleanup_phase %d\n", cleanup_phase);
+                LBUG();
         }
-out_ctxt:
-        pop_ctxt(&saved, &filter->fo_ctxt, NULL);
-        f_dput(object_dentry);
 
-        EXIT;
-out:
-        up(&dir_dentry->d_inode->i_sem);
-        return rc;
+        RETURN(rc);
 }
 
 /* NB start and end are used for punch, but not truncate */
@@ -1770,7 +2051,8 @@ static int filter_truncate(struct lustre_handle *conn, struct obdo *oa,
         XPROCFS_BUMP_MYCPU_IOSTAT (st_punch_reqs, 1);
 
         if (end != OBD_OBJECT_EOF)
-                CERROR("PUNCH not supported, only truncate works\n");
+                CERROR("PUNCH not supported, only truncate: end = "LPX64"\n",
+                       end);
 
         CDEBUG(D_INODE, "calling truncate for object "LPU64", valid = %x, "
                "o_size = "LPD64"\n", oa->o_id, oa->o_valid, start);
@@ -1781,43 +2063,73 @@ static int filter_truncate(struct lustre_handle *conn, struct obdo *oa,
 
 static inline void lustre_put_page(struct page *page)
 {
-        kunmap(page);
         page_cache_release(page);
 }
 
-
-static struct page *
-lustre_get_page_read(struct inode *inode, struct niobuf_local *lnb)
+static int filter_start_page_read(struct inode *inode, struct niobuf_local *lnb)
 {
-        unsigned long index = lnb->offset >> PAGE_SHIFT;
         struct address_space *mapping = inode->i_mapping;
         struct page *page;
+        unsigned long index = lnb->offset >> PAGE_SHIFT;
         int rc;
 
-        page = read_cache_page(mapping, index,
-                               (filler_t*)mapping->a_ops->readpage, NULL);
-        if (!IS_ERR(page)) {
-                wait_on_page(page);
-                lnb->addr = kmap(page);
-                lnb->page = page;
-                if (!PageUptodate(page)) {
-                        CERROR("page index %lu not uptodate\n", index);
-                        GOTO(err_page, rc = -EIO);
-                }
-                if (PageError(page)) {
-                        CERROR("page index %lu has error\n", index);
-                        GOTO(err_page, rc = -EIO);
-                }
+        page = grab_cache_page(mapping, index); /* locked page */
+        if (IS_ERR(page))
+                return lnb->rc = PTR_ERR(page);
+
+        lnb->page = page;
+
+        if (inode->i_size < lnb->offset + lnb->len - 1)
+                lnb->rc = inode->i_size - lnb->offset;
+        else
+                lnb->rc = lnb->len;
+
+        if (PageUptodate(page)) {
+                unlock_page(page);
+                return 0;
+        }
+
+        rc = mapping->a_ops->readpage(NULL, page);
+        if (rc < 0) {
+                CERROR("page index %lu, rc = %d\n", index, rc);
+                lnb->page = NULL;
+                lustre_put_page(page);
+                return lnb->rc = rc;
         }
-        return page;
+
+        return 0;
+}
+
+static int filter_finish_page_read(struct niobuf_local *lnb)
+{
+        if (lnb->page == NULL)
+                return 0;
+
+        if (PageUptodate(lnb->page))
+                return 0;
+
+        wait_on_page(lnb->page);
+        if (!PageUptodate(lnb->page)) {
+                CERROR("page index %lu/offset "LPX64" not uptodate\n",
+                       lnb->page->index, lnb->offset);
+                GOTO(err_page, lnb->rc = -EIO);
+        }
+        if (PageError(lnb->page)) {
+                CERROR("page index %lu/offset "LPX64" has error\n",
+                       lnb->page->index, lnb->offset);
+                GOTO(err_page, lnb->rc = -EIO);
+        }
+
+        return 0;
 
 err_page:
-        lustre_put_page(page);
-        return ERR_PTR(rc);
+        lustre_put_page(lnb->page);
+        lnb->page = NULL;
+        return lnb->rc;
 }
 
-static struct page *
-lustre_get_page_write(struct inode *inode, unsigned long index)
+static struct page *lustre_get_page_write(struct inode *inode,
+                                          unsigned long index)
 {
         struct address_space *mapping = inode->i_mapping;
         struct page *page;
@@ -1826,7 +2138,6 @@ lustre_get_page_write(struct inode *inode, unsigned long index)
         page = grab_cache_page(mapping, index); /* locked page */
 
         if (!IS_ERR(page)) {
-                kmap(page);
                 /* Note: Called with "O" and "PAGE_SIZE" this is essentially
                  * a no-op for most filesystems, because we write the whole
                  * page.  For partial-page I/O this will read in the page.
@@ -1888,7 +2199,7 @@ static int lustre_commit_write(struct niobuf_local *lnb)
         LASSERT(to <= PAGE_SIZE);
         err = page->mapping->a_ops->commit_write(NULL, page, from, to);
         if (!err && IS_SYNC(inode))
-                err = waitfor_one_page(page);
+                waitfor_one_page(page);
         //SetPageUptodate(page); // the client commit_write will do this
 
         SetPageReferenced(page);
@@ -1897,8 +2208,8 @@ static int lustre_commit_write(struct niobuf_local *lnb)
         return err;
 }
 
-struct page *filter_get_page_write(struct inode *inode,
-                                   struct niobuf_local *lnb, int *pglocked)
+int filter_get_page_write(struct inode *inode, struct niobuf_local *lnb,
+                          int *pglocked)
 {
         unsigned long index = lnb->offset >> PAGE_SHIFT;
         struct address_space *mapping = inode->i_mapping;
@@ -1923,14 +2234,11 @@ struct page *filter_get_page_write(struct inode *inode,
                 }
                 POISON((void *)addr, 0xBA, PAGE_SIZE);
                 page = virt_to_page(addr);
-                kmap(page);
                 page->index = index;
-                lnb->addr = (void *)addr;
                 lnb->page = page;
                 lnb->flags |= N_LOCAL_TEMP_PAGE;
         } else if (!IS_ERR(page)) {
                 (*pglocked)++;
-                kmap(page);
 
                 rc = mapping->a_ops->prepare_write(NULL, page,
                                                    lnb->offset & ~PAGE_MASK,
@@ -1946,17 +2254,16 @@ struct page *filter_get_page_write(struct inode *inode,
                         LBUG();
                         GOTO(err_unlock, rc = -EIO);
                 }
-                lnb->addr = page_address(page);
                 lnb->page = page;
         }
 
-        return page;
+        return 0;
 
 err_unlock:
         unlock_page(page);
         lustre_put_page(page);
 err:
-        return ERR_PTR(rc);
+        return lnb->rc = rc;
 }
 
 /*
@@ -1987,30 +2294,34 @@ static int filter_commit_write(struct niobuf_local *lnb, int err)
                 for (bh = head, block_start = 0; bh != head || !block_start;
                      block_start = block_end, bh = bh->b_this_page) {
                         block_end = block_start + blocksize;
-                        if (buffer_new(bh))
-                                memset(lnb->addr + block_start, 0, blocksize);
+                        if (buffer_new(bh)) {
+                                memset(kmap(lnb->page) + block_start, 0,
+                                       blocksize);
+                                kunmap(lnb->page);
+                        }
                 }
         }
 #endif
         return lustre_commit_write(lnb);
 }
 
-static int filter_preprw(int cmd, struct lustre_handle *conn,
+static int filter_preprw(int cmd, struct obd_export *export,
                          int objcount, struct obd_ioobj *obj,
                          int niocount, struct niobuf_remote *nb,
                          struct niobuf_local *res, void **desc_private,
                          struct obd_trans_info *oti)
 {
         struct obd_run_ctxt saved;
-        struct obd_export *export;
         struct obd_device *obd;
         struct obd_ioobj *o;
-        struct niobuf_remote *rnb = nb;
-        struct niobuf_local *lnb = res;
+        struct niobuf_remote *rnb;
+        struct niobuf_local *lnb;
         struct fsfilt_objinfo *fso;
-        int pglocked = 0;
-        int rc = 0;
-        int i;
+        struct dentry *dentry;
+        struct inode *inode;
+        struct lprocfs_counters *cntrs;
+        int pglocked = 0, rc = 0, i, j;
+
         ENTRY;
 
         if ((cmd & OBD_BRW_WRITE) != 0)
@@ -2020,14 +2331,18 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
 
         memset(res, 0, niocount * sizeof(*res));
 
-        export = class_conn2export(conn);
-        obd = class_conn2obd(conn);
-        if (!obd) {
-                CDEBUG(D_IOCTL, "invalid client "LPX64"\n", conn->addr);
+        obd = export->exp_obd;
+        if (obd == NULL)
                 RETURN(-EINVAL);
-        }
 
-        LASSERT(objcount < 16); // theoretically we support multi-obj BRW
+        cntrs = obd->counters;
+        if ((cmd & OBD_BRW_WRITE) != 0)
+                LPROCFS_COUNTER_INCBY1(&cntrs->cntr[LPROC_FILTER_WRITES]);
+        else
+                LPROCFS_COUNTER_INCBY1(&cntrs->cntr[LPROC_FILTER_READS]);
+
+        // theoretically we support multi-obj BRW RPCs, but until then...
+        LASSERT(objcount == 1);
 
         OBD_ALLOC(fso, objcount * sizeof(*fso));
         if (!fso)
@@ -2037,7 +2352,6 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
 
         for (i = 0, o = obj; i < objcount; i++, o++) {
                 struct filter_dentry_data *fdd;
-                struct dentry *dentry;
 
                 LASSERT(o->ioo_bufcnt);
 
@@ -2045,7 +2359,7 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
                                                               o->ioo_id),
                                            o->ioo_id, 0);
 
-                if (IS_ERR(dentry)) 
+                if (IS_ERR(dentry))
                         GOTO(out_objinfo, rc = PTR_ERR(dentry));
 
                 fso[i].fso_dentry = dentry;
@@ -2054,10 +2368,26 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
                 if (!dentry->d_inode) {
                         CERROR("trying to BRW to non-existent file "LPU64"\n",
                                o->ioo_id);
-                        f_dput(dentry);
                         GOTO(out_objinfo, rc = -ENOENT);
                 }
 
+                /* If we ever start to support mutli-object BRW RPCs, we will
+                 * need to get locks on mulitple inodes (in order) or use the
+                 * DLM to do the locking for us (and use the same locking in
+                 * filter_setattr() for truncate).  That isn't all, because
+                 * there still exists the possibility of a truncate starting
+                 * a new transaction while holding the ext3 rwsem = write
+                 * while some writes (which have started their transactions
+                 * here) blocking on the ext3 rwsem = read => lock inversion.
+                 *
+                 * The handling gets very ugly when dealing with locked pages.
+                 * It may be easier to just get rid of the locked page code
+                 * (which has problems of its own) and either discover we do
+                 * not need it anymore (i.e. it was a symptom of another bug)
+                 * or ensure we get the page locks in an appropriate order.
+                 */
+                if (cmd & OBD_BRW_WRITE)
+                        down(&dentry->d_inode->i_sem);
                 fdd = dentry->d_fsdata;
                 if (!fdd || !atomic_read(&fdd->fdd_open_count))
                         CDEBUG(D_PAGE, "I/O to unopened object "LPU64"\n",
@@ -2065,22 +2395,6 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
         }
 
         if (cmd & OBD_BRW_WRITE) {
-#warning "FIXME: we need inode->i_sem for each object to protect vs truncate"
-                /* Even worse, we need to get locks on mulitple inodes (in
-                 * order) or use the DLM to do the locking for us (and use
-                 * the same locking in filter_setattr() for truncate.  The
-                 * handling gets very ugly when dealing with locked pages.
-                 * It may be easier to just get rid of the locked page code
-                 * (which has problems of its own) and either discover we do
-                 * not need it anymore (i.e. it was a symptom of another bug)
-                 * or ensure we get the page locks in an appropriate order.
-                 */
-                /* Danger, Will Robinson! You are taking a lock here and also
-                 * starting a transaction and releasing/finishing then in
-                 * filter_commitrw(), so you must call fsfilt_commit() and
-                 * finish_transno() if an error occurs in this function.
-                 */
-                filter_start_transno(export);
                 *desc_private = fsfilt_brw_start(obd, objcount, fso,
                                                  niocount, nb);
                 if (IS_ERR(*desc_private)) {
@@ -2092,52 +2406,65 @@ static int filter_preprw(int cmd, struct lustre_handle *conn,
                 }
         }
 
-        obd_kmap_get(niocount, 1);
-
-        for (i = 0, o = obj; i < objcount; i++, o++) {
-                struct dentry *dentry;
-                struct inode *inode;
-                int j;
-
+        for (i = 0, o = obj, rnb = nb, lnb = res; i < objcount; i++, o++) {
                 dentry = fso[i].fso_dentry;
                 inode = dentry->d_inode;
 
                 for (j = 0; j < o->ioo_bufcnt; j++, rnb++, lnb++) {
-                        struct page *page;
-
                         if (j == 0)
                                 lnb->dentry = dentry;
                         else
                                 lnb->dentry = dget(dentry);
 
-                        /* lnb->offset is aligned, while rnb->offset isn't,
-                         * and we need to copy the fields to lnb anyways.
-                         */
-                        memcpy(lnb, rnb, sizeof(*rnb));
+                        lnb->offset = rnb->offset;
+                        lnb->len    = rnb->len;
+                        lnb->flags  = rnb->flags;
+
                         if (cmd & OBD_BRW_WRITE) {
-                                page = filter_get_page_write(inode, lnb,
-                                                             &pglocked);
+                                rc = filter_get_page_write(inode,lnb,&pglocked);
 
                                 XPROCFS_BUMP_MYCPU_IOSTAT(st_write_bytes,
                                                           lnb->len);
+                                LPROCFS_COUNTER_INCR(&cntrs->cntr[LPROC_FILTER_WRITE_BYTES], lnb->len);
+                        } else if (inode->i_size <= rnb->offset) {
+                                /* If there's no more data, abort early.
+                                 * lnb->page == NULL and lnb->rc == 0, so it's
+                                 * easy to detect later. */
+                                f_dput(lnb->dentry);
+                                lnb->dentry = NULL;
+                                break;
                         } else {
-                                page = lustre_get_page_read(inode, lnb);
+                                rc = filter_start_page_read(inode, lnb);
 
                                 XPROCFS_BUMP_MYCPU_IOSTAT(st_read_bytes,
                                                           lnb->len);
+                                LPROCFS_COUNTER_INCR(&cntrs->cntr[LPROC_FILTER_READ_BYTES], lnb->len);
                         }
 
-                        if (IS_ERR(page)) {
-                                rc = PTR_ERR(page);
+                        if (rc) {
                                 CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
                                        "error on page @"LPU64"%u/%u: rc = %d\n",
                                        lnb->offset, j, o->ioo_bufcnt, rc);
                                 f_dput(dentry);
                                 GOTO(out_pages, rc);
                         }
+
+                        if ((cmd & OBD_BRW_READ) && lnb->rc < lnb->len) {
+                                /* Likewise with a partial read */
+                                break;
+                        }
                 }
         }
 
+        while ((cmd & OBD_BRW_READ) && lnb-- > res) {
+                rc = filter_finish_page_read(lnb);
+                if (rc) {
+                        CERROR("error on page %u@"LPU64": rc = %d\n",
+                               lnb->len, lnb->offset, rc);
+                        f_dput(lnb->dentry);
+                        GOTO(out_pages, rc);
+                }
+        }
         EXIT;
 out:
         OBD_FREE(fso, objcount * sizeof(*fso));
@@ -2147,30 +2474,36 @@ out:
 
 out_pages:
         while (lnb-- > res) {
-                if (cmd & OBD_BRW_WRITE)
+                if (cmd & OBD_BRW_WRITE) {
                         filter_commit_write(lnb, rc);
-                else
+                        up(&lnb->dentry->d_inode->i_sem);
+                } else {
                         lustre_put_page(lnb->page);
+                }
                 f_dput(lnb->dentry);
         }
-        obd_kmap_put(niocount);
         if (cmd & OBD_BRW_WRITE) {
                 filter_finish_transno(export, *desc_private, oti, rc);
                 fsfilt_commit(obd,
                               filter_parent(obd,S_IFREG,obj->ioo_id)->d_inode,
-                              *desc_private);
+                              *desc_private, 0);
         }
         goto out; /* dropped the dentry refs already (one per page) */
 
 out_objinfo:
-        for (i = 0; i < objcount && fso[i].fso_dentry; i++)
+        for (i = 0; i < objcount && fso[i].fso_dentry; i++) {
+                if (cmd & OBD_BRW_WRITE)
+                        up(&fso[i].fso_dentry->d_inode->i_sem);
                 f_dput(fso[i].fso_dentry);
+        }
         goto out;
 }
 
 static int filter_write_locked_page(struct niobuf_local *lnb)
 {
         struct page *lpage;
+        void        *lpage_addr;
+        void        *lnb_addr;
         int rc;
         ENTRY;
 
@@ -2195,11 +2528,15 @@ static int filter_write_locked_page(struct niobuf_local *lnb)
                 RETURN(rc);
         }
 
-        /* lpage is kmapped in lustre_get_page_write() above and kunmapped in
-         * lustre_commit_write() below, lnb->page was kmapped previously in
-         * filter_get_page_write() and kunmapped in lustre_put_page() below.
-         */
-        memcpy(page_address(lpage), page_address(lnb->page), PAGE_SIZE);
+        /* 2 kmaps == vanishingly small deadlock opportunity */
+        lpage_addr = kmap(lpage);
+        lnb_addr = kmap(lnb->page);
+
+        memcpy(lpage_addr, lnb_addr, PAGE_SIZE);
+
+        kunmap(lnb->page);
+        kunmap(lpage);
+
         lustre_put_page(lnb->page);
 
         lnb->page = lpage;
@@ -2211,19 +2548,17 @@ static int filter_write_locked_page(struct niobuf_local *lnb)
         RETURN(rc);
 }
 
-static int filter_syncfs(struct lustre_handle *conn)
+static int filter_syncfs(struct obd_export *exp)
 {
-        struct obd_device *obd;
+        struct obd_device *obd = exp->exp_obd;
         ENTRY;
 
-        obd = class_conn2obd(conn);
-
         XPROCFS_BUMP_MYCPU_IOSTAT (st_syncfs_reqs, 1);
 
         RETURN(fsfilt_sync(obd, obd->u.filter.fo_sb));
 }
 
-static int filter_commitrw(int cmd, struct lustre_handle *conn,
+static int filter_commitrw(int cmd, struct obd_export *export,
                            int objcount, struct obd_ioobj *obj,
                            int niocount, struct niobuf_local *res,
                            void *desc_private, struct obd_trans_info *oti)
@@ -2231,11 +2566,8 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn,
         struct obd_run_ctxt saved;
         struct obd_ioobj *o;
         struct niobuf_local *lnb;
-        struct obd_export *export = class_conn2export(conn);
-        struct obd_device *obd = class_conn2obd(conn);
-        int found_locked = 0;
-        int rc = 0;
-        int i;
+        struct obd_device *obd = export->exp_obd;
+        int found_locked = 0, rc = 0, i;
         ENTRY;
 
         push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL);
@@ -2246,9 +2578,14 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn,
         for (i = 0, o = obj, lnb = res; i < objcount; i++, o++) {
                 int j;
 
-                if (cmd & OBD_BRW_WRITE)
+                if (cmd & OBD_BRW_WRITE) {
                         inode_update_time(lnb->dentry->d_inode, 1);
+                        up(&lnb->dentry->d_inode->i_sem);
+                }
                 for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) {
+                        if (lnb->page == NULL) {
+                                continue;
+                        }
                         if (lnb->flags & N_LOCAL_TEMP_PAGE) {
                                 found_locked++;
                                 continue;
@@ -2259,16 +2596,16 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn,
 
                                 if (!rc)
                                         rc = err;
-                        } else
+                        } else {
                                 lustre_put_page(lnb->page);
+                        }
 
-                        obd_kmap_put(1);
                         f_dput(lnb->dentry);
                 }
         }
 
         for (i = 0, o = obj, lnb = res; found_locked > 0 && i < objcount;
-                        i++, o++) {
+             i++, o++) {
                 int j;
                 for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) {
                         int err;
@@ -2276,7 +2613,6 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn,
                                 continue;
 
                         err = filter_write_locked_page(lnb);
-                        obd_kmap_put(1);
                         if (!rc)
                                 rc = err;
                         f_dput(lnb->dentry);
@@ -2290,14 +2626,13 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn,
                 int err;
 
                 rc = filter_finish_transno(export, desc_private, oti, rc);
-                err = fsfilt_commit(obd, dir_dentry->d_inode, desc_private);
+                err = fsfilt_commit(obd, dir_dentry->d_inode, desc_private,
+                                    obd_sync_filter);
                 if (err)
                         rc = err;
-                if (obd_sync_filter) {
-                        /* this can fail with ENOMEM, what should we do then? */
-                        filter_syncfs(conn);
-                }
-                /* XXX <adilger> LASSERT(last_rcvd == last_committed)*/
+                if (obd_sync_filter)
+                        LASSERT(oti->oti_transno <= obd->obd_last_committed);
+
         }
 
         LASSERT(!current->journal_info);
@@ -2308,9 +2643,9 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn,
 
 static int filter_brw(int cmd, struct lustre_handle *conn,
                       struct lov_stripe_md *lsm, obd_count oa_bufs,
-                      struct brw_page *pga, struct obd_brw_set *set,
-                      struct obd_trans_info *oti)
+                      struct brw_page *pga, struct obd_trans_info *oti)
 {
+        struct obd_export *export = class_conn2export(conn);
         struct obd_ioobj        ioo;
         struct niobuf_local     *lnb;
         struct niobuf_remote    *rnb;
@@ -2319,6 +2654,9 @@ static int filter_brw(int cmd, struct lustre_handle *conn,
         int                     ret = 0;
         ENTRY;
 
+        if (export == NULL)
+                RETURN(-EINVAL);
+
         OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local));
         OBD_ALLOC(rnb, oa_bufs * sizeof(struct niobuf_remote));
 
@@ -2335,7 +2673,7 @@ static int filter_brw(int cmd, struct lustre_handle *conn,
         ioo.ioo_type = S_IFREG;
         ioo.ioo_bufcnt = oa_bufs;
 
-        ret = filter_preprw(cmd, conn, 1, &ioo, oa_bufs, rnb, lnb,
+        ret = filter_preprw(cmd, export, 1, &ioo, oa_bufs, rnb, lnb,
                             &desc_private, oti);
         if (ret != 0)
                 GOTO(out, ret);
@@ -2343,16 +2681,20 @@ static int filter_brw(int cmd, struct lustre_handle *conn,
         for (i = 0; i < oa_bufs; i++) {
                 void *virt = kmap(pga[i].pg);
                 obd_off off = pga[i].off & ~PAGE_MASK;
+                void *addr = kmap(lnb[i].page);
+
+                /* 2 kmaps == vanishingly small deadlock opportunity */
 
                 if (cmd & OBD_BRW_WRITE)
-                        memcpy(lnb[i].addr + off, virt + off, pga[i].count);
+                        memcpy(addr + off, virt + off, pga[i].count);
                 else
-                        memcpy(virt + off, lnb[i].addr + off, pga[i].count);
+                        memcpy(virt + off, addr + off, pga[i].count);
 
+                kunmap(addr);
                 kunmap(virt);
         }
 
-        ret = filter_commitrw(cmd, conn, 1, &ioo, oa_bufs, lnb, desc_private,
+        ret = filter_commitrw(cmd, export, 1, &ioo, oa_bufs, lnb, desc_private,
                               oti);
 
 out:
@@ -2360,6 +2702,7 @@ out:
                 OBD_FREE(lnb, oa_bufs * sizeof(struct niobuf_local));
         if (rnb)
                 OBD_FREE(rnb, oa_bufs * sizeof(struct niobuf_remote));
+        class_export_put(export);
         RETURN(ret);
 }
 
@@ -2381,7 +2724,8 @@ static int filter_san_preprw(int cmd, struct lustre_handle *conn,
 
         obd = class_conn2obd(conn);
         if (!obd) {
-                CDEBUG(D_IOCTL, "invalid client "LPX64"\n", conn->addr);
+                CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
+                       conn->cookie);
                 RETURN(-EINVAL);
         }
 
@@ -2451,29 +2795,32 @@ static int filter_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
         RETURN(fsfilt_statfs(obd, obd->u.filter.fo_sb, osfs));
 }
 
-static int filter_get_info(struct lustre_handle *conn, obd_count keylen,
-                           void *key, obd_count *vallen, void **val)
+static int filter_get_info(struct lustre_handle *conn, __u32 keylen,
+                           void *key, __u32 *vallen, void *val)
 {
         struct obd_device *obd;
         ENTRY;
 
         obd = class_conn2obd(conn);
         if (!obd) {
-                CDEBUG(D_IOCTL, "invalid client "LPX64"\n", conn->addr);
+                CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n",
+                       conn->cookie);
                 RETURN(-EINVAL);
         }
 
-        if ( keylen == strlen("blocksize") &&
-             memcmp(key, "blocksize", keylen) == 0 ) {
-                *vallen = sizeof(long);
-                *val = (void *)(long)obd->u.filter.fo_sb->s_blocksize;
+        if (keylen == strlen("blocksize") &&
+            memcmp(key, "blocksize", keylen) == 0) {
+                __u32 *blocksize = val;
+                *vallen = sizeof(*blocksize);
+                *blocksize = obd->u.filter.fo_sb->s_blocksize;
                 RETURN(0);
         }
 
-        if ( keylen == strlen("blocksize_bits") &&
-             memcmp(key, "blocksize_bits", keylen) == 0 ){
-                *vallen = sizeof(long);
-                *val = (void *)(long)obd->u.filter.fo_sb->s_blocksize_bits;
+        if (keylen == strlen("blocksize_bits") &&
+            memcmp(key, "blocksize_bits", keylen) == 0) {
+                __u32 *blocksize_bits = val;
+                *vallen = sizeof(*blocksize_bits);
+                *blocksize_bits = obd->u.filter.fo_sb->s_blocksize_bits;
                 RETURN(0);
         }
 
@@ -2505,12 +2852,7 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst,
         if (page == NULL)
                 RETURN(-ENOMEM);
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-        while (TryLockPage(page))
-                ___wait_on_page(page);
-#else
-        wait_on_page_locked(page);
-#endif
+        wait_on_page(page);
 
         /* XXX with brw vector I/O, we could batch up reads and writes here,
          *     all we need to do is allocate multiple pages to handle the I/Os
@@ -2518,14 +2860,6 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst,
          */
         while (index < ((src->o_size + PAGE_SIZE - 1) >> PAGE_SHIFT)) {
                 struct brw_page pg;
-                struct obd_brw_set *set;
-
-                set = obd_brw_set_new();
-                if (set == NULL) {
-                        err = -ENOMEM;
-                        EXIT;
-                        break;
-                }
 
                 pg.pg = page;
                 pg.count = PAGE_SIZE;
@@ -2533,26 +2867,16 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst,
                 pg.flag = 0;
 
                 page->index = index;
-                set->brw_callback = ll_brw_sync_wait;
-                err = obd_brw(OBD_BRW_READ, src_conn, &srcmd, 1, &pg, set,NULL);
-                obd_brw_set_decref(set);
+                err = obd_brw(OBD_BRW_READ, src_conn, &srcmd, 1, &pg, NULL);
                 if (err) {
                         EXIT;
                         break;
                 }
 
-                set = obd_brw_set_new();
-                if (set == NULL) {
-                        err = -ENOMEM;
-                        EXIT;
-                        break;
-                }
                 pg.flag = OBD_BRW_CREATE;
                 CDEBUG(D_INFO, "Read page %ld ...\n", page->index);
 
-                set->brw_callback = ll_brw_sync_wait;
-                err = obd_brw(OBD_BRW_WRITE, dst_conn, &dstmd, 1, &pg, set,oti);
-                obd_brw_set_decref(set);
+                err = obd_brw(OBD_BRW_WRITE, dst_conn, &dstmd, 1, &pg, oti);
 
                 /* XXX should handle dst->o_size, dst->o_blocks here */
                 if (err) {
@@ -2574,26 +2898,27 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst,
 }
 
 static struct obd_ops filter_obd_ops = {
-        o_owner:        THIS_MODULE,
-        o_attach:       filter_attach,
-        o_detach:       filter_detach,
-        o_get_info:     filter_get_info,
-        o_setup:        filter_setup,
-        o_cleanup:      filter_cleanup,
-        o_connect:      filter_connect,
-        o_disconnect:   filter_disconnect,
-        o_statfs:       filter_statfs,
-        o_syncfs:       filter_syncfs,
-        o_getattr:      filter_getattr,
-        o_create:       filter_create,
-        o_setattr:      filter_setattr,
-        o_destroy:      filter_destroy,
-        o_open:         filter_open,
-        o_close:        filter_close,
-        o_brw:          filter_brw,
-        o_punch:        filter_truncate,
-        o_preprw:       filter_preprw,
-        o_commitrw:     filter_commitrw
+        o_owner:          THIS_MODULE,
+        o_attach:         filter_attach,
+        o_detach:         filter_detach,
+        o_get_info:       filter_get_info,
+        o_setup:          filter_setup,
+        o_cleanup:        filter_cleanup,
+        o_connect:        filter_connect,
+        o_disconnect:     filter_disconnect,
+        o_statfs:         filter_statfs,
+        o_syncfs:         filter_syncfs,
+        o_getattr:        filter_getattr,
+        o_create:         filter_create,
+        o_setattr:        filter_setattr,
+        o_destroy:        filter_destroy,
+        o_open:           filter_open,
+        o_close:          filter_close,
+        o_brw:            filter_brw,
+        o_punch:          filter_truncate,
+        o_preprw:         filter_preprw,
+        o_commitrw:       filter_commitrw,
+        o_destroy_export: filter_destroy_export,
 #if 0
         o_san_preprw:  filter_san_preprw,
         o_preallocate: filter_preallocate_inodes,
@@ -2604,26 +2929,27 @@ static struct obd_ops filter_obd_ops = {
 };
 
 static struct obd_ops filter_sanobd_ops = {
-        o_owner:        THIS_MODULE,
-        o_attach:       filter_attach,
-        o_detach:       filter_detach,
-        o_get_info:     filter_get_info,
-        o_setup:        filter_san_setup,
-        o_cleanup:      filter_cleanup,
-        o_connect:      filter_connect,
-        o_disconnect:   filter_disconnect,
-        o_statfs:       filter_statfs,
-        o_getattr:      filter_getattr,
-        o_create:       filter_create,
-        o_setattr:      filter_setattr,
-        o_destroy:      filter_destroy,
-        o_open:         filter_open,
-        o_close:        filter_close,
-        o_brw:          filter_brw,
-        o_punch:        filter_truncate,
-        o_preprw:       filter_preprw,
-        o_commitrw:     filter_commitrw,
-        o_san_preprw:   filter_san_preprw,
+        o_owner:          THIS_MODULE,
+        o_attach:         filter_attach,
+        o_detach:         filter_detach,
+        o_get_info:       filter_get_info,
+        o_setup:          filter_san_setup,
+        o_cleanup:        filter_cleanup,
+        o_connect:        filter_connect,
+        o_disconnect:     filter_disconnect,
+        o_statfs:         filter_statfs,
+        o_getattr:        filter_getattr,
+        o_create:         filter_create,
+        o_setattr:        filter_setattr,
+        o_destroy:        filter_destroy,
+        o_open:           filter_open,
+        o_close:          filter_close,
+        o_brw:            filter_brw,
+        o_punch:          filter_truncate,
+        o_preprw:         filter_preprw,
+        o_commitrw:       filter_commitrw,
+        o_san_preprw:     filter_san_preprw,
+        o_destroy_export: filter_destroy_export
 #if 0
         o_preallocate:  filter_preallocate_inodes,
         o_migrate:      filter_migrate,
@@ -2639,41 +2965,19 @@ static int __init obdfilter_init(void)
         int rc;
 
         printk(KERN_INFO "Lustre Filtering OBD driver; info@clusterfs.com\n");
-        filter_open_cache = kmem_cache_create("ll_filter_fdata",
-                                              sizeof(struct filter_file_data),
-                                              0, 0, NULL, NULL);
-        if (!filter_open_cache)
-                RETURN(-ENOMEM);
-
-        filter_dentry_cache = kmem_cache_create("ll_filter_dentry",
-                                        sizeof(struct filter_dentry_data),
-                                        0, 0, NULL, NULL);
-        if (!filter_dentry_cache) {
-                rc = -ENOMEM;
-                goto err1;
-        }
 
         xprocfs_init ("filter");
-
         lprocfs_init_vars(&lvars);
 
         rc = class_register_type(&filter_obd_ops, lvars.module_vars,
                                  OBD_FILTER_DEVICENAME);
         if (rc)
-                goto err2;
+                return rc;
 
         rc = class_register_type(&filter_sanobd_ops, lvars.module_vars,
                                  OBD_FILTER_SAN_DEVICENAME);
         if (rc)
-                goto err3;
-
-        return 0;
-err3:
-        class_unregister_type(OBD_FILTER_DEVICENAME);
-err2:
-        kmem_cache_destroy(filter_dentry_cache);
-err1:
-        kmem_cache_destroy(filter_open_cache);
+                class_unregister_type(OBD_FILTER_DEVICENAME);
         return rc;
 }
 
@@ -2681,10 +2985,6 @@ static void __exit obdfilter_exit(void)
 {
         class_unregister_type(OBD_FILTER_SAN_DEVICENAME);
         class_unregister_type(OBD_FILTER_DEVICENAME);
-        if (kmem_cache_destroy(filter_dentry_cache))
-                CERROR("couldn't free obdfilter dentry cache\n");
-        if (kmem_cache_destroy(filter_open_cache))
-                CERROR("couldn't free obdfilter open cache\n");
         xprocfs_fini ();
 }
 
index c4e0747..89203e5 100644 (file)
@@ -55,6 +55,18 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof,
         return snprintf(page, count, "%s\n", dev->u.filter.fo_fstype);
 }
 
+int lprocfs_filter_rd_mntdev(char *page, char **start, off_t off, int count,
+                    int *eof, void *data)
+{
+        struct obd_device* obd = (struct obd_device *)data;
+
+        LASSERT(obd != NULL);
+        LASSERT(obd->u.filter.fo_vfsmnt->mnt_devname);
+        *eof = 1;
+        return snprintf(page, count, "%s\n", 
+                        obd->u.filter.fo_vfsmnt->mnt_devname);
+}
+
 struct lprocfs_vars lprocfs_obd_vars[] = {
         { "uuid",        lprocfs_rd_uuid,    0, 0 },
         { "blocksize",   rd_blksize,         0, 0 },
@@ -64,6 +76,7 @@ struct lprocfs_vars lprocfs_obd_vars[] = {
         { "filesfree",   rd_filesfree,       0, 0 },
         { "filegroups",  rd_filegroups,      0, 0 },
         { "fstype",      rd_fstype,          0, 0 },
+        { "mntdev",      lprocfs_filter_rd_mntdev,    0, 0 },
         { 0 }
 };
 
index 19fd65c..dc0b4d8 100644 (file)
@@ -5,25 +5,14 @@
 
 DEFS=
 
-
 if LIBLUSTRE
 lib_LIBRARIES = libosc.a
-LINX= obd_pack.c client.c
-libosc_a_SOURCES = osc_request.c  $(LINX)
+libosc_a_SOURCES = osc_request.c
 else
 MODULE = osc
 modulefs_DATA = osc.o
 EXTRA_PROGRAMS = osc
-LINX= obd_pack.c client.c
-osc_SOURCES = osc_request.c lproc_osc.c $(LINX)
+osc_SOURCES = osc_request.c lproc_osc.c osc_lib.c
 endif
 
-obd_pack.c: 
-       test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c
-client.c: 
-       test -e client.c || ln -sf $(top_srcdir)/lib/client.c
-
-dist-hook:
-       list='$(LINX)'; for f in $$list; do rm -f $(distdir)/$$f; done
-
 include $(top_srcdir)/Rules
diff --git a/lustre/osc/osc_lib.c b/lustre/osc/osc_lib.c
new file mode 100644 (file)
index 0000000..aa04a1a
--- /dev/null
@@ -0,0 +1,76 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+#define DEBUG_SUBSYSTEM S_OSC
+
+#ifdef __KERNEL__
+# include <linux/module.h>
+# include <linux/obd.h>
+# include <linux/obd_ost.h>
+# include <linux/lustre_net.h>
+# include <linux/lustre_dlm.h>
+
+/* convert a pathname into a kdev_t */
+static kdev_t path2dev(char *path)
+{
+        struct dentry *dentry;
+        struct nameidata nd;
+        kdev_t dev;
+        KDEVT_VAL(dev, 0);
+
+        if (!path_init(path, LOOKUP_FOLLOW, &nd))
+                return 0;
+
+        if (path_walk(path, &nd))
+                return 0;
+
+        dentry = nd.dentry;
+        if (dentry->d_inode && !is_bad_inode(dentry->d_inode) &&
+            S_ISBLK(dentry->d_inode->i_mode))
+                dev = dentry->d_inode->i_rdev;
+        path_release(&nd);
+
+        return dev;
+}
+
+int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf)
+{
+        struct obd_ioctl_data* data = buf;
+        struct client_obd *cli = &obddev->u.cli;
+        ENTRY;
+
+        if (data->ioc_inllen3 < 1) {
+                CERROR("setup requires a SAN device pathname\n");
+                RETURN(-EINVAL);
+        }
+
+        client_obd_setup(obddev, len, buf);
+
+        cli->cl_sandev = path2dev(data->ioc_inlbuf3);
+        if (!kdev_t_to_nr(cli->cl_sandev)) {
+                CERROR("%s seems not a valid SAN device\n", data->ioc_inlbuf3);
+                RETURN(-EINVAL);
+        }
+
+        RETURN(0);
+}
+#endif
index 515aa70..2289c74 100644 (file)
 #include <linux/kp30.h>
 #include <linux/lustre_mds.h> /* for mds_objid */
 #include <linux/obd_ost.h>
-#include <linux/obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
+
+#ifndef  __CYGWIN__
 #include <linux/ctype.h>
 #include <linux/init.h>
+#else
+#include <ctype.h>
+#endif
+
 #include <linux/lustre_ha.h>
 #include <linux/obd_support.h> /* for OBD_FAIL_CHECK */
 #include <linux/lustre_lite.h> /* for ll_i2info */
 #include <portals/lib-types.h> /* for PTL_MD_MAX_IOV */
 #include <linux/lprocfs_status.h>
 
-/* It is important that ood_fh remain the first item in this structure: that
- * way, we don't have to re-pack the obdo's inline data before we send it to
- * the server, we can just send the whole struct unaltered. */
-#define OSC_OBDO_DATA_MAGIC 0xD15EA5ED
-struct osc_obdo_data {
-        struct lustre_handle ood_fh;
-        struct ptlrpc_request *ood_request;
-        __u32 ood_magic;
-};
-#include <linux/obd_lov.h> /* just for the startup assertion; is that wrong? */
-
-static int send_sync(struct obd_import *imp, struct ll_fid *rootfid,
-                          int level, int msg_flags)
-{
-        struct ptlrpc_request *req;
-        struct mds_body *body;
-        int rc, size = sizeof(*body);
-        ENTRY;
-
-        req = ptlrpc_prep_req(imp, OST_SYNCFS, 1, &size, NULL);
-        if (!req)
-                GOTO(out, rc = -ENOMEM);
-
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
-        req->rq_level = level;
-        req->rq_replen = lustre_msg_size(1, &size);
-
-        req->rq_reqmsg->flags |= msg_flags;
-        rc = ptlrpc_queue_wait(req);
-
-        if (!rc) {
-                CDEBUG(D_NET, "last_committed="LPU64
-                       ", last_xid="LPU64"\n",
-                       req->rq_repmsg->last_committed,
-                       req->rq_repmsg->last_xid);
-        }
-
-        EXIT;
- out:
-        ptlrpc_req_finished(req);
-        return rc;
-}
-
-static int signal_completed_replay(struct obd_import *imp)
-{
-        struct ll_fid fid;
-
-        return send_sync(imp, &fid, LUSTRE_CONN_RECOVD, MSG_LAST_REPLAY);
-}
-
 static int osc_attach(struct obd_device *dev, obd_count len, void *data)
 {
         struct lprocfs_static_vars lvars;
@@ -120,7 +75,7 @@ static int osc_detach(struct obd_device *dev)
         return lprocfs_obd_detach(dev);
 }
 
-/* Pack OSC object metadata for shipment to the MDS. */
+/* Pack OSC object metadata for disk storage (LE byte order). */
 static int osc_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp,
                       struct lov_stripe_md *lsm)
 {
@@ -142,20 +97,36 @@ static int osc_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp,
                 if (!*lmmp)
                         RETURN(-ENOMEM);
         }
+
         if (lsm) {
                 LASSERT(lsm->lsm_object_id);
-                (*lmmp)->lmm_object_id = (lsm->lsm_object_id);
+                (*lmmp)->lmm_object_id = cpu_to_le64 (lsm->lsm_object_id);
         }
 
         RETURN(lmm_size);
 }
 
+/* Unpack OSC object metadata from disk storage (LE byte order). */
 static int osc_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
-                        struct lov_mds_md *lmm)
+                        struct lov_mds_md *lmm, int lmm_bytes)
 {
         int lsm_size;
         ENTRY;
 
+        if (lmm != NULL) {
+                if (lmm_bytes < sizeof (*lmm)) {
+                        CERROR("lov_mds_md too small: %d, need %d\n",
+                               lmm_bytes, (int)sizeof(*lmm));
+                        RETURN (-EINVAL);
+                }
+                /* XXX LOV_MAGIC etc check? */
+
+                if (lmm->lmm_object_id == cpu_to_le64 (0)) {
+                        CERROR ("lov_mds_md: zero lmm_object_id\n");
+                        RETURN (-EINVAL);
+                }
+        }
+
         lsm_size = sizeof(**lsmp);
         if (!lsmp)
                 RETURN(lsm_size);
@@ -172,21 +143,76 @@ static int osc_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp,
                         RETURN(-ENOMEM);
         }
 
-        /* XXX endianness */
         if (lmm) {
-                (*lsmp)->lsm_object_id = (lmm->lmm_object_id);
+                /* XXX zero *lsmp? */
+                (*lsmp)->lsm_object_id = le64_to_cpu (lmm->lmm_object_id);
+                (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
                 LASSERT((*lsmp)->lsm_object_id);
         }
 
         RETURN(lsm_size);
 }
 
-inline void oti_from_request(struct obd_trans_info *oti,
-                             struct ptlrpc_request *req)
+#warning "FIXME: make this be sent from OST"
+#define OSC_BRW_MAX_SIZE 65536
+#define OSC_BRW_MAX_IOV min_t(int, PTL_MD_MAX_IOV, OSC_BRW_MAX_SIZE/PAGE_SIZE)
+
+static int osc_getattr_interpret(struct ptlrpc_request *req,
+                                 struct osc_getattr_async_args *aa, int rc)
 {
-        if (oti && req->rq_repmsg)
-                oti->oti_transno = NTOH__u64(req->rq_repmsg->transno);
-        EXIT;
+        struct obdo     *oa = aa->aa_oa;
+        struct ost_body *body;
+        ENTRY;
+
+        if (rc != 0) {
+                CERROR("failed: rc = %d\n", rc);
+                RETURN (rc);
+        }
+
+        body = lustre_swab_repbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("can't unpack ost_body\n");
+                RETURN (-EPROTO);
+        }
+
+        CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
+        memcpy(oa, &body->oa, sizeof(*oa));
+
+        /* This should really be sent by the OST */
+        oa->o_blksize = OSC_BRW_MAX_SIZE;
+        oa->o_valid |= OBD_MD_FLBLKSZ;
+
+        RETURN (0);
+}
+
+static int osc_getattr_async(struct lustre_handle *conn, struct obdo *oa,
+                             struct lov_stripe_md *md,
+                             struct ptlrpc_request_set *set)
+{
+        struct ptlrpc_request *request;
+        struct ost_body *body;
+        int size = sizeof(*body);
+        struct osc_getattr_async_args *aa;
+        ENTRY;
+
+        request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_GETATTR, 1,
+                                  &size, NULL);
+        if (!request)
+                RETURN(-ENOMEM);
+
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
+        memcpy(&body->oa, oa, sizeof(*oa));
+
+        request->rq_replen = lustre_msg_size(1, &size);
+        request->rq_interpret_reply = osc_getattr_interpret;
+
+        LASSERT (sizeof (*aa) <= sizeof (request->rq_async_args));
+        aa = (struct osc_getattr_async_args *)&request->rq_async_args;
+        aa->aa_oa = oa;
+
+        ptlrpc_set_add_req (set, request);
+        RETURN (0);
 }
 
 static int osc_getattr(struct lustre_handle *conn, struct obdo *oa,
@@ -202,8 +228,7 @@ static int osc_getattr(struct lustre_handle *conn, struct obdo *oa,
         if (!request)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
-#warning FIXME: pack only valid fields instead of memcpy, endianness
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->oa, oa, sizeof(*oa));
 
         request->rq_replen = lustre_msg_size(1, &size);
@@ -214,32 +239,103 @@ static int osc_getattr(struct lustre_handle *conn, struct obdo *oa,
                 GOTO(out, rc);
         }
 
-        body = lustre_msg_buf(request->rq_repmsg, 0);
+        body = lustre_swab_repbuf(request, 0, sizeof (*body),
+                                  lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("can't unpack ost_body\n");
+                GOTO (out, rc = -EPROTO);
+        }
+
         CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode);
         memcpy(oa, &body->oa, sizeof(*oa));
 
+        /* This should really be sent by the OST */
+        oa->o_blksize = OSC_BRW_MAX_SIZE;
+        oa->o_valid |= OBD_MD_FLBLKSZ;
+
         EXIT;
  out:
         ptlrpc_req_finished(request);
         return rc;
 }
 
+/* The import lock must already be held. */
+static inline void osc_update_body_handle(struct list_head *head,
+                                          struct lustre_handle *old,
+                                          struct lustre_handle *new, int op)
+{
+        struct list_head *tmp;
+        struct ost_body *body;
+        struct ptlrpc_request *req;
+        struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
+
+        list_for_each(tmp, head) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                /* XXX ok to remove when bug 1303 resolved - rread 05/27/03  */
+                LASSERT (req != last_req);
+                last_req = req;
+
+                if (req->rq_reqmsg->opc != op)
+                        continue;
+                body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body));
+                if (memcmp(obdo_handle(&body->oa), old, sizeof(*old)))
+                        continue;
+
+                DEBUG_REQ(D_HA, req, "updating close body with new fh");
+                memcpy(obdo_handle(&body->oa), new, sizeof(*new));
+        }
+}
+
+static void osc_replay_open(struct ptlrpc_request *req)
+{
+        struct lustre_handle old;
+        struct ost_body *body;
+        struct obd_client_handle *och = req->rq_replay_data;
+        struct lustre_handle *oa_handle;
+        ENTRY;
+
+        body = lustre_swab_repbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        LASSERT (body != NULL);
+
+        oa_handle = obdo_handle(&body->oa);
+
+        memcpy(&old, &och->och_fh, sizeof(old));
+        CDEBUG(D_HA, "updating cookie from "LPD64" to "LPD64"\n",
+               och->och_fh.cookie, oa_handle->cookie);
+        memcpy(&och->och_fh, oa_handle, sizeof(och->och_fh));
+
+        /* A few frames up, ptlrpc_replay holds the lock, so this is safe. */
+        osc_update_body_handle(&req->rq_import->imp_sending_list, &old,
+                              &och->och_fh, OST_CLOSE);
+        osc_update_body_handle(&req->rq_import->imp_delayed_list, &old,
+                              &och->och_fh, OST_CLOSE);
+        EXIT;
+}
+
+
 static int osc_open(struct lustre_handle *conn, struct obdo *oa,
-                    struct lov_stripe_md *md, struct obd_trans_info *oti)
+                    struct lov_stripe_md *md, struct obd_trans_info *oti,
+                    struct obd_client_handle *och)
 {
         struct ptlrpc_request *request;
         struct ost_body *body;
+        unsigned long flags;
         int rc, size = sizeof(*body);
         ENTRY;
+        LASSERT(och != NULL);
 
         request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_OPEN, 1, &size,
                                   NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
-        request->rq_flags |= PTL_RPC_FL_REPLAY;
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
-#warning FIXME: pack only valid fields instead of memcpy, endianness
+        spin_lock_irqsave (&request->rq_lock, flags);
+        request->rq_replay = 1;
+        spin_unlock_irqrestore (&request->rq_lock, flags);
+
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->oa, oa, sizeof(*oa));
 
         request->rq_replen = lustre_msg_size(1, &size);
@@ -248,28 +344,34 @@ static int osc_open(struct lustre_handle *conn, struct obdo *oa,
         if (rc)
                 GOTO(out, rc);
 
-        if (oa) {
-                struct osc_obdo_data ood;
-                body = lustre_msg_buf(request->rq_repmsg, 0);
-                memcpy(oa, &body->oa, sizeof(*oa));
-
-                /* If the open succeeded, we better have a handle */
-                /* BlueArc OSTs don't send back (o_valid | FLHANDLE).  sigh.
-                 * Temporary workaround until fixed. -phil 24 Feb 03 */
-                //LASSERT(oa->o_valid & OBD_MD_FLHANDLE);
-                oa->o_valid |= OBD_MD_FLHANDLE;
-
-                memcpy(&ood.ood_fh, obdo_handle(oa), sizeof(ood.ood_fh));
-                ood.ood_request = ptlrpc_request_addref(request);
-                ood.ood_magic = OSC_OBDO_DATA_MAGIC;
-
-                /* Save this data in the request; it will be passed back to us
-                 * in future obdos.  This memcpy is guaranteed to be safe,
-                 * because we check at compile-time that sizeof(ood) is smaller
-                 * than oa->o_inline. */
-                memcpy(&oa->o_inline, &ood, sizeof(ood));
+        body = lustre_swab_repbuf (request, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("Can't unpack ost_body\n");
+                GOTO (out, rc = -EPROTO);
         }
 
+        memcpy(oa, &body->oa, sizeof(*oa));
+
+        /* If the open succeeded, we better have a handle */
+        /* BlueArc OSTs don't send back (o_valid | FLHANDLE).  sigh.
+         * Temporary workaround until fixed. -phil 24 Feb 03 */
+        // if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) {
+        //         CERROR ("No file handle\n");
+        //         GOTO (out, rc = -EPROTO);
+        // }
+        oa->o_valid |= OBD_MD_FLHANDLE;
+
+        /* This should really be sent by the OST */
+        oa->o_blksize = OSC_BRW_MAX_SIZE;
+        oa->o_valid |= OBD_MD_FLBLKSZ;
+
+        memcpy(&och->och_fh, obdo_handle(oa), sizeof(och->och_fh));
+        request->rq_replay_cb = osc_replay_open;
+        request->rq_replay_data = och;
+        och->och_req = ptlrpc_request_addref(request);
+        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
+
         EXIT;
  out:
         ptlrpc_req_finished(request);
@@ -282,55 +384,70 @@ static int osc_close(struct lustre_handle *conn, struct obdo *oa,
         struct obd_import *import = class_conn2cliimp(conn);
         struct ptlrpc_request *request;
         struct ost_body *body;
-        struct osc_obdo_data *ood;
+        struct obd_client_handle *och;
         unsigned long flags;
         int rc, size = sizeof(*body);
         ENTRY;
 
         LASSERT(oa != NULL);
-        ood = (struct osc_obdo_data *)&oa->o_inline;
-        LASSERT(ood->ood_magic == OSC_OBDO_DATA_MAGIC);
+        och = (struct obd_client_handle *)&oa->o_inline;
+        if (och->och_magic == 0) {
+                /* Zero magic means that this file was never opened on this
+                 * OST--almost certainly because the OST was inactive at
+                 * open-time */
+                RETURN(0);
+        }
+        LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC);
 
         request = ptlrpc_prep_req(import, OST_CLOSE, 1, &size, NULL);
         if (!request)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
-#warning FIXME: pack only valid fields instead of memcpy, endianness
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->oa, oa, sizeof(*oa));
 
         request->rq_replen = lustre_msg_size(1, &size);
 
         rc = ptlrpc_queue_wait(request);
-        if (rc) {
-                /* FIXME: Does this mean that the file is still open locally?
-                 * If not, and I somehow suspect not, we need to cleanup
-                 * below */
-                GOTO(out, rc);
-        }
-
-        spin_lock_irqsave(&import->imp_lock, flags);
-        ood->ood_request->rq_flags &= ~PTL_RPC_FL_REPLAY;
-        /* see comments in llite/file.c:ll_mdc_close() */
-        if (ood->ood_request->rq_transno) {
-                LBUG(); /* this can't happen yet */
-                if (!request->rq_transno) {
-                        request->rq_transno = ood->ood_request->rq_transno;
-                        ptlrpc_retain_replayable_request(request, import);
+        if (rc)
+                CDEBUG(D_HA, "Suppressing close error %d\n", rc); // bug 1036
+
+        /* och_req == NULL can't happen any more, right? --phik */
+        if (och->och_req != NULL) {
+                spin_lock_irqsave(&import->imp_lock, flags);
+                spin_lock (&och->och_req->rq_lock);
+                och->och_req->rq_replay = 0;
+                spin_unlock (&och->och_req->rq_lock);
+                /* see comments in llite/file.c:ll_mdc_close() */
+                if (och->och_req->rq_transno) {
+                        /* this can't happen yet, because the OSTs don't yet
+                         * issue transnos for OPEN requests -phik 21 Apr 2003 */
+                        LBUG();
+                        if (!request->rq_transno && import->imp_replayable) {
+                                request->rq_transno = och->och_req->rq_transno;
+                                ptlrpc_retain_replayable_request(request,
+                                                                 import);
+                        }
+                        spin_unlock_irqrestore(&import->imp_lock, flags);
+                } else {
+                        spin_unlock_irqrestore(&import->imp_lock, flags);
                 }
-                spin_unlock_irqrestore(&import->imp_lock, flags);
-        } else {
-                spin_unlock_irqrestore(&import->imp_lock, flags);
-                ptlrpc_req_finished(ood->ood_request);
+
+                ptlrpc_req_finished(och->och_req);
         }
 
-        body = lustre_msg_buf(request->rq_repmsg, 0);
-        memcpy(oa, &body->oa, sizeof(*oa));
+        if (!rc) {
+                body = lustre_swab_repbuf (request, 0, sizeof (*body),
+                                           lustre_swab_ost_body);
+                if (body == NULL) {
+                        rc = -EPROTO;
+                        CDEBUG(D_HA, "Suppressing close error %d\n", rc); // bug 1036
+                } else
+                        memcpy(oa, &body->oa, sizeof(*oa));
+        }
 
-        EXIT;
- out:
         ptlrpc_req_finished(request);
-        return rc;
+        RETURN(0);
 }
 
 static int osc_setattr(struct lustre_handle *conn, struct obdo *oa,
@@ -346,7 +463,7 @@ static int osc_setattr(struct lustre_handle *conn, struct obdo *oa,
         if (!request)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->oa, oa, sizeof(*oa));
 
         request->rq_replen = lustre_msg_size(1, &size);
@@ -358,12 +475,11 @@ static int osc_setattr(struct lustre_handle *conn, struct obdo *oa,
 }
 
 static int osc_create(struct lustre_handle *conn, struct obdo *oa,
-                      struct lov_stripe_md **ea, struct obd_trans_info *oti_in)
+                      struct lov_stripe_md **ea, struct obd_trans_info *oti)
 {
         struct ptlrpc_request *request;
         struct ost_body *body;
         struct lov_stripe_md *lsm;
-        struct obd_trans_info *oti, trans_info;
         int rc, size = sizeof(*body);
         ENTRY;
 
@@ -377,17 +493,12 @@ static int osc_create(struct lustre_handle *conn, struct obdo *oa,
                         RETURN(rc);
         }
 
-        if (oti_in)
-                oti = oti_in;
-        else
-                oti = &trans_info;
-
         request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_CREATE, 1, &size,
                                   NULL);
         if (!request)
                 GOTO(out, rc = -ENOMEM);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->oa, oa, sizeof(*oa));
 
         request->rq_replen = lustre_msg_size(1, &size);
@@ -396,15 +507,28 @@ static int osc_create(struct lustre_handle *conn, struct obdo *oa,
         if (rc)
                 GOTO(out_req, rc);
 
-        body = lustre_msg_buf(request->rq_repmsg, 0);
+        body = lustre_swab_repbuf (request, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("can't unpack ost_body\n");
+                GOTO (out_req, rc = -EPROTO);
+        }
+
         memcpy(oa, &body->oa, sizeof(*oa));
 
+        /* This should really be sent by the OST */
+        oa->o_blksize = OSC_BRW_MAX_SIZE;
+        oa->o_valid |= OBD_MD_FLBLKSZ;
+
         lsm->lsm_object_id = oa->o_id;
         lsm->lsm_stripe_count = 0;
+        lsm->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES;
         *ea = lsm;
 
-        oti_from_request(oti, request);
-        CDEBUG(D_HA, "transno: "LPD64"\n", oti->oti_transno);
+        if (oti != NULL)
+                oti->oti_transno = request->rq_repmsg->transno;
+
+        CDEBUG(D_HA, "transno: "LPD64"\n", request->rq_repmsg->transno);
         EXIT;
 out_req:
         ptlrpc_req_finished(request);
@@ -433,14 +557,13 @@ static int osc_punch(struct lustre_handle *conn, struct obdo *oa,
         if (!request)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
-#warning FIXME: pack only valid fields instead of memcpy, endianness, valid
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->oa, oa, sizeof(*oa));
 
         /* overload the size and blocks fields in the oa with start/end */
-        body->oa.o_size = HTON__u64(start);
-        body->oa.o_blocks = HTON__u64(end);
-        body->oa.o_valid |= HTON__u32(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+        body->oa.o_size = start;
+        body->oa.o_blocks = end;
+        body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
 
         request->rq_replen = lustre_msg_size(1, &size);
 
@@ -448,7 +571,13 @@ static int osc_punch(struct lustre_handle *conn, struct obdo *oa,
         if (rc)
                 GOTO(out, rc);
 
-        body = lustre_msg_buf(request->rq_repmsg, 0);
+        body = lustre_swab_repbuf (request, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("can't unpack ost_body\n");
+                GOTO (out, rc = -EPROTO);
+        }
+
         memcpy(oa, &body->oa, sizeof(*oa));
 
         EXIT;
@@ -474,8 +603,7 @@ static int osc_destroy(struct lustre_handle *conn, struct obdo *oa,
         if (!request)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
-#warning FIXME: pack only valid fields instead of memcpy, endianness
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
         memcpy(&body->oa, oa, sizeof(*oa));
 
         request->rq_replen = lustre_msg_size(1, &size);
@@ -484,7 +612,13 @@ static int osc_destroy(struct lustre_handle *conn, struct obdo *oa,
         if (rc)
                 GOTO(out, rc);
 
-        body = lustre_msg_buf(request->rq_repmsg, 0);
+        body = lustre_swab_repbuf (request, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("Can't unpack body\n");
+                GOTO (out, rc = -EPROTO);
+        }
+
         memcpy(oa, &body->oa, sizeof(*oa));
 
         EXIT;
@@ -493,191 +627,259 @@ static int osc_destroy(struct lustre_handle *conn, struct obdo *oa,
         return rc;
 }
 
-/* Our bulk-unmapping bottom half. */
-static void unmap_and_decref_bulk_desc(void *data)
+/* We assume that the reason this OSC got a short read is because it read
+ * beyond the end of a stripe file; i.e. lustre is reading a sparse file
+ * via the LOV, and it _knows_ it's reading inside the file, it's just that
+ * this stripe never got written at or beyond this stripe offset yet. */
+static void handle_short_read(int nob_read, obd_count page_count,
+                              struct brw_page *pga)
 {
-        struct ptlrpc_bulk_desc *desc = data;
-        struct list_head *tmp;
-        ENTRY;
-
-        list_for_each(tmp, &desc->bd_page_list) {
-                struct ptlrpc_bulk_page *bulk;
-                bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
+        char *ptr;
+
+        /* skip bytes read OK */
+        while (nob_read > 0) {
+                LASSERT (page_count > 0);
+
+                if (pga->count > nob_read) {
+                        /* EOF inside this page */
+                        ptr = kmap(pga->pg) + (pga->off & ~PAGE_MASK);
+                        memset(ptr + nob_read, 0, pga->count - nob_read);
+                        kunmap(pga->pg);
+                        page_count--;
+                        pga++;
+                        break;
+                }
 
-                kunmap(bulk->bp_page);
-                obd_kmap_put(1);
+                nob_read -= pga->count;
+                page_count--;
+                pga++;
         }
 
-        ptlrpc_bulk_decref(desc);
-        EXIT;
+        /* zero remaining pages */
+        while (page_count-- > 0) {
+                ptr = kmap(pga->pg) + (pga->off & ~PAGE_MASK);
+                memset(ptr, 0, pga->count);
+                kunmap(pga->pg);
+                pga++;
+        }
 }
 
-
-/*  this is the callback function which is invoked by the Portals
- *  event handler associated with the bulk_sink queue and bulk_source queue.
- */
-static void osc_ptl_ev_hdlr(struct ptlrpc_bulk_desc *desc)
+static int check_write_rcs (struct ptlrpc_request *request,
+                            int niocount, obd_count page_count,
+                            struct brw_page *pga)
 {
-        ENTRY;
-
-        LASSERT(desc->bd_brw_set != NULL);
-        LASSERT(desc->bd_brw_set->brw_callback != NULL);
-
-        /* It's important that you don't use desc->bd_brw_set after this
-         * callback runs.  If you do, take a reference on it. */
-        desc->bd_brw_set->brw_callback(desc->bd_brw_set, CB_PHASE_FINISH);
-
-        /* We can't kunmap the desc from interrupt context, so we do it from
-         * the bottom half above. */
-        prepare_work(&desc->bd_queue, unmap_and_decref_bulk_desc, desc);
-        schedule_work(&desc->bd_queue);
+        int    i;
+        __u32 *remote_rcs;
+
+        /* return error if any niobuf was in error */
+        remote_rcs = lustre_swab_repbuf(request, 1,
+                                        sizeof(*remote_rcs) * niocount, NULL);
+        if (remote_rcs == NULL) {
+                CERROR ("Missing/short RC vector on BRW_WRITE reply\n");
+                return (-EPROTO);
+        }
+        if (lustre_msg_swabbed (request->rq_repmsg))
+                for (i = 0; i < niocount; i++)
+                        __swab32s (&remote_rcs[i]);
+
+        for (i = 0; i < niocount; i++) {
+                if (remote_rcs[i] < 0)
+                        return (remote_rcs[i]);
+
+                if (remote_rcs[i] != 0) {
+                        CERROR ("rc[%d] invalid (%d) req %p\n",
+                                i, remote_rcs[i], request);
+                        return (-EPROTO);
+                }
+        }
 
-        EXIT;
+        return (0);
 }
 
-/*
- * This is called when there was a bulk error return.  However, we don't know
- * whether the bulk completed or not.  We cancel the portals bulk descriptors,
- * so that if the OST decides to send them later we don't double free.  Then
- * remove this descriptor from the set so that the set callback doesn't wait
- * forever for the last CB_PHASE_FINISH to be called, and finally dump all of
- * the bulk descriptor references.
- */
-static void osc_ptl_ev_abort(struct ptlrpc_bulk_desc *desc)
+static inline int can_merge_pages (struct brw_page *p1, struct brw_page *p2)
 {
-        ENTRY;
-
-        LASSERT(desc->bd_brw_set != NULL);
-
-        /* XXX reconcile this with ll_sync_brw_timeout() handling, and/or
-         *     just make osc_ptl_ev_hdlr() check desc->bd_flags for either
-         *     PTL_BULK_FL_RCVD or PTL_BULK_FL_SENT, and pass CB_PHASE_ABORT
-         *     to brw_callback() and do the rest of the cleanup there.  I
-         *     also think ll_sync_brw_timeout() is missing an PtlMEUnlink,
-         *     but I could be wrong.
-         */
-        if (ptlrpc_abort_bulk(desc)) {
-                EXIT;
-                return;
+        if (p1->flag != p2->flag) {
+                /* XXX we don't make much use of 'flag' right now
+                 * but this will warn about usage when we do */
+                CERROR ("different flags set %d, %d\n",
+                        p1->flag, p2->flag);
+                return (0);
         }
-        obd_brw_set_del(desc);
-        unmap_and_decref_bulk_desc(desc);
 
-        EXIT;
+        return (p1->off + p1->count == p2->off);
 }
 
-static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
-                        obd_count page_count, struct brw_page *pga,
-                        struct obd_brw_set *set)
+#if CHECKSUM_BULK
+static __u64 cksum_pages(int nob, obd_count page_count, struct brw_page *pga)
 {
-        struct obd_import *imp = class_conn2cliimp(conn);
-        struct ptlrpc_connection *connection = imp->imp_connection;
-        struct ptlrpc_request *request = NULL;
-        struct ptlrpc_bulk_desc *desc = NULL;
-        struct ost_body *body;
-        int rc, size[3] = {sizeof(*body)}, mapped = 0;
-        struct obd_ioobj *iooptr;
-        struct niobuf_remote *nioptr;
-        __u32 xid;
-        ENTRY;
-
-restart_bulk:
-        size[1] = sizeof(struct obd_ioobj);
-        size[2] = page_count * sizeof(struct niobuf_remote);
+        __u64 cksum = 0;
+        char *ptr;
+        int   i;
 
-        request = ptlrpc_prep_req(imp, OST_READ, 3, size, NULL);
-        if (!request)
-                RETURN(-ENOMEM);
+        while (nob > 0) {
+                LASSERT (page_count > 0);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
-        body->oa.o_valid = HTON__u32(OBD_MD_FLCKSUM * CHECKSUM_BULK);
+                ptr = kmap (pga->pg);
+                ost_checksum (&cksum, ptr + (pga->off & (PAGE_SIZE - 1)),
+                              pga->count > nob ? nob : pga->count);
+                kunmap (pga->pg);
 
-        desc = ptlrpc_prep_bulk(connection);
-        if (!desc)
-                GOTO(out_req, rc = -ENOMEM);
-        desc->bd_portal = OST_BULK_PORTAL;
-        desc->bd_ptl_ev_hdlr = osc_ptl_ev_hdlr;
-        CDEBUG(D_PAGE, "desc = %p\n", desc);
+                nob -= pga->count;
+                page_count--;
+                pga++;
+        }
 
-        iooptr = lustre_msg_buf(request->rq_reqmsg, 1);
-        nioptr = lustre_msg_buf(request->rq_reqmsg, 2);
-        ost_pack_ioo(iooptr, lsm, page_count);
-        /* end almost identical to brw_write case */
+        return (cksum);
+}
+#endif
 
-        xid = ptlrpc_next_xid();       /* single xid for all pages */
+static int osc_brw_prep_request(struct obd_import *imp,
+                                struct lov_stripe_md *lsm, obd_count page_count,
+                                struct brw_page *pga, int cmd,
+                                int *requested_nobp, int *niocountp,
+                                struct ptlrpc_request **reqp)
+{
+        struct ptlrpc_request   *req;
+        struct ptlrpc_bulk_desc *desc;
+        struct ost_body         *body;
+        struct obd_ioobj        *ioobj;
+        struct niobuf_remote    *niobuf;
+        unsigned long            flags;
+        int                      niocount;
+        int                      size[3];
+        int                      i;
+        int                      requested_nob;
+        int                      opc;
+        int                      rc;
+
+        opc = ((cmd & OBD_BRW_WRITE) != 0) ? OST_WRITE : OST_READ;
+
+        for (niocount = i = 1; i < page_count; i++)
+                if (!can_merge_pages (&pga[i - 1], &pga[i]))
+                        niocount++;
+
+        size[0] = sizeof (*body);
+        size[1] = sizeof (*ioobj);
+        size[2] = niocount * sizeof (*niobuf);
+
+        req = ptlrpc_prep_req (imp, opc, 3, size, NULL);
+        if (req == NULL)
+                return (-ENOMEM);
+
+        if (opc == OST_WRITE)
+                desc = ptlrpc_prep_bulk_imp(req, BULK_GET_SOURCE,
+                                            OST_BULK_PORTAL);
+        else
+                desc = ptlrpc_prep_bulk_imp(req, BULK_PUT_SINK,
+                                            OST_BULK_PORTAL);
+        if (desc == NULL)
+                GOTO (out, rc = -ENOMEM);
+        /* NB request now owns desc and will free it when it gets freed */
+
+        body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body));
+        ioobj = lustre_msg_buf(req->rq_reqmsg, 1, sizeof(*ioobj));
+        niobuf = lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf));
+
+        ioobj->ioo_id = lsm->lsm_object_id;
+        ioobj->ioo_gr = 0;
+        ioobj->ioo_type = S_IFREG;
+        ioobj->ioo_bufcnt = niocount;
+
+        LASSERT (page_count > 0);
+        for (requested_nob = i = 0; i < page_count; i++, niobuf++) {
+                struct brw_page *pg = &pga[i];
+                struct brw_page *pg_prev = pg - 1;
+
+                LASSERT (pg->count > 0);
+                LASSERT ((pg->off & (PAGE_SIZE - 1)) + pg->count <= PAGE_SIZE);
+                LASSERT (i == 0 || pg->off > pg_prev->off);
+
+                rc = ptlrpc_prep_bulk_page (desc, pg->pg,
+                                            pg->off & (PAGE_SIZE - 1),
+                                            pg->count);
+                if (rc != 0)
+                        GOTO (out, rc);
 
-        obd_kmap_get(page_count, 0);
+                requested_nob += pg->count;
 
-        for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
-                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
-                if (bulk == NULL) {
-                        unmap_and_decref_bulk_desc(desc);
-                        GOTO(out_req, rc = -ENOMEM);
+                if (i > 0 &&
+                    can_merge_pages (pg_prev, pg)) {
+                        niobuf--;
+                        niobuf->len += pg->count;
+                } else {
+                        niobuf->offset = pg->off;
+                        niobuf->len    = pg->count;
+                        niobuf->flags  = pg->flag;
                 }
-
-                LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off);
-
-                bulk->bp_xid = xid;           /* single xid for all pages */
-                bulk->bp_buf = kmap(pga[mapped].pg);
-                bulk->bp_page = pga[mapped].pg;
-                bulk->bp_buflen = PAGE_SIZE;
-                ost_pack_niobuf(nioptr, pga[mapped].off, pga[mapped].count,
-                                pga[mapped].flag, bulk->bp_xid);
         }
 
-        /*
-         * Register the bulk first, because the reply could arrive out of order,
-         * and we want to be ready for the bulk data.
-         *
-         * One reference is released when osc_ptl_ev_hdlr() is called by
-         * portals, the other when the caller removes us from the "set" list.
-         *
-         * On error, we never do the brw_finish, so we handle all decrefs.
-         */
-        if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_READ_BULK)) {
-                CERROR("obd_fail_loc=%x, skipping register_bulk\n",
-                       OBD_FAIL_OSC_BRW_READ_BULK);
+        LASSERT ((void *)(niobuf - niocount) ==
+                 lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf)));
+#if CHECKSUM_BULK
+        body->oa.o_valid |= OBD_MD_FLCKSUM;
+        if (opc == OST_BRW_WRITE)
+                body->oa.o_rdev = cksum_pages (requested_nob, page_count, pga);
+#endif
+        spin_lock_irqsave (&req->rq_lock, flags);
+        req->rq_no_resend = 1;
+        spin_unlock_irqrestore (&req->rq_lock, flags);
+
+        /* size[0] still sizeof (*body) */
+        if (opc == OST_WRITE) {
+                /* 1 RC per niobuf */
+                size[1] = sizeof(__u32) * niocount;
+                req->rq_replen = lustre_msg_size(2, size);
         } else {
-                rc = ptlrpc_register_bulk_put(desc);
-                if (rc) {
-                        unmap_and_decref_bulk_desc(desc);
-                        GOTO(out_req, rc);
-                }
-                obd_brw_set_add(set, desc);
+                /* 1 RC for the whole I/O */
+                req->rq_replen = lustre_msg_size(1, size);
         }
 
-        request->rq_flags |= PTL_RPC_FL_NO_RESEND;
-        request->rq_replen = lustre_msg_size(1, size);
-        rc = ptlrpc_queue_wait(request);
+        *niocountp = niocount;
+        *requested_nobp = requested_nob;
+        *reqp = req;
+        return (0);
 
-        /* XXX bug 937 here */
-        if (rc == -ETIMEDOUT && (request->rq_flags & PTL_RPC_FL_RESEND)) {
-                DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
-                ptlrpc_req_finished(request);
-                goto restart_bulk;
+ out:
+        ptlrpc_req_finished (req);
+        return (rc);
+}
+
+static int osc_brw_fini_request (struct ptlrpc_request *req,
+                                 int requested_nob, int niocount,
+                                 obd_count page_count, struct brw_page *pga,
+                                 int rc)
+{
+        if (rc < 0)
+                return (rc);
+
+        if (req->rq_reqmsg->opc == OST_WRITE) {
+                if (rc > 0) {
+                        CERROR ("Unexpected +ve rc %d\n", rc);
+                        return (-EPROTO);
+                }
+
+                return (check_write_rcs(req, niocount, page_count, pga));
         }
 
-        if (rc) {
-                osc_ptl_ev_abort(desc);
-                GOTO(out_req, rc);
+        if (rc > requested_nob) {
+                CERROR ("Unexpected rc %d (%d requested)\n",
+                        rc, requested_nob);
+                return (-EPROTO);
         }
 
+        if (rc < requested_nob)
+                handle_short_read (rc, page_count, pga);
+
 #if CHECKSUM_BULK
-        body = lustre_msg_buf(request->rq_repmsg, 0);
-        if (body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM)) {
+        imp = req->rq_import;
+        body = lustre_swab_repmsg (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("Can't unpack body\n");
+        } else if (body->oa.o_valid & OBD_MD_FLCKSUM) {
                 static int cksum_counter;
-                __u64 server_cksum = NTOH__u64(body->oa.o_rdev);
-                __u64 cksum = 0;
-
-                for (mapped = 0; mapped < page_count; mapped++) {
-                        char *ptr = kmap(pga[mapped].pg);
-                        int   off = pga[mapped].off & (PAGE_SIZE - 1);
-                        int   len = pga[mapped].count;
-
-                        LASSERT(off + len <= PAGE_SIZE);
-                        ost_checksum(&cksum, ptr + off, len);
-                        kunmap(pga[mapped].pg);
-                }
+                __u64 server_cksum = body->oa.o_rdev;
+                __u64 cksum = cksum_pages (rc, page_count, pga);
 
                 cksum_counter++;
                 if (server_cksum != cksum) {
@@ -698,143 +900,208 @@ restart_bulk:
                                imp->imp_connection->c_peer.peer_nid);
         }
 #endif
-
-        EXIT;
- out_req:
-        ptlrpc_req_finished(request);
-        return rc;
+        return (0);
 }
 
-static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm,
-                         obd_count page_count, struct brw_page *pga,
-                         struct obd_brw_set *set, struct obd_trans_info *oti)
+static int osc_brw_internal(struct lustre_handle *conn,
+                            struct lov_stripe_md *lsm,
+                            obd_count page_count, struct brw_page *pga, int cmd)
 {
-        struct obd_import *imp = class_conn2cliimp(conn);
-        struct ptlrpc_connection *connection = imp->imp_connection;
-        struct ptlrpc_request *request = NULL;
-        struct ptlrpc_bulk_desc *desc = NULL;
-        struct ost_body *body;
-        int rc, size[3] = {sizeof(*body)}, mapped = 0;
-        struct obd_ioobj *iooptr;
-        struct niobuf_remote *nioptr;
-        __u32 xid;
-#if CHECKSUM_BULK
-        __u64 cksum = 0;
-#endif
+        int                    requested_nob;
+        int                    niocount;
+        struct ptlrpc_request *request;
+        int                    rc;
         ENTRY;
 
 restart_bulk:
-        size[1] = sizeof(struct obd_ioobj);
-        size[2] = page_count * sizeof(struct niobuf_remote);
+        rc = osc_brw_prep_request(class_conn2cliimp(conn), lsm, page_count, pga,
+                                  cmd, &requested_nob, &niocount, &request);
+        /* NB ^ sets rq_no_resend */
 
-        request = ptlrpc_prep_req(imp, OST_WRITE, 3, size, NULL);
-        if (!request)
-                RETURN(-ENOMEM);
+        if (rc != 0)
+                return (rc);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
+        rc = ptlrpc_queue_wait(request);
 
-        desc = ptlrpc_prep_bulk(connection);
-        if (!desc)
-                GOTO(out_req, rc = -ENOMEM);
-        desc->bd_portal = OSC_BULK_PORTAL;
-        desc->bd_ptl_ev_hdlr = osc_ptl_ev_hdlr;
-        CDEBUG(D_PAGE, "desc = %p\n", desc);
+        if (rc == -ETIMEDOUT && request->rq_resend) {
+                DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
+                ptlrpc_req_finished(request);
+                goto restart_bulk;
+        }
 
-        iooptr = lustre_msg_buf(request->rq_reqmsg, 1);
-        nioptr = lustre_msg_buf(request->rq_reqmsg, 2);
-        ost_pack_ioo(iooptr, lsm, page_count);
-        /* end almost identical to brw_read case */
+        rc = osc_brw_fini_request (request, requested_nob, niocount,
+                                   page_count, pga, rc);
 
-        xid = ptlrpc_next_xid();       /* single xid for all pages */
+        ptlrpc_req_finished(request);
+        RETURN (rc);
+}
 
-        obd_kmap_get(page_count, 0);
+static int brw_interpret(struct ptlrpc_request *request,
+                         struct osc_brw_async_args *aa, int rc)
+{
+        int requested_nob    = aa->aa_requested_nob;
+        int niocount         = aa->aa_nio_count;
+        obd_count page_count = aa->aa_page_count;
+        struct brw_page *pga = aa->aa_pga;
+        ENTRY;
 
-        for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
-                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
-                if (bulk == NULL) {
-                        unmap_and_decref_bulk_desc(desc);
-                        GOTO(out_req, rc = -ENOMEM);
-                }
+        /* XXX bug 937 here */
+        if (rc == -ETIMEDOUT && request->rq_resend) {
+                DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
+                LBUG(); /* re-send.  later. */
+                //goto restart_bulk;
+        }
 
-                LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off);
+        rc = osc_brw_fini_request (request, requested_nob, niocount,
+                                   page_count, pga, rc);
+        RETURN (rc);
+}
 
-                bulk->bp_xid = xid;           /* single xid for all pages */
-                bulk->bp_buf = kmap(pga[mapped].pg);
-                bulk->bp_page = pga[mapped].pg;
-                /* matching ptlrpc_bulk_get assert */
-                LASSERT(pga[mapped].count > 0);
-                bulk->bp_buflen = pga[mapped].count;
-                ost_pack_niobuf(nioptr, pga[mapped].off, pga[mapped].count,
-                                pga[mapped].flag, bulk->bp_xid);
-                ost_checksum(&cksum, bulk->bp_buf, bulk->bp_buflen);
+static int async_internal(struct lustre_handle *conn, struct lov_stripe_md *lsm,
+                          obd_count page_count, struct brw_page *pga,
+                          struct ptlrpc_request_set *set, int cmd)
+{
+        struct ptlrpc_request     *request;
+        int                        requested_nob;
+        int                        nio_count;
+        struct osc_brw_async_args *aa;
+        int                        rc;
+        ENTRY;
+
+        rc = osc_brw_prep_request (class_conn2cliimp(conn),
+                                   lsm, page_count, pga, cmd,
+                                   &requested_nob, &nio_count, &request);
+        /* NB ^ sets rq_no_resend */
+
+        if (rc == 0) {
+                LASSERT (sizeof (*aa) <= sizeof (request->rq_async_args));
+                aa = (struct osc_brw_async_args *)&request->rq_async_args;
+                aa->aa_requested_nob = requested_nob;
+                aa->aa_nio_count = nio_count;
+                aa->aa_page_count = page_count;
+                aa->aa_pga = pga;
+
+                request->rq_interpret_reply = brw_interpret;
+                ptlrpc_set_add_req(set, request);
         }
+        RETURN (rc);
+}
 
-#if CHECKSUM_BULK
-        body->oa.o_rdev = HTON__u64(cksum);
-        body->oa.o_valid |= HTON__u32(OBD_MD_FLCKSUM);
+#ifndef min_t
+#define min_t(type,x,y) \
+        ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
 #endif
-        /*
-         * Register the bulk first, because the reply could arrive out of
-         * order, and we want to be ready for the bulk data.
-         *
-         * One reference is released when brw_finish is complete, the other
-         * when the caller removes us from the "set" list.
-         *
-         * On error, we never do the brw_finish, so we handle all decrefs.
-         */
-        if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_WRITE_BULK)) {
-                CERROR("obd_fail_loc=%x, skipping register_bulk\n",
-                       OBD_FAIL_OSC_BRW_WRITE_BULK);
-        } else {
-                rc = ptlrpc_register_bulk_get(desc);
-                if (rc) {
-                        unmap_and_decref_bulk_desc(desc);
-                        GOTO(out_req, rc);
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order.  we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation.  its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+static void sort_brw_pages(struct brw_page *array, int num)
+{
+        int stride, i, j;
+        struct brw_page tmp;
+
+        if (num == 1)
+                return;
+        for (stride = 1; stride < num ; stride = (stride * 3) + 1)
+                ;
+
+        do {
+                stride /= 3;
+                for (i = stride ; i < num ; i++) {
+                        tmp = array[i];
+                        j = i;
+                        while (j >= stride && array[j - stride].off > tmp.off) {
+                                array[j] = array[j - stride];
+                                j -= stride;
+                        }
+                        array[j] = tmp;
+                }
+        } while (stride > 1);
+}
+
+/* make sure we the regions we're passing to elan don't violate its '4
+ * fragments' constraint.  portal headers are a fragment, all full
+ * PAGE_SIZE long pages count as 1 fragment, and each partial page
+ * counts as a fragment.  I think.  see bug 934. */
+static obd_count check_elan_limit(struct brw_page *pg, obd_count pages)
+{
+        int frags_left = 3;
+        int saw_whole_frag = 0;
+        int i;
+
+        for (i = 0 ; frags_left && i < pages ; pg++, i++) {
+                if (pg->count == PAGE_SIZE) {
+                        if (!saw_whole_frag) {
+                                saw_whole_frag = 1;
+                                frags_left--;
+                        }
+                } else {
+                        frags_left--;
                 }
-                obd_brw_set_add(set, desc);
         }
+        return i;
+}
 
-        request->rq_flags |= PTL_RPC_FL_NO_RESEND;
-        request->rq_replen = lustre_msg_size(1, size);
-        rc = ptlrpc_queue_wait(request);
+static int osc_brw(int cmd, struct lustre_handle *conn,
+                   struct lov_stripe_md *md, obd_count page_count,
+                   struct brw_page *pga, struct obd_trans_info *oti)
+{
+        ENTRY;
 
-        /* XXX bug 937 here */
-        if (rc == -ETIMEDOUT && (request->rq_flags & PTL_RPC_FL_RESEND)) {
-                DEBUG_REQ(D_HA, request,  "BULK TIMEOUT");
-                ptlrpc_req_finished(request);
-                goto restart_bulk;
-        }
+        if (cmd == OBD_BRW_CHECK) {
+                /* The caller just wants to know if there's a chance that this
+                 * I/O can succeed */
+                struct obd_import *imp = class_conn2cliimp(conn);
 
-        if (rc) {
-                osc_ptl_ev_abort(desc);
-                GOTO(out_req, rc);
+                if (imp == NULL || imp->imp_invalid)
+                        RETURN(-EIO);
+                RETURN(0);
         }
 
-        EXIT;
- out_req:
-        ptlrpc_req_finished(request);
-        return rc;
-}
+        while (page_count) {
+                obd_count pages_per_brw;
+                int rc;
 
-#ifndef min_t
-#define min_t(a,b,c) ( b<c ) ? b : c
-#endif
+                if (page_count > OSC_BRW_MAX_IOV)
+                        pages_per_brw = OSC_BRW_MAX_IOV;
+                else
+                        pages_per_brw = page_count;
 
-#warning "FIXME: make values dynamic based on get_info at setup (bug 665)"
-#define OSC_BRW_MAX_SIZE 65536
-#define OSC_BRW_MAX_IOV min_t(int, PTL_MD_MAX_IOV, OSC_BRW_MAX_SIZE/PAGE_SIZE)
+                sort_brw_pages(pga, pages_per_brw);
+                pages_per_brw = check_elan_limit(pga, pages_per_brw);
 
-#warning "FIXME: make these values dynamic based on a get_info call at setup"
-#define OSC_BRW_MAX_SIZE 65536
-#define OSC_BRW_MAX_IOV min_t(int, PTL_MD_MAX_IOV, OSC_BRW_MAX_SIZE/PAGE_SIZE)
+                rc = osc_brw_internal(conn, md, pages_per_brw, pga, cmd);
 
-static int osc_brw(int cmd, struct lustre_handle *conn,
-                   struct lov_stripe_md *md, obd_count page_count,
-                   struct brw_page *pga, struct obd_brw_set *set,
-                   struct obd_trans_info *oti)
+                if (rc != 0)
+                        RETURN(rc);
+
+                page_count -= pages_per_brw;
+                pga += pages_per_brw;
+        }
+        RETURN(0);
+}
+
+static int osc_brw_async(int cmd, struct lustre_handle *conn,
+                         struct lov_stripe_md *md, obd_count page_count,
+                         struct brw_page *pga, struct ptlrpc_request_set *set,
+                         struct obd_trans_info *oti)
 {
         ENTRY;
 
+        if (cmd == OBD_BRW_CHECK) {
+                /* The caller just wants to know if there's a chance that this
+                 * I/O can succeed */
+                struct obd_import *imp = class_conn2cliimp(conn);
+
+                if (imp == NULL || imp->imp_invalid)
+                        RETURN(-EIO);
+                RETURN(0);
+        }
+
         while (page_count) {
                 obd_count pages_per_brw;
                 int rc;
@@ -844,11 +1111,10 @@ static int osc_brw(int cmd, struct lustre_handle *conn,
                 else
                         pages_per_brw = page_count;
 
-                if (cmd & OBD_BRW_WRITE)
-                        rc = osc_brw_write(conn, md, pages_per_brw, pga,
-                                           set, oti);
-                else
-                        rc = osc_brw_read(conn, md, pages_per_brw, pga, set);
+                sort_brw_pages(pga, pages_per_brw);
+                pages_per_brw = check_elan_limit(pga, pages_per_brw);
+
+                rc = async_internal(conn, md, pages_per_brw, pga, set, cmd);
 
                 if (rc != 0)
                         RETURN(rc);
@@ -865,16 +1131,18 @@ static int osc_brw(int cmd, struct lustre_handle *conn,
 static int sanosc_brw_read(struct lustre_handle *conn,
                            struct lov_stripe_md *lsm,
                            obd_count page_count,
-                           struct brw_page *pga,
-                           struct obd_brw_set *set)
+                           struct brw_page *pga)
 {
         struct ptlrpc_request *request = NULL;
         struct ost_body *body;
         struct niobuf_remote *nioptr;
         struct obd_ioobj *iooptr;
-        int rc, j, size[3] = {sizeof(*body)}, mapped = 0;
+        int rc, size[3] = {sizeof(*body)}, mapped = 0;
+        int swab;
         ENTRY;
 
+        /* XXX does not handle 'new' brw protocol */
+
         size[1] = sizeof(struct obd_ioobj);
         size[2] = page_count * sizeof(*nioptr);
 
@@ -883,20 +1151,23 @@ static int sanosc_brw_read(struct lustre_handle *conn,
         if (!request)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
-        iooptr = lustre_msg_buf(request->rq_reqmsg, 1);
-        nioptr = lustre_msg_buf(request->rq_reqmsg, 2);
-        ost_pack_ioo(iooptr, lsm, page_count);
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
+        iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof (*iooptr));
+        nioptr = lustre_msg_buf(request->rq_reqmsg, 2,
+                                sizeof (*nioptr) * page_count);
 
-        obd_kmap_get(page_count, 0);
+        iooptr->ioo_id = lsm->lsm_object_id;
+        iooptr->ioo_gr = 0;
+        iooptr->ioo_type = S_IFREG;
+        iooptr->ioo_bufcnt = page_count;
 
         for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
                 LASSERT(PageLocked(pga[mapped].pg));
                 LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off);
 
-                kmap(pga[mapped].pg);
-                ost_pack_niobuf(nioptr, pga[mapped].off, pga[mapped].count,
-                                pga[mapped].flag, 0);
+                nioptr->offset = pga[mapped].off;
+                nioptr->len    = pga[mapped].count;
+                nioptr->flags  = pga[mapped].flag;
         }
 
         size[1] = page_count * sizeof(*nioptr);
@@ -904,25 +1175,25 @@ static int sanosc_brw_read(struct lustre_handle *conn,
 
         rc = ptlrpc_queue_wait(request);
         if (rc)
-                GOTO(out_unmap, rc);
-
-        nioptr = lustre_msg_buf(request->rq_repmsg, 1);
-        if (!nioptr)
-                GOTO(out_unmap, rc = -EINVAL);
+                GOTO(out_req, rc);
 
-        if (request->rq_repmsg->buflens[1] != size[1]) {
-                CERROR("buffer length wrong (%d vs. %d)\n",
-                       request->rq_repmsg->buflens[1], size[1]);
-                GOTO(out_unmap, rc = -EINVAL);
+        swab = lustre_msg_swabbed (request->rq_repmsg);
+        LASSERT_REPSWAB (request, 1);
+        nioptr = lustre_msg_buf(request->rq_repmsg, 1, size[1]);
+        if (!nioptr) {
+                /* nioptr missing or short */
+                GOTO(out_req, rc = -EPROTO);
         }
 
         /* actual read */
-        for (j = 0; j < page_count; j++, nioptr++) {
-                struct page *page = pga[j].pg;
+        for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
+                struct page *page = pga[mapped].pg;
                 struct buffer_head *bh;
                 kdev_t dev;
 
-                ost_unpack_niobuf(nioptr, nioptr);
+                if (swab)
+                        lustre_swab_niobuf_remote (nioptr);
+
                 /* got san device associated */
                 LASSERT(class_conn2obd(conn));
                 dev = class_conn2obd(conn)->u.cli.cl_sandev;
@@ -970,35 +1241,26 @@ static int sanosc_brw_read(struct lustre_handle *conn,
                 if (!buffer_uptodate(bh)) {
                         /* I/O error */
                         rc = -EIO;
-                        goto out_unmap;
+                        goto out_req;
                 }
         }
 
 out_req:
         ptlrpc_req_finished(request);
         RETURN(rc);
-
-out_unmap:
-        /* Clean up on error. */
-        while (mapped-- > 0)
-                kunmap(pga[mapped].pg);
-
-        obd_kmap_put(page_count);
-
-        goto out_req;
 }
 
 static int sanosc_brw_write(struct lustre_handle *conn,
                             struct lov_stripe_md *lsm,
                             obd_count page_count,
-                            struct brw_page *pga,
-                            struct obd_brw_set *set)
+                            struct brw_page *pga)
 {
         struct ptlrpc_request *request = NULL;
         struct ost_body *body;
         struct niobuf_remote *nioptr;
         struct obd_ioobj *iooptr;
-        int rc, j, size[3] = {sizeof(*body)}, mapped = 0;
+        int rc, size[3] = {sizeof(*body)}, mapped = 0;
+        int swab;
         ENTRY;
 
         size[1] = sizeof(struct obd_ioobj);
@@ -1009,20 +1271,24 @@ static int sanosc_brw_write(struct lustre_handle *conn,
         if (!request)
                 RETURN(-ENOMEM);
 
-        body = lustre_msg_buf(request->rq_reqmsg, 0);
-        iooptr = lustre_msg_buf(request->rq_reqmsg, 1);
-        nioptr = lustre_msg_buf(request->rq_reqmsg, 2);
-        ost_pack_ioo(iooptr, lsm, page_count);
+        body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body));
+        iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof (*iooptr));
+        nioptr = lustre_msg_buf(request->rq_reqmsg, 2,
+                                sizeof (*nioptr) * page_count);
+
+        iooptr->ioo_id = lsm->lsm_object_id;
+        iooptr->ioo_gr = 0;
+        iooptr->ioo_type = S_IFREG;
+        iooptr->ioo_bufcnt = page_count;
 
-        /* map pages, and pack request */
-        obd_kmap_get(page_count, 0);
+        /* pack request */
         for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
                 LASSERT(PageLocked(pga[mapped].pg));
                 LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off);
 
-                kmap(pga[mapped].pg);
-                ost_pack_niobuf(nioptr, pga[mapped].off, pga[mapped].count,
-                                pga[mapped].flag, 0);
+                nioptr->offset = pga[mapped].off;
+                nioptr->len    = pga[mapped].count;
+                nioptr->flags  = pga[mapped].flag;
         }
 
         size[1] = page_count * sizeof(*nioptr);
@@ -1030,25 +1296,25 @@ static int sanosc_brw_write(struct lustre_handle *conn,
 
         rc = ptlrpc_queue_wait(request);
         if (rc)
-                GOTO(out_unmap, rc);
-
-        nioptr = lustre_msg_buf(request->rq_repmsg, 1);
-        if (!nioptr)
-                GOTO(out_unmap, rc = -EINVAL);
+                GOTO(out_req, rc);
 
-        if (request->rq_repmsg->buflens[1] != size[1]) {
-                CERROR("buffer length wrong (%d vs. %d)\n",
-                       request->rq_repmsg->buflens[1], size[1]);
-                GOTO(out_unmap, rc = -EINVAL);
+        swab = lustre_msg_swabbed (request->rq_repmsg);
+        LASSERT_REPSWAB (request, 1);
+        nioptr = lustre_msg_buf(request->rq_repmsg, 1, size[1]);
+        if (!nioptr) {
+                CERROR("absent/short niobuf array\n");
+                GOTO(out_req, rc = -EPROTO);
         }
 
         /* actual write */
-        for (j = 0; j < page_count; j++, nioptr++) {
-                struct page *page = pga[j].pg;
+        for (mapped = 0; mapped < page_count; mapped++, nioptr++) {
+                struct page *page = pga[mapped].pg;
                 struct buffer_head *bh;
                 kdev_t dev;
 
-                ost_unpack_niobuf(nioptr, nioptr);
+                if (swab)
+                        lustre_swab_niobuf_remote (nioptr);
+
                 /* got san device associated */
                 LASSERT(class_conn2obd(conn));
                 dev = class_conn2obd(conn)->u.cli.cl_sandev;
@@ -1089,28 +1355,18 @@ static int sanosc_brw_write(struct lustre_handle *conn,
                 if (!buffer_uptodate(bh) || test_bit(BH_Dirty, &bh->b_state)) {
                         /* I/O error */
                         rc = -EIO;
-                        goto out_unmap;
+                        goto out_req;
                 }
         }
 
 out_req:
         ptlrpc_req_finished(request);
         RETURN(rc);
-
-out_unmap:
-        /* Clean up on error. */
-        while (mapped-- > 0)
-                kunmap(pga[mapped].pg);
-
-        obd_kmap_put(page_count);
-
-        goto out_req;
 }
 
 static int sanosc_brw(int cmd, struct lustre_handle *conn,
                       struct lov_stripe_md *lsm, obd_count page_count,
-                      struct brw_page *pga, struct obd_brw_set *set,
-                      struct obd_trans_info *oti)
+                      struct brw_page *pga, struct obd_trans_info *oti)
 {
         ENTRY;
 
@@ -1124,10 +1380,9 @@ static int sanosc_brw(int cmd, struct lustre_handle *conn,
                         pages_per_brw = page_count;
 
                 if (cmd & OBD_BRW_WRITE)
-                        rc = sanosc_brw_write(conn, lsm, pages_per_brw,
-                                              pga, set);
+                        rc = sanosc_brw_write(conn, lsm, pages_per_brw, pga);
                 else
-                        rc = sanosc_brw_read(conn, lsm, pages_per_brw, pga,set);
+                        rc = sanosc_brw_read(conn, lsm, pages_per_brw, pga);
 
                 if (rc != 0)
                         RETURN(rc);
@@ -1152,20 +1407,17 @@ static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm,
         int rc;
         ENTRY;
 
-        /* Filesystem locks are given a bit of special treatment: if
-         * this is not a file size lock (which has end == -1), we
-         * fixup the lock to start and end on page boundaries. */
-        if (extent->end != OBD_OBJECT_EOF) {
-                extent->start &= PAGE_MASK;
-                extent->end = (extent->end & PAGE_MASK) + PAGE_SIZE - 1;
-        }
+        /* Filesystem lock extents are extended to page boundaries so that
+         * dealing with the page cache is a little smoother.  */
+        extent->start -= extent->start & ~PAGE_MASK;
+        extent->end |= ~PAGE_MASK;
 
         /* Next, search for already existing extent locks that will cover us */
         rc = ldlm_lock_match(obddev->obd_namespace, 0, &res_id, type, extent,
                              sizeof(extent), mode, lockh);
         if (rc == 1)
                 /* We already have a lock, and it's referenced */
-                RETURN(ELDLM_LOCK_MATCHED);
+                RETURN(ELDLM_OK);
 
         /* If we're trying to read, we also search for an existing PW lock.  The
          * VFS and page cache already protect us locally, so lots of readers/
@@ -1189,14 +1441,52 @@ static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm,
                         ldlm_lock_addref(lockh, LCK_PR);
                         ldlm_lock_decref(lockh, LCK_PW);
 
-                        RETURN(ELDLM_LOCK_MATCHED);
+                        RETURN(ELDLM_OK);
                 }
         }
 
         rc = ldlm_cli_enqueue(connh, NULL, obddev->obd_namespace, parent_lock,
                               res_id, type, extent, sizeof(extent), mode, flags,
-                              ldlm_completion_ast, callback, data, NULL,
-                              lockh);
+                              ldlm_completion_ast, callback, data, lockh);
+        RETURN(rc);
+}
+
+static int osc_match(struct lustre_handle *connh, struct lov_stripe_md *lsm,
+                       __u32 type, void *extentp, int extent_len, __u32 mode,
+                       int *flags, struct lustre_handle *lockh)
+{
+        struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
+        struct obd_device *obddev = class_conn2obd(connh);
+        struct ldlm_extent *extent = extentp;
+        int rc;
+        ENTRY;
+
+        /* Filesystem lock extents are extended to page boundaries so that
+         * dealing with the page cache is a little smoother */
+        extent->start -= extent->start & ~PAGE_MASK;
+        extent->end |= ~PAGE_MASK;
+
+        /* Next, search for already existing extent locks that will cover us */
+        rc = ldlm_lock_match(obddev->obd_namespace, *flags, &res_id, type,
+                             extent, sizeof(extent), mode, lockh);
+        if (rc)
+                RETURN(rc);
+
+        /* If we're trying to read, we also search for an existing PW lock.  The
+         * VFS and page cache already protect us locally, so lots of readers/
+         * writers can share a single PW lock. */
+        if (mode == LCK_PR) {
+                rc = ldlm_lock_match(obddev->obd_namespace, *flags, &res_id,
+                                     type, extent, sizeof(extent), LCK_PW,
+                                     lockh);
+                if (rc == 1) {
+                        /* FIXME: This is not incredibly elegant, but it might
+                         * be more elegant than adding another parameter to
+                         * lock_match.  I want a second opinion. */
+                        ldlm_lock_addref(lockh, LCK_PR);
+                        ldlm_lock_decref(lockh, LCK_PW);
+                }
+        }
         RETURN(rc);
 }
 
@@ -1211,16 +1501,18 @@ static int osc_cancel(struct lustre_handle *oconn, struct lov_stripe_md *md,
 }
 
 static int osc_cancel_unused(struct lustre_handle *connh,
-                             struct lov_stripe_md *lsm, int flags)
+                             struct lov_stripe_md *lsm, int flags, void *opaque)
 {
         struct obd_device *obddev = class_conn2obd(connh);
         struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} };
 
-        return ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags);
+        return ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags,
+                                      opaque);
 }
 
 static int osc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
 {
+        struct obd_statfs *msfs;
         struct ptlrpc_request *request;
         int rc, size = sizeof(*osfs);
         ENTRY;
@@ -1238,7 +1530,14 @@ static int osc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs)
                 GOTO(out, rc);
         }
 
-        obd_statfs_unpack(osfs, lustre_msg_buf(request->rq_repmsg, 0));
+        msfs = lustre_swab_repbuf (request, 0, sizeof (*msfs),
+                                   lustre_swab_obd_statfs);
+        if (msfs == NULL) {
+                CERROR ("Can't unpack obd_statfs\n");
+                GOTO (out, rc = -EPROTO);
+        }
+
+        memcpy (osfs, msfs, sizeof (*msfs));
 
         EXIT;
  out:
@@ -1299,55 +1598,6 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
         ENTRY;
 
         switch (cmd) {
-#if 0
-        case IOC_LDLM_TEST: {
-                err = ldlm_test(obddev, conn);
-                CERROR("-- done err %d\n", err);
-                GOTO(out, err);
-        }
-        case IOC_LDLM_REGRESS_START: {
-                unsigned int numthreads = 1;
-                unsigned int numheld = 10;
-                unsigned int numres = 10;
-                unsigned int numext = 10;
-                char *parse;
-
-                if (data->ioc_inllen1) {
-                        parse = data->ioc_inlbuf1;
-                        if (*parse != '\0') {
-                                while(isspace(*parse)) parse++;
-                                numthreads = simple_strtoul(parse, &parse, 0);
-                                while(isspace(*parse)) parse++;
-                        }
-                        if (*parse != '\0') {
-                                while(isspace(*parse)) parse++;
-                                numheld = simple_strtoul(parse, &parse, 0);
-                                while(isspace(*parse)) parse++;
-                        }
-                        if (*parse != '\0') {
-                                while(isspace(*parse)) parse++;
-                                numres = simple_strtoul(parse, &parse, 0);
-                                while(isspace(*parse)) parse++;
-                        }
-                        if (*parse != '\0') {
-                                while(isspace(*parse)) parse++;
-                                numext = simple_strtoul(parse, &parse, 0);
-                                while(isspace(*parse)) parse++;
-                        }
-                }
-
-                err = ldlm_regression_start(obddev, conn, numthreads,
-                                numheld, numres, numext);
-
-                CERROR("-- done err %d\n", err);
-                GOTO(out, err);
-        }
-        case IOC_LDLM_REGRESS_STOP: {
-                err = ldlm_regression_stop();
-                CERROR("-- done err %d\n", err);
-                GOTO(out, err);
-        }
-#endif
         case IOC_OSC_REGISTER_LOV: {
                 if (obddev->u.cli.cl_containing_lov)
                         GOTO(out, err = -EALREADY);
@@ -1390,7 +1640,7 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
                 err = copy_to_user((void *)uarg, buf, len);
                 if (err)
                         err = -EFAULT;
-                OBD_FREE(buf, len);
+                obd_ioctl_freedata(buf, len);
                 GOTO(out, err);
         }
         case LL_IOC_LOV_SETSTRIPE:
@@ -1401,6 +1651,14 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len,
         case LL_IOC_LOV_GETSTRIPE:
                 err = osc_getstripe(conn, karg, uarg);
                 GOTO(out, err);
+        case OBD_IOC_CLIENT_RECOVER:
+                err = ptlrpc_recover_import(obddev->u.cli.cl_import,
+                                            data->ioc_inlbuf1);
+                GOTO(out, err);
+        case IOC_OSC_SET_ACTIVE:
+                err = ptlrpc_set_import_active(obddev->u.cli.cl_import,
+                                               data->ioc_offset);
+                GOTO(out, err);
         default:
                 CERROR ("osc_ioctl(): unrecognised ioctl %#x\n", cmd);
                 GOTO(out, err = -ENOTTY);
@@ -1409,166 +1667,21 @@ out:
         return err;
 }
 
-static void set_osc_active(struct obd_import *imp, int active)
+static int osc_get_info(struct lustre_handle *conn, obd_count keylen,
+                        void *key, __u32 *vallen, void *val)
 {
-        struct obd_device *notify_obd;
-
-        LASSERT(imp->imp_obd);
-
-        notify_obd = imp->imp_obd->u.cli.cl_containing_lov;
-
-        if (notify_obd == NULL)
-                return;
-
-        /* How gross is _this_? */
-        if (!list_empty(&notify_obd->obd_exports)) {
-                int rc;
-                struct lustre_handle fakeconn;
-                struct obd_ioctl_data ioc_data = { 0 };
-                struct obd_export *exp =
-                        list_entry(notify_obd->obd_exports.next,
-                                   struct obd_export, exp_obd_chain);
-
-                fakeconn.addr = (__u64)(unsigned long)exp;
-                fakeconn.cookie = exp->exp_cookie;
-                ioc_data.ioc_inlbuf1 =
-                        (char *)&imp->imp_obd->u.cli.cl_target_uuid;
-                ioc_data.ioc_offset = active;
-                rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn,
-                                   sizeof ioc_data, &ioc_data, NULL);
-                if (rc)
-                        CERROR("error disabling %s on LOV %p/%s: %d\n",
-                               imp->imp_obd->u.cli.cl_target_uuid.uuid,
-                               notify_obd, notify_obd->obd_uuid.uuid, rc);
-        } else {
-                CDEBUG(D_HA, "No exports for obd %p/%s, can't notify about "
-                       "%p\n", notify_obd, notify_obd->obd_uuid.uuid,
-                       imp->imp_obd->obd_uuid.uuid);
-        }
-}
-
-static int osc_recover(struct obd_import *imp, int phase)
-{
-        int rc;
-        unsigned long flags;
-        int msg_flags;
-        struct ptlrpc_request *req;
-        struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
         ENTRY;
+        if (!vallen || !val)
+                RETURN(-EFAULT);
 
-        CDEBUG(D_HA, "%s: entering phase: %d\n",
-               imp->imp_obd->obd_name, phase);
-        switch(phase) {
-
-            case PTLRPC_RECOVD_PHASE_PREPARE: {
-                if (imp->imp_flags & IMP_REPLAYABLE) {
-                        CDEBUG(D_HA, "failover OST\n");
-                        /* If we're a failover OSC/OST, just cancel unused
-                         * locks to simplify lock replay.
-                         */
-                        ldlm_cli_cancel_unused(ns, NULL, LDLM_FL_LOCAL_ONLY);
-                } else {
-                        CDEBUG(D_HA, "non-failover OST\n");
-                        /* Non-failover OSTs (LLNL scenario) disable the OSC
-                         * and invalidate local state.
-                         */
-                        ldlm_namespace_cleanup(ns, 1 /* no network ops */);
-                        ptlrpc_abort_inflight(imp, 0);
-                        set_osc_active(imp, 0 /* inactive */);
-                }
-                RETURN(0);
-            }
-
-        case PTLRPC_RECOVD_PHASE_RECOVER: {
-        reconnect:
-                imp->imp_flags &= ~IMP_INVALID;
-                rc = ptlrpc_reconnect_import(imp, OST_CONNECT, &req);
-
-                msg_flags = req->rq_repmsg
-                        ? lustre_msg_get_op_flags(req->rq_repmsg)
-                        : 0;
-
-                if (rc == -EBUSY && (msg_flags & MSG_CONNECT_RECOVERING))
-                        CERROR("reconnect denied by recovery; should retry\n");
-
-                if (rc) {
-                        if (phase != PTLRPC_RECOVD_PHASE_NOTCONN) {
-                                CERROR("can't reconnect, invalidating\n");
-                                ldlm_namespace_cleanup(ns, 1);
-                                ptlrpc_abort_inflight(imp, 0);
-                        }
-                        imp->imp_flags |= IMP_INVALID;
-                        ptlrpc_req_finished(req);
-                        RETURN(rc);
-                }
-
-                if (msg_flags & MSG_CONNECT_RECOVERING) {
-                        /* Replay if they want it. */
-                        DEBUG_REQ(D_HA, req, "OST wants replay");
-                        rc = ptlrpc_replay(imp);
-                        if (rc)
-                                GOTO(check_rc, rc);
-
-                        rc = ldlm_replay_locks(imp);
-                        if (rc)
-                                GOTO(check_rc, rc);
-
-                        rc = signal_completed_replay(imp);
-                        if (rc)
-                                GOTO(check_rc, rc);
-                } else if (msg_flags & MSG_CONNECT_RECONNECT) {
-                        DEBUG_REQ(D_HA, req, "reconnecting to MDS\n");
-                        /* Nothing else to do here. */
-                } else {
-                        DEBUG_REQ(D_HA, req, "evicted: invalidating\n");
-                        /* Otherwise, clean everything up. */
-                        ldlm_namespace_cleanup(ns, 1);
-                        ptlrpc_abort_inflight(imp, 0);
-                }
-
-                ptlrpc_req_finished(req);
-
-                spin_lock_irqsave(&imp->imp_lock, flags);
-                imp->imp_level = LUSTRE_CONN_FULL;
-                imp->imp_flags &= ~IMP_INVALID;
-                spin_unlock_irqrestore(&imp->imp_lock, flags);
-
-                /* Is this the right place?  Should we do this in _PREPARE
-                 * as well?  What about raising the level right away?
-                 */
-                ptlrpc_wake_delayed(imp);
-
-                rc = ptlrpc_resend(imp);
-                if (rc)
-                        GOTO(check_rc, rc);
-
-                set_osc_active(imp, 1 /* active */);
+        if (keylen > strlen("lock_to_stripe") &&
+            strcmp(key, "lock_to_stripe") == 0) {
+                __u32 *stripe = val;
+                *vallen = sizeof(*stripe);
+                *stripe = 0;
                 RETURN(0);
-
-        check_rc:
-                /* If we get disconnected in the middle, recovery has probably
-                 * failed.  Reconnect and find out.
-                 */
-                if (rc == -ENOTCONN)
-                        goto reconnect;
-                RETURN(rc);
-        }
-            case PTLRPC_RECOVD_PHASE_NOTCONN:
-                osc_recover(imp, PTLRPC_RECOVD_PHASE_PREPARE);
-                RETURN(osc_recover(imp, PTLRPC_RECOVD_PHASE_RECOVER));
-
-            default:
-                RETURN(-EINVAL);
         }
-}
-
-static int osc_connect(struct lustre_handle *conn, struct obd_device *obd,
-                       struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                       ptlrpc_recovery_cb_t recover)
-{
-        struct obd_import *imp = &obd->u.cli.cl_import;
-        imp->imp_recover = osc_recover;
-        return client_obd_connect(conn, obd, cluuid, recovd, recover);
+        RETURN(-EINVAL);
 }
 
 struct obd_ops osc_obd_ops = {
@@ -1577,23 +1690,27 @@ struct obd_ops osc_obd_ops = {
         o_detach:       osc_detach,
         o_setup:        client_obd_setup,
         o_cleanup:      client_obd_cleanup,
-        o_connect:      osc_connect,
-        o_disconnect:   client_obd_disconnect,
+        o_connect:      client_import_connect,
+        o_disconnect:   client_import_disconnect,
         o_statfs:       osc_statfs,
         o_packmd:       osc_packmd,
         o_unpackmd:     osc_unpackmd,
         o_create:       osc_create,
         o_destroy:      osc_destroy,
         o_getattr:      osc_getattr,
+        o_getattr_async: osc_getattr_async,
         o_setattr:      osc_setattr,
         o_open:         osc_open,
         o_close:        osc_close,
         o_brw:          osc_brw,
+        o_brw_async:    osc_brw_async,
         o_punch:        osc_punch,
         o_enqueue:      osc_enqueue,
+        o_match:        osc_match,
         o_cancel:       osc_cancel,
         o_cancel_unused: osc_cancel_unused,
-        o_iocontrol:    osc_iocontrol
+        o_iocontrol:    osc_iocontrol,
+        o_get_info:     osc_get_info
 };
 
 struct obd_ops sanosc_obd_ops = {
@@ -1601,14 +1718,15 @@ struct obd_ops sanosc_obd_ops = {
         o_attach:       osc_attach,
         o_detach:       osc_detach,
         o_cleanup:      client_obd_cleanup,
-        o_connect:      osc_connect,
-        o_disconnect:   client_obd_disconnect,
+        o_connect:      client_import_connect,
+        o_disconnect:   client_import_disconnect,
         o_statfs:       osc_statfs,
         o_packmd:       osc_packmd,
         o_unpackmd:     osc_unpackmd,
         o_create:       osc_create,
         o_destroy:      osc_destroy,
         o_getattr:      osc_getattr,
+        o_getattr_async: osc_getattr_async,
         o_setattr:      osc_setattr,
         o_open:         osc_open,
         o_close:        osc_close,
@@ -1618,6 +1736,7 @@ struct obd_ops sanosc_obd_ops = {
 #endif
         o_punch:        osc_punch,
         o_enqueue:      osc_enqueue,
+        o_match:        osc_match,
         o_cancel:       osc_cancel,
         o_cancel_unused: osc_cancel_unused,
         o_iocontrol:    osc_iocontrol,
@@ -1629,7 +1748,8 @@ int __init osc_init(void)
         int rc;
         ENTRY;
 
-        LASSERT(sizeof(struct osc_obdo_data) <= FD_OSTDATA_SIZE);
+        LASSERT(sizeof(struct obd_client_handle) <= FD_OSTDATA_SIZE);
+        LASSERT(sizeof(struct obd_client_handle) <= OBD_INLINESZ);
 
         lprocfs_init_vars(&lvars);
 
index c158a0f..b2e51c3 100644 (file)
@@ -4,19 +4,10 @@
 # See the file COPYING in this distribution
 
 DEFS= 
+
 MODULE = ost
 modulefs_DATA = ost.o
 EXTRA_PROGRAMS = ost
-
-LINX=obd_pack.c target.c
-
-obd_pack.c: 
-       test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c
-target.c: 
-       test -e target.c || ln -sf $(top_srcdir)/lib/target.c
-
-ost_SOURCES = ost_handler.c lproc_ost.c $(LINX)
-dist-hook:
-       list='$(LINX)'; for f in $$list; do rm -f $(distdir)/$$f; done
+ost_SOURCES = ost_handler.c lproc_ost.c
 
 include $(top_srcdir)/Rules
index 848336c..f14d82f 100644 (file)
 #include <linux/init.h>
 #include <linux/lprocfs_status.h>
 
-inline void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req)
+inline void oti_to_request(struct obd_trans_info *oti,
+                           struct ptlrpc_request *req)
 {
-        if (oti && req->rq_repmsg)
-                req->rq_repmsg->transno = HTON__u64(oti->oti_transno);
+        int i;
+        struct oti_req_ack_lock *ack_lock;
+
+        if(oti == NULL)
+                return;
+
+        if (req->rq_repmsg)
+                req->rq_repmsg->transno = oti->oti_transno;
+
+        /* XXX 4 == entries in oti_ack_locks??? */
+        for (ack_lock = oti->oti_ack_locks, i = 0; i < 4; i++, ack_lock++) {
+                if (!ack_lock->mode)
+                        break;
+                memcpy(&req->rq_ack_locks[i].lock, &ack_lock->lock,
+                       sizeof(req->rq_ack_locks[i].lock));
+                req->rq_ack_locks[i].mode = ack_lock->mode;
+        }
         EXIT;
 }
 
 static int ost_destroy(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
-        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
+        struct lustre_handle *conn = &req->rq_reqmsg->handle;
         struct ost_body *body;
         int rc, size = sizeof(*body);
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL)
+                RETURN (-EFAULT);
 
         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc)
@@ -72,14 +91,16 @@ static int ost_getattr(struct ptlrpc_request *req)
         int rc, size = sizeof(*body);
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL)
+                RETURN (-EFAULT);
 
         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc)
                 RETURN(rc);
 
-        repbody = lustre_msg_buf(req->rq_repmsg, 0);
-        /* FIXME: unpack only valid fields instead of memcpy, endianness */
+        repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody));
         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
         req->rq_status = obd_getattr(conn, &repbody->oa, NULL);
         RETURN(0);
@@ -96,23 +117,18 @@ static int ost_statfs(struct ptlrpc_request *req)
         if (rc)
                 RETURN(rc);
 
-        osfs = lustre_msg_buf(req->rq_repmsg, 0);
+        osfs = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*osfs));
         memset(osfs, 0, size);
 
-        rc = obd_statfs(conn, osfs);
-        if (rc) {
-                CERROR("ost: statfs failed: rc %d\n", rc);
-                req->rq_status = rc;
-                RETURN(rc);
-        }
-        obd_statfs_pack(osfs, osfs);
+        req->rq_status = obd_statfs(conn, osfs);
+        if (req->rq_status != 0)
+                CERROR("ost: statfs failed: rc %d\n", req->rq_status);
 
         RETURN(0);
 }
 
 static int ost_syncfs(struct ptlrpc_request *req)
 {
-        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
         struct obd_statfs *osfs;
         int rc, size = sizeof(*osfs);
         ENTRY;
@@ -121,7 +137,7 @@ static int ost_syncfs(struct ptlrpc_request *req)
         if (rc)
                 RETURN(rc);
 
-        rc = obd_syncfs(conn);
+        rc = obd_syncfs(req->rq_export);
         if (rc) {
                 CERROR("ost: syncfs failed: rc %d\n", rc);
                 req->rq_status = rc;
@@ -135,19 +151,21 @@ static int ost_open(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
         struct ost_body *body, *repbody;
-        int rc, size = sizeof(*body);
+        int rc, size = sizeof(*repbody);
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL)
+                return (-EFAULT);
 
         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc)
                 RETURN(rc);
 
-        repbody = lustre_msg_buf(req->rq_repmsg, 0);
-        /* FIXME: unpack only valid fields instead of memcpy, endianness */
+        repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody));
         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
-        req->rq_status = obd_open(conn, &repbody->oa, NULL, oti);
+        req->rq_status = obd_open(conn, &repbody->oa, NULL, oti, NULL);
         RETURN(0);
 }
 
@@ -155,17 +173,19 @@ static int ost_close(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
         struct ost_body *body, *repbody;
-        int rc, size = sizeof(*body);
+        int rc, size = sizeof(*repbody);
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL)
+                RETURN (-EFAULT);
 
         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc)
                 RETURN(rc);
 
-        repbody = lustre_msg_buf(req->rq_repmsg, 0);
-        /* FIXME: unpack only valid fields instead of memcpy, endianness */
+        repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
         req->rq_status = obd_close(conn, &repbody->oa, NULL, oti);
         RETURN(0);
@@ -175,17 +195,19 @@ static int ost_create(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
         struct ost_body *body, *repbody;
-        int rc, size = sizeof(*body);
+        int rc, size = sizeof(*repbody);
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL)
+                RETURN (-EFAULT);
 
         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc)
                 RETURN(rc);
 
-        repbody = lustre_msg_buf(req->rq_repmsg, 0);
-        /* FIXME: unpack only valid fields instead of memcpy, endianness */
+        repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody));
         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
         req->rq_status = obd_create(conn, &repbody->oa, NULL, oti);
         RETURN(0);
@@ -195,12 +217,15 @@ static int ost_punch(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
         struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
         struct ost_body *body, *repbody;
-        int rc, size = sizeof(*body);
+        int rc, size = sizeof(*repbody);
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL)
+                RETURN (-EFAULT);
 
-        if ((NTOH__u32(body->oa.o_valid) & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))!=
+        if ((body->oa.o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) !=
             (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))
                 RETURN(-EINVAL);
 
@@ -208,30 +233,32 @@ static int ost_punch(struct ptlrpc_request *req, struct obd_trans_info *oti)
         if (rc)
                 RETURN(rc);
 
-        repbody = lustre_msg_buf(req->rq_repmsg, 0);
-        /* FIXME: unpack only valid fields instead of memcpy, endianness */
+        repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
-        req->rq_status = obd_punch(conn, &repbody->oa, NULL,
-                                   repbody->oa.o_size, repbody->oa.o_blocks, oti);
+        req->rq_status = obd_punch(conn, &repbody->oa, NULL, repbody->oa.o_size,
+                                   repbody->oa.o_blocks, oti);
         RETURN(0);
 }
 
 static int ost_setattr(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
-        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
+        struct lustre_handle *conn = &req->rq_reqmsg->handle;
         struct ost_body *body, *repbody;
-        int rc, size = sizeof(*body);
+        int rc, size = sizeof(*repbody);
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL)
+                RETURN (-EFAULT);
 
         rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
         if (rc)
                 RETURN(rc);
 
-        repbody = lustre_msg_buf(req->rq_repmsg, 0);
-        /* FIXME: unpack only valid fields instead of memcpy, endianness */
+        repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody));
         memcpy(&repbody->oa, &body->oa, sizeof(body->oa));
+
         req->rq_status = obd_setattr(conn, &repbody->oa, NULL, oti);
         RETURN(0);
 }
@@ -245,128 +272,274 @@ static int ost_bulk_timeout(void *data)
         RETURN(1);
 }
 
-static int ost_brw_read(struct ptlrpc_request *req)
+static int get_per_page_niobufs (struct obd_ioobj *ioo, int nioo,
+                                 struct niobuf_remote *rnb, int nrnb,
+                                 struct niobuf_remote **pp_rnbp)
 {
-        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
-        struct ptlrpc_bulk_desc *desc;
-        struct niobuf_remote *remote_nb;
-        struct niobuf_local *local_nb = NULL;
-        struct obd_ioobj *ioo;
-        struct ost_body *body;
-        struct l_wait_info lwi;
-        void *desc_priv = NULL;
-        void *end2;
-        int cmd, i, j, objcount, niocount, size = sizeof(*body);
-        int rc = 0;
+        /* Copy a remote niobuf, splitting it into page-sized chunks
+         * and setting ioo[i].ioo_bufcnt accordingly */
+        struct niobuf_remote *pp_rnb;
+        int   i;
+        int   j;
+        int   page;
+        int   rnbidx = 0;
+        int   npages = 0;
+
+        /* first count and check the number of pages required */
+        for (i = 0; i < nioo; i++)
+                for (j = 0; j < ioo->ioo_bufcnt; j++, rnbidx++) {
+                        obd_off offset = rnb[rnbidx].offset;
+                        obd_off p0 = offset >> PAGE_SHIFT;
+                        obd_off pn = (offset + rnb[rnbidx].len - 1)>>PAGE_SHIFT;
+
+                        LASSERT (rnbidx < nrnb);
+
+                        npages += (pn + 1 - p0);
+
+                        if (rnb[rnbidx].len == 0) {
+                                CERROR("zero len BRW: obj %d objid "LPX64
+                                       " buf %u\n", i, ioo[i].ioo_id, j);
+                                return (-EINVAL);
+                        }
+                        if (j > 0 &&
+                            rnb[rnbidx].offset <= rnb[rnbidx-1].offset) {
+                                CERROR("unordered BRW: obj %d objid "LPX64
+                                       " buf %u offset "LPX64" <= "LPX64"\n",
+                                       i, ioo[i].ioo_id, j, rnb[rnbidx].offset,
+                                       rnb[rnbidx].offset);
+                                return (-EINVAL);
+                        }
+                }
+
+        LASSERT (rnbidx == nrnb);
+
+        if (npages == nrnb) {       /* all niobufs are for single pages */
+                *pp_rnbp = rnb;
+                return (npages);
+        }
+
+        OBD_ALLOC (pp_rnb, sizeof (*pp_rnb) * npages);
+        if (pp_rnb == NULL)
+                return (-ENOMEM);
+
+        /* now do the actual split */
+        page = rnbidx = 0;
+        for (i = 0; i < nioo; i++) {
+                int  obj_pages = 0;
+
+                for (j = 0; j < ioo[i].ioo_bufcnt; j++, rnbidx++) {
+                        obd_off off = rnb[rnbidx].offset;
+                        int     nob = rnb[rnbidx].len;
+
+                        LASSERT (rnbidx < nrnb);
+                        do {
+                                obd_off  poff = off & (PAGE_SIZE - 1);
+                                int      pnob = (poff + nob > PAGE_SIZE) ?
+                                                PAGE_SIZE - poff : nob;
+
+                                LASSERT (page < npages);
+                                pp_rnb[page].len = pnob;
+                                pp_rnb[page].offset = off;
+                                pp_rnb[page].flags = rnb->flags;
+
+                                CDEBUG (D_PAGE, "   obj %d id "LPX64
+                                        "page %d(%d) "LPX64" for %d\n",
+                                        i, ioo[i].ioo_id, obj_pages, page,
+                                        pp_rnb[page].offset, pp_rnb[page].len);
+                                page++;
+                                obj_pages++;
+
+                                off += pnob;
+                                nob -= pnob;
+                        } while (nob > 0);
+                        LASSERT (nob == 0);
+                }
+                ioo[i].ioo_bufcnt = obj_pages;
+        }
+        LASSERT (page == npages);
+
+        *pp_rnbp = pp_rnb;
+        return (npages);
+}
+
+static void free_per_page_niobufs (int npages, struct niobuf_remote *pp_rnb,
+                                   struct niobuf_remote *rnb)
+{
+        if (pp_rnb == rnb)                      /* didn't allocate above */
+                return;
+
+        OBD_FREE (pp_rnb, sizeof (*pp_rnb) * npages);
+}
+
 #if CHECKSUM_BULK
-        __u64 cksum = 0;
+__u64 ost_checksum_bulk (struct ptlrpc_bulk_desc *desc)
+{
+        __u64             cksum = 0;
+        struct list_head *tmp;
+        char             *ptr;
+
+        list_for_each (tmp, &desc->bd_page_list) {
+                struct ptlrpc_bulk_page *bp;
+
+                bp = list_entry (tmp, struct ptlrpc_bulk_page, bp_link);
+                ptr = kmap (bp->bp_page);
+                ost_checksum (&cksum, ptr + bp->bp_pageoffset, bp->bp_buflen);
+                kunmap (bp->bp_page);
+        }
+}
 #endif
-        ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
-        ioo = lustre_msg_buf(req->rq_reqmsg, 1);
-        remote_nb = lustre_msg_buf(req->rq_reqmsg, 2);
-        end2 = (char *)remote_nb + req->rq_reqmsg->buflens[2];
-        objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo);
-        niocount = req->rq_reqmsg->buflens[2] / sizeof(*remote_nb);
-        cmd = OBD_BRW_READ;
+static int ost_brw_read(struct ptlrpc_request *req)
+{
+        struct ptlrpc_bulk_desc *desc;
+        struct niobuf_remote    *remote_nb;
+        struct niobuf_remote    *pp_rnb;
+        struct niobuf_local     *local_nb;
+        struct obd_ioobj        *ioo;
+        struct ost_body         *body;
+        struct l_wait_info       lwi;
+        void                    *desc_priv = NULL;
+        int                      size[1] = { sizeof(*body) };
+        int                      comms_error = 0;
+        int                      niocount;
+        int                      npages;
+        int                      nob = 0;
+        int                      rc;
+        int                      i;
+        ENTRY;
 
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK))
-                GOTO(out, req->rq_status = -EIO);
+                GOTO(out, rc = -EIO);
 
-        /* Hmm, we don't return anything in this reply buffer?
-         * We should be returning per-page status codes and also
-         * per-object size, blocks count, mtime, ctime.  (bug 593) */
-        rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
-        if (rc)
-                GOTO(out, req->rq_status = rc);
-
-        for (i = 0; i < objcount; i++, ioo++) {
-                ost_unpack_ioo(ioo, ioo);
-                if ((void *)(remote_nb + ioo->ioo_bufcnt) > end2) {
-                        CERROR("BRW: objid "LPX64" count %u larger than %u\n",
-                               ioo->ioo_id, ioo->ioo_bufcnt,
-                               (int)(end2 - (void *)remote_nb));
-                        LBUG();
-                        GOTO(out, rc = -EINVAL);
-                }
-                for (j = 0; j < ioo->ioo_bufcnt; j++, remote_nb++) {
-                        ost_unpack_niobuf(remote_nb, remote_nb);
-                        if (remote_nb->len == 0) {
-                                CERROR("zero len BRW: objid "LPX64" buf %u\n",
-                                       ioo->ioo_id, j);
-                                GOTO(out, rc = -EINVAL);
-                        }
-                        if (j && remote_nb->offset <= (remote_nb - 1)->offset) {
-                                CERROR("unordered BRW: objid "LPX64
-                                       " buf %u offset "LPX64" <= "LPX64"\n",
-                                       ioo->ioo_id, j, remote_nb->offset,
-                                       (remote_nb - 1)->offset);
-                                GOTO(out, rc = -EINVAL);
-                        }
-                }
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("Missing/short ost_body\n");
+                GOTO (out, rc = -EFAULT);
         }
 
-        OBD_ALLOC(local_nb, sizeof(*local_nb) * niocount);
-        if (local_nb == NULL)
-                GOTO(out, rc = -ENOMEM);
+        ioo = lustre_swab_reqbuf (req, 1, sizeof (*ioo),
+                                  lustre_swab_obd_ioobj);
+        if (ioo == NULL) {
+                CERROR ("Missing/short ioobj\n");
+                GOTO (out, rc = -EFAULT);
+        }
 
-        /* The unpackers move ioo and remote_nb, so reset them before using */
-        ioo = lustre_msg_buf(req->rq_reqmsg, 1);
-        remote_nb = lustre_msg_buf(req->rq_reqmsg, 2);
-        req->rq_status = obd_preprw(cmd, conn, objcount, ioo, niocount,
-                                    remote_nb, local_nb, &desc_priv, NULL);
+        niocount = ioo->ioo_bufcnt;
+        remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb),
+                                       lustre_swab_niobuf_remote);
+        if (remote_nb == NULL) {
+                CERROR ("Missing/short niobuf\n");
+                GOTO (out, rc = -EFAULT);
+        }
+        if (lustre_msg_swabbed (req->rq_reqmsg)) { /* swab remaining niobufs */
+                for (i = 1; i < niocount; i++)
+                        lustre_swab_niobuf_remote (&remote_nb[i]);
+        }
 
-        if (req->rq_status)
-                GOTO(out, req->rq_status);
+        rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg);
+        if (rc)
+                GOTO(out, rc);
+
+        /* CAVEAT EMPTOR this sets ioo->ioo_bufcnt to # pages */
+        npages = get_per_page_niobufs (ioo, 1, remote_nb, niocount, &pp_rnb);
+        if (npages < 0)
+                GOTO(out, rc = npages);
+
+        OBD_ALLOC(local_nb, sizeof(*local_nb) * npages);
+        if (local_nb == NULL)
+                GOTO(out_pp_rnb, rc = -ENOMEM);
 
-        desc = ptlrpc_prep_bulk(req->rq_connection);
+        desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, OST_BULK_PORTAL);
         if (desc == NULL)
                 GOTO(out_local, rc = -ENOMEM);
-        desc->bd_ptl_ev_hdlr = NULL;
-        desc->bd_portal = OST_BULK_PORTAL;
 
-        for (i = 0; i < niocount; i++) {
-                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
+        rc = obd_preprw(OBD_BRW_READ, req->rq_export, 1, ioo, npages,
+                        pp_rnb, local_nb, &desc_priv, NULL);
+        if (rc != 0)
+                GOTO(out_bulk, rc);
 
-                if (bulk == NULL)
-                        GOTO(out_bulk, rc = -ENOMEM);
-                bulk->bp_xid = remote_nb[i].xid;
-                bulk->bp_buf = local_nb[i].addr;
-                bulk->bp_buflen = remote_nb[i].len;
-                if (body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM))
-                        ost_checksum(&cksum, bulk->bp_buf, bulk->bp_buflen);
-        }
+        nob = 0;
+        for (i = 0; i < npages; i++) {
+                int page_rc = local_nb[i].rc;
 
-        rc = ptlrpc_bulk_put(desc);
-        if (rc)
-                GOTO(out_bulk, rc);
+                if (page_rc < 0) {              /* error */
+                        rc = page_rc;
+                        break;
+                }
 
-        lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, desc);
-        rc = l_wait_event(desc->bd_waitq, desc->bd_flags & PTL_BULK_FL_SENT,
-                          &lwi);
-        if (rc) {
-                LASSERT(rc == -ETIMEDOUT);
-                GOTO(out_bulk, rc);
+                LASSERT (page_rc <= pp_rnb[i].len);
+                nob += page_rc;
+                if (page_rc != 0) {             /* some data! */
+                        LASSERT (local_nb[i].page != NULL);
+                        rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page,
+                                                   pp_rnb[i].offset& ~PAGE_MASK,
+                                                   page_rc);
+                        if (rc != 0)
+                                break;
+                }
+
+                if (page_rc != pp_rnb[i].len) { /* short read */
+                        /* All subsequent pages should be 0 */
+                        while (++i < npages)
+                                LASSERT (local_nb[i].rc == 0);
+                        break;
+                }
         }
 
-        req->rq_status = obd_commitrw(cmd, conn, objcount, ioo, niocount,
-                                      local_nb, desc_priv, NULL);
+        if (rc == 0) {
+                rc = ptlrpc_bulk_put(desc);
+                if (rc == 0) {
+                        lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout,
+                                          desc);
+                        rc = l_wait_event(desc->bd_waitq,
+                                          ptlrpc_bulk_complete(desc), &lwi);
+                        if (rc) {
+                                LASSERT(rc == -ETIMEDOUT);
+                                CERROR ("timeout waiting for bulk PUT\n");
+                                ptlrpc_abort_bulk (desc);
+                        }
+                }
+                comms_error = rc != 0;
+        }
+
+        /* Must commit after prep above in all cases */
+        rc = obd_commitrw(OBD_BRW_READ, req->rq_export, 1, ioo, npages,
+                          local_nb, desc_priv, NULL);
 
-out_bulk:
-        ptlrpc_bulk_decref(desc);
-out_local:
-        OBD_FREE(local_nb, sizeof(*local_nb) * niocount);
-out:
-        if (rc)
-                ptlrpc_error(req->rq_svc, req);
-        else {
 #if CHECKSUM_BULK
-                body = lustre_msg_buf(req->rq_repmsg, 0);
-                body->oa.o_rdev = HTON__u64(cksum);
-                body->oa.o_valid |= HTON__u32(OBD_MD_FLCKSUM);
+        if (rc == 0) {
+                body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
+                body->oa.o_rdev = ost_checksum_bulk (desc);
+                body->oa.o_valid |= OBD_MD_FLCKSUM;
+        }
 #endif
-                ptlrpc_reply(req->rq_svc, req);
+
+ out_bulk:
+        ptlrpc_free_bulk (desc);
+ out_local:
+        OBD_FREE(local_nb, sizeof(*local_nb) * npages);
+ out_pp_rnb:
+        free_per_page_niobufs (npages, pp_rnb, remote_nb);
+ out:
+        LASSERT (rc <= 0);
+        if (rc == 0) {
+                req->rq_status = nob;
+                ptlrpc_reply(req);
+        } else if (!comms_error) {
+                /* only reply if comms OK */
+                req->rq_status = rc;
+                ptlrpc_error(req);
+        } else {
+                if (req->rq_repmsg != NULL) {
+                        /* reply out callback would free */
+                        OBD_FREE (req->rq_repmsg, req->rq_replen);
+                }
+                CERROR("bulk IO comms error: evicting %s@%s nid "LPU64"\n",
+                       req->rq_export->exp_client_uuid.uuid,
+                       req->rq_connection->c_remote_uuid.uuid,
+                       req->rq_connection->c_peer.peer_nid);
+                ptlrpc_fail_export(req->rq_export);
         }
 
         RETURN(rc);
@@ -374,117 +547,117 @@ out:
 
 static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
 {
-        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
         struct ptlrpc_bulk_desc *desc;
-        struct niobuf_remote *remote_nb;
-        void *end2;
-        struct niobuf_local *local_nb = NULL;
-        struct obd_ioobj *ioo;
-        struct ost_body *body;
-        struct l_wait_info lwi;
-        void *desc_priv = NULL;
-        int cmd, i, j, objcount, niocount, size = sizeof(*body);
-        int rc = 0;
+        struct niobuf_remote    *remote_nb;
+        struct niobuf_remote    *pp_rnb;
+        struct niobuf_local     *local_nb;
+        struct obd_ioobj        *ioo;
+        struct ost_body         *body;
+        struct l_wait_info       lwi;
+        void                    *desc_priv = NULL;
+        __u32                   *rcs;
+        int                      size[2] = { sizeof (*body) };
+        int                      objcount, niocount, npages;
+        int                      comms_error = 0;
+        int                      rc, rc2, swab, i, j;
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
-        ioo = lustre_msg_buf(req->rq_reqmsg, 1);
-        remote_nb = lustre_msg_buf(req->rq_reqmsg, 2);
-        end2 = (void *)remote_nb + req->rq_reqmsg->buflens[2];
-        objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo);
-        niocount = req->rq_reqmsg->buflens[2] / sizeof(*remote_nb);
-        cmd = OBD_BRW_WRITE;
-
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
-                GOTO(out, req->rq_status = -EIO);
-
-        for (i = 0; i < objcount; i++, ioo++) {
-                ost_unpack_ioo(ioo, ioo);
-                if ((void *)(remote_nb + ioo->ioo_bufcnt) > end2) {
-                        CERROR("BRW: objid "LPX64" count %u larger than %u\n",
-                               ioo->ioo_id, ioo->ioo_bufcnt,
-                               (int)(end2 - (void *)remote_nb));
-                        LBUG();
-                        GOTO(out, rc = -EINVAL);
-                }
-                for (j = 0; j < ioo->ioo_bufcnt; j++, remote_nb++) {
-                        ost_unpack_niobuf(remote_nb, remote_nb);
-                        if (remote_nb->len == 0) {
-                                CERROR("zero len BRW: objid "LPX64" buf %u\n",
-                                       ioo->ioo_id, j);
-                                GOTO(out, rc = -EINVAL);
-                        }
-                        if (j && remote_nb->offset <= (remote_nb - 1)->offset) {
-                                CERROR("unordered BRW: objid "LPX64
-                                       " buf %u offset "LPX64" <= "LPX64"\n",
-                                       ioo->ioo_id, j, remote_nb->offset,
-                                       (remote_nb - 1)->offset);
-                                GOTO(out, rc = -EINVAL);
-                        }
+                GOTO(out, rc = -EIO);
+
+        swab = lustre_msg_swabbed (req->rq_reqmsg);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("Missing/short ost_body\n");
+                GOTO(out, rc = -EFAULT);
+        }
+
+        LASSERT_REQSWAB (req, 1);
+        objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo);
+        if (objcount == 0) {
+                CERROR ("Missing/short ioobj\n");
+                GOTO (out, rc = -EFAULT);
+        }
+        ioo = lustre_msg_buf (req->rq_reqmsg, 1, objcount * sizeof (*ioo));
+        LASSERT (ioo != NULL);
+        for (niocount = i = 0; i < objcount; i++) {
+                if (swab)
+                        lustre_swab_obd_ioobj (&ioo[i]);
+                if (ioo[i].ioo_bufcnt == 0) {
+                        CERROR ("ioo[%d] has zero bufcnt\n", i);
+                        GOTO (out, rc = -EFAULT);
                 }
+                niocount += ioo[i].ioo_bufcnt;
         }
 
-        OBD_ALLOC(local_nb, sizeof(*local_nb) * niocount);
-        if (local_nb == NULL)
-                GOTO(out, rc = -ENOMEM);
+        remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb),
+                                       lustre_swab_niobuf_remote);
+        if (remote_nb == NULL) {
+                CERROR ("Missing/short niobuf\n");
+                GOTO(out, rc = -EFAULT);
+        }
+        if (swab) {                             /* swab the remaining niobufs */
+                for (i = 1; i < niocount; i++)
+                        lustre_swab_niobuf_remote (&remote_nb[i]);
+        }
 
-        /* The unpackers move ioo and remote_nb, so reset them before using */
-        ioo = lustre_msg_buf(req->rq_reqmsg, 1);
-        remote_nb = lustre_msg_buf(req->rq_reqmsg, 2);
+        size[1] = niocount * sizeof (*rcs);
+        rc = lustre_pack_msg(2, size, NULL, &req->rq_replen,
+                             &req->rq_repmsg);
+        if (rc != 0)
+                GOTO (out, rc);
+        rcs = lustre_msg_buf (req->rq_repmsg, 1, niocount * sizeof (*rcs));
 
-        req->rq_status = obd_preprw(cmd, conn, objcount, ioo, niocount,
-                                    remote_nb, local_nb, &desc_priv, oti);
+        /* CAVEAT EMPTOR this sets ioo->ioo_bufcnt to # pages */
+        npages = get_per_page_niobufs(ioo, objcount,remote_nb,niocount,&pp_rnb);
+        if (npages < 0)
+                GOTO (out, rc = npages);
 
-        if (req->rq_status)
-                GOTO(out_local, rc = 0);
+        OBD_ALLOC(local_nb, sizeof(*local_nb) * npages);
+        if (local_nb == NULL)
+                GOTO(out_pp_rnb, rc = -ENOMEM);
 
-        desc = ptlrpc_prep_bulk(req->rq_connection);
+        desc = ptlrpc_prep_bulk_exp (req, BULK_GET_SINK, OST_BULK_PORTAL);
         if (desc == NULL)
                 GOTO(out_local, rc = -ENOMEM);
-        desc->bd_ptl_ev_hdlr = NULL;
-        desc->bd_portal = OSC_BULK_PORTAL;
 
-        for (i = 0; i < niocount; i++) {
-                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
+        rc = obd_preprw(OBD_BRW_WRITE, req->rq_export, objcount, ioo,
+                        npages, pp_rnb, local_nb, &desc_priv, oti);
+        if (rc != 0)
+                GOTO (out_bulk, rc);
 
-                if (bulk == NULL)
-                        GOTO(out_bulk, rc = -ENOMEM);
-                bulk->bp_xid = remote_nb[i].xid;
-                bulk->bp_buf = local_nb[i].addr;
-                bulk->bp_buflen = remote_nb[i].len;
-        }
+        /* NB Having prepped, we must commit... */
 
-        rc = ptlrpc_bulk_get(desc);
-        if (rc)
-                GOTO(out_bulk, rc);
+        for (i = 0; i < npages; i++) {
+                rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page,
+                                           pp_rnb[i].offset & (PAGE_SIZE - 1),
+                                           pp_rnb[i].len);
+                if (rc != 0)
+                        break;
+        }
 
-        lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, desc);
-        rc = l_wait_event(desc->bd_waitq, desc->bd_flags & PTL_BULK_FL_RCVD,
-                          &lwi);
-        if (rc) {
-                LASSERT(rc == -ETIMEDOUT);
-                ptlrpc_abort_bulk(desc);
-                recovd_conn_fail(desc->bd_connection);
-                obd_commitrw(cmd, conn, objcount, ioo, niocount, local_nb,
-                             desc_priv, oti);
-                GOTO(out_bulk, rc);
+        if (rc == 0) {
+                rc = ptlrpc_bulk_get(desc);
+                if (rc == 0) {
+                        lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout,
+                                          desc);
+                        rc = l_wait_event(desc->bd_waitq,
+                                          ptlrpc_bulk_complete(desc), &lwi);
+                        if (rc) {
+                                LASSERT(rc == -ETIMEDOUT);
+                                CERROR ("timeout waiting for bulk GET\n");
+                                ptlrpc_abort_bulk (desc);
+                        }
+                }
+                comms_error = rc != 0;
         }
 
 #if CHECKSUM_BULK
-        if ((body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM))) {
+        if (rc == 0 && (body->oa.o_valid & OBD_MD_FLCKSUM) != 0) {
                 static int cksum_counter;
-                __u64 client_cksum = NTOH__u64(body->oa.o_rdev);
-                __u64 cksum = 0;
-
-                for (i = 0; i < niocount; i++) {
-                        char *ptr = kmap(local_nb[i].page);
-                        int   off = local_nb[i].offset & (PAGE_SIZE - 1);
-                        int   len = local_nb[i].len;
-
-                        LASSERT(off + len <= PAGE_SIZE);
-                        ost_checksum(&cksum, ptr + off, len);
-                        kunmap(local_nb[i].page);
-                }
+                __u64 client_cksum = body->oa.o_rdev;
+                __u64 cksum = ost_checksum_bulk (desc);
 
                 if (client_cksum != cksum) {
                         CERROR("Bad checksum: client "LPX64", server "LPX64
@@ -501,59 +674,119 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 }
         }
 #endif
-
-        req->rq_status = obd_commitrw(cmd, conn, objcount, ioo, niocount,
-                                      local_nb, desc_priv, oti);
+        /* Must commit after prep above in all cases */
+        rc2 = obd_commitrw(OBD_BRW_WRITE, req->rq_export, objcount, ioo,
+                           npages, local_nb, desc_priv, oti);
+
+        if (rc == 0) {
+                /* set per-requested niobuf return codes */
+                for (i = j = 0; i < niocount; i++) {
+                        int nob = remote_nb[i].len;
+
+                        rcs[i] = 0;
+                        do {
+                                LASSERT (j < npages);
+                                if (local_nb[j].rc < 0)
+                                        rcs[i] = local_nb[j].rc;
+                                nob -= pp_rnb[j].len;
+                                j++;
+                        } while (nob > 0);
+                        LASSERT (nob == 0);
+                }
+                LASSERT (j == npages);
+        }
+        if (rc == 0)
+                rc = rc2;
 
  out_bulk:
-        ptlrpc_bulk_decref(desc);
+        ptlrpc_free_bulk (desc);
  out_local:
-        OBD_FREE(local_nb, sizeof(*local_nb) * niocount);
+        OBD_FREE(local_nb, sizeof(*local_nb) * npages);
+ out_pp_rnb:
+        free_per_page_niobufs (npages, pp_rnb, remote_nb);
  out:
-        if (!rc)
-                /* Hmm, we don't return anything in this reply buffer?
-                 * We should be returning per-page status codes and also
-                 * per-object size, blocks count, mtime, ctime.  (bug 593) */
-                rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen,
-                                     &req->rq_repmsg);
-        if (rc)
-                ptlrpc_error(req->rq_svc, req);
-        else {
+        if (rc == 0) {
                 oti_to_request(oti, req);
-                rc = ptlrpc_reply(req->rq_svc, req);
+                rc = ptlrpc_reply(req);
+        } else if (!comms_error) {
+                /* Only reply if there was no comms problem with bulk */
+                req->rq_status = rc;
+                ptlrpc_error(req);
+        } else {
+                if (req->rq_repmsg != NULL) {
+                        /* reply out callback would free */
+                        OBD_FREE (req->rq_repmsg, req->rq_replen);
+                }
+                CERROR("bulk IO comms error: evicting %s@%s nid "LPU64"\n",
+                       req->rq_export->exp_client_uuid.uuid,
+                       req->rq_connection->c_remote_uuid.uuid,
+                       req->rq_connection->c_peer.peer_nid);
+                ptlrpc_fail_export(req->rq_export);
         }
         RETURN(rc);
 }
 
-static int ost_san_brw(struct ptlrpc_request *req, int alloc)
+static int ost_san_brw(struct ptlrpc_request *req, int cmd)
 {
-        struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg;
+        struct lustre_handle *conn = &req->rq_reqmsg->handle;
         struct niobuf_remote *remote_nb, *res_nb;
         struct obd_ioobj *ioo;
         struct ost_body *body;
-        int cmd, rc, i, j, objcount, niocount, size[2] = {sizeof(*body)};
-        void *end2;
+        int rc, i, j, objcount, niocount, size[2] = {sizeof(*body)};
+        int n;
+        int swab;
         ENTRY;
 
-        body = lustre_msg_buf(req->rq_reqmsg, 0);
-        ioo = lustre_msg_buf(req->rq_reqmsg, 1);
-        remote_nb = lustre_msg_buf(req->rq_reqmsg, 2);
-        end2 = (void *)remote_nb + req->rq_reqmsg->buflens[2];
+        /* XXX not set to use latest protocol */
+
+        swab = lustre_msg_swabbed (req->rq_reqmsg);
+        body = lustre_swab_reqbuf (req, 0, sizeof (*body),
+                                   lustre_swab_ost_body);
+        if (body == NULL) {
+                CERROR ("Missing/short ost_body\n");
+                GOTO (out, rc = -EFAULT);
+        }
+
+        ioo = lustre_swab_reqbuf(req, 1, sizeof (*ioo),
+                                 lustre_swab_obd_ioobj);
+        if (ioo == NULL) {
+                CERROR ("Missing/short ioobj\n");
+                GOTO (out, rc = -EFAULT);
+        }
         objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo);
-        niocount = req->rq_reqmsg->buflens[2] / sizeof(*remote_nb);
+        niocount = ioo[0].ioo_bufcnt;
+        for (i = 1; i < objcount; i++) {
+                if (swab)
+                        lustre_swab_obd_ioobj (&ioo[i]);
+                niocount += ioo[i].ioo_bufcnt;
+        }
 
-        cmd = alloc ? OBD_BRW_WRITE : OBD_BRW_READ;
+        remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb),
+                                       lustre_swab_niobuf_remote);
+        if (remote_nb == NULL) {
+                CERROR ("Missing/short niobuf\n");
+                GOTO (out, rc = -EFAULT);
+        }
+        if (swab) {                             /* swab the remaining niobufs */
+                for (i = 1; i < niocount; i++)
+                        lustre_swab_niobuf_remote (&remote_nb[i]);
+        }
 
-        for (i = 0; i < objcount; i++, ioo++) {
-                ost_unpack_ioo(ioo, ioo);
-                if ((void *)(remote_nb + ioo->ioo_bufcnt) > end2) {
-                        CERROR("BRW: objid "LPX64" count %u larger than %u\n",
-                               ioo->ioo_id, ioo->ioo_bufcnt,
-                               (int)(end2 - (void *)remote_nb));
-                        GOTO(out, rc = -EINVAL);
+        for (i = n = 0; i < objcount; i++) {
+                for (j = 0; j < ioo[i].ioo_bufcnt; j++, n++) {
+                        if (remote_nb[n].len == 0) {
+                                CERROR("zero len BRW: objid "LPX64" buf %u\n",
+                                       ioo[i].ioo_id, j);
+                                GOTO(out, rc = -EINVAL);
+                        }
+                        if (j && remote_nb[n].offset <= remote_nb[n-1].offset) {
+                                CERROR("unordered BRW: objid "LPX64
+                                       " buf %u offset "LPX64" <= "LPX64"\n",
+                                       ioo[i].ioo_id, j, remote_nb[n].offset,
+                                       remote_nb[n-1].offset);
+                                GOTO(out, rc = -EINVAL);
+                        }
                 }
-                for (j = 0; j < ioo->ioo_bufcnt; j++, remote_nb++)
-                        ost_unpack_niobuf(remote_nb, remote_nb);
         }
 
         size[1] = niocount * sizeof(*remote_nb);
@@ -561,33 +794,23 @@ static int ost_san_brw(struct ptlrpc_request *req, int alloc)
         if (rc)
                 GOTO(out, rc);
 
-        /* The unpackers move ioo and remote_nb, so reset them before using */
-        ioo = lustre_msg_buf(req->rq_reqmsg, 1);
-        remote_nb = lustre_msg_buf(req->rq_reqmsg, 2);
-
         req->rq_status = obd_san_preprw(cmd, conn, objcount, ioo,
                                         niocount, remote_nb);
 
-        if (req->rq_status) {
-                rc = 0;
-                goto out;
-        }
-
-        remote_nb = lustre_msg_buf(req->rq_repmsg, 1);
-        res_nb = lustre_msg_buf(req->rq_reqmsg, 2);
-        for (i = 0; i < niocount; i++, remote_nb++, res_nb++)
-                ost_pack_niobuf(remote_nb, res_nb->offset, res_nb->len,
-                                res_nb->flags, res_nb->xid);
+        if (req->rq_status)
+                GOTO (out, rc = 0);
 
+        res_nb = lustre_msg_buf(req->rq_repmsg, 1, size[1]);
+        memcpy (res_nb, remote_nb, size[1]);
         rc = 0;
-
 out:
         if (rc) {
                 OBD_FREE(req->rq_repmsg, req->rq_replen);
                 req->rq_repmsg = NULL;
-                ptlrpc_error(req->rq_svc, req);
+                req->rq_status = rc;
+                ptlrpc_error(req);
         } else
-                ptlrpc_reply(req->rq_svc, req);
+                ptlrpc_reply(req);
 
         return rc;
 }
@@ -601,6 +824,7 @@ static int filter_recovery_request(struct ptlrpc_request *req,
                *process = 1;
                RETURN(0);
 
+        case OBD_PING:
         case OST_CLOSE:
         case OST_CREATE:
         case OST_DESTROY:
@@ -617,24 +841,23 @@ static int filter_recovery_request(struct ptlrpc_request *req,
                 DEBUG_REQ(D_ERROR, req, "not permitted during recovery");
                 *process = 0;
                 /* XXX what should we set rq_status to here? */
-                RETURN(ptlrpc_error(req->rq_svc, req));
+                req->rq_status = -EAGAIN;
+                RETURN(ptlrpc_error(req));
         }
 }
 
+
+
 static int ost_handle(struct ptlrpc_request *req)
 {
         struct obd_trans_info trans_info = { 0, }, *oti = &trans_info;
-        int should_process, rc;
+        int should_process, fail = OBD_FAIL_OST_ALL_REPLY_NET, rc = 0;
         ENTRY;
 
-        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
-        if (rc || OBD_FAIL_CHECK(OBD_FAIL_OST_HANDLE_UNPACK)) {
-                CERROR("lustre_ost: Invalid request\n");
-                GOTO(out, rc);
-        }
-
+        /* XXX identical to MDS */
         if (req->rq_reqmsg->opc != OST_CONNECT) {
                 struct obd_device *obd;
+                int abort_recovery, recovering;
 
                 if (req->rq_export == NULL) {
                         CERROR("lustre_ost: operation %d on unconnected OST\n",
@@ -645,31 +868,18 @@ static int ost_handle(struct ptlrpc_request *req)
 
                 obd = req->rq_export->exp_obd;
 
+                /* Check for aborted recovery. */
                 spin_lock_bh(&obd->obd_processing_task_lock);
-                if (obd->obd_flags & OBD_ABORT_RECOVERY)
-                        target_abort_recovery(obd);
+                abort_recovery = obd->obd_abort_recovery;
+                recovering = obd->obd_recovering;
                 spin_unlock_bh(&obd->obd_processing_task_lock);
-
-                if (obd->obd_flags & OBD_RECOVERING) {
+                if (abort_recovery) {
+                        target_abort_recovery(obd);
+                } else if (recovering) {
                         rc = filter_recovery_request(req, obd, &should_process);
                         if (rc || !should_process)
                                 RETURN(rc);
-                } else if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) {
-#if 0
-/* need to store this reply somewhere... */
-                        if (req->rq_xid == med->med_last_xid) {
-                                DEBUG_REQ(D_HA, req, "resending reply");
-                                OBD_ALLOC(req->rq_repmsg, med->med_last_replen);
-                                req->rq_replen = med->med_last_replen;
-                                memcpy(req->rq_repmsg, med->med_last_reply,
-                                       req->rq_replen);
-                                ptlrpc_reply(req->rq_svc, req);
-                                return 0;
-                        }
-                        DEBUG_REQ(D_HA, req, "no reply for resend, continuing");
-#endif
                 }
-
         } 
 
         if (strcmp(req->rq_obd->obd_type->typ_name, "ost") != 0)
@@ -731,13 +941,13 @@ static int ost_handle(struct ptlrpc_request *req)
         case OST_SAN_READ:
                 CDEBUG(D_INODE, "san read\n");
                 OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
-                rc = ost_san_brw(req, 0);
+                rc = ost_san_brw(req, OBD_BRW_READ);
                 /* ost_san_brw sends its own replies */
                 RETURN(rc);
         case OST_SAN_WRITE:
                 CDEBUG(D_INODE, "san write\n");
                 OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0);
-                rc = ost_san_brw(req, 1);
+                rc = ost_san_brw(req, OBD_BRW_WRITE);
                 /* ost_san_brw sends its own replies */
                 RETURN(rc);
         case OST_PUNCH:
@@ -755,11 +965,16 @@ static int ost_handle(struct ptlrpc_request *req)
                 OBD_FAIL_RETURN(OBD_FAIL_OST_SYNCFS_NET, 0);
                 rc = ost_syncfs(req);
                 break;
+        case OBD_PING:
+                DEBUG_REQ(D_INODE, req, "ping");
+                rc = target_handle_ping(req);
+                break;
         case LDLM_ENQUEUE:
                 CDEBUG(D_INODE, "enqueue\n");
                 OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0);
                 rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
                                          ldlm_server_blocking_ast);
+                fail = OBD_FAIL_OST_LDLM_REPLY_NET;
                 break;
         case LDLM_CONVERT:
                 CDEBUG(D_INODE, "convert\n");
@@ -775,12 +990,11 @@ static int ost_handle(struct ptlrpc_request *req)
         case LDLM_CP_CALLBACK:
                 CDEBUG(D_INODE, "callback\n");
                 CERROR("callbacks should not happen on OST\n");
-                LBUG();
-                OBD_FAIL_RETURN(OBD_FAIL_LDLM_BL_CALLBACK, 0);
-                break;
+                /* fall through */
         default:
+                CERROR("Unexpected opcode %d\n", req->rq_reqmsg->opc);
                 req->rq_status = -ENOTSUPP;
-                rc = ptlrpc_error(req->rq_svc, req);
+                rc = ptlrpc_error(req);
                 RETURN(rc);
         }
 
@@ -788,22 +1002,22 @@ static int ost_handle(struct ptlrpc_request *req)
         /* If we're DISCONNECTing, the export_data is already freed */
         if (!rc && req->rq_reqmsg->opc != OST_DISCONNECT) {
                 struct obd_device *obd  = req->rq_export->exp_obd;
-                if ((obd->obd_flags & OBD_NO_TRANSNO) == 0) {
+                if (!obd->obd_no_transno) {
                         req->rq_repmsg->last_committed =
-                                HTON__u64(obd->obd_last_committed);
+                                obd->obd_last_committed;
                 } else {
                         DEBUG_REQ(D_IOCTL, req,
                                   "not sending last_committed update");
                 }
                 CDEBUG(D_INFO, "last_committed "LPU64", xid "LPX64"\n",
-                       obd->obd_last_committed, HTON__u64(req->rq_xid));
+                       obd->obd_last_committed, req->rq_xid);
         }
 
 out:
         if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) {
                 struct obd_device *obd = req->rq_export->exp_obd;
 
-                if (obd && (obd->obd_flags & OBD_RECOVERING)) {
+                if (obd && obd->obd_recovering) {
                         DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply");
                         return target_queue_final_reply(req, rc);
                 }
@@ -811,21 +1025,10 @@ out:
                 rc = req->rq_status = -ENOTCONN;
         }
 
-        if (rc) {
-                CERROR("ost: processing error (opcode=%d): %d\n",
-                       req->rq_reqmsg->opc, rc);
-                ptlrpc_error(req->rq_svc, req);
-        } else {
-                CDEBUG(D_INODE, "sending reply\n");
-                if (req->rq_repmsg == NULL)
-                        CERROR("handler for opcode %d returned rc=0 without "
-                               "creating rq_repmsg; needs to return rc != 0!\n",
-                               req->rq_reqmsg->opc);
-                else
-                        oti_to_request(oti, req);
-                ptlrpc_reply(req->rq_svc, req);
-        }
+        if (!rc)
+                oti_to_request(oti, req);
 
+        target_send_reply(req, rc, fail);
         return 0;
 }
 
@@ -839,7 +1042,7 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
         ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS,
                                            OST_BUFSIZE, OST_MAXREQSIZE,
                                            OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
-                                           ost_handle, "ost");
+                                           ost_handle, "ost", obddev);
         if (!ost->ost_service) {
                 CERROR("failed to start service\n");
                 GOTO(error_disc, err = -ENOMEM);
@@ -861,13 +1064,15 @@ error_disc:
         RETURN(err);
 }
 
-static int ost_cleanup(struct obd_device * obddev)
+static int ost_cleanup(struct obd_device *obddev, int force, int failover)
 {
         struct ost_obd *ost = &obddev->u.ost;
         int err = 0;
-
         ENTRY;
 
+        if (obddev->obd_recovering)
+                target_cancel_recovery_timer(obddev);
+
         ptlrpc_stop_all_threads(ost->ost_service);
         ptlrpc_unregister_service(ost->ost_service);
 
@@ -891,9 +1096,7 @@ int ost_detach(struct obd_device *dev)
  * connects directly to this module.
  */
 static int ost_connect(struct lustre_handle *conn,
-                       struct obd_device *obd, struct obd_uuid *cluuid,
-                       struct recovd_obd *recovd,
-                       ptlrpc_recovery_cb_t recover)
+                       struct obd_device *obd, struct obd_uuid *cluuid)
 {
         struct obd_export *exp;
         int rc;
@@ -907,6 +1110,7 @@ static int ost_connect(struct lustre_handle *conn,
                 RETURN(rc);
         exp = class_conn2export(conn);
         LASSERT(exp);
+        class_export_put(exp);
 
         RETURN(0);
 }
diff --git a/lustre/portals/.cvsignore b/lustre/portals/.cvsignore
new file mode 100644 (file)
index 0000000..99ac885
--- /dev/null
@@ -0,0 +1,8 @@
+Kernelenv
+Makefile
+Makefile.in
+aclocal.m4
+autom4te.cache
+config.log
+config.status
+configure
diff --git a/lustre/portals/AUTHORS b/lustre/portals/AUTHORS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/ChangeLog b/lustre/portals/ChangeLog
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/Kernelenv.in b/lustre/portals/Kernelenv.in
new file mode 100644 (file)
index 0000000..29a713f
--- /dev/null
@@ -0,0 +1 @@
+EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include
diff --git a/lustre/portals/Kernelenv.mk b/lustre/portals/Kernelenv.mk
new file mode 100644 (file)
index 0000000..29a713f
--- /dev/null
@@ -0,0 +1 @@
+EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include
diff --git a/lustre/portals/Makefile.am b/lustre/portals/Makefile.am
new file mode 100644 (file)
index 0000000..1a223f2
--- /dev/null
@@ -0,0 +1,12 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = Rules.linux archdep.m4 include 
+DIST_SUBDIRS = libcfs portals knals unals utils tests doc router
+if LIBLUSTRE
+SUBDIRS = portals unals utils
+else
+SUBDIRS = libcfs portals knals unals utils tests doc router
+endif
diff --git a/lustre/portals/Makefile.mk b/lustre/portals/Makefile.mk
new file mode 100644 (file)
index 0000000..be0e51a
--- /dev/null
@@ -0,0 +1,6 @@
+include fs/lustre/portals/Kernelenv
+
+obj-y += portals/
+obj-y += libcfs/
+obj-y += knals/
+obj-y += router/
diff --git a/lustre/portals/NEWS b/lustre/portals/NEWS
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/README b/lustre/portals/README
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/Rules.linux b/lustre/portals/Rules.linux
new file mode 100644 (file)
index 0000000..93943b7
--- /dev/null
@@ -0,0 +1,25 @@
+# included in Linux kernel directories
+# Rules for module building
+
+if LINUX25
+
+basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g')
+AM_CPPFLAGS= -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2  -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename)
+
+$(MODULE).o: $($(MODULE)_OBJECTS)
+       $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS)
+
+else
+
+$(MODULE).o: $($(MODULE)_OBJECTS)
+       $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS)
+
+endif
+
+tags:
+       rm -f $(top_srcdir)/TAGS
+       rm -f $(top_srcdir)/tags
+       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a
+       find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a
+       find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a
+       find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a
diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4
new file mode 100644 (file)
index 0000000..7cb00cf
--- /dev/null
@@ -0,0 +1,317 @@
+
+# -------- in kernel compilation? (2.5 only) -------------
+AC_ARG_ENABLE(inkernel, [ --enable-inkernel set up 2.5 kernel makefiles])
+AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes)
+echo "Makefile for in kernel build: $INKERNEL"
+
+# -------- liblustre compilation --------------
+AC_ARG_WITH(lib, [  --with-lib compile lustre library], host_cpu="lib")
+
+# -------- set linuxdir ------------
+
+AC_ARG_WITH(linux, [  --with-linux=[path] set path to Linux source (default=/usr/src/linux)],LINUX=$with_linux,LINUX=/usr/src/linux)
+AC_SUBST(LINUX)
+
+# --------- UML?  --------------------
+AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...)
+if test $host_cpu = "lib" ; then 
+        host_cpu="lib"
+       AC_MSG_RESULT(no building Lustre library)
+else
+  if test -e $LINUX/include/asm-um ; then
+    if test  X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then
+       host_cpu="um";
+       AC_MSG_RESULT(yes)
+    else
+       AC_MSG_RESULT(no (asm doesn't point at asm-um))
+    fi
+
+  else 
+        AC_MSG_RESULT(no (asm-um missing))
+  fi
+fi
+
+# --------- Linux 25 ------------------
+
+AC_MSG_CHECKING(if you are running linux 2.5)
+if test -e $LINUX/include/linux/namei.h ; then
+        linux25="yes"
+        AC_MSG_RESULT(yes)
+else
+        linux25="no"
+        AC_MSG_RESULT(no)
+fi
+AM_CONDITIONAL(LINUX25, test x$linux25 = xyes)
+echo "Makefiles for in linux 2.5 build: $LINUX25"
+
+# -------  Makeflags ------------------
+
+AC_MSG_CHECKING(setting make flags system architecture: )
+case ${host_cpu} in
+       lib )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall '
+       KCPPFLAGS='-D__arch_lib__ '
+       libdir='${exec_prefix}/lib/lustre'
+        MOD_LINK=elf_i386
+;;
+       um )
+       AC_MSG_RESULT($host_cpu)
+       KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common '
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/kernel/skas/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) '
+        ;;
+                * )
+                KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE  -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/include '
+        ;;
+        esac
+
+        MOD_LINK=elf_i386
+;;
+       i*86 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe'
+        case ${linux25} in
+                yes )
+                KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include '
+        ;;
+                * )
+                KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        ;;
+        esac
+        MOD_LINK=elf_i386
+;;
+
+       alphaev6 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       alphaev67 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       alpha* )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-g -O2  -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5'
+        KCPPFLAGS='-D__KERNEL__ -DMODULE '
+        MOD_LINK=elf64alpha
+;;
+
+       ia64 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step'
+       KCPPFLAGS='-D__KERNEL__ -DMODULE'
+        MOD_LINK=elf64_ia64
+;;
+
+       sparc64 )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs'
+        KCPPFLAGS='-D__KERNEL__'
+        MOD_LINK=elf64_sparc
+
+;;
+
+       powerpc )
+       AC_MSG_RESULT($host_cpu)
+        KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring'
+        KCPPFLAGS='-D__KERNEL__'
+        MOD_LINK=elf32ppclinux
+;;
+
+        *)
+       AC_ERROR("Unknown Linux Platform: $host_cpu")
+;;
+esac
+
+# ----------- make dep run? ------------------
+
+if test $host_cpu != "lib" ; then 
+  AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) )
+  if test -f $LINUX/include/linux/config.h ; then
+  AC_MSG_RESULT(yes)
+ else
+  AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.)
+  fi
+fi
+
+# ------------ include paths ------------------
+
+if test $host_cpu != "lib" ; then 
+    KINCFLAGS="-I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include -I$LINUX/include"
+else
+    KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include'
+fi
+CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS"
+
+if test $host_cpu != "lib" ; then 
+# ------------ autoconf.h ------------------
+  AC_MSG_CHECKING(if autoconf.h is in kernel source)
+  if test -f $LINUX/include/linux/autoconf.h ; then
+      AC_MSG_RESULT(yes)
+  else
+      AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.)
+  fi
+
+# ------------ RELEASE and moduledir ------------------
+  AC_MSG_CHECKING(for Linux release)
+  
+  dnl We need to rid ourselves of the nasty [ ] quotes.
+  changequote(, )
+  dnl Get release from version.h
+  RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`"
+  changequote([, ])
+  
+  moduledir='$(libdir)/modules/'$RELEASE/kernel
+  AC_SUBST(moduledir)
+  
+  modulefsdir='$(moduledir)/fs/$(PACKAGE)'
+  AC_SUBST(modulefsdir)
+  
+  AC_MSG_RESULT($RELEASE)
+  AC_SUBST(RELEASE)
+
+# ---------- modversions? --------------------
+  AC_MSG_CHECKING(for MODVERSIONS)
+  if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1;
+  then
+        MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB"
+        AC_MSG_RESULT(yes)
+  else
+        MFLAGS=
+        AC_MSG_RESULT(no)
+  fi
+fi
+
+# ---------- Portals flags --------------------
+
+#AC_PREFIX_DEFAULT([])
+#if test "x$prefix" = xNONE || test "x$prefix" = x; then
+#  usrprefix=/usr
+#else
+#  usrprefix='${prefix}'
+#fi
+#AC_SUBST(usrprefix)
+
+AC_MSG_CHECKING(if kernel has CPU affinity support)
+if test "$target_cpu" != ia64 ; then
+  enable_affinity_temp="-DCPU_AFFINITY=1"
+  AC_MSG_RESULT(yes)
+else
+  enable_affinity_temp=""
+  AC_MSG_RESULT(no)
+fi
+
+AC_MSG_CHECKING(if kernel has zero-copy TCP support)
+ZCCD="`grep -c zccd $LINUX/include/linux/skbuff.h`"
+if test "$ZCCD" != 0 ; then
+  enable_zerocopy_temp="-DSOCKNAL_ZC=1"
+  AC_MSG_RESULT(yes)
+else
+  enable_zerocopy_temp=""
+  AC_MSG_RESULT(no)
+fi
+
+AC_ARG_ENABLE(zerocopy, [  --enable-zerocopy enable socknal zerocopy],enable_zerocopy=$enable_zerocopy_temp, enable_zerocopy="")
+
+AC_ARG_ENABLE(affinity, [  --enable-affinity enable process/irq affinity],enable_affinity="-DCPU_AFFINITY=1", enable_affinity=$enable_affinity_temp)
+#####################################
+
+AC_MSG_CHECKING(if quadrics kernel headers are present)
+if test -d $LINUX/drivers/net/qsnet ; then
+  AC_MSG_RESULT(yes)
+  QSWNAL="qswnal"
+  with_quadrics="-I$LINUX/drivers/net/qsnet/include"
+  :
+elif test -d $LINUX/drivers/qsnet1 ; then
+  AC_MSG_RESULT(yes)
+  QSWNAL="qswnal"
+  with_quadrics="-I$LINUX/drivers/qsnet1/include -DPROPRIETARY_ELAN"
+  :
+elif test -d $LINUX/drivers/quadrics ; then
+  AC_MSG_RESULT(yes)
+  QSWNAL="qswnal"
+  with_quadrics="-I$LINUX/drivers/quadrics/include -DPROPRIETARY_ELAN"
+  :
+#elif test -d /usr/include/elan3 ; then
+#  AC_MSG_RESULT(yes)
+#  QSWNAL="qswnal"
+#  with_quadrics=""
+#  :
+else
+  AC_MSG_RESULT(no)
+  QSWNAL=""
+  with_quadrics=""
+  :
+fi
+AC_SUBST(with_quadrics)
+AC_SUBST(QSWNAL)
+
+# R. Read 5/02
+GMNAL=""
+echo "checking with-gm=" ${with_gm}
+if test "${with_gm+set}" = set; then
+  if test "${with_gm}" = yes; then
+    with_gm="-I/usr/local/gm/include"
+  else
+    with_gm=-I"$with_gm/include"
+  fi
+  GMNAL="gmnal"
+else
+# default case - no GM
+  with_gm=""
+fi
+AC_SUBST(with_gm)
+AC_SUBST(GMNAL)
+
+
+def_scamac=/opt/scali/include
+AC_ARG_WITH(scamac, [  --with-scamac=[yes/no/path] Path to ScaMAC includes (default=/opt/scali/include)], with_scamac=$withval, with_scamac=$def_scamac)
+AC_MSG_CHECKING(if ScaMAC headers are present)
+if test "$with_scamac" = yes; then
+  with_scamac=$def_scamac
+fi
+if test "$with_scamac" != no -a -f ${with_scamac}/scamac.h; then
+  AC_MSG_RESULT(yes)
+  SCIMACNAL="scimacnal"
+  with_scamac="-I${with_scamac} -I${with_scamac}/icm"
+else
+  AC_MSG_RESULT(no)
+  SCIMACNAL=""
+  with_scamac=""
+fi
+
+AC_SUBST(with_scamac)
+AC_SUBST(SCIMACNAL)
+
+CFLAGS="$KCFLAGS"
+CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac "
+
+AC_SUBST(MOD_LINK)
+AC_SUBST(LINUX25)
+AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib)
+
+# ---------- Red Hat 2.4.20 backports some 2.5 bits --------
+# This needs to run after we've defined the KCPPFLAGS
+
+AC_MSG_CHECKING(for kernel version)
+AC_TRY_LINK([#define __KERNEL__
+             #include <linux/sched.h>],
+            [struct task_struct p;
+             p.sighand = NULL;],
+            [RH_2_4_20=1],
+            [RH_2_4_20=0])
+
+if test $RH_2_4_20 = 1; then
+       AC_MSG_RESULT(redhat-2.4.20)
+       CPPFLAGS="$CPPFLAGS -DCONFIG_RH_2_4_20"
+else
+       AC_MSG_RESULT($RELEASE)
+fi 
diff --git a/lustre/portals/autogen.sh b/lustre/portals/autogen.sh
new file mode 100755 (executable)
index 0000000..9deed73
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+aclocal &&
+automake --add-missing &&
+${AUTOCONF:-autoconf}
diff --git a/lustre/portals/build.m4 b/lustre/portals/build.m4
new file mode 100644 (file)
index 0000000..025f243
--- /dev/null
@@ -0,0 +1,95 @@
+# ----------  other tests and settings ---------
+
+
+# ---------  unsigned long long sane? -------
+
+AC_CHECK_SIZEOF(unsigned long long, 0)
+echo "---> size SIZEOF $SIZEOF_unsigned_long_long"
+echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long"
+if test $ac_cv_sizeof_unsigned_long_long != 8 ; then
+        AC_MSG_ERROR([** we assume that sizeof(long long) == 8.  Tell phil@clusterfs.com])
+fi
+
+# directories for binaries
+ac_default_prefix=
+bindir='${exec_prefix}/usr/bin'
+sbindir='${exec_prefix}/usr/sbin'
+includedir='${prefix}/usr/include'
+
+# Directories for documentation and demos.
+docdir='${prefix}/usr/share/doc/$(PACKAGE)'
+AC_SUBST(docdir)
+demodir='$(docdir)/demo'
+AC_SUBST(demodir)
+pkgexampledir='${prefix}/usr/lib/$(PACKAGE)/examples'
+AC_SUBST(pkgexampledir)
+pymoddir='${prefix}/usr/lib/${PACKAGE}/python/Lustre'
+AC_SUBST(pymoddir)
+modulenetdir='$(moduledir)/net/$(PACKAGE)'
+AC_SUBST(modulenetdir)
+
+
+# ----------  BAD gcc? ------------
+AC_PROG_RANLIB
+AC_PROG_CC
+AC_MSG_CHECKING(for buggy compiler)
+CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"`
+bad_cc() {
+       echo
+       echo "   '$CC_VERSION'"
+       echo "  has been known to generate bad code, "
+       echo "  please get an updated compiler."
+       AC_MSG_ERROR(sorry)
+}
+TMP_VERSION=`echo $CC_VERSION | cut -c 1-16`
+if test "$TMP_VERSION" = "gcc version 2.95"; then
+        bad_cc
+fi
+case "$CC_VERSION" in 
+       # ost_pack_niobuf putting 64bit NTOH temporaries on the stack
+       # without "sub    $0xc,%esp" to protect the stack from being
+       # stomped on by interrupts (bug 606)
+       "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)")
+               bad_cc
+               ;;
+       # mandrake's similar sub 0xc compiler bug
+       # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2
+       "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)")
+               bad_cc
+               ;;
+       *)
+               AC_MSG_RESULT(no known problems)
+               ;;
+esac
+# end ------  BAD gcc? ------------
+
+# --------  Check for required packages  --------------
+
+# this doesn't seem to work on older autoconf
+# AC_CHECK_LIB(readline, readline,,)
+AC_ARG_ENABLE(readline,        [  --enable-readline  use readline library],,
+                       enable_readline="yes")
+if test "$enable_readline" = "yes" ; then
+   LIBREADLINE="-lreadline -lncurses"
+   HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1"
+else 
+   LIBREADLINE=""
+   HAVE_LIBREADLINE=""
+fi
+AC_SUBST(LIBREADLINE)
+AC_SUBST(HAVE_LIBREADLINE)
+
+AC_ARG_ENABLE(efence,  [  --enable-efence  use efence library],,
+                       enable_efence="no")
+if test "$enable_efence" = "yes" ; then
+   LIBEFENCE="-lefence"
+   HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1"
+else 
+   LIBEFENCE=""
+   HAVE_LIBEFENCE=""
+fi
+AC_SUBST(LIBEFENCE)
+AC_SUBST(HAVE_LIBEFENCE)
+
diff --git a/lustre/portals/configure.in b/lustre/portals/configure.in
new file mode 100644 (file)
index 0000000..31d3492
--- /dev/null
@@ -0,0 +1,34 @@
+# This version is here to make autoconf happy; the name is a file which is
+# "unique" to this directory so that configure knows where it should run.
+AC_INIT(knals/Makefile.am, 3.0)
+AC_CANONICAL_SYSTEM
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+# Automake variables.  Steal the version number from packaging/intersync.spec
+AM_INIT_AUTOMAKE(portals, builtin([esyscmd], [sed -ne '/.*define IVERSION /{ s/.*IVERSION //; p; }' libcfs/module.c]))
+# AM_MAINTAINER_MODE
+
+sinclude(build.m4)
+sinclude(archdep.m4)
+
+if test x$enable_inkernel = xyes ; then
+cp Kernelenv.mk Kernelenv.in
+cp Makefile.mk Makefile.in
+cp libcfs/Makefile.mk libcfs/Makefile.in
+cp portals/Makefile.mk portals/Makefile.in
+cp knals/Makefile.mk knals/Makefile.in
+cp knals/socknal/Makefile.mk knals/socknal/Makefile.in
+cp router/Makefile.mk router/Makefile.in
+fi
+
+AM_CONFIG_HEADER(include/config.h)
+
+AC_OUTPUT([Makefile Kernelenv libcfs/Makefile portals/Makefile \
+          unals/Makefile knals/Makefile router/Makefile \
+         knals/socknal/Makefile knals/gmnal/Makefile knals/qswnal/Makefile \
+         knals/scimacnal/Makefile knals/toenal/Makefile \
+          utils/Makefile tests/Makefile doc/Makefile ])
+
diff --git a/lustre/portals/doc/.cvsignore b/lustre/portals/doc/.cvsignore
new file mode 100644 (file)
index 0000000..827dca4
--- /dev/null
@@ -0,0 +1,4 @@
+Makefile
+Makefile.in
+*.eps
+*.pdf
diff --git a/lustre/portals/doc/Data-structures b/lustre/portals/doc/Data-structures
new file mode 100644 (file)
index 0000000..b5532b1
--- /dev/null
@@ -0,0 +1,65 @@
+In this document I will try to draw the data structures and how they
+interrelate in the Portals 3 reference implementation.  It is probably
+best shown with a drawing, so there may be an additional xfig or
+Postscript figure.
+
+
+MEMORY POOLS:
+------------
+
+First, a digression on memory allocation in the library.  As mentioned
+in the NAL Writer's Guide, the library does not link against any
+standard C libraries and as such is unable to dynamically allocate
+memory on its own.  It requires that the NAL implement a method
+for allocation that is appropriate for the protection domain in
+which the library lives.  This is only called when a network
+interface is initialized to allocate the Portals object pools.
+
+These pools are preallocate blocks of objects that the library
+can rapidly make active and manage with a minimum of overhead.
+It is also cuts down on overhead for setting up structures
+since the NAL->malloc() callback does not need to be called
+for each object.
+
+The objects are maintained on a per-object type singly linked free
+list and contain a pointer to the next free object.  This pointer
+is NULL if the object is not on the free list and is non-zero
+if it is on the list.  The special sentinal value of 0xDEADBEEF
+is used to mark the end of the free list since NULL could
+indicate that the last object in the list is not free.
+
+When one of the lib_*_alloc() functions is called, the library
+returns the head of the free list and advances the head pointer
+to the next item on the list.  The special case of 0xDEADBEEF is
+checked and a NULL pointer is returned if there are no more
+objects of this type available.   The lib_*_free() functions
+are even simpler -- check to ensure that the object is not already
+free, set its next pointer to the current head and then set
+the head to be this newly freed object.
+
+Since C does not have templates, I did the next best thing and wrote
+the memory pool allocation code as a macro that expands based on the
+type of the argument.  The mk_alloc(T) macro expands to
+write the _lib_T_alloc() and lib_T_free() functions.
+It requires that the object have a pointer of the type T named
+"next_free".  There are also functions that map _lib_T_alloc()
+to lib_T_alloc() so that the library can add some extra
+functionality to the T constructor.
+
+
+
+LINKED LISTS:
+------------
+
+Many of the active Portals objects are stored in doubly linked lists
+when they are active.  These are always implemented with the pointer
+to the next object and a pointer to the next pointer of the
+previous object.  This avoids the "dummy head" object or
+special cases for inserting at the beginning or end of the list.
+The pointer manipulations are a little hairy at times, but
+I hope that they are understandable.
+
+The actual linked list code is implemented as macros in <lib-p30.h>,
+although the object has to know about 
+
+
diff --git a/lustre/portals/doc/Makefile.am b/lustre/portals/doc/Makefile.am
new file mode 100644 (file)
index 0000000..7c65e6c
--- /dev/null
@@ -0,0 +1,46 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+LYX2PDF = lyx --export pdf
+LYX2TXT = lyx --export text
+LYX2HTML = lyx --export html
+SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps
+
+DOCS = portals3.pdf 
+IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps
+LYXFILES= portals3.lyx
+
+MAINTAINERCLEANFILES =  $(IMAGES) $(DOCS) $(GENERATED)
+GENERATED = 
+EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) 
+
+all: $(DOCS)
+
+# update date and version in document
+date := $(shell date +%x)
+tag := $(shell echo '$$Name:  $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/')
+addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g'
+
+# Regenerate when the $(VERSION) or $Name:  $ changes.
+.INTERMEDIATE: $(GENERATED)
+$(GENERATED) : %.lyx: %.lin Makefile
+       $(addversion) $< > $@
+
+.lyx.pdf:
+       @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n"
+
+.lyx.txt:
+       @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n"
+.lyx.html:
+       @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n"
+.fig.eps:
+       -fig2dev -L eps $< > $@
+
+portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx
+
+syncweb: portals3.pdf
+#      cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf
+#      ( cd /usr/src/www ; make lustre ; make synclustre )
+
diff --git a/lustre/portals/doc/Message-life-cycle b/lustre/portals/doc/Message-life-cycle
new file mode 100644 (file)
index 0000000..e8cc7e2
--- /dev/null
@@ -0,0 +1,118 @@
+This documents the life cycle of message as it arrives and is handled by
+a basic async, packetized NAL.  There are four types of messages that have
+slightly different life cycles, so they are addressed independently.
+
+
+Put request
+-----------
+
+1.  NAL notices that there is a incoming message header on the network
+and reads an ptl_hdr_t in from the wire.
+
+2.  It may store additional NAL specific data that provides context
+for this event in a void* that it will interpret in some fashion
+later.
+
+3.  The NAL calls lib_parse() with a pointer to the header and its
+private data structure.
+
+4.  The library decodes the header and may build a message state
+object that describes the event to be written and the ACK to be
+sent, if any.  It then calls nal->recv() with the private data
+that the NAL passed in, a pointer to the message state object
+and a translated user address.
+
+       The NAL will have been given a chance to pretranslate
+       all user addresses when the buffers are created.  This
+       process is described in the NAL-HOWTO.
+
+5.  The NAL should restore what ever context it required from the
+private data pointer, begin receiving the bytes and possibly store
+some extra state of its own.  It should return at this point.
+
+
+
+Get request
+-----------
+
+1.  As with a Put, the NAL notices the incoming message header and
+passes it to lib_parse().
+
+2.  The library decodes the header and calls nal->recv() with a
+zero byte length, offset and destination to instruct it to clean
+up the wire after reading the header.  The private data will
+be passed in as well, allowing the NAL to retrieve any state
+or context that it requires.
+
+3.  The library may build a message state object to possibly
+write an event log or invalidate a memory region.
+
+4.  The library will build a ptl_msg_t header that specifies the
+Portals protocol information for delivery at the remote end.
+
+5.  The library calls nal->send() with the pre-built header,
+the optional message state object, the four part address
+component, a translated user pointer + offset, and some
+other things.
+
+6.  The NAL is to put the header on the wire or copy it at
+this point (since it off the stack).  It should store some
+amount of state about its current position in the message and
+the destination address.
+
+7.  And then return to the library.
+
+
+Reply request
+-------------
+
+1.  Starting at "The library decodes the header..."
+
+2.  The library decodes the header and calls nal->recv()
+to bring in the rest of the message.  Flow continues in
+exactly the same fashion as with all other receives.
+
+
+Ack request
+-----------
+
+1.  The library decodes the header, builds the appropriate data
+structures for the event in a message state object and calls nal->recv()
+with a zero byte length, etc.
+
+
+Packet arrival
+--------------
+
+1.  The NAL should notice the arrival of a packet, retrieve whatever
+state it needs from the message ID or other NAL specific header data
+and place the data bytes directly into the user address that were
+given to nal->recv().
+
+       How this happens is outside the scope of the Portals library
+       and soley determined by the NAL...
+
+2.  If this is the last packet in a message, the NAL should retrieve
+the lib_msg_t *cookie that it was given in the call to nal->recv()
+and pass it to lib_finalize().  lib_finalize() may call nal->send()
+to send an ACK, nal->write() to record an entry in the event log,
+nal->invalidate() to unregister a region of memory or do nothing at all.
+
+3.  It should then clean up any remaining NAL specific state about
+the message and go back into the main loop.
+
+
+Outgoing packets
+----------------
+
+1.  When the NAL has pending output, it should put the packets on
+the wire wrapped with whatever implementation specified wrappers.
+
+2.  Once it has output all the packets of a message it should
+call lib_finalize() with the message state object that was
+handed to nal->send().  This will allows the library to clean
+up its state regarding the message and write any pending event
+entries.
+
+
+
diff --git a/lustre/portals/doc/NAL-HOWTO b/lustre/portals/doc/NAL-HOWTO
new file mode 100644 (file)
index 0000000..ea38aed
--- /dev/null
@@ -0,0 +1,293 @@
+This document is a first attempt at describing how to write a NAL
+for the Portals 3 library.  It also defines the library architecture
+and the abstraction of protection domains.
+
+
+First, an overview of the architecture:
+
+    Application
+
+----|----+--------
+         |
+   API  === NAL        (User space)
+         |   
+---------+---|-----
+         |    
+   LIB  === NAL        (Library space)
+         |
+---------+---|-----
+          
+    Physical wire      (NIC space)
+          
+
+Application
+    API
+API-side NAL
+------------
+LIB-side NAL
+    LIB
+LIB-side NAL
+   wire
+
+Communication is through the indicated paths via well defined
+interfaces.  The API and LIB portions are written to be portable
+across platforms and do not depend on the network interface.
+
+Communcation between the application and the API code is
+defined in the Portals 3 API specification.  This is the
+user-visible portion of the interface and should be the most
+stable.
+
+
+
+API-side NAL:
+------------
+
+The user space NAL needs to implement only a few functions
+that are stored in a nal_t data structure and called by the
+API-side library:
+
+       int forward( nal_t *nal,
+               int     index,
+               void    *args,
+               size_t  arg_len,
+               void    *ret,
+               size_t  ret_len
+       );
+
+Most of the data structures in the portals library are held in
+the LIB section of the code, so it is necessary to forward API
+calls across the protection domain to the library.  This is
+handled by the NAL's forward method.  Once the argument and return
+blocks are on the remote side the NAL should call lib_dispatch()
+to invoke the appropriate API function.
+
+       int validate( nal_t *nal,
+               void    *base,
+               size_t  extent,
+               void    **trans_base,
+               void    **trans_data
+       );
+
+The validate method provides a means for the NAL to prevalidate
+and possibly pretranslate user addresses into a form suitable
+for fast use by the network card or kernel module.  The trans_base
+pointer will be used by the library everytime it needs to
+refer to the block of memory.  The trans_data result is a
+cookie that will be handed to the NAL along with the trans_base.
+
+The library never performs calculations on the trans_base value;
+it only computes offsets that are then handed to the NAL.
+
+
+       int shutdown( nal_t *nal, int interface );
+
+Brings down the network interface.  The remote NAL side should
+call lib_fini() to bring down the library side of the network.
+
+       void yield( nal_t *nal );
+
+This allows the user application to gracefully give up the processor
+while busy waiting.  Performance critical applications may not
+want to take the time to call this function, so it should be an
+option to the PtlEQWait call.  Right now it is not implemented as such.
+
+Lastly, the NAL must implement a function named PTL_IFACE_*, where
+* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR.
+This initialization function is to set up communication with the
+library-side NAL, which should call lib_init() to bring up the
+network interface.
+
+
+
+LIB-side NAL:
+------------
+
+On the library-side, the NAL has much more responsibility.  It
+is responsible for calling lib_dispatch() on behalf of the user,
+it is also responsible for bringing packets off the wire and
+pushing bits out.  As on the user side, the methods are stored
+in a nal_cb_t structure that is defined on a per network
+interface basis.
+
+The calls to lib_dispatch() need to be examined.  The prototype:
+
+       void    lib_dispatch(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       int                     index,
+                       void                    *arg_block,
+                       void                    *ret_block
+       );
+
+has two complications.  The private field is a NAL-specific
+value that will be passed to any callbacks produced as a result
+of this API call.  Kernel module implementations may use this
+for task structures, or perhaps network card data.  It is ignored
+by the library.
+
+Secondly, the arg_block and ret_block must be in the same protection
+domain as the library.  The NAL's two halves must communicate the
+sizes and perform the copies.  After the call, the buffer pointed
+to by ret_block will be filled in and should be copied back to
+the user space.  How this is to be done is NAL specific.
+
+       int lib_parse(
+                       nal_cb_t                *nal,
+                       ptl_hdr_t               *hdr,
+                       void                    *private
+       );
+
+This is the only other entry point into the library from the NAL.
+When the NAL detects an incoming message on the wire it should read
+sizeof(ptl_hdr_t) bytes and pass a pointer to the header to
+lib_parse().  It may set private to be anything that it needs to
+tie the incoming message to callbacks that are made as a result
+of this event.
+
+The method calls are:
+
+       int     (*send)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       lib_msg_t               *cookie,
+                       ptl_hdr_t               *hdr,
+                       int                     nid,
+                       int                     pid,
+                       int                     gid,
+                       int                     rid,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+                       size_t                  len
+       );
+
+This is a tricky function -- it must support async output
+of messages as well as properly syncronized event log writing.
+The private field is the same that was passed into lib_dispatch()
+or lib_parse() and may be used to tie this call to the event
+that initiated the entry to the library.
+
+The cookie is a pointer to a library private value that must
+be passed to lib_finalize() once the message has been completely
+sent.  It should not be examined by the NAL for any meaning.
+
+The four ID fields are passed in, although some implementations
+may not use all of them.
+
+The single base pointer has been replaced with the translated
+address that the API NAL generated in the api_nal->validate()
+call.  The trans_data is unchanged and the offset is in bytes.
+
+
+       int     (*recv)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       lib_msg_t               *cookie,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+                       size_t                  mlen,
+                       size_t                  rlen
+       );
+
+This callback will only be called in response to lib_parse().
+The cookie, trans_addr and trans_data  are as discussed in send().
+The NAL should read mlen bytes from the wire, deposit them into
+trans_base + offset and then discard (rlen - mlen) bytes.
+Once the entire message has been received the NAL should call
+lib_finalize() with the lib_msg_t *cookie.
+
+The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0
+is used to indicate that the NAL should clean up the wire.  This could
+be implemented as a blocking call, although having it return as quickly
+as possible is desirable.
+
+       int     (*write)(
+                       nal_cb_t                *nal,
+                       void                    *private,
+                       user_ptr                trans_addr,
+                       user_ptr                trans_data,
+                       size_t                  offset,
+
+                       void                    *src_addr,
+                       size_t                  len
+       );
+
+This is essentially a cross-protection domain memcpy().  The user address
+has been pretranslated by the api_nal->translate() call.
+
+       void    *(*malloc)(
+                       nal_cb_t                *nal,
+                       size_t                  len
+       );
+
+       void    (*free)(
+                       nal_cb_t                *nal,
+                       void                    *buf
+       );
+
+Since the NAL may be in a non-standard hosted environment it can
+not call malloc().  This allows the library side NAL to implement
+the system specific malloc().  In the current reference implementation
+the libary only calls nal->malloc() when the network interface is
+initialized and then calls free when it is brought down.  The library
+maintains its own pool of objects for allocation so only one call to
+malloc is made per object type.
+
+       void    (*invalidate)(
+                       nal_cb_t                *nal,
+                       user_ptr                trans_base,
+                       user_ptr                trans_data,
+                       size_t                  extent
+       );
+
+User addresses are validated/translated at the user-level API NAL
+method, which is likely to push them to this level.  Meanwhile,
+the library NAL will be notified when the library no longer
+needs the buffer.  Overlapped buffers are not detected by the
+library, so the NAL should ref count each page involved.
+
+Unfortunately we have a few bugs when the invalidate method is
+called.  It is still in progress...
+
+       void    (*printf)(
+                       nal_cb_t                *nal,
+                       const char              *fmt,
+                       ...
+       );
+
+As with malloc(), the library does not have any way to do printf
+or printk.  It is not necessary for the NAL to implement the this
+call, although it will make debugging difficult.
+
+       void    (*cli)(
+                       nal_cb_t                *nal,
+                       unsigned long           *flags
+       );
+
+       void    (*sti)(
+                       nal_cb_t                *nal,
+                       unsigned long           *flags
+       );
+
+These are used by the library to mark critical sections.
+
+       int     (*gidrid2nidpid)(
+                       nal_cb_t                *nal,
+                       ptl_id_t                gid,
+                       ptl_id_t                rid,
+                       ptl_id_t                *nid,
+                       ptl_id_t                *pid
+       );
+
+
+       int     (*nidpid2gidrid)(
+                       nal_cb_t                *nal,
+                       ptl_id_t                nid,
+                       ptl_id_t                pid,
+                       ptl_id_t                *gid,
+                       ptl_id_t                *rid
+       );
+
+Rolf added these.  I haven't looked at how they have to work yet.
diff --git a/lustre/portals/doc/file.fig b/lustre/portals/doc/file.fig
new file mode 100644 (file)
index 0000000..914c294
--- /dev/null
@@ -0,0 +1,111 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 1200 750 1650 1050
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 1050 1650 750 1200 750 1200 1050 1650 1050
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001
+-6
+6 1200 2325 1650 2625
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 2625 1650 2325 1200 2325 1200 2625 1650 2625
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001
+-6
+6 1200 1800 1650 2100
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 2100 1650 1800 1200 1800 1200 2100 1650 2100
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001
+-6
+6 1200 1275 1650 1575
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1650 1575 1650 1275 1200 1275 1200 1575 1650 1575
+4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001
+-6
+6 450 750 900 1200
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 825 450 1050
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1050 900 825
+-6
+6 450 2325 900 2775
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 2400 450 2625
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2625 900 2400
+-6
+6 450 1800 900 2250
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 1875 450 2100
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2100 900 1875
+-6
+6 450 1275 900 1725
+5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575
+1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        450 1350 450 1575
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1575 900 1350
+-6
+6 2250 750 3450 2625
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1200 3150 1200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1500 3150 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 1800 3150 1800
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 2100 3150 2100
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2550 975 3150 975 3150 2625 2550 2625 2550 975
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2
+        2550 2400 3150 2400
+4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2400 2550 1350
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 1875 2550 1050
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 1425 2550 1950
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 900 2550 1650
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 900 1200 900
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1425 1200 1425
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 1950 1200 1950
+2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2
+        900 2475 1200 2475
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2025 2550 2250
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2
+       0 0 1.00 60.00 120.00
+       0 0 1.00 60.00 120.00
+        1650 2550 2550 2475
+2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5
+        1875 2850 1875 600 225 600 225 2850 1875 2850
+4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001
diff --git a/lustre/portals/doc/flow_new.fig b/lustre/portals/doc/flow_new.fig
new file mode 100644 (file)
index 0000000..d828dea
--- /dev/null
@@ -0,0 +1,213 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 525 2175 1575 2925
+6 675 2287 1425 2812
+4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001
+4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001
+4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        1575 2550 1050 2175 525 2550 1050 2925 1575 2550
+-6
+6 3450 1275 4350 1725
+6 3600 1312 4200 1687
+4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001
+4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3450 1275 4350 1275 4350 1725 3450 1725 3450 1275
+-6
+6 4650 1275 5550 1725
+6 4725 1312 5475 1687
+4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001
+4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4650 1275 5550 1275 5550 1725 4650 1725 4650 1275
+-6
+6 1350 525 2250 975
+6 1350 562 2250 937
+4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001
+4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1350 525 2250 525 2250 975 1350 975 1350 525
+-6
+6 525 1125 1575 1875
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        1575 1500 1050 1125 525 1500 1050 1875 1575 1500
+4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001
+-6
+6 2340 1237 2940 1687
+6 2340 1237 2940 1687
+4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001
+4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001
+4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001
+-6
+-6
+6 525 3225 1575 3975
+6 675 3375 1425 3750
+4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001
+4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001
+-6
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        525 3600 1050 3225 1575 3600 1050 3975 525 3600
+-6
+6 3300 3375 4350 3825
+6 3300 3412 4350 3787
+4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001
+4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3300 3375 4350 3375 4350 3825 3300 3825 3300 3375
+-6
+6 1950 3225 3000 3975
+6 2250 3450 2700 3750
+4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001
+4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        3000 3600 2475 3225 1950 3600 2475 3975 3000 3600
+-6
+6 3150 4500 4200 4950
+6 3150 4537 4200 4912
+4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001
+4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3150 4500 4200 4500 4200 4950 3150 4950 3150 4500
+-6
+6 600 4500 1500 4950
+6 675 4537 1425 4912
+4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001
+4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        600 4500 1500 4500 1500 4950 600 4950 600 4500
+-6
+6 4650 4350 5700 5100
+6 4950 4537 5400 4912
+6 4950 4537 5400 4912
+4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001
+4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001
+-6
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        5700 4725 5175 4350 4650 4725 5175 5100 5700 4725
+-6
+6 6000 4500 6900 4950
+6 6225 4575 6675 4875
+4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001
+4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001
+-6
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        6000 4500 6900 4500 6900 4950 6000 4950 6000 4500
+-6
+6 1800 4350 2850 5100
+6 2100 4575 2550 4875
+4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001
+4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001
+-6
+2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5
+        2850 4725 2325 4350 1800 4725 2325 5100 2850 4725
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 1875 1050 2175
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1575 1500 2100 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 450 1050 1125
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1350 750 1050 750
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 2925 1050 3225
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3150 1500 3450 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        4350 1500 4650 1500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        2100 1500 2625 1125 3150 1500 2625 1875 2100 1500
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1575 3600 1950 3600
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1050 3975 1050 4500
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3000 3600 3300 3600
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 4725 1800 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        5700 4725 6000 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2850 4725 3150 4725
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        4200 4725 4650 4725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        6900 4725 7950 4725
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1575 2550 1650 2550 1800 2550 1800 2400 1800 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5
+       0 0 1.00 60.00 120.00
+        2250 750 2475 750 2625 750 2625 900 2625 1125
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5
+       0 0 1.00 60.00 120.00
+        7500 4725 7500 1650 7500 1500 7350 1500 5550 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        2475 3225 2475 2400 2475 2250 2325 2250 1800 2250
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        3825 3375 3825 2175 3825 2025 3675 2025 1800 2025
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8
+       0 0 1.00 60.00 120.00
+        2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125
+        4425 4275 4425 4725
+        0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8
+       0 0 1.00 60.00 120.00
+        5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125
+        7275 4275 7275 4725
+        0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000
+4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001
+4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001
+4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001
+4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001
+4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001
+4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001
+4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001
+4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001
+4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001
diff --git a/lustre/portals/doc/get.fig b/lustre/portals/doc/get.fig
new file mode 100644 (file)
index 0000000..28db949
--- /dev/null
@@ -0,0 +1,33 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 2775 900 3525 1200
+4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001
+4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001
+-6
+6 1350 1725 2175 2025
+4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001
+4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001
+-6
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        900 525 2700 750
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 825 2700 1275
+2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 1350 900 1950
+2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5
+        2400 300 3600 300 3600 2250 2400 2250 2400 300
+2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5
+        0 300 1200 300 1200 2250 0 2250 0 300
+4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001
+4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001
+4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001
diff --git a/lustre/portals/doc/ieee.bst b/lustre/portals/doc/ieee.bst
new file mode 100644 (file)
index 0000000..4df7c50
--- /dev/null
@@ -0,0 +1,1112 @@
+% ---------------------------------------------------------------
+%
+% by Paolo.Ienne@di.epfl.ch
+%
+% ---------------------------------------------------------------
+%
+% no guarantee is given that the format corresponds perfectly to 
+% IEEE 8.5" x 11" Proceedings, but most features should be ok.
+%
+% ---------------------------------------------------------------
+%
+% `ieee' from BibTeX standard bibliography style `abbrv'
+% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09.
+% Copyright (C) 1985, all rights reserved.
+% Copying of this file is authorized only if either
+% (1) you make absolutely no changes to your copy, including name, or
+% (2) if you do make changes, you name it something other than
+% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
+% This restriction helps ensure that all standard styles are identical.
+% The file btxbst.doc has the documentation for this style.
+
+ENTRY
+  { address
+    author
+    booktitle
+    chapter
+    edition
+    editor
+    howpublished
+    institution
+    journal
+    key
+    month
+    note
+    number
+    organization
+    pages
+    publisher
+    school
+    series
+    title
+    type
+    volume
+    year
+  }
+  {}
+  { label }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+  #1 'mid.sentence :=
+  #2 'after.sentence :=
+  #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+  output.state mid.sentence =
+    { ", " * write$ }
+    { output.state after.block =
+ { add.period$ write$
+   newline$
+   "\newblock " write$
+ }
+ { output.state before.all =
+     'write$
+     { add.period$ " " * write$ }
+   if$
+ }
+      if$
+      mid.sentence 'output.state :=
+    }
+  if$
+  s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+    'pop$
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+  duplicate$ empty$
+    { pop$ "empty " t * " in " * cite$ * warning$ }
+    'output.nonnull
+  if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+  "\bibitem{" write$
+  cite$ write$
+  "}" write$
+  newline$
+  ""
+  before.all 'output.state :=
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+  write$
+  newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+    'skip$
+    { after.block 'output.state := }
+  if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+    'skip$
+    { output.state before.all =
+ 'skip$
+ { after.sentence 'output.state := }
+      if$
+    }
+  if$
+}
+
+FUNCTION {not}
+{   { #0 }
+    { #1 }
+  if$
+}
+
+FUNCTION {and}
+{   'skip$
+    { pop$ #0 }
+  if$
+}
+
+FUNCTION {or}
+{   { pop$ #1 }
+    'skip$
+  if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.block
+  if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+  swap$ empty$
+  and
+    'skip$
+    'new.sentence
+  if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+    { pop$ "" }
+    'skip$
+  if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+    { pop$ "" }
+    { "{\em " swap$ * "}" * }
+  if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+  #1 'nameptr :=
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't :=
+      nameptr #1 >
+ { namesleft #1 >
+     { ", " * t * }
+     { numnames #2 >
+  { "," * }
+  'skip$
+       if$
+       t "others" =
+  { " et~al." * }
+  { " and " * t * }
+       if$
+     }
+   if$
+ }
+ 't
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+    { "" }
+    { author format.names }
+  if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+    { "" }
+    { editor format.names
+      editor num.names$ #1 >
+ { ", editors" * }
+ { ", editor" * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+    { "" }
+    { title "t" change.case$ }
+  if$
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+  ""
+    { t empty$ not }
+    { t #1 #1 substring$ "-" =
+ { t #1 #2 substring$ "--" = not
+     { "--" *
+       t #2 global.max$ substring$ 't :=
+     }
+     {   { t #1 #1 substring$ "-" = }
+  { "-" *
+    t #2 global.max$ substring$ 't :=
+  }
+       while$
+     }
+   if$
+ }
+ { t #1 #1 substring$ *
+   t #2 global.max$ substring$ 't :=
+ }
+      if$
+    }
+  while$
+}
+
+FUNCTION {format.date}
+{ year empty$
+    { month empty$
+ { "" }
+ { "there's a month but no year in " cite$ * warning$
+   month
+ }
+      if$
+    }
+    { month empty$
+ 'year
+ { month " " * year * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+    { "~" }
+    { " " }
+  if$
+  swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+    'pop$
+    { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+  if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+    { "" }
+    { "volume" volume tie.or.space.connect
+      series empty$
+ 'skip$
+ { " of " * series emphasize * }
+      if$
+      "volume and number" number either.or.check
+    }
+  if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+    { number empty$
+ { series field.or.null }
+ { output.state mid.sentence =
+     { "number" }
+     { "Number" }
+   if$
+   number tie.or.space.connect
+   series empty$
+     { "there's a number but no series in " cite$ * warning$ }
+     { " in " * series * }
+   if$
+ }
+      if$
+    }
+    { "" }
+  if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+    { "" }
+    { output.state mid.sentence =
+ { edition "l" change.case$ " edition" * }
+ { edition "t" change.case$ " edition" * }
+      if$
+    }
+  if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+  #0 'multiresult :=
+    { multiresult not
+      t empty$ not
+      and
+    }
+    { t #1 #1 substring$
+      duplicate$ "-" =
+      swap$ duplicate$ "," =
+      swap$ "+" =
+      or or
+ { #1 'multiresult := }
+ { t #2 global.max$ substring$ 't := }
+      if$
+    }
+  while$
+  multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+    { "" }
+    { pages multi.page.check
+ { "pages" pages n.dashify tie.or.space.connect }
+ { "page" pages tie.or.space.connect }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+  number empty$
+    'skip$
+    { "(" number * ")" * *
+      volume empty$
+ { "there's a number but no volume in " cite$ * warning$ }
+ 'skip$
+      if$
+    }
+  if$
+  pages empty$
+    'skip$
+    { duplicate$ empty$
+ { pop$ format.pages }
+ { ":" * pages n.dashify * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+    'format.pages
+    { type empty$
+ { "chapter" }
+ { type "l" change.case$ }
+      if$
+      chapter tie.or.space.connect
+      pages empty$
+ 'skip$
+ { ", " * format.pages * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+    { "" }
+    { editor empty$
+ { "In " booktitle emphasize * }
+ { "In " format.editors * ", " * booktitle emphasize * }
+      if$
+    }
+  if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+  month empty$ year empty$ note empty$
+  and and and and and
+  key empty$ not and
+    { "all relevant fields are empty in " cite$ * warning$ }
+    'skip$
+  if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+    'skip$
+    { pop$
+      type "t" change.case$
+    }
+  if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+    { "Technical Report" }
+    'type
+  if$
+  number empty$
+    { "t" change.case$ }
+    { number tie.or.space.connect }
+  if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+    { journal empty$
+ { "need key or journal for " cite$ * " to crossref " * crossref *
+   warning$
+   ""
+ }
+ { "In {\em " journal * "\/}" * }
+      if$
+    }
+    { "In " key * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.crossref.editor}
+{ editor #1 "{vv~}{ll}" format.name$
+  editor num.names$ duplicate$
+  #2 >
+    { pop$ " et~al." * }
+    { #2 <
+ 'skip$
+ { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+     { " et~al." * }
+     { " and " * editor #2 "{vv~}{ll}" format.name$ * }
+   if$
+ }
+      if$
+    }
+  if$
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+    { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+      "In "
+    }
+    { "Volume" volume tie.or.space.connect
+      " of " *
+    }
+  if$
+  editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { series empty$
+     { "need editor, key, or series for " cite$ * " to crossref " *
+       crossref * warning$
+       "" *
+     }
+     { "{\em " * series * "\/}" * }
+   if$
+ }
+ { key * }
+      if$
+    }
+    { format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+  editor field.or.null author field.or.null =
+  or
+    { key empty$
+ { booktitle empty$
+     { "need editor, key, or booktitle for " cite$ * " to crossref " *
+       crossref * warning$
+       ""
+     }
+     { "In {\em " booktitle * "\/}" * }
+   if$
+ }
+ { "In " key * }
+      if$
+    }
+    { "In " format.crossref.editor * }
+  if$
+  " \cite{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { journal emphasize "journal" output.check
+      format.vol.num.pages output
+      format.date "year" output.check
+    }
+    { format.article.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+  format.authors output
+  new.block
+  format.title "title" output.check
+  howpublished address new.block.checkb
+  howpublished output
+  address output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+  author empty$
+    { format.editors "author and editor" output.check }
+    { format.authors output.nonnull
+      crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+      if$
+    }
+  if$
+  new.block
+  format.btitle "title" output.check
+  crossref missing$
+    { format.bvolume output
+      format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.number.series output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+    }
+    { format.chapter.pages "chapter and pages" output.check
+      new.block
+      format.book.crossref output.nonnull
+    }
+  if$
+  format.edition output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.chapter.pages output
+      new.sentence
+      publisher "publisher" output.check
+      address output
+      format.edition output
+      format.date "year" output.check
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.chapter.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  crossref missing$
+    { format.in.ed.booktitle "booktitle" output.check
+      format.bvolume output
+      format.number.series output
+      format.pages output
+      address empty$
+ { organization publisher new.sentence.checkb
+   organization output
+   publisher output
+   format.date "year" output.check
+ }
+ { address output.nonnull
+   format.date "year" output.check
+   new.sentence
+   organization output
+   publisher output
+ }
+      if$
+    }
+    { format.incoll.inproc.crossref output.nonnull
+      format.pages output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+  author empty$
+    { organization empty$
+ 'skip$
+ { organization output.nonnull
+   address output
+ }
+      if$
+    }
+    { format.authors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  author empty$
+    { organization empty$
+ { address new.block.checka
+   address output
+ }
+ 'skip$
+      if$
+    }
+    { organization address new.block.checkb
+      organization output
+      address output
+    }
+  if$
+  format.edition output
+  format.date output
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  "Master's thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+  format.authors output
+  title howpublished new.block.checkb
+  format.title output
+  howpublished new.block.checka
+  howpublished output
+  format.date output
+  new.block
+  note output
+  fin.entry
+  empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.btitle "title" output.check
+  new.block
+  "PhD thesis" format.thesis.type output.nonnull
+  school "school" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+  editor empty$
+    { organization output }
+    { format.editors output.nonnull }
+  if$
+  new.block
+  format.btitle "title" output.check
+  format.bvolume output
+  format.number.series output
+  address empty$
+    { editor empty$
+ { publisher new.sentence.checka }
+ { organization publisher new.sentence.checkb
+   organization output
+ }
+      if$
+      publisher output
+      format.date "year" output.check
+    }
+    { address output.nonnull
+      format.date "year" output.check
+      new.sentence
+      editor empty$
+ 'skip$
+ { organization output }
+      if$
+      publisher output
+    }
+  if$
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  format.tr.number output.nonnull
+  institution "institution" output.check
+  address output
+  format.date "year" output.check
+  new.block
+  note output
+  fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+  format.authors "author" output.check
+  new.block
+  format.title "title" output.check
+  new.block
+  note "note" output.check
+  format.date output
+  fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+MACRO {jan} {"Jan."}
+
+MACRO {feb} {"Feb."}
+
+MACRO {mar} {"Mar."}
+
+MACRO {apr} {"Apr."}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"Aug."}
+
+MACRO {sep} {"Sept."}
+
+MACRO {oct} {"Oct."}
+
+MACRO {nov} {"Nov."}
+
+MACRO {dec} {"Dec."}
+
+MACRO {acmcs} {"ACM Comput. Surv."}
+
+MACRO {acta} {"Acta Inf."}
+
+MACRO {cacm} {"Commun. ACM"}
+
+MACRO {ibmjrd} {"IBM J. Res. Dev."}
+
+MACRO {ibmsj} {"IBM Syst.~J."}
+
+MACRO {ieeese} {"IEEE Trans. Softw. Eng."}
+
+MACRO {ieeetc} {"IEEE Trans. Comput."}
+
+MACRO {ieeetcad}
+ {"IEEE Trans. Comput.-Aided Design Integrated Circuits"}
+
+MACRO {ipl} {"Inf. Process. Lett."}
+
+MACRO {jacm} {"J.~ACM"}
+
+MACRO {jcss} {"J.~Comput. Syst. Sci."}
+
+MACRO {scp} {"Sci. Comput. Programming"}
+
+MACRO {sicomp} {"SIAM J. Comput."}
+
+MACRO {tocs} {"ACM Trans. Comput. Syst."}
+
+MACRO {tods} {"ACM Trans. Database Syst."}
+
+MACRO {tog} {"ACM Trans. Gr."}
+
+MACRO {toms} {"ACM Trans. Math. Softw."}
+
+MACRO {toois} {"ACM Trans. Office Inf. Syst."}
+
+MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."}
+
+MACRO {tcs} {"Theoretical Comput. Sci."}
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+  "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+  'len :=
+  s #1 len substring$ =
+    { s len #1 + global.max$ substring$ }
+    's
+  if$
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+  #1 'nameptr :=
+  ""
+  s num.names$ 'numnames :=
+  numnames 'namesleft :=
+    { namesleft #0 > }
+    { nameptr #1 >
+ { "   " * }
+ 'skip$
+      if$
+      s nameptr "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}" format.name$ 't :=
+      nameptr numnames = t "others" = and
+ { "et al" * }
+ { t sortify * }
+      if$
+      nameptr #1 + 'nameptr :=
+      namesleft #1 - 'namesleft :=
+    }
+  while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+  "A " #2
+    "An " #3
+      "The " #4 t chop.word
+    chop.word
+  chop.word
+  sortify
+  #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+    { key empty$
+ { "to sort, need author or key in " cite$ * warning$
+   ""
+ }
+ { key sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+    { editor empty$
+ { key empty$
+     { "to sort, need author, editor, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { editor sort.format.names }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need author, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { author sort.format.names }
+  if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+    { organization empty$
+ { key empty$
+     { "to sort, need editor, organization, or key in " cite$ * warning$
+       ""
+     }
+     { key sortify }
+   if$
+ }
+ { "The " #4 organization chop.word sortify }
+      if$
+    }
+    { editor sort.format.names }
+  if$
+}
+
+FUNCTION {presort}
+{ type$ "book" =
+  type$ "inbook" =
+  or
+    'author.editor.sort
+    { type$ "proceedings" =
+ 'editor.organization.sort
+ { type$ "manual" =
+     'author.organization.sort
+     'author.sort
+   if$
+ }
+      if$
+    }
+  if$
+  "    "
+  *
+  year field.or.null sortify
+  *
+  "    "
+  *
+  title field.or.null
+  sort.format.title
+  *
+  #1 entry.max$ substring$
+  'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label }
+
+INTEGERS { number.label longest.label.width }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+  #1 'number.label :=
+  #0 'longest.label.width :=
+}
+
+FUNCTION {longest.label.pass}
+{ number.label int.to.str$ 'label :=
+  number.label #1 + 'number.label :=
+  label width$ longest.label.width >
+    { label 'longest.label :=
+      label width$ 'longest.label.width :=
+    }
+    'skip$
+  if$
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {longest.label.pass}
+
+FUNCTION {begin.bib}
+{ preamble$ empty$
+    'skip$
+    { preamble$ write$ newline$ }
+  if$
+  "\begin{thebibliography}{"  longest.label  * 
+  "}\setlength{\itemsep}{-1ex}\small" * write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+  "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
+
+% end of file ieee.bst
+% ---------------------------------------------------------------
diff --git a/lustre/portals/doc/mpi.fig b/lustre/portals/doc/mpi.fig
new file mode 100644 (file)
index 0000000..e1a91b5
--- /dev/null
@@ -0,0 +1,117 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 150 1650 900 2025
+4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001
+4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001
+-6
+6 150 150 900 525
+4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001
+4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001
+-6
+6 2550 4125 3150 4725
+4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001
+4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001
+4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001
+-6
+6 1050 1575 1950 1875
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 1575 1950 1575 1950 1875 1050 1875 1050 1575
+4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001
+-6
+6 5400 1575 6300 2175
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        5400 1575 6300 1575 6300 2175 5400 2175 5400 1575
+4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001
+-6
+6 5400 2400 6300 3000
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        5400 2400 6300 2400 6300 3000 5400 3000 5400 2400
+4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001
+-6
+6 1050 2400 1950 2700
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 2400 1950 2400 1950 2700 1050 2700 1050 2400
+4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001
+-6
+6 1050 825 1950 1125
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 825 1950 825 1950 1125 1050 1125 1050 825
+4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001
+-6
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 1125 1500 1575
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2025 4050 3375
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        150 675 6600 675
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        150 1350 6600 1350
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 4125 3300 4125 3300 4725 2400 4725 2400 4125
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 4500 4050 3675
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 1725 5400 1725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2550 5400 2550
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3225 2850 4050 3450
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 1800 1500 2400
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 825 3300 825 3300 1275 2400 1275 2400 825
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 2625 1500 4125
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1050 4125 1950 4125 1950 4425 1050 4425 1050 4125
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1500 300 1500 825
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 975 2400 975
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 1725 2400 1725
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 2550 2400 2550
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        1875 4275 2400 4275
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 1575 3300 1575 3300 2175 2400 2175 2400 1575
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2400 2400 3300 2400 3300 3000 2400 3000 2400 2400
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4050 3300 5250 3300 5250 3750 4050 3750 4050 3300
+4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001
+4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001
+4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001
+4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001
+4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001
+4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001
+4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001
+4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001
+4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001
+4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001
+4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001
+4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001
+4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001
+4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001
+4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001
diff --git a/lustre/portals/doc/portals.fig b/lustre/portals/doc/portals.fig
new file mode 100644 (file)
index 0000000..9b1271b
--- /dev/null
@@ -0,0 +1,68 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1350 900 1650 900 1650 1200 1350 1200 1350 900
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        1800 1350 2100 1350 2100 1650 1800 1650 1800 1350
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2250 1800 2550 1800 2550 2100 2250 2100 2250 1800
+2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2
+        4200 375 4200 2100
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        525 600 1125 600 1125 2100 525 2100 525 600
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        4425 1275 4875 1275 4875 1950 4425 1950 4425 1275
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        2550 1200 3150 1200 3150 1500 2550 1500 2550 1200
+2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3000 1425 4425 1425
+2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5
+        3600 825 3750 825 3750 1125 3600 1125 3600 825
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2025 1425 2550 1425
+2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5
+        4425 750 4875 750 4875 1125 4425 1125 4425 750
+2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        3675 975 4425 975
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2
+       0 0 1.00 60.00 120.00
+        825 1050 1350 1050
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1500 1125 1500 1350 1500 1500 1650 1500 1800 1500
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5
+       0 0 1.00 60.00 120.00
+        1950 1575 1950 1800 1950 1950 2100 1950 2250 1950
+        0.000 1.000 1.000 1.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2
+        525 975 1125 975
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2
+        525 1125 1125 1125
+        0.000 0.000
+3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7
+       0 0 1.00 60.00 120.00
+        3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975
+        3600 975
+        0.000 1.000 1.000 1.000 1.000 1.000 0.000
+4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001
+4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001
+4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001
+4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001
+4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001
+4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001
+4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001
+4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001
+4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001
diff --git a/lustre/portals/doc/portals3.bib b/lustre/portals/doc/portals3.bib
new file mode 100644 (file)
index 0000000..323b99f
--- /dev/null
@@ -0,0 +1,124 @@
+@Article{           Cplant,
+    title       = { {M}assively {P}arallel {C}omputing with
+                    {C}ommodity {C}omponents },
+    author      = { Ron Brightwell and David S. Greenberg and Arthur
+                    B. Maccabe and Rolf Riesen },
+    journal     = { Parallel Computing },
+    volume      = { 26 },
+    month       = { February },
+    pages       = { 243-266 },
+    year        = { 2000 }
+}
+
+@Manual{     Portals,
+    organization = { Sandia National Laboratories },
+    title        = { {P}uma {P}ortals },
+    note         = { http://www.cs.sandia.gov/puma/portals },
+    year         = { 1997 }
+}
+
+@Techreport{      VIA,
+  title         = { {V}irtual {I}nterface {A}rchitecture
+                    {S}pecification {V}ersion 1.0 }, 
+  author        = { {Compaq, Microsoft, and Intel} },
+  institution   = { Compaq, Microsoft, and Intel },
+  month         = { December },
+  year          = { 1997 }
+}
+
+@Techreport{      ST,
+  title         = { {I}nformation {T}echnology - {S}cheduled
+                  {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 },
+  author        = { {Task Group of Technical Committee T11} },
+  institution   = { Accredited Standards Committee NCITS },
+  month         = { July },
+  year          = { 1998 }
+}
+
+@Manual{     TFLOPS,
+    organization = { Sandia National Laboratories },
+    title        = { ASCI Red },
+    note         = { http://www.sandia.gov/ASCI/TFLOP },
+    year         = { 1996 }
+}
+
+@Techreport{      GM,
+  title         = { The {GM} {M}essage {P}assing {S}ystem },
+  author         = { {Myricom, Inc.} },
+  institution    = { {Myricom, Inc.} },
+  year          = { 1997 },
+}
+
+@Article{           MPIstandard,
+    title        = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard },
+    author       = { {Message Passing Interface Forum} },
+    journal      = { The International Journal of Supercomputer Applications
+                     and High Performance Computing },
+    volume       = { 8 },
+    year         = { 1994 }
+}
+
+@Inproceedings{    PumaOS,
+    author       = "Lance Shuler and Chu Jong and Rolf Riesen and
+                    David van Dresser and Arthur B. Maccabe and
+                    Lee Ann Fisk and T. Mack Stallcup",
+    booktitle    = "Proceeding of the 1995 Intel Supercomputer
+                    User's Group Conference",
+    title        = "The {P}uma Operating System for Massively Parallel Computers",
+    organization = "Intel Supercomputer User's Group",
+    year         = 1995
+}
+
+@InProceedings{   SUNMOS,
+author          = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and
+                   Stephen R. Wheat",
+title           = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide",
+booktitle       = "Proceedings of the {Intel} Supercomputer Users' Group. 1994
+                   Annual North America Users' Conference.",
+year            = 1994,
+pages           = "245--251",
+month           = "June",
+location        = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps"
+}
+
+@InProceedings {   PumaMPI,
+    title        = { Design and Implementation of {MPI} on {P}uma Portals },
+    author       = { Ron Brightwell and Lance Shuler },
+    booktitle    = { Proceedings of the Second MPI Developer's Conference },
+    pages        = { 18-25 },
+    month        = { July },
+    year         = { 1996 }
+}
+
+@Inproceedings{     FM2,
+    author       = { Mario Lauria and Scott Pakin and Andrew Chien },
+    title        = { {E}fficient {L}ayering for {H}igh {S}peed
+                     {C}ommunication: {F}ast {M}essages 2.x },
+    Booktitle    = { Proceedings of the IEEE International Symposium
+                     on High Performance Distributed Computing },
+    year         = { 1998 }
+}
+
+@Manual {          CraySHMEM,
+    title        = "SHMEM Technical Note for C, SG-2516 2.3",
+    organization = "Cray Research, Inc.",
+    month        = "October",
+    year         = 1994
+}
+
+@Manual {          MPI2,
+    title        = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface",
+    organization = "Message Passing Interface Forum",
+    note         = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html",
+    month        = "July",
+    year         = 1997
+}
+
+@InProceedings {   PMMPI,
+    title        = { {The Design and Implementation of Zero Copy MPI Using
+                       Commodity Hardware with a High Performance Network} },
+    author       = { Francis O'Carroll and  Hiroshi Tezuka and Atsushi Hori
+                     and Yutaka Ishikawa  },
+    booktitle    = { Proceedings of the ICS },
+    year         = { 1998 }
+}
diff --git a/lustre/portals/doc/portals3.lyx b/lustre/portals/doc/portals3.lyx
new file mode 100644 (file)
index 0000000..8429280
--- /dev/null
@@ -0,0 +1,15944 @@
+#LyX 1.2 created this file. For more info see http://www.lyx.org/
+\lyxformat 220
+\textclass report
+\begin_preamble
+\usepackage{fullpage}
+\renewenvironment{comment}%
+{\begin{quote}\textbf{Discussion}: \slshape}%
+{\end{quote}}
+\pagestyle{myheadings}
+\end_preamble
+\language american
+\inputencoding auto
+\fontscheme pslatex
+\graphics default
+\paperfontsize 10
+\spacing single 
+\papersize letterpaper
+\paperpackage a4
+\use_geometry 0
+\use_amsmath 0
+\use_natbib 0
+\use_numerical_citations 0
+\paperorientation portrait
+\secnumdepth 2
+\tocdepth 2
+\paragraph_separation indent
+\defskip medskip
+\quotes_language english
+\quotes_times 2
+\papercolumns 1
+\papersides 2
+\paperpagestyle headings
+
+\layout Title
+
+The Portals 3.2 Message Passing Interface 
+\newline 
+ Revision 1.1
+\layout Author
+
+Ron Brightwell
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+R.
+ Brightwell and R.
+ Riesen are with the Scalable Computing Systems Department, Sandia National
+ Laboratories, P.O.
+ Box 5800, Albuquerque, NM\SpecialChar ~
+\SpecialChar ~
+87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov.
+\end_inset 
+
+, Arthur B.
+ Maccabe
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+A.
+ B.
+ Maccabe is with the Computer Science Department, University of New Mexico,
+ Albuquerque, NM\SpecialChar ~
+\SpecialChar ~
+87131-1386, maccabe@cs.unm.edu.
+\end_inset 
+
+, Rolf Riesen and Trammell Hudson
+\layout Abstract
+
+This report presents a specification for the Portals 3.2 message passing
+ interface.
+ Portals 3.2 is intended to allow scalable, high-performance network communicatio
+n between nodes of a parallel computing system.
+ Specifically, it is designed to support a parallel computing platform composed
+ of clusters of commodity workstations connected by a commodity system area
+ network fabric.
+ In addition, Portals 3.2 is well suited to massively parallel processing
+ and embedded systems.
+ Portals 3.2 represents an adaption of the data movement layer developed
+ for massively parallel processing platforms, such as the 4500-node Intel
+ TeraFLOPS machine.
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+clearpage
+\backslash 
+pagenumbering{roman}
+\backslash 
+setcounter{page}{3}
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset LatexCommand \tableofcontents{}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset FloatList figure
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset FloatList table
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\end_inset 
+
+
+\layout Chapter*
+
+Summary of Changes for Revision 1.1
+\layout Enumerate
+
+Updated version number to 3.2 throughout the document
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sub:PtlGetId}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_SEGV
+\family default 
+ to error list for 
+\shape italic 
+PtlGetId
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_ML_TOOLONG
+\family default 
+ to error list for 
+\shape italic 
+PtlMEAttach
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meunlink}
+
+\end_inset 
+
+: removed text referring to a list of associated memory descriptors.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: added text to describe unlinking a free-floating memory descriptor.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+: added entry for 
+\family typewriter 
+ptl_seq_t
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+:
+\begin_deeper 
+\layout Enumerate
+
+added definition of 
+\family typewriter 
+max_offset
+\family default 
+.
+\layout Enumerate
+
+added text to clarify 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+.
+\end_deeper 
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: modified text for 
+\family typewriter 
+unlink_op
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+: added text to clarify multiple calls to 
+\shape italic 
+PtlNIInit
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: added text to clarify 
+\family typewriter 
+unlink_nofit
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:receiving}
+
+\end_inset 
+
+: removed text indicating that an MD will reject a message if the associated
+ EQ is full.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_MD_INUSE
+\family default 
+ error code and text to indicate that only MDs with no pending operations
+ can be unlinked.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_MD_INUSE
+\family default 
+ return code.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+: added user id field, MD handle field, and NI specific failure field to
+ the 
+\family typewriter 
+ptl_event_t
+\family default 
+ structure.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+: added 
+\family typewriter 
+ptl_ni_fail_t
+\family default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_EVENT_UNLINK
+\family default 
+ event type.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+: removed 
+\shape slanted 
+PtlTransId
+\shape default 
+.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, Section 
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+, Section 
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+: listed allowable constants with relevant fields.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+: added 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+ function.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+: added 
+\family typewriter 
+PTL_PT_FULL
+\family default 
+ return code for 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+.
+\layout Enumerate
+
+Table 
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+: updated to reflect new event types.
+\layout Enumerate
+
+Section 
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+ptl_nid_t
+\family default 
+, 
+\family typewriter 
+ptl_pid_t
+\family default 
+, and 
+\family typewriter 
+ptl_uid_t
+\family default 
+.
+\layout Chapter*
+
+Summary of Changes for Version 3.1
+\layout Section*
+
+Thread Issues
+\layout Standard
+
+The most significant change to the interface from version 3.0 to 3.1 involves
+ the clarification of how the interface interacts with multi-threaded applicatio
+ns.
+ We adopted a generic thread model in which processes define an address
+ space and threads share the address space.
+ Consideration of the API in the light of threads lead to several clarifications
+ throughout the document: 
+\layout Enumerate
+
+Glossary: 
+\begin_deeper 
+\layout Enumerate
+
+added a definition for 
+\emph on 
+thread
+\emph default 
+, 
+\layout Enumerate
+
+reworded the definition for 
+\emph on 
+process
+\emph default 
+.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:apiover}
+
+\end_inset 
+
+: added section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:threads}
+
+\end_inset 
+
+ to describe the multi-threading model used by the Portals API.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ptlinit}
+
+\end_inset 
+
+: 
+\emph on 
+PtlInit
+\emph default 
+ must be called at least once and may be called any number of times.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ptlfini}
+
+\end_inset 
+
+: 
+\emph on 
+PtlFini
+\emph default 
+ should be called once as the process is terminating and not as each thread
+ terminates.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+: Portals does not define thread ids.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+: network interfaces are associated with processes, not threads.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+: 
+\emph on 
+PtlNIInit
+\emph default 
+ must be called at least once and may be called any number of times.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:eqget}
+
+\end_inset 
+
+: 
+\emph on 
+PtlEQGet
+\emph default 
+ returns 
+\family typewriter 
+PTL_EQ_EMPTY
+\family default 
+ if a thread is blocked on 
+\emph on 
+PtlEQWait
+\emph default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:eqwait}
+
+\end_inset 
+
+: waiting threads are awakened in FIFO order.
+\layout Standard
+
+Two functions, 
+\emph on 
+PtlNIBarrier
+\emph default 
+ and 
+\emph on 
+PtlEQCount
+\emph default 
+ were removed from the API.
+\emph on 
+PtlNIBarrier
+\emph default 
+ was defined to block the calling process until all of the processes in
+ the application group had invoked 
+\emph on 
+PtlNIBarrier
+\emph default 
+.
+ We now consider this functionality, along with the concept of groups (see
+ the discussion under 
+\begin_inset Quotes eld
+\end_inset 
+
+other changes
+\begin_inset Quotes erd
+\end_inset 
+
+), to be part of the runtime system, not part of the Portals API.
+\emph on 
+PtlEQCount
+\emph default 
+ was defined to return the number of events in an event queue.
+ Because external operations may lead to new events being added and other
+ threads may remove events, the value returned by 
+\emph on 
+PtlEQCount
+\emph default 
+ would have to be a hint about the number of events in the event queue.
+\layout Section*
+
+Handling small, unexpected messages
+\layout Standard
+
+Another set of changes relates to handling small unexpected messages in
+ MPI.
+ In designing version 3.0, we assumed that each unexpected message would
+ be placed in a unique memory descriptor.
+ To avoid the need to process a long list of memory descriptors, we moved
+ the memory descriptors out of the match list and hung them off of a single
+ match list entry.
+ In this way, large unexpected messages would only encounter a single 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ match list entry before encountering the 
+\begin_inset Quotes eld
+\end_inset 
+
+long message
+\begin_inset Quotes erd
+\end_inset 
+
+ match list entry.
+ Experience with this strategy identified resource management problems with
+ this approach.
+ In particular, a long sequence of very short (or zero length) messages
+ could quickly exhaust the memory descriptors constructed for handling unexpecte
+d messages.
+ Our new strategy involves the use of several very large memory descriptors
+ for small unexpected messages.
+ Consecutive unexpected messages will be written into the first of these
+ memory descriptors until the memory descriptor fills up.
+ When the first of the 
+\begin_inset Quotes eld
+\end_inset 
+
+small memory
+\begin_inset Quotes erd
+\end_inset 
+
+ descriptors fills up, it will be unlinked and subsequent short messages
+ will be written into the next 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor.
+ In this case, a 
+\begin_inset Quotes eld
+\end_inset 
+
+short message
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor will be declared full when it does not have sufficient
+ space for the largest small unexpected message.
+\layout Standard
+
+This lead to two significant changes.
+ First, each match list entry now has a single memory descriptor rather
+ than a list of memory descriptors.
+ Second, in addition to exceeding the operation threshold, a memory descriptor
+ can be unlinked when the local offset exceeds a specified value.
+ These changes have lead to several changes in this document: 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{subsec:paddress}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+removed references to the memory descriptor list, 
+\layout Enumerate
+
+changed the portals address translation description to indicate that unlinking
+ a memory descriptor implies unlinking the associated match list entry--match
+ list entries can no longer be unlinked independently from the memory descriptor.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+removed unlink from argument list, 
+\layout Enumerate
+
+removed description of 
+\family typewriter 
+ptl_unlink
+\family default 
+ type, 
+\layout Enumerate
+
+changed wording of the error condition when the Portal table index already
+ has an associated match list.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+: removed unlink from argument list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+: added 
+\family typewriter 
+max_offset
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+: 
+\begin_deeper 
+\layout Enumerate
+
+added description of 
+\family typewriter 
+ptl_unlink
+\family default 
+ type, 
+\layout Enumerate
+
+removed reference to memory descriptor lists, 
+\layout Enumerate
+
+changed wording of the error condition when match list entry already has
+ an associated memory descriptor, 
+\layout Enumerate
+
+changed the description of the 
+\family typewriter 
+unlink
+\family default 
+ argument.
+\end_deeper 
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+: removed 
+\family typewriter 
+PtlMDInsert
+\family default 
+ operation.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+: removed references to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+: removed reference to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:summary}
+
+\end_inset 
+
+: removed references to PtlMDInsert.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:semantics}
+
+\end_inset 
+
+: removed reference to memory descriptor list.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:exmpi}
+
+\end_inset 
+
+: revised the MPI example to reflect the changes to the interface.
+\layout Standard
+
+Several changes have been made to improve the general documentation of the
+ interface.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+: documented the special value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+: documented the special value 
+\family typewriter 
+PTL_ID_ANY
+\family default 
+.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+: documented the return value 
+\family typewriter 
+PTL_INV_EQ
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+: clarified the description of the 
+\emph on 
+PtlMDUpdate
+\emph default 
+ function.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:implvals}
+
+\end_inset 
+
+: introduced a new section to document the implementation defined values.
+\layout Enumerate
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:summary}
+
+\end_inset 
+
+: modified Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+ to indicate where each constant is introduced and where it is used.
+\layout Section*
+
+Other changes
+\layout Subsection*
+
+Implementation defined limits (Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+)
+\layout Standard
+
+The earlier version provided implementation defined limits for the maximum
+ number of match entries, the maximum number of memory descriptors, etc.
+ Rather than spanning the entire implementation, these limits are now associated
+ with individual network interfaces.
+\layout Subsection*
+
+Added User Ids (Section 
+\begin_inset LatexCommand \ref{sec:uid}
+
+\end_inset 
+
+)
+\layout Standard
+
+Group Ids had been used to simplify access control entries.
+ In particular, a process could allow access for all of the processes in
+ a group.
+ User Ids have been introduced to regain this functionality.
+ We use user ids to fill this role.
+\layout Subsection*
+
+Removed Group Ids and Rank Ids (Section 
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+)
+\layout Standard
+
+The earlier version of Portals had two forms for addressing processes: <node
+ id, process id> and <group id, rank id>.
+ A process group was defined as the collection processes created during
+ application launch.
+ Each process in the group was given a unique rank id in the range 0 to
+\begin_inset Formula $n-1$
+\end_inset 
+
+ where 
+\begin_inset Formula $n$
+\end_inset 
+
+ was the number of processes in the group.
+ We removed groups because they are better handled in the runtime system.
+\layout Subsection*
+
+Match lists (Section 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+)
+\layout Standard
+
+It is no longer illegal to have an existing match entry when calling PtlMEAttach.
+ A position argument was added to the list of arguments supplied to 
+\emph on 
+PtlMEAttach
+\emph default 
+ to specify whether the new match entry is prepended or appended to the
+ existing list.
+ If there is no existing match list, the position argument is ignored.
+\layout Subsection*
+
+Unlinking Memory Descriptors (Section 
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+)
+\layout Standard
+
+Previously, a memory descriptor could be unlinked if the offset exceeded
+ a threshold upon the completion of an operation.
+ In this version, the unlinking is delayed until there is a matching operation
+ which requires more memory than is currently available in the descriptor.
+ In addition to changes in section, this lead to a revision of Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:flow}
+
+\end_inset 
+
+.
+\layout Subsection*
+
+Split Phase Operations and Events (Section 
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+)
+\layout Standard
+
+Previously, there were five types of events: 
+\family typewriter 
+PTL_EVENT_PUT
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_GET
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_REPLY
+\family default 
+, 
+\family typewriter 
+PTL_EVENT_SENT
+\family default 
+, and 
+\family typewriter 
+PTL_EVENT_ACK.
+\family default 
+The first four of these reflected the completion of potentially long operations.
+ We have introduced new event types to reflect the fact that long operations
+ have a distinct starting point and a distinct completion point.
+ Moreover, the completion may be successful or unsuccessful.
+\layout Standard
+
+In addition to providing a mechanism for reporting failure to higher levels
+ of software, this split provides an opportunity for for improved ordering
+ semantics.
+ Previously, if one process intiated two operations (e.g., two put operations)
+ on a remote process, these operations were guaranteed to complete in the
+ same order that they were initiated.
+ Now, we only guarantee that the initiation events are delivered in the
+ same order.
+ In particular, the operations do not need to complete in the order that
+ they were intiated.
+\layout Subsection*
+
+Well known proces ids (Section 
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+)
+\layout Standard
+
+To support the notion of 
+\begin_inset Quotes eld
+\end_inset 
+
+well known process ids,
+\begin_inset Quotes erd
+\end_inset 
+
+ we added a process id argument to the arguments for PtlNIInit.
+\layout Chapter*
+
+Glossary
+\layout Description
+
+API Application Programming Interface.
+ A definition of the functions and semantics provided by library of functions.
+\layout Description
+
+Initiator A 
+\emph on 
+process
+\emph default 
+ that initiates a message operation.
+\layout Description
+
+Message An application-defined unit of data that is exchanged between 
+\emph on 
+processes
+\emph default 
+.
+\layout Description
+
+Message\SpecialChar ~
+Operation Either a put operation, which writes data, or a get operation,
+ which reads data.
+\layout Description
+
+Network A network provides point-to-point communication between 
+\emph on 
+nodes
+\emph default 
+.
+ Internally, a network may provide multiple routes between endpoints (to
+ improve fault tolerance or to improve performance characteristics); however,
+ multiple paths will not be exposed outside of the network.
+\layout Description
+
+Node A node is an endpoint in a 
+\emph on 
+network
+\emph default 
+.
+ Nodes provide processing capabilities and memory.
+ A node may provide multiple processors (an SMP node) or it may act as a
+\emph on 
+gateway
+\emph default 
+ between networks.
+\layout Description
+
+Process A context of execution.
+ A process defines a virtual memory (VM) context.
+ This context is not shared with other processes.
+ Several threads may share the VM context defined by a process.
+\layout Description
+
+Target A 
+\emph on 
+process
+\emph default 
+ that is acted upon by a message operation.
+\layout Description
+
+Thread A context of execution that shares a VM context with other threads.
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+cleardoublepage
+\layout Standard
+
+\backslash 
+setcounter{page}{1}
+\backslash 
+pagenumbering{arabic}
+\end_inset 
+
+
+\layout Chapter
+
+Introduction
+\begin_inset LatexCommand \label{sec:intro}
+
+\end_inset 
+
+
+\layout Section
+
+Overview
+\layout Standard
+
+This document describes an application programming interface for message
+ passing between nodes in a system area network.
+ The goal of this interface is to improve the scalability and performance
+ of network communication by defining the functions and semantics of message
+ passing required for scaling a parallel computing system to ten thousand
+ nodes.
+ This goal is achieved by providing an interface that will allow a quality
+ implementation to take advantage of the inherently scalable design of Portals.
+\layout Standard
+
+This document is divided into several sections: 
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:intro}
+
+\end_inset 
+
+---Introduction This section describes the purpose and scope of the Portals
+ API.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:apiover}
+
+\end_inset 
+
+---An\SpecialChar ~
+Overview\SpecialChar ~
+of\SpecialChar ~
+the\SpecialChar ~
+Portals\SpecialChar ~
+3.1\SpecialChar ~
+API This section gives a brief overview of the
+ Portals API.
+ The goal is to introduce the key concepts and terminology used in the descripti
+on of the API.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:api}
+
+\end_inset 
+
+---The\SpecialChar ~
+Portals\SpecialChar ~
+3.2\SpecialChar ~
+API This section describes the functions and semantics of
+ the Portals application programming interface.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:semantics}
+
+\end_inset 
+
+--The\SpecialChar ~
+Semantics\SpecialChar ~
+of\SpecialChar ~
+Message\SpecialChar ~
+Transmission This section describes the semantics
+ of message transmission.
+ In particular, the information transmitted in each type of message and
+ the processing of incoming messages.
+\layout Description
+
+Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:examples}
+
+\end_inset 
+
+---Examples This section presents several examples intended to illustrates
+ the use of the Portals API.
+\layout Section
+
+Purpose
+\layout Standard
+
+Existing message passing technologies available for commodity cluster networking
+ hardware do not meet the scalability goals required by the Cplant\SpecialChar ~
+
+\begin_inset LatexCommand \cite{Cplant}
+
+\end_inset 
+
+ project at Sandia National Laboratories.
+ The goal of the Cplant project is to construct a commodity cluster that
+ can scale to the order of ten thousand nodes.
+ This number greatly exceeds the capacity for which existing message passing
+ technologies have been designed and implemented.
+\layout Standard
+
+In addition to the scalability requirements of the network, these technologies
+ must also be able to support a scalable implementation of the Message Passing
+ Interface (MPI)\SpecialChar ~
+
+\begin_inset LatexCommand \cite{MPIstandard}
+
+\end_inset 
+
+ standard, which has become the 
+\shape italic 
+de facto
+\shape default 
+ standard for parallel scientific computing.
+ While MPI does not impose any scalability limitations, existing message
+ passing technologies do not provide the functionality needed to allow implement
+ations of MPI to meet the scalability requirements of Cplant.
+\layout Standard
+
+The following are properties of a network architecture that do not impose
+ any inherent scalability limitations: 
+\layout Itemize
+
+Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~
+
+\begin_inset LatexCommand \cite{VIA}
+
+\end_inset 
+
+ and TCP/IP sockets, have limitations on the number of peer connections
+ that can be established.
+\layout Itemize
+
+Network independence - Many communication systems depend on the host processor
+ to perform operations in order for messages in the network to be consumed.
+ Message consumption from the network should not be dependent on host processor
+ activity, such as the operating system scheduler or user-level thread scheduler.
+\layout Itemize
+
+User-level flow control - Many communication systems manage flow control
+ internally to avoid depleting resources, which can significantly impact
+ performance as the number of communicating processes increases.
+\layout Itemize
+
+OS Bypass - High performance network communication should not involve memory
+ copies into or out of a kernel-managed protocol stack.
+\layout Standard
+
+The following are properties of a network architecture that do not impose
+ scalability limitations for an implementation of MPI:
+\layout Itemize
+
+Receiver-managed - Sender-managed message passing implementations require
+ a persistent block of memory to be available for every process, requiring
+ memory resources to increase with job size and requiring user-level flow
+ control mechanisms to manage these resources.
+\layout Itemize
+
+User-level Bypass - While OS Bypass is necessary for high-performance, it
+ alone is not sufficient to support the Progress Rule of MPI asynchronous
+ operations.
+\layout Itemize
+
+Unexpected messages - Few communication systems have support for receiving
+ messages for which there is no prior notification.
+ Support for these types of messages is necessary to avoid flow control
+ and protocol overhead.
+\layout Section
+
+Background
+\layout Standard
+
+Portals was originally designed for and implemented on the nCube machine
+ as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~
+
+\begin_inset LatexCommand \cite{SUNMOS}
+
+\end_inset 
+
+ and Puma\SpecialChar ~
+
+\begin_inset LatexCommand \cite{PumaOS}
+
+\end_inset 
+
+ lightweight kernel development projects.
+ Portals went through two design phases, the latter of which is used on
+ the 4500-node Intel TeraFLOPS machine\SpecialChar ~
+
+\begin_inset LatexCommand \cite{TFLOPS}
+
+\end_inset 
+
+.
+ Portals have been very successful in meeting the needs of such a large
+ machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~
+
+\begin_inset LatexCommand \cite{PumaMPI}
+
+\end_inset 
+
+, but also for implementing the scalable run-time environment and parallel
+ I/O capabilities of the machine.
+\layout Standard
+
+The second generation Portals implementation was designed to take full advantage
+ of the hardware architecture of large MPP machines.
+ However, efforts to implement this same design on commodity cluster technology
+ identified several limitations, due to the differences in network hardware
+ as well as to shortcomings in the design of Portals.
+\layout Section
+
+Scalability
+\layout Standard
+
+The primary goal in the design of Portals is scalability.
+ Portals are designed specifically for an implementation capable of supporting
+ a parallel job running on tens of thousands of nodes.
+ Performance is critical only in terms of scalability.
+ That is, the level of message passing performance is characterized by how
+ far it allows an application to scale and not by how it performs in micro-bench
+marks (e.g., a two node bandwidth or latency test).
+\layout Standard
+
+The Portals API is designed to allow for scalability, not to guarantee it.
+ Portals cannot overcome the shortcomings of a poorly designed application
+ program.
+ Applications that have inherent scalability limitations, either through
+ design or implementation, will not be transformed by Portals into scalable
+ applications.
+ Scalability must be addressed at all levels.
+ Portals do not inhibit scalability, but do not guarantee it either.
+\layout Standard
+
+To support scalability, the Portals interface maintains a minimal amount
+ of state.
+ Portals provide reliable, ordered delivery of messages between pairs of
+ processes.
+ They are connectionless: a process is not required to explicitly establish
+ a point-to-point connection with another process in order to communicate.
+ Moreover, all buffers used in the transmission of messages are maintained
+ in user space.
+ The target process determines how to respond to incoming messages, and
+ messages for which there are no buffers are discarded.
+\layout Section
+
+Communication Model
+\layout Standard
+
+Portals combine the characteristics of both one-side and two-sided communication.
+ They define a 
+\begin_inset Quotes eld
+\end_inset 
+
+matching put
+\begin_inset Quotes erd
+\end_inset 
+
+ operation and a 
+\begin_inset Quotes eld
+\end_inset 
+
+matching get
+\begin_inset Quotes erd
+\end_inset 
+
+ operation.
+ The destination of a put (or send) is not an explicit address; instead,
+ each message contains a set of match bits that allow the receiver to determine
+ where incoming messages should be placed.
+ This flexibility allows Portals to support both traditional one-sided operation
+s and two-sided send/receive operations.
+\layout Standard
+
+Portals allows the target to determine whether incoming messages are acceptable.
+ A target process can choose to accept message operations from any specific
+ process or can choose to ignore message operations from any specific process.
+\layout Section
+
+Zero Copy, OS Bypass and Application Bypass
+\layout Standard
+
+In traditional system architectures, network packets arrive at the network
+ interface card (NIC), are passed through one or more protocol layers in
+ the operating system, and eventually copied into the address space of the
+ application.
+ As network bandwidth began to approach memory copy rates, reduction of
+ memory copies became a critical concern.
+ This concern lead to the development of zero-copy message passing protocols
+ in which message copies are eliminated or pipelined to avoid the loss of
+ bandwidth.
+\layout Standard
+
+A typical zero-copy protocol has the NIC generate an interrupt for the CPU
+ when a message arrives from the network.
+ The interrupt handler then controls the transfer of the incoming message
+ into the address space of the appropriate application.
+ The interrupt latency, the time from the initiation of an interrupt until
+ the interrupt handler is running, is fairly significant.
+ To avoid this cost, some modern NICs have processors that can be programmed
+ to implement part of a message passing protocol.
+ Given a properly designed protocol, it is possible to program the NIC to
+ control the transfer of incoming messages, without needing to interrupt
+ the CPU.
+ Because this strategy does not need to involve the OS on every message
+ transfer, it is frequently called 
+\begin_inset Quotes eld
+\end_inset 
+
+OS Bypass.
+\begin_inset Quotes erd
+\end_inset 
+
+ ST\SpecialChar ~
+
+\begin_inset LatexCommand \cite{ST}
+
+\end_inset 
+
+, VIA\SpecialChar ~
+
+\begin_inset LatexCommand \cite{VIA}
+
+\end_inset 
+
+, FM\SpecialChar ~
+
+\begin_inset LatexCommand \cite{FM2}
+
+\end_inset 
+
+, GM\SpecialChar ~
+
+\begin_inset LatexCommand \cite{GM}
+
+\end_inset 
+
+, and Portals are examples of OS Bypass protocols.
+\layout Standard
+
+Many protocols that support OS Bypass still require that the application
+ actively participate in the protocol to ensure progress.
+ As an example, the long message protocol of PM requires that the application
+ receive and reply to a request to put or get a long message.
+ This complicates the runtime environment, requiring a thread to process
+ incoming requests, and significantly increases the latency required to
+ initiate a long message protocol.
+ The Portals message passing protocol does not require activity on the part
+ of the application to ensure progress.
+ We use the term 
+\begin_inset Quotes eld
+\end_inset 
+
+Application Bypass
+\begin_inset Quotes erd
+\end_inset 
+
+ to refer to this aspect of the Portals protocol.
+\layout Section
+
+Faults 
+\layout Standard
+
+Given the number of components that we are dealing with and the fact that
+ we are interested in supporting applications that run for very long times,
+ failures are inevitable.
+ The Portals API recognizes that the underlying transport may not be able
+ to successfully complete an operation once it has been initiated.
+ This is reflected in the fact that the Portals API reports three types
+ of events: events indicating the initiation of an operation, events indicating
+ the successful completion of an operation, and events indicating the unsuccessf
+ul completion of an operation.
+ Every initiation event is eventually followed by a successful completion
+ event or an unsuccessful completion event.
+\layout Standard
+
+Between the time an operation is started and the time that the operation
+ completes (successfully or unsuccessfully), any memory associated with
+ the operation should be considered volatile.
+ That is, the memory may be changed in unpredictable ways while the operation
+ is progressing.
+ Once the operation completes, the memory associated with the operation
+ will not be subject to further modification (from this operation).
+ Notice that unsuccessful operations may alter memory in an essentially
+ unpredictable fashion.
+\layout Chapter
+
+An Overview of the Portals API
+\begin_inset LatexCommand \label{sec:apiover}
+
+\end_inset 
+
+
+\layout Standard
+
+In this section, we give a conceptual overview of the Portals API.
+ The goal is to provide a context for understanding the detailed description
+ of the API presented in the next section.
+\layout Section
+
+Data Movement
+\begin_inset LatexCommand \label{sec:dmsemantics}
+
+\end_inset 
+
+
+\layout Standard
+
+A Portal represents an opening in the address space of a process.
+ Other processes can use a Portal to read (get) or write (put) the memory
+ associated with the portal.
+ Every data movement operation involves two processes, the 
+\series bold 
+initiator
+\series default 
+ and the 
+\series bold 
+target
+\series default 
+.
+ The initiator is the process that initiates the data movement operation.
+ The target is the process that responds to the operation by either accepting
+ the data for a put operation, or replying with the data for a get operation.
+\layout Standard
+
+In this discussion, activities attributed to a process may refer to activities
+ that are actually performed by the process or 
+\emph on 
+on behalf of the process
+\emph default 
+.
+ The inclusiveness of our terminology is important in the context of 
+\emph on 
+application bypass
+\emph default 
+.
+ In particular, when we note that the target sends a reply in the case of
+ a get operation, it is possible that reply will be generated by another
+ component in the system, bypassing the application.
+\layout Standard
+
+Figures\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:put}
+
+\end_inset 
+
+ and 
+\begin_inset LatexCommand \ref{fig:get}
+
+\end_inset 
+
+ present graphical interpretations of the Portal data movement operations:
+ put and get.
+ In the case of a put operation, the initiator sends a put request message
+ containing the data to the target.
+ The target translates the Portal addressing information in the request
+ using its local Portal structures.
+ When the request has been processed, the target optionally sends an acknowledge
+ment message.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename put.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 218pt
+       lyxheight 119pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Put (Send)
+\begin_inset LatexCommand \label{fig:put}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+In the case of a get operation, the initiator sends a get request to the
+ target.
+ As with the put operation, the target translates the Portal addressing
+ information in the request using its local Portal structures.
+ Once it has translated the Portal addressing information, the target sends
+ a reply that includes the requested data.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename get.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 218pt
+       lyxheight 119pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Get
+\begin_inset LatexCommand \label{fig:get}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+We should note that Portal address translations are only performed on nodes
+ that respond to operations initiated by other nodes.
+ Acknowledgements and replies to get operations bypass the portals address
+ translation structures.
+\layout Section
+
+Portal Addressing
+\begin_inset LatexCommand \label{subsec:paddress}
+
+\end_inset 
+
+
+\layout Standard
+
+One-sided data movement models (e.g., shmem\SpecialChar ~
+
+\begin_inset LatexCommand \cite{CraySHMEM}
+
+\end_inset 
+
+, ST\SpecialChar ~
+
+\begin_inset LatexCommand \cite{ST}
+
+\end_inset 
+
+, MPI-2\SpecialChar ~
+
+\begin_inset LatexCommand \cite{MPI2}
+
+\end_inset 
+
+) typically use a triple to address memory on a remote node.
+ This triple consists of a process id, memory buffer id, and offset.
+ The process id identifies the target process, the memory buffer id specifies
+ the region of memory to be used for the operation, and the offset specifies
+ an offset within the memory buffer.
+\layout Standard
+
+In addition to the standard address components (process id, memory buffer
+ id, and offset), a Portal address includes a set of match bits.
+ This addressing model is appropriate for supporting one-sided operations
+ as well as traditional two-sided message passing operations.
+ Specifically, the Portals API provides the flexibility needed for an efficient
+ implementation of MPI-1, which defines two-sided operations with one-sided
+ completion semantics.
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:portals}
+
+\end_inset 
+
+ presents a graphical representation of the structures used by a target
+ in the interpretation of a Portal address.
+ The process id is used to route the message to the appropriate node and
+ is not reflected in this diagram.
+ The memory buffer id, called the 
+\series bold 
+portal id
+\series default 
+, is used as an index into the Portal table.
+ Each element of the Portal table identifies a match list.
+ Each element of the match list specifies two bit patterns: a set of 
+\begin_inset Quotes eld
+\end_inset 
+
+don't care
+\begin_inset Quotes erd
+\end_inset 
+
+ bits, and a set of 
+\begin_inset Quotes eld
+\end_inset 
+
+must match
+\begin_inset Quotes erd
+\end_inset 
+
+ bits.
+ In addition to the two sets of match bits, each match list element has
+ at most one memory descriptor.
+ Each memory descriptor identifies a memory region and an optional event
+ queue.
+ The memory region specifies the memory to be used in the operation and
+ the event queue is used to record information about these operations.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename portals.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 305pt
+       lyxheight 106pt
+\end_inset 
+
+
+\layout Caption
+
+Portal Addressing Structures
+\begin_inset LatexCommand \label{fig:portals}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:flow}
+
+\end_inset 
+
+ illustrates the steps involved in translating a Portal address, starting
+ from the first element in a match list.
+ If the match criteria specified in the match list entry are met and the
+ memory descriptor list accepts the operation
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+Memory descriptors can reject operations because a threshold has been exceeded
+ or because the memory region does not have sufficient space, see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+
+, the operation (put or get) is performed using the memory region specified
+ in the memory descriptor.
+ If the memory descriptor specifies that it is to be unlinked when a threshold
+ has been exceeded, the match list entry is removed from the match list
+ and the resources associated with the memory descriptor and match list
+ entry are reclaimed.
+ Finally, if there is an event queue specified in the memory descriptor,
+ the operation is logged in the event queue.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename flow_new.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 447pt
+       lyxheight 282pt
+\end_inset 
+
+
+\layout Caption
+
+Portals Address Translation
+\begin_inset LatexCommand \label{fig:flow}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+If the match criteria specified in the match list entry are not met, or
+ there is no memory descriptor associated with the match list entry, or
+ the memory descriptor associated with the match list entry rejects the
+ operation, the address translation continues with the next match list entry.
+ If the end of the match list has been reached, the address translation
+ is aborted and the incoming requested is discarded.
+\layout Section
+
+Access Control
+\layout Standard
+
+A process can control access to its portals using an access control list.
+ Each entry in the access control list specifies a process id and a Portal
+ table index.
+ The access control list is actually an array of entries.
+ Each incoming request includes an index into the access control list (i.e.,
+ a 
+\begin_inset Quotes eld
+\end_inset 
+
+cookie
+\begin_inset Quotes erd
+\end_inset 
+
+ or hint).
+ If the id of the process issuing the request doesn't match the id specified
+ in the access control list entry or the Portal table index specified in
+ the request doesn't match the Portal table index specified in the access
+ control list entry, the request is rejected.
+ Process identifiers and Portal table indexes may include wild card values
+ to increase the flexibility of this mechanism.
+\layout Standard
+
+Two aspects of this design merit further discussion.
+ First, the model assumes that the information in a message header, the
+ sender's id in particular, is trustworthy.
+ In most contexts, we assume that the entity that constructs the header
+ is trustworthy; however, using cryptographic techniques, we could easily
+ devise a protocol that would ensure the authenticity of the sender.
+\layout Standard
+
+Second, because the access check is performed by the receiver, it is possible
+ that a malicious process will generate thousands of messages that will
+ be denied by the receiver.
+ This could saturate the network and/or the receiver, resulting in a 
+\emph on 
+denial of service
+\emph default 
+ attack.
+ Moving the check to the sender using capabilities, would remove the potential
+ for this form of attack.
+ However, the solution introduces the complexities of capability management
+ (exchange of capabilities, revocation, protections, etc).
+\layout Section
+
+Multi-threaded Applications
+\begin_inset LatexCommand \label{sec:threads}
+
+\end_inset 
+
+\layout Standard
+
+The Portals API supports a generic view of multi-threaded applications.
+ From the perspective of the Portals API, an application program is defined
+ by a set of processes.
+ Each process defines a unique address space.
+ The Portals API defines access to this address space from other processes
+ (using portals addressing and the data movement operations).
+ A process may have one or more 
+\emph on 
+threads
+\emph default 
+ executing in its address space.
+\layout Standard
+
+With the exception of 
+\emph on 
+PtlEQWait
+\emph default 
+ every function in the Portals API is non-blocking and atomic with respect
+ to both other threads and external operations that result from data movement
+ operations.
+ While individual operations are atomic, sequences of these operations may
+ be interleaved between different threads and with external operations.
+ The Portals API does not provide any mechanisms to control this interleaving.
+ It is expected that these mechanisms will be provided by the API used to
+ create threads.
+\layout Chapter
+
+The Portals API
+\begin_inset LatexCommand \label{sec:api}
+
+\end_inset 
+
+
+\layout Section
+
+Naming Conventions
+\begin_inset LatexCommand \label{sec:conv}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API defines two types of entities: functions and types.
+ Function always start with 
+\emph on 
+Ptl
+\emph default 
+ and use mixed upper and lower case.
+ When used in the body of this report, function names appear in italic face,
+ e.g., 
+\emph on 
+PtlInit
+\emph default 
+.
+ The functions associated with an object type will have names that start
+ with 
+\emph on 
+Ptl
+\emph default 
+, followed by the two letter object type code shown in Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:objcodes}
+
+\end_inset 
+
+.
+ As an example, the function 
+\emph on 
+PtlEQAlloc
+\emph default 
+ allocates resources for an event queue.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Object Type Codes
+\begin_inset LatexCommand \label{tab:objcodes}
+
+\end_inset 
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+\backslash 
+medskip
+\newline 
+  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\emph on 
+xx
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Name 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Section 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+EQ 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Event Queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ MD 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Memory Descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ ME 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Match list Entry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ NI 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Network Interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Type names use lower case with underscores to separate words.
+ Each type name starts with 
+\family typewriter 
+ptl
+\family default 
+_ and ends with 
+\family typewriter 
+_t
+\family default 
+.
+ When used in the body of this report, type names appear in a fixed font,
+ e.g., 
+\family typewriter 
+ptl_match_bits_t
+\family default 
+.
+\layout Standard
+
+Names for constants use upper case with underscores to separate words.
+ Each constant name starts with 
+\family typewriter 
+PTL_
+\family default 
+.
+ When used in the body of this report, type names appear in a fixed font,
+ e.g., 
+\family typewriter 
+PTL_OK
+\family default 
+.
+\layout Section
+
+Base Types
+\layout Standard
+
+The Portals API defines a variety of base types.
+ These types represent a simple renaming of the base types provided by the
+ C programming language.
+ In most cases these new type names have been introduced to improve type
+ safety and to avoid issues arising from differences in representation sizes
+ (e.g., 16-bit or 32-bit integers).
+\layout Subsection
+
+Sizes
+\begin_inset LatexCommand \label{sec:size-t}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_size_t
+\family default 
+ is an unsigned 64-bit integral type used for representing sizes.
+\layout Subsection
+
+Handles
+\begin_inset LatexCommand \label{sec:handle-type}
+
+\end_inset 
+
+\layout Standard
+
+Objects maintained by the API are accessed through handles.
+ Handle types have names of the form 
+\family typewriter 
+ptl_handle_
+\emph on 
+xx
+\emph default 
+_t
+\family default 
+, where 
+\emph on 
+xx
+\emph default 
+ is one of the two letter object type codes shown in Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:objcodes}
+
+\end_inset 
+
+.
+ For example, the type 
+\family typewriter 
+ptl_handle_ni_t
+\family default 
+ is used for network interface handles.
+\layout Standard
+
+Each type of object is given a unique handle type to enhance type checking.
+ The type, 
+\family typewriter 
+ptl_handle_any_t
+\family default 
+, can be used when a generic handle is needed.
+ Every handle value can be converted into a value of type 
+\family typewriter 
+ptl_handle_any_t
+\family default 
+ without loss of information.
+\layout Standard
+
+Handles are not simple values.
+ Every portals object is associated with a specific network interface and
+ an identifier for this interface (along with an object identifier) is part
+ of the handle for the object.
+\layout Standard
+
+The special value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, of type 
+\family typewriter 
+ptl_handle_eq_t
+\family default 
+, is used to indicate the absence of an event queue.
+ See sections 
+\begin_inset LatexCommand \ref{sec:mdfree}
+
+\end_inset 
+
+ and\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+ for uses of this value.
+\layout Subsection
+
+Indexes
+\begin_inset LatexCommand \label{sec:index-type}
+
+\end_inset 
+
+\layout Standard
+
+The types 
+\family typewriter 
+ptl_pt_index_t
+\family default 
+ and 
+\family typewriter 
+ptl_ac_index_t
+\family default 
+ are integral types used for representing Portal table indexes and access
+ control tables indexes, respectively.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:niinit}
+
+\end_inset 
+
+ for limits on values of these types.
+\layout Subsection
+
+Match Bits
+\begin_inset LatexCommand \label{sec:mb-type}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_match_bits_t
+\family default 
+ is capable of holding unsigned 64-bit integer values.
+\layout Subsection
+
+Network Interfaces
+\begin_inset LatexCommand \label{sec:ni-type}
+
+\end_inset 
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_interface_t
+\family default 
+ is an integral type used for identifying different network interfaces.
+ Users will need to consult the local documentation to determine appropriate
+ values for the interfaces available.
+ The special value 
+\family typewriter 
+PTL_IFACE_DEFAULT
+\family default 
+ identifies the default interface.
+\layout Subsection
+
+Identifiers
+\begin_inset LatexCommand \label{sec:id-type}
+
+\end_inset 
+
+
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_nid_t
+\family default 
+ is an integral type used for representing node ids
+\family typewriter 
+, ptl_pid_t
+\family default 
+ is an integral type for representing process ids, and 
+\family typewriter 
+ptl_uid_t 
+\family default 
+is an integral type for representing user ids.
+\layout Standard
+
+The special values 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ matches any process identifier, PTL_NID_ANY matches any node identifier,
+ and 
+\family typewriter 
+PTL_UID_ANY
+\family default 
+ matches any user identifier.
+ See sections 
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+ and\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+ for uses of these values.
+\layout Subsection
+
+Status Registers
+\begin_inset LatexCommand \label{sec:stat-type}
+
+\end_inset 
+
+
+\layout Standard
+
+Each network interface maintains an array of status registers that can be
+ accessed using the 
+\family typewriter 
+PtlNIStatus
+\family default 
+ function (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+).
+ The type 
+\family typewriter 
+ptl_sr_index_t
+\family default 
+ defines the types of indexes that can be used to access the status registers.
+ The only index defined for all implementations is 
+\family typewriter 
+PTL_SR_DROP_COUNT
+\family default 
+ which identifies the status register that counts the dropped requests for
+ the interface.
+ Other indexes (and registers) may be defined by the implementation.
+\layout Standard
+
+The type 
+\family typewriter 
+ptl_sr_value_t
+\family default 
+ defines the types of values held in status registers.
+ This is a signed integer type.
+ The size is implementation dependent, but must be at least 32 bits.
+\layout Section
+
+Initialization and Cleanup
+\begin_inset LatexCommand \label{sec:init}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API includes a function, 
+\emph on 
+PtlInit
+\emph default 
+, to initialize the library and a function, 
+\emph on 
+PtlFini
+\emph default 
+, to cleanup after the application is done using the library.
+\layout Subsection
+
+PtlInit
+\begin_inset LatexCommand \label{sec:ptlinit}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlInit( int *max_interfaces );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlInit
+\emph default 
+ function initializes the Portals library.
+ PtlInit must be called at least once by a process before any thread makes
+ a Portals function call, but may be safely called more than once.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_FAIL Indicates an error during initialization.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+max_interfaces
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+max_interfaces
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the maximum number of interfaces
+ that can be initialized.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlFini
+\begin_inset LatexCommand \label{sec:ptlfini}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+void PtlFini( void );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlFini
+\emph default 
+ function cleans up after the Portals library is no longer needed by a process.
+ After this function is called, calls to any of the functions defined by
+ the Portal API or use of the structures set up by the Portals API will
+ result in undefined behavior.
+ This function should be called once and only once during termination by
+ a process.
+ Typically, this function will be called in the exit sequence of a process.
+ Individual threads should not call PtlFini when they terminate.
+\layout Section
+
+Network Interfaces
+\begin_inset LatexCommand \label{sec:ni}
+
+\end_inset 
+
+\layout Standard
+
+The Portals API supports the use of multiple network interfaces.
+ However, each interface is treated as an independent entity.
+ Combining interfaces (e.g., 
+\begin_inset Quotes eld
+\end_inset 
+
+bonding
+\begin_inset Quotes erd
+\end_inset 
+
+ to create a higher bandwidth connection) must be implemented by the application
+ or embedded in the underlying network.
+ Interfaces are treated as independent entities to make it easier to cache
+ information on individual network interface cards.
+\layout Standard
+
+Once initialized, each interface provides a Portal table, an access control
+ table, and a collection of status registers.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+ for a discussion of updating Portal table entries using the 
+\emph on 
+PtlMEAttach
+\emph default 
+ function.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ac}
+
+\end_inset 
+
+ for a discussion of the initialization and updating of entries in the access
+ control table.
+ See Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+ for a discussion of the 
+\emph on 
+PtlNIStatus
+\emph default 
+ function which can be used to determine the value of a status register.
+\layout Standard
+
+Every other type of Portal object (e.g., memory descriptor, event queue, or
+ match list entry) is associated with a specific network interface.
+ The association to a network interface is established when the object is
+ created and is encoded in the handle for the object.
+\layout Standard
+
+Each network interface is initialized and shutdown independently.
+ The initialization routine, 
+\emph on 
+PtlNIInit
+\emph default 
+, returns a handle for an interface object which is used in all subsequent
+ Portal operations.
+ The 
+\emph on 
+PtlNIFini
+\emph default 
+ function is used to shutdown an interface and release any resources that
+ are associated with the interface.
+ Network interface handles are associated with processes, not threads.
+ All threads in a process share all of the network interface handles.
+\layout Standard
+
+The Portals API also defines the 
+\emph on 
+PtlNIStatus
+\emph default 
+ function to query the status registers for a network interface, the 
+\emph on 
+PtlNIDist
+\emph default 
+ function to determine the 
+\begin_inset Quotes eld
+\end_inset 
+
+distance
+\begin_inset Quotes erd
+\end_inset 
+
+ to another process, and the 
+\emph on 
+PtlNIHandle
+\emph default 
+ function to determine the network interface that an object is associated
+ with.
+\layout Subsection
+
+PtlNIInit
+\begin_inset LatexCommand \label{sec:niinit}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    int            max_match_entries;
+\newline 
+    int            max_mem_descriptors;
+\newline 
+    int            max_event_queues;
+\newline 
+    ptl_ac_index_t max_atable_index; 
+\newline 
+    ptl_pt_index_t max_ptable_index;
+\newline 
+} ptl_ni_limits_t;
+\newline 
+
+\newline 
+int PtlNIInit( ptl_interface_t  interface
+\newline 
+               ptl_pid_t        pid,
+\newline 
+               ptl_ni_limits_t* desired,
+\newline 
+               ptl_ni_limits_t* actual,
+\newline 
+               ptl_handle_ni_t* handle );
+\layout Standard
+
+Values of type 
+\family typewriter 
+ptl_ni_limits_t
+\family default 
+ include the following members:
+\layout Description
+
+max_match_entries Maximum number of match entries that can be allocated
+ at any one time.
+\layout Description
+
+max_mem_descriptors Maximum number of memory descriptors that can be allocated
+ at any one time.
+\layout Description
+
+max_event_queues Maximum number of event queues that can be allocated at
+ any one time.
+\layout Description
+
+max_atable_index Largest access control table index for this interface,
+ valid indexes range from zero to 
+\family typewriter 
+max_atable_index
+\family default 
+, inclusive.
+\layout Description
+
+max_ptable_index Largest Portal table index for this interface, valid indexes
+ range from zero to 
+\family typewriter 
+max_ptable_index
+\family default 
+, inclusive.
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIInit
+\emph default 
+ function is used to initialized the Portals API for a network interface.
+ This function must be called at least once by each process before any other
+ operations that apply to the interface by any process or thread.
+ For subsequent calls to 
+\shape italic 
+PtlNIInit
+\shape default 
+ from within the same process (either by different threads or the same thread),
+ the desired limits will be ignored and the call will return the existing
+ NI handle.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INIT_DUP Indicates a duplicate initialization of 
+\family typewriter 
+interface
+\family default 
+.
+\layout Description
+
+PTL_INIT_INV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to initialize the
+ interface.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+pid
+\family default 
+ is not a valid process id.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+actual 
+\family default 
+or
+\family typewriter 
+ handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the network interface to be initialized.
+  (See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+ for a discussion of  values used to identify network interfaces.)
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+pid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the desired process id (for well known process ids).
+ The value 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ may be used to have the process id assigned by the underlying library.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+desired
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If non-NULL, points to a structure that holds the desired limits.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+actual
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, the location pointed to by actual will hold the actual
+ limits.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the interface.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+The use of desired is implementation dependent.
+ In particular, an implementation may choose to ignore this argument.
+\layout Subsection
+
+PtlNIFini
+\begin_inset LatexCommand \label{sec:nifini}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlNIFini( ptl_handle_ni_t interface );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIFini
+\emph default 
+ function is used to release the resources allocated for a network interface.
+ Once the 
+\emph on 
+PtlNIFini
+\emph default 
+ operation has been started, the results of pending API operations (e.g.,
+ operations initiated by another thread) for this interface are undefined.
+ Similarly, the effects of incoming operations (puts and gets) or return
+ values (acknowledgements and replies) for this interface are undefined.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the interface to shutdown.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlNIStatus
+\begin_inset LatexCommand \label{sec:nistatus}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlNIStatus( ptl_handle_ni_t interface,
+\newline 
+                 ptl_sr_index_t  status_register,
+\newline 
+                 ptl_sr_value_t* status );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIStatus
+\emph default 
+ function returns the value of a status register for the specified interface.
+ (See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+ for more information on status register indexes and status register values.)
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_INV_SR_INDX Indicates that 
+\family typewriter 
+status_register
+\family default 
+ is not a valid status register.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+status
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+status_register
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+An index for the status register to read.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+status
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the current value of the status
+ register.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+The only status register that must be defined is a drop count register (
+\family typewriter 
+PTL_SR_DROP_COUNT
+\family default 
+).
+ Implementations may define additional status registers.
+ Identifiers for the indexes associated with these registers should start
+ with the prefix 
+\family typewriter 
+PTL_SR_
+\family default 
+.
+\layout Subsection
+
+PtlNIDist
+\layout LyX-Code
+
+int PtlNIDist( ptl_handle_ni_t  interface,
+\newline 
+               ptl_process_id_t process,
+\newline 
+               unsigned long*   distance );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIDist
+\emph default 
+ function returns the distance to another process using the specified interface.
+ Distances are only defined relative to an interface.
+ Distance comparisons between different interfaces on the same process may
+ be meaningless.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+process
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+distance
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+process
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+An identifier for the process whose distance is being  requested.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+distance
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the  distance to the remote
+ process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+This function should return a static measure of distance.
+ Examples include minimum latency, the inverse of available bandwidth, or
+ the number of switches between the two endpoints.
+\layout Subsection
+
+PtlNIHandle
+\layout LyX-Code
+
+int PtlNIHandle( ptl_handle_any_t handle,
+\newline 
+                 ptl_handle_ni_t* interface );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlNIHandle
+\emph default 
+ function returns a handle for the network interface with which the object
+ identified by 
+\family typewriter 
+handle
+\family default 
+ is associated.
+ If the object identified by 
+\family typewriter 
+handle
+\family default 
+ is a network interface, this function returns the same value it is passed.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_HANDLE Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a valid handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the object.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the network interface
+ associated with 
+\family typewriter 
+handle
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Every handle should encode the network interface and the object id relative
+ to this handle.
+ Both are presumably encoded using integer values.
+\layout Section
+
+User Identification
+\begin_inset LatexCommand \label{sec:uid}
+
+\end_inset 
+
+
+\layout Standard
+
+Every process runs on behalf of a user.
+\layout Subsection
+
+PtlGetUid
+\layout LyX-Code
+
+int PtlGetUid( ptl_handle_ni_t   ni_handle,
+\newline 
+               ptl_uid_t*        uid );
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+ni_handle
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A network interface handle.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the user id for the calling
+ process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Note that user identifiers are dependent on the network interface(s).
+ In particular, if a node has multiple interfaces, a process may have multiple
+ user identifiers.
+\layout Section
+
+Process Identification
+\begin_inset LatexCommand \label{sec:pid}
+
+\end_inset 
+
+
+\layout Standard
+
+Processes that use the Portals API, can be identified using a node id and
+ process id.
+ Every node accessible through a network interface has a unique node identifier
+ and every process running on a node has a unique process identifier.
+ As such, any process in the computing system can be identified by its node
+ id and process id.
+\layout Standard
+
+The Portals API defines a type, 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ for representing process ids and a function, 
+\emph on 
+PtlGetId
+\emph default 
+, which can be used to obtain the id of the current process.
+\layout Comment
+
+The portals API does not include thread identifiers.
+  Messages are delivered to processes (address spaces) not threads (contexts
+ of  execution).
+\layout Subsection
+
+The Process Id Type
+\begin_inset LatexCommand \label{sec:pid-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    ptl_nid_t       nid; /* node id */
+\newline 
+    ptl_pid_t       pid; /* process id */
+\newline 
+} ptl_process_id_t;
+\layout Standard
+\noindent 
+The 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ type uses two identifiers to represent a process id: a node id and a process
+ id.
+\layout Subsection
+
+PtlGetId
+\begin_inset LatexCommand \label{sub:PtlGetId}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlGetId( ptl_handle_ni_t   ni_handle,
+\newline 
+              ptl_process_id_t* id );
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+ni_handle
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+id
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A network interface handle.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the id for the calling process.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Comment
+
+Note that process identifiers are dependent on the network interface(s).
+ In particular, if a node has multiple interfaces, it may have multiple
+ node identifiers.
+\layout Section
+
+Match List Entries and Match Lists
+\begin_inset LatexCommand \label{sec:me}
+
+\end_inset 
+
+
+\layout Standard
+
+A match list is a chain of match list entries.
+ Each match list entry includes a memory descriptor and a set of match criteria.
+ The match criteria can be used to reject incoming requests based on process
+ id or the match bits provided in the request.
+ A match list is created using the 
+\emph on 
+PtlMEAttach
+\emph default 
+ or 
+\shape italic 
+PtlMEAttachAny
+\shape default 
+ functions, which create a match list consisting of a single match list
+ entry, attaches the match list to the specified Portal index, and returns
+ a handle for the match list entry.
+ Match entries can be dynamically inserted and removed from a match list
+ using the 
+\emph on 
+PtlMEInsert
+\emph default 
+ and 
+\emph on 
+PtlMEUnlink
+\emph default 
+ functions.
+\layout Subsection
+
+PtlMEAttach
+\begin_inset LatexCommand \label{sec:meattach}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t;
+\newline 
+
+\layout LyX-Code
+
+typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t;
+\newline 
+
+\layout LyX-Code
+
+int PtlMEAttach( ptl_handle_ni_t  interface,
+\newline 
+                 ptl_pt_index_t   index,
+\newline 
+                 ptl_process_id_t matchid,
+\newline 
+                 ptl_match_bits_t match_bits,
+\newline 
+                 ptl_match_bits_t ignorebits,
+\newline 
+                 ptl_unlink_t     unlink,
+\newline 
+                 ptl_ins_pos_t    position,
+\newline 
+                 ptl_handle_me_t* handle );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+ are used to control where a new item is inserted.
+ The value 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+ is used to insert the new item before the current item or before the head
+ of the list.
+ The value 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+ is used to insert the new item after the current item or after the last
+ item in the list.
+\layout Standard
+
+The 
+\emph on 
+PtlMEAttach
+\emph default 
+ function creates a match list consisting of a single entry and attaches
+ this list to the Portal table for 
+\family typewriter 
+interface
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PTINDEX Indicates that 
+\family typewriter 
+index
+\family default 
+ is not a valid Portal table index.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match list entry.
+\layout Description
+
+PTL_ML_TOOLONG Indicates that the resulting match list is too long.
+ The maximum length for a match list is defined by the interface.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="7" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.75in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The Portal table index where the match list  should be attached.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Specifies the match criteria for the process id of the requestor.
+  The constants 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ and 
+\family typewriter 
+PTL_NID_ANY
+\family default 
+ can be used to  wildcard either of the ids in the 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ structure.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+match_bits, ignorebits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Specify the match criteria to apply  to the match bits in the incoming request.
+  The 
+\family typewriter 
+ignorebits
+\family default 
+ are used to mask out insignificant bits in the incoming match bits.
+  The resulting bits are then compared to the match list entry's match 
+ bits to determine if the incoming request meets the match criteria.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+unlink
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates the match list entry should be unlinked when the last memory descripto
+r associated with this match list  entry is unlinked.
+  (Note, the check for unlinking a match entry  only occurs when a memory
+ descriptor is unlinked.) 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+position
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates whether the new match entry should be prepended or appended to
+ the existing match list.
+ If there is no existing list, this argument is ignored and the new match
+ entry becomes the only entry in the list.
+ Allowed constants: 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+, 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ match list entry.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEAttachAny
+\begin_inset LatexCommand \label{sec:attachany}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEAttachAny( ptl_handle_ni_t  interface,
+\newline 
+                    ptl_pt_index_t   *index,
+\newline 
+                    ptl_process_id_t matchid,
+\newline 
+                    ptl_match_bits_t match_bits,
+\newline 
+                    ptl_match_bits_t ignorebits,
+\newline 
+                    ptl_unlink_t     unlink,
+\newline 
+                    ptl_handle_me_t* handle );
+\layout Standard
+
+The 
+\emph on 
+PtlMEAttachAny
+\emph default 
+ function creates a match list consisting of a single entry and attaches
+ this list to an unused Portal table entry for 
+\family typewriter 
+interface
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match list entry.
+\layout Description
+
+PTL_PT_FULL Indicates that there are no free entries in the Portal table.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.75in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On succesfful return, this location will hold the Portal index where the
+ match list  has been attached.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid, match_bits, ignorebits, unlink
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion for 
+\shape italic 
+PtlMEAttach
+\shape default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ match list entry.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEInsert
+\begin_inset LatexCommand \label{sec:meinsert}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEInsert( ptl_handle_me_t  current,
+\newline 
+                 ptl_process_id_t matchid,
+\newline 
+                 ptl_match_bits_t match_bits,
+\newline 
+                 ptl_match_bits_t ignorebits,
+\newline 
+                 ptl_ins_pos_t    position,
+\newline 
+                 ptl_handle_me_t* handle );
+\layout Standard
+
+The 
+\emph on 
+PtlMEInsert
+\emph default 
+ function creates a new match list entry and inserts this entry into the
+ match list containing 
+\family typewriter 
+current
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+current
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ML_TOOLONG Indicates that the resulting match list is too long.
+ The maximum length for a match list is defined by the interface.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ match entry.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0.8in">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+current
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for a match entry.
+  The new match entry will be inserted immediately before or immediately
+ after this match entry.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+matchid
+\family default 
+, 
+\family typewriter 
+match_bits
+\family default 
+, 
+\family typewriter 
+ignorebits
+\family default 
+,  
+\family typewriter 
+unlink
+\family default 
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion  for 
+\emph on 
+PtlMEAttach
+\emph default 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+position
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Indicates whether the new match entry should be inserted before or after
+ the 
+\family typewriter 
+current
+\family default 
+ entry.
+ Allowed constants: 
+\family typewriter 
+PTL_INS_BEFORE
+\family default 
+, 
+\family typewriter 
+PTL_INS_AFTER
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+See the discussion for 
+\emph on 
+PtlMEAttach
+\emph default 
+.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMEUnlink
+\begin_inset LatexCommand \label{sec:meunlink}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMEUnlink( ptl_handle_me_t entry );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMEUnlink
+\emph default 
+ function can be used to unlink a match entry from a match list.
+ This operation also releases any resources associated with the match entry
+ (including the associated memory descriptor).
+ It is an error to use the match entry handle after calling 
+\emph on 
+PtlMEUnlink
+\emph default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+entry
+\family default 
+ is not a valid match entry handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+entry
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the match entry to be unlinked.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Memory Descriptors
+\begin_inset LatexCommand \label{sec:md}
+
+\end_inset 
+
+
+\layout Standard
+
+A memory descriptor contains information about a region of an application
+ process' memory and an event queue where information about the operations
+ performed on the memory descriptor are recorded.
+ The Portals API provides two operations to create memory descriptors: 
+\emph on 
+PtlMDAttach
+\emph default 
+, and 
+\emph on 
+PtlMDBind
+\emph default 
+; an operation to update a memory descriptor, 
+\emph on 
+PtlMDUpdate
+\emph default 
+; and an operation to unlink and release the resources associated with a
+ memory descriptor, 
+\emph on 
+PtlMDUnlink
+\emph default 
+.
+\layout Subsection
+
+The Memory Descriptor Type
+\begin_inset LatexCommand \label{sec:md-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    void*           start;
+\newline 
+    ptl_size_t      length;
+\newline 
+    int             threshold;
+\newline 
+    unsigned int    max_offset;
+\newline 
+    unsigned int    options;
+\newline 
+    void*           user_ptr;
+\newline 
+    ptl_handle_eq_t eventq;
+\newline 
+} ptl_md_t;
+\layout Standard
+\noindent 
+The 
+\family typewriter 
+ptl_md_t
+\family default 
+ type defines the application view of a memory descriptor.
+ Values of this type are used to initialize and update the memory descriptors.
+\layout Subsubsection
+
+Members
+\layout Description
+
+start,\SpecialChar ~
+length Specify the memory region associated with the memory descriptor.
+ The 
+\family typewriter 
+start
+\family default 
+ member specifies the starting address for the memory region and the 
+\family typewriter 
+length
+\family default 
+ member specifies the length of the region.
+ The 
+\family typewriter 
+start member
+\family default 
+ can be NULL provided that the 
+\family typewriter 
+length
+\family default 
+ member is zero.
+ (Zero length buffers are useful to record events.) There are no alignment
+ restrictions on the starting address or the length of the region; although,
+ unaligned messages may be slower (i.e., lower bandwidth and/or longer latency)
+ on some implementations.
+\layout Description
+
+threshold Specifies the maximum number of operations that can be performed
+ on the memory descriptor.
+ An operation is any action that could possibly generate an event (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+ for the different types of events).
+ In the usual case, the threshold value is decremented for each operation
+ on the memory descriptor.
+ When the threshold value is zero, the memory descriptor is 
+\emph on 
+inactive
+\emph default 
+, and does not respond to operations.
+ A memory descriptor can have an initial threshold value of zero to allow
+ for manipulation of an inactive memory descriptor by the local process.
+ A threshold value of 
+\family typewriter 
+PTL_MD_THRESH_INF
+\family default 
+ indicates that there is no bound on the number of operations that may be
+ applied to a memory descriptor.
+ Note that local operations (e.g., 
+\emph on 
+PtlMDUpdate
+\emph default 
+) are not applied to the threshold count.
+\layout Description
+
+max_offset Specifies the maximum local offset of a memory descriptor.
+ When the local offset of a memory descriptor exceeds this maximum, the
+ memory descriptor becomes 
+\shape italic 
+inactive
+\shape default 
+ and does not respond to further operations.
+\layout Description
+
+options Specifies the behavior of the memory descriptor.
+ There are five options that can be selected: enable put operations (yes
+ or no), enable get operations (yes or no), offset management (local or
+ remote), message truncation (yes or no), and acknowledgement (yes or no).
+ Values for this argument can be constructed using a bitwise or of the following
+ values: 
+\begin_deeper 
+\begin_deeper 
+\layout Description
+
+PTL_MD_OP_PUT Specifies that the memory descriptor will respond to 
+\emph on 
+put
+\emph default 
+ operations.
+ By default, memory descriptors reject 
+\emph on 
+put
+\emph default 
+ operations.
+\layout Description
+
+PTL_MD_OP_GET Specifies that the memory descriptor will respond to 
+\emph on 
+get
+\emph default 
+ operations.
+ By default, memory descriptors reject 
+\emph on 
+get
+\emph default 
+ operations.
+\layout Description
+
+PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory
+ region is provided by the incoming request.
+ By default, the offset is maintained locally.
+ When the offset is maintained locally, the offset is incremented by the
+ length of the request so that the next operation (put and/or get) will
+ access the next part of the memory region.
+\layout Description
+
+PTL_MD_TRUNCATE Specifies that the length provided in the incoming request
+ can be reduced to match the memory available in the region.
+ (The memory available in a memory region is determined by subtracting the
+ offset from the length of the memory region.) By default, if the length
+ in the incoming operation is greater than the amount of memory available,
+ the operation is rejected.
+\layout Description
+
+PTL_MD_ACK_DISABLE Specifies that an acknowledgement should 
+\emph on 
+not
+\emph default 
+ be sent for incoming 
+\emph on 
+put
+\emph default 
+ operations, even if requested.
+ By default, acknowledgements are sent for 
+\emph on 
+put
+\emph default 
+ operations that request an acknowledgement.
+ Acknowledgements are never sent for 
+\emph on 
+get
+\emph default 
+ operations.
+ The value sent in the reply serves as an implicit acknowledgement.
+\end_deeper 
+\layout Standard
+
+
+\series bold 
+Note
+\series default 
+: It is not considered an error to have a memory descriptor that does not
+ respond to either 
+\emph on 
+put
+\emph default 
+ or 
+\emph on 
+get
+\emph default 
+ operations: Every memory descriptor responds to 
+\emph on 
+reply
+\emph default 
+ operations.
+ Nor is it considered an error to have a memory descriptor that responds
+ to both 
+\emph on 
+put
+\emph default 
+ and 
+\emph on 
+get
+\emph default 
+ operations.
+\end_deeper 
+\layout Description
+
+user_ptr A user-specified value that is associated with the memory descriptor.
+ The value does not need to be a pointer, but must fit in the space used
+ by a pointer.
+ This value (along with other values) is recorded in events associated with
+ operations on this memory descriptor.
+\begin_inset Foot
+collapsed true
+
+\layout Standard
+
+Tying the memory descriptor to a user-defined value can be useful when multiple
+ memory descriptor share the same event queue or when the memory descriptor
+ needs to be associated with a data structure maintained by the application.
+ For example, an MPI implementation can set the 
+\family typewriter 
+user_ptr
+\family default 
+ argument to the value of an MPI Request.
+ This direct association allows for processing of memory descriptor's by
+ the MPI implementation without a table lookup or a search for the appropriate
+ MPI Request.
+\end_inset 
+
+
+\layout Description
+
+eventq A handle for the event queue used to log the operations performed
+ on the memory region.
+ If this argument is 
+\family typewriter 
+PTl_EQ_NONE
+\family default 
+, operations performed on this memory descriptor are not logged.
+\layout Subsection
+
+PtlMDAttach
+\begin_inset LatexCommand \label{sec:mdattach}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDAttach( ptl_handle_me_t  match,
+\newline 
+                 ptl_md_t         mem_desc,
+\newline 
+                 ptl_unlink_t     unlink_op,
+\newline 
+                 ptl_unlink_t     unlink_nofit,
+\newline 
+                 ptl_handle_md_t* handle );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_unlink_t
+\family default 
+ are used to control whether an item is unlinked from a list.
+ The value 
+\family typewriter 
+PTL_UNLINK
+\family default 
+ enables unlinking.
+ The value 
+\family typewriter 
+PTL_RETAIN
+\family default 
+ disables unlinking.
+\layout Standard
+
+The 
+\emph on 
+PtlMDAttach
+\emph default 
+ operation is used to create a memory descriptor and attach it to a match
+ list entry.
+ An error code is returned if this match list entry already has an associated
+ memory descriptor.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INUSE Indicates that 
+\family typewriter 
+match
+\family default 
+ already has a memory descriptor attached.
+\layout Description
+
+PTL_INV_ME Indicates that 
+\family typewriter 
+match
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ILL_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a legal memory descriptor.
+ This may happen because the memory region defined in 
+\family typewriter 
+mem_desc
+\family default 
+ is invalid or because the network interface associated with the 
+\family typewriter 
+eventq
+\family default 
+ in 
+\family typewriter 
+mem_desc
+\family default 
+ is not the same as the network interface associated with 
+\family typewriter 
+match
+\family default 
+.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ memory descriptor.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the match entry that the memory descriptor will be associated
+ with.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Provides initial values for the application visible parts of a memory descriptor.
+  Other than its use for initialization, there is no linkage between this
+ structure and the memory descriptor maintained by the API.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+unlink_op
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A flag to indicate whether the memory descriptor is  unlinked when it becomes
+ inactive, either because the operation threshold drops to zero or because
+ the maximum offset has been exceeded.
+  (Note, the check for unlinking a memory descriptor only occurs after a
+ the completion of a successful operation.
+  If the threshold is set to zero during initialization or  using 
+\emph on 
+PtlMDUpdate
+\emph default 
+, the memory descriptor is 
+\series bold 
+not
+\series default 
+  unlinked.) 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+unlink_nofit
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A flag to indicate whether the memory descriptor is  unlinked when the space
+ remaining in the memory descriptor is not sufficient for a matching operation.
+ If an incoming message arrives arrives at a memory descriptor that does
+ not have sufficient space and the 
+\series bold 
+PTL_MD_TRUNCATE
+\series default 
+ operation is not specified, the memory descriptor will be unlinked.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the newly created
+ memory descriptor.
+  The 
+\family typewriter 
+handle
+\family default 
+ argument can be NULL, in which case the handle will not be returned.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDBind
+\begin_inset LatexCommand \label{sec:mdbind}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDBind( ptl_handle_ni_t  interface,
+\newline 
+               ptl_md_t         mem_desc,
+\newline 
+               ptl_handle_md_t* handle );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDBind
+\emph default 
+ operation is used to create a 
+\begin_inset Quotes eld
+\end_inset 
+
+free floating
+\begin_inset Quotes erd
+\end_inset 
+
+ memory descriptor, i.e., a memory descriptor that is not associated with
+ a match list entry.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid match entry handle.
+\layout Description
+
+PTL_ILL_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a legal memory descriptor.
+ This may happen because the memory region defined in 
+\family typewriter 
+mem_desc
+\family default 
+ is invalid or because the network interface associated with the 
+\family typewriter 
+eventq
+\family default 
+ in 
+\family typewriter 
+mem_desc
+\family default 
+ is not the same as the network interface, 
+\family typewriter 
+interface
+\family default 
+.
+\layout Description
+
+PTL_INV_EQ Indicates that the event queue associated with 
+\family typewriter 
+mem_desc
+\family default 
+ is not valid.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ memory descriptor.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the network interface with which the memory descriptor will
+ be associated.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Provides initial values for the application visible parts of a memory descriptor.
+  Other than its use for initialization, there is no linkage between this
+ structure and the  memory descriptor maintained by the API.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a  handle for the newly created
+ memory descriptor.
+  The 
+\family typewriter 
+handle
+\family default 
+ argument must be a valid address and cannot be NULL.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDUnlink
+\begin_inset LatexCommand \label{sec:mdfree}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDUnlink( ptl_handle_md_t mem_desc );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDUnlink
+\emph default 
+ function unlinks the memory descriptor from any match list entry it may
+ be linked to and releases the resources associated with a memory descriptor.
+ (This function does not free the memory region associated with the memory
+ descriptor.) This function also releases the resources associated with a
+ floating memory descriptor.
+ Only memory descriptors with no pending operations may be unlinked.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor handle.
+\layout Description
+
+PTL_MD_INUSE Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ has pending operations and cannot be unlinked.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor to be released.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlMDUpdate
+\begin_inset LatexCommand \label{sec:mdupdate}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlMDUpdate( ptl_handle_md_t mem_desc,
+\newline 
+                 ptl_md_t*       old_md,
+\newline 
+                 ptl_md_t*       new_md,
+\newline 
+                 ptl_handle_eq_t testq );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlMDUpdate
+\emph default 
+ function provides a conditional, atomic update operation for memory descriptors.
+ The memory descriptor identified by 
+\family typewriter 
+mem_desc
+\family default 
+ is only updated if the event queue identified by 
+\family typewriter 
+testq
+\family default 
+ is empty.
+ The intent is to only enable updates to the memory descriptor when no new
+ messages have arrived since the last time the queue was checked.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:exmpi}
+
+\end_inset 
+
+ for an example of how this function can be used.
+\layout Standard
+
+If 
+\family typewriter 
+new
+\family default 
+ is not NULL the memory descriptor identified by handle will be updated
+ to reflect the values in the structure pointed to by 
+\family typewriter 
+new
+\family default 
+ if 
+\family typewriter 
+testq
+\family default 
+ has the value 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+ or if the event queue identified by 
+\family typewriter 
+testq
+\family default 
+ is empty.
+ If 
+\family typewriter 
+old
+\family default 
+ is not NULL, the current value of the memory descriptor identified by 
+\family typewriter 
+mem_desc
+\family default 
+ is recorded in the location identified by 
+\family typewriter 
+old
+\family default 
+.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_NOUPDATE Indicates that the update was not performed because 
+\family typewriter 
+testq
+\family default 
+ was not empty.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor handle.
+\layout Description
+
+PTL_ILL_MD Indicates that the value pointed to by 
+\family typewriter 
+new
+\family default 
+ is not a legal memory descriptor (e.g., the memory region specified by the
+ memory descriptor may be invalid).
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+testq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+new
+\family default 
+ or 
+\family typewriter 
+old
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="4" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor to update.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+old_md
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If 
+\family typewriter 
+old_md
+\family default 
+ is not the value 
+\family typewriter 
+NULL
+\family default 
+, the current value of the memory descriptor will be stored in the location
+ identified by 
+\family typewriter 
+old
+\family default 
+_md.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+new_md
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+If 
+\family typewriter 
+new_md
+\family default 
+ is not the value 
+\family typewriter 
+NULL
+\family default 
+, this argument provides the new values for the memory descriptor, if the
+ update is performed.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+testq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for an event queue used to predicate the update.
+ If 
+\family typewriter 
+testq
+\family default 
+ is equal to 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, the update is performed unconditionally.
+  Otherwise, the update is performed if and only if 
+\family typewriter 
+testq
+\family default 
+ is empty.
+  If the update is  not performed, the function returns the value 
+\family typewriter 
+PTL_NOUPDATE
+\family default 
+.
+  (Note, the 
+\family typewriter 
+testq
+\family default 
+ argument does not need to be the same as  the event queue associated with
+ the memory descriptor.)
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Standard
+
+The conditional update can be used to ensure that the memory descriptor
+ has not changed between the time it was examined and the time it is updated.
+ In particular, it is needed to support an MPI implementation where the
+ activity of searching an unexpected message queue and posting a receive
+ must be atomic.
+\layout Section
+
+Events and Event Queues
+\begin_inset LatexCommand \label{sec:eq}
+
+\end_inset 
+
+
+\layout Standard
+
+Event queues are used to log operations performed on memory descriptors.
+ They can also be used to hold acknowledgements for completed 
+\emph on 
+put
+\emph default 
+ operations and to note when the data specified in a 
+\emph on 
+put
+\emph default 
+ operation has been sent (i.e., when it is safe to reuse the buffer that holds
+ this data).
+ Multiple memory descriptors can share a single event queue.
+\layout Standard
+
+In addition to the 
+\family typewriter 
+ptl_handle_eq_t
+\family default 
+ type, the Portals API defines two types associated with events: The 
+\family typewriter 
+
+\newline 
+ptl_event_kind_t
+\family default 
+ type defines the kinds of events that can be stored in an event queue.
+ The 
+\family typewriter 
+ptl_event_t
+\family default 
+ type defines a structure that holds the information associated with an
+ event.
+\layout Standard
+
+The Portals API also provides four functions for dealing with event queues:
+ The 
+\emph on 
+PtlEQAlloc
+\emph default 
+ function is used to allocate the API resources needed for an event queue,
+ the 
+\emph on 
+PtlEQFree
+\emph default 
+ function is used to release these resources, the 
+\emph on 
+PtlEQGet
+\emph default 
+ function can be used to get the next event from an event queue, and the
+\emph on 
+PtlEQWait
+\emph default 
+ function can be used to block a process (or thread) until an event queue
+ has at least one event.
+\layout Subsection
+
+Kinds of Events
+\begin_inset LatexCommand \label{sec:ek-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { 
+\newline 
+    PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL,
+\newline 
+    PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL,
+\newline 
+    PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL,
+\newline 
+    PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL,
+\newline 
+    PTL_EVENT_ACK,
+\newline 
+    PTL_EVENT_UNLINK
+\newline 
+} ptl_event_kind_t;
+\layout Standard
+\noindent 
+The Portals API defines fourteen types of events that can be logged in an
+ event queue: 
+\layout Description
+
+PTL_EVENT_GET_START A remote 
+\emph on 
+get
+\emph default 
+ operation has been started on the memory descriptor.
+ The memory region associated with this descriptor should not be altered
+ until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_GET_END A previously initiated 
+\emph on 
+get
+\emph default 
+ operation completed successfully.
+ This event is logged after the reply has been sent by the local node.
+ As such, the process could free the memory descriptor once it sees this
+ event.
+\layout Description
+
+PTL_EVENT_GET_FAIL A previously initiated 
+\emph on 
+get
+\emph default 
+ operation completed unsuccessfully.
+ This event is logged after the reply has been sent by the local node.
+ As such, the process could free the memory descriptor once it sees this
+ event.
+\layout Description
+
+PTL_EVENT_PUT_START A remote 
+\emph on 
+put
+\emph default 
+ operation has been started on the memory descriptor.
+ The memory region associated with this descriptor should should be considered
+ volatile until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_PUT_END A previously initiated 
+\emph on 
+put
+\emph default 
+ operation completed successfully.
+ The underlying layers will not alter the memory (on behalf of this operation)
+ once this event has been logged.
+\layout Description
+
+PTL_EVENT_PUT_FAIL A previously initiated 
+\emph on 
+put
+\emph default 
+ operation completed unsuccessfully.
+ The underlying layers will not alter the memory (on behalf of this operation)
+ once this event has been logged.
+\layout Description
+
+PTL_EVENT_REPLY_START A 
+\emph on 
+reply
+\emph default 
+ operation has been started on the memory descriptor.
+\layout Description
+
+PTL_EVENT_REPLY_END A previously initiated 
+\emph on 
+reply
+\emph default 
+ operation has completed successfully .
+ This event is logged after the data (if any) from the reply has been written
+ into the memory descriptor.
+\layout Description
+
+PTL_EVENT_REPLY_FAIL A previously initiated 
+\emph on 
+reply
+\emph default 
+ operation has completed unsuccessfully.
+ This event is logged after the data (if any) from the reply has been written
+ into the memory descriptor.
+\layout Description
+
+PTL_EVENT_ACK An 
+\emph on 
+acknowledgement
+\emph default 
+ was received.
+ This event is logged when the acknowledgement is received 
+\layout Description
+
+PTL_EVENT_SEND_START An outgoing 
+\emph on 
+send
+\emph default 
+ operation has been started.
+ The memory region associated with this descriptor should not be altered
+ until the corresponding END or FAIL event is logged.
+\layout Description
+
+PTL_EVENT_SEND_END A previously initiated 
+\emph on 
+send
+\emph default 
+ operation has completed successfully.
+ This event is logged after the entire buffer has been sent and it is safe
+ for the application to reuse the buffer.
+\layout Description
+
+PTL_EVENT_SEND_FAIL A previously initiated 
+\emph on 
+send
+\emph default 
+ operation has completed unsuccessfully.
+ The process can safely manipulate the memory or free the memory descriptor
+ once it sees this event.
+\layout Description
+
+PTL_EVENT_UNLINK A memory descriptor associated with this event queue has
+ been automatically unlinked.
+ This event is not generated when a memory descriptor is explicitly unlinked
+ by calling 
+\shape italic 
+PtlMDUnlink
+\shape default 
+.
+ This event does not decrement the threshold count.
+\layout Subsection
+
+Event Ordering
+\layout Standard
+
+The Portals API guarantees that a when a process initiates two operations
+ on a remote process, the operations will be initiated on the remote process
+ in the same order that they were initiated on the original process.
+ As an example, if process A intitates two 
+\emph on 
+put
+\emph default 
+ operations, 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+, on process B, the Portals API guarantees that process A will receive the
+\family typewriter 
+PTL_EVENT_SEND_START
+\family default 
+ events for 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+ in the same order that process B receives the 
+\family typewriter 
+PTL_EVENT_PUT_START
+\family default 
+ events for 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+.
+ Notice that the API does not guarantee that the start events will be delivered
+ in the same order that process A initiated the 
+\emph on 
+x
+\emph default 
+ and 
+\emph on 
+y
+\emph default 
+ operations.
+ If process A needs to ensure the ordering of these operations, it should
+ include code to wait for the initiation of 
+\emph on 
+x
+\emph default 
+ before it initiates 
+\emph on 
+y
+\emph default 
+.
+\layout Subsection
+
+Failure Notification
+\layout Standard
+
+Operations may fail to complete successfully; however, unless the node itself
+ fails, every operation that is started will eventually complete.
+ While an operation is in progress, the memory associated with the operation
+ should not be viewed (in the case of a put or a reply) or altered (in the
+ case of a send or get).
+ Operation completion, whether successful or unsuccessful, is final.
+ That is, when an operation completes, the memory associated with the operation
+ will no longer be read or altered by the operation.
+ A network interface can use the 
+\family typewriter 
+ptl_ni_fail_t
+\family default 
+ to define more specific information regarding the failure of the operation
+ and record this information in the 
+\family typewriter 
+ni_fail_type
+\family default 
+ field of the event.
+\layout Subsection
+
+The Event Type
+\begin_inset LatexCommand \label{sec:event-type}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef struct {
+\newline 
+    ptl_event_kind_t      type;
+\newline 
+    ptl_process_id_t      initiator;
+\newline 
+    ptl_uid_t             uid;
+\layout LyX-Code
+
+    ptl_pt_index_t        portal;
+\newline 
+    ptl_match_bits_t      match_bits;
+\newline 
+    ptl_size_t            rlength;
+\newline 
+    ptl_size_t            mlength;
+\newline 
+    ptl_size_t            offset; 
+\newline 
+    ptl_handle_md_t       md_handle;
+\newline 
+    ptl_md_t              mem_desc;
+\newline 
+    ptl_hdr_data_t        hdr_data;
+\newline 
+    ptl_seq_t             link;
+\newline 
+    ptl_ni_fail_t         ni_fail_type;
+\newline 
+    volatile ptl_seq_t    sequence;
+\newline 
+} ptl_event_t;
+\layout Standard
+\noindent 
+An event structure includes the following members: 
+\layout Description
+
+type Indicates the type of the event.
+\layout Description
+
+initiator The id of the initiator.
+\layout Description
+
+portal The Portal table index specified in the request.
+\layout Description
+
+match_bits A copy of the match bits specified in the request.
+ See section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+ for more information on match bits.
+\layout Description
+
+rlength The length (in bytes) specified in the request.
+\layout Description
+
+mlength The length (in bytes) of the data that was manipulated by the operation.
+ For truncated operations, the manipulated length will be the number of
+ bytes specified by the memory descriptor (possibly with an offset) operation.
+ For all other operations, the manipulated length will be the length of
+ the requested operation.
+\layout Description
+
+offset Is the displacement (in bytes) into the memory region that the operation
+ used.
+ The offset can be determined by the operation (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+) for a remote managed memory descriptor, or by the local memory descriptor
+ (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+).
+\layout Description
+
+md_handle Is the handle to the memory descriptor associated with the event.
+\layout Description
+
+mem_desc Is the state of the memory descriptor immediately after the event
+ has been processed.
+\layout Description
+
+hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+).
+\layout Description
+
+link The 
+\emph on 
+link
+\emph default 
+ member is used to link 
+\family typewriter 
+START
+\family default 
+ events with the 
+\family typewriter 
+END
+\family default 
+ or 
+\family typewriter 
+FAIL
+\family default 
+ event that signifies completion of the operation.
+ The 
+\emph on 
+link
+\emph default 
+ member will be the same for the two events associated with an operation.
+ The link member is also used to link an 
+\family typewriter 
+UNLINK
+\family default 
+ event with the event that caused the memory descriptor to be unlinked.
+\layout Description
+
+sequence The sequence number for this event.
+ Sequence numbers are unique to each event.
+\layout Comment
+
+The 
+\emph on 
+sequence
+\emph default 
+ member is the last member and is volatile to support SMP implementations.
+ When an event structure is filled in, the 
+\emph on 
+sequence
+\emph default 
+ member should be written after all other members have been updated.
+ Moreover, a memory barrier should be inserted between the updating of other
+ members and the updating of the 
+\emph on 
+sequence
+\emph default 
+ member.
+\layout Subsection
+
+PtlEQAlloc
+\begin_inset LatexCommand \label{sec:eqalloc}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQAlloc( ptl_handle_ni_t  interface,
+\newline 
+                ptl_size_t       count,
+\newline 
+                ptl_handle_eq_t* handle );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlEQAlloc
+\emph default 
+ function is used to build an event queue.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_NOSPACE Indicates that there is insufficient memory to allocate the
+ event queue.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+handle
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="3" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the interface with which the event queue  will be associated.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+count
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The number of events that can be stored in the event queue.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+handle
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold a handle for the newly created
+ event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQFree
+\begin_inset LatexCommand \label{sec:eqfree}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQFree( ptl_handle_eq_t eventq );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlEQFree
+\emph default 
+ function releases the resources associated with an event queue.
+ It is up to the user to insure that no memory descriptors are associated
+ with the event queue once it is freed.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="1" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+A handle for the event queue to be released.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQGet
+\begin_inset LatexCommand \label{sec:eqget}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQGet( ptl_handle_eq_t eventq,
+\newline 
+              ptl_event_t*    event );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PTLEQGet
+\emph default 
+ function is a nonblocking function that can be used to get the next event
+ in an event queue.
+ The event is removed from the queue.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at
+ least one event between this event and the last event obtained (using 
+\emph on 
+PtlEQGet
+\emph default 
+ or 
+\emph on 
+PtlEQWait
+\emph default 
+) from this event queue has been dropped due to limited space in the event
+ queue.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_EQ_EMPTY Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is empty or another thread is waiting on 
+\emph on 
+PtlEQWait
+\emph default 
+.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+event
+\family default 
+ is not a legal address.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.5in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the event queue.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+event
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the  values associated with
+ the next event in the event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlEQWait
+\begin_inset LatexCommand \label{sec:eqwait}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlEQWait( ptl_handle_eq_t eventq,
+\newline 
+               ptl_event_t*    event );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PTLEQWait
+\emph default 
+ function can be used to block the calling process (thread) until there
+ is an event in an event queue.
+ This function also returns the next event in the event queue and removes
+ this event from the queue.
+ This is the only blocking operation in the Portals 3.2 API.
+ In the event that multiple threads are waiting on the same event queue,
+ PtlEQWait is guaranteed to wake exactly one thread, but the order in which
+ they are awakened is not specified.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at
+ least one event between this event and the last event obtained (using 
+\emph on 
+PtlEQGet
+\emph default 
+ or 
+\emph on 
+PtlEQWait
+\emph default 
+) from this event queue has been dropped due to limited space in the event
+ queue.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_EQ Indicates that 
+\family typewriter 
+eventq
+\family default 
+ is not a valid event queue handle.
+\layout Description
+
+PTL_SEGV Indicates that 
+\family typewriter 
+event
+\family default 
+ is not a legal address.
+ queue handle.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+\noindent 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="2" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+eventq
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the event queue to wait on.
+  The calling process (thread) will be blocked until 
+\family typewriter 
+eventq
+\family default 
+ is not empty.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+event
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+output
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+On successful return, this location will hold the values associated with
+ the next event in the event queue.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+The Access Control Table
+\begin_inset LatexCommand \label{sec:ac}
+
+\end_inset 
+
+
+\layout Standard
+
+Processes can use the access control table to control which processes are
+ allowed to perform operations on Portal table entries.
+ Each communication interface has a Portal table and an access control table.
+ The access control table for the default interface contains an entry at
+ index zero that allows all processes with the same user id to communicate.
+ Entries in the access control table can be manipulated using the 
+\emph on 
+PtlACEntry
+\emph default 
+ function.
+\layout Subsection
+
+PtlACEntry
+\begin_inset LatexCommand \label{sec:acentry}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlACEntry( ptl_handle_ni_t  interface,
+\newline 
+                ptl_ac_index_t   index,
+\newline 
+                ptl_process_id_t matchid,
+\newline 
+                ptl_uid_t        user_id,
+\newline 
+                ptl_pt_index_t   portal );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlACEntry
+\emph default 
+ function can be used to update an entry in the access control table for
+ an interface.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_NI Indicates that 
+\family typewriter 
+interface
+\family default 
+ is not a valid network interface handle.
+\layout Description
+
+PTL_AC_INV_INDEX Indicates that 
+\family typewriter 
+index
+\family default 
+ is not a valid access control table index.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+matchid
+\family default 
+ is not a valid process identifier.
+\layout Description
+
+PTL_PT_INV_INDEX Indicates that 
+\family typewriter 
+portal
+\family default 
+ is not a valid Portal table index.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="5" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+interface
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the interface to use.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+index
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index of the entry in the access control table to update.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+matchid
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the process(es) that are allowed to  perform operations.
+ The constants 
+\family typewriter 
+PTL_PID_ANY
+\family default 
+ and 
+\family typewriter 
+PTL_NID_ANY
+\family default 
+ can be used to wildcard either of the ids in the 
+\family typewriter 
+ptl_process_id_t
+\family default 
+ structure.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+user_id
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the user that is allowed to  perform operations.
+ The value 
+\family typewriter 
+PTL_UID_ANY
+\family default 
+ can be used to wildcard the user.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Identifies the Portal index(es) that can be used.
+  The value 
+\family typewriter 
+PTL_PT_INDEX_ANY
+\family default 
+ can be used to wildcard the  Portal index.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Data Movement Operations
+\begin_inset LatexCommand \label{sec:datamovement}
+
+\end_inset 
+
+
+\layout Standard
+
+The Portals API provides two data movement operations: 
+\emph on 
+PtlPut
+\emph default 
+ and 
+\emph on 
+PtlGet
+\emph default 
+.
+\layout Subsection
+
+PtlPut
+\begin_inset LatexCommand \label{sec:put}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t;
+\newline 
+
+\newline 
+int PtlPut( ptl_handle_md_t  mem_desc,
+\newline 
+            ptl_ack_req_t    ack_req,
+\newline 
+            ptl_process_id_t target,
+\newline 
+            ptl_pt_index_t   portal,
+\newline 
+            ptl_ac_index_t   cookie,
+\newline 
+            ptl_match_bits_t match_bits,
+\newline 
+            ptl_size_t       offset,
+\newline 
+            ptl_hdr_data_t   hdr_data );
+\layout Standard
+\noindent 
+Values of the type 
+\family typewriter 
+ptl_ack_req_t
+\family default 
+ are used to control whether an acknowledgement should be sent when the
+ operation completes (i.e., when the data has been written to a memory descriptor
+ of the 
+\family typewriter 
+target
+\family default 
+ process).
+ The value 
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+ requests an acknowledgement, the value 
+\family typewriter 
+PTL_NOACK_REQ
+\family default 
+ requests that no acknowledgement should be generated.
+\layout Standard
+
+The 
+\emph on 
+PtlPut
+\emph default 
+ function initiates an asynchronous put operation.
+ There are several events associated with a put operation: initiation of
+ the send on the local node (
+\family typewriter 
+PTL_EVENT_SEND_START
+\family default 
+), completion of the send on the local node (
+\family typewriter 
+PTL_EVENT_SEND_END
+\family default 
+ or 
+\family typewriter 
+PTL_EVENT_SEND_FAIL
+\family default 
+), and, when the send completes successfully, the receipt of an acknowledgement
+ (
+\family typewriter 
+PTL_EVENT_ACK
+\family default 
+) indicating that the operation was accepted by the target.
+ These events will be logged in the event queue associated with the memory
+ descriptor (
+\family typewriter 
+mem_desc
+\family default 
+) used in the put operation.
+ Using a memory descriptor that does not have an associated event queue
+ results in these events being discarded.
+ In this case, the application must have another mechanism (e.g., a higher
+ level protocol) for determining when it is safe to modify the memory region
+ associated with the memory descriptor.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+target
+\family default 
+ is not a valid process id.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="8" columns="3">
+<features>
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor that describes the memory to be sent.
+  If the memory descriptor has an event queue  associated with it, it will
+ be used to record events when the  message has been sent (PTL_EVENT_SEND_START,
+ PTL_EVENT_SEND_END).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ack_req
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+Controls whether an acknowledgement event is requested.
+  Acknowledgements are only sent when they are requested by the initiating
+ process 
+\series bold 
+and
+\series default 
+ the memory descriptor has an event queue 
+\series bold 
+and
+\series default 
+ the target memory descriptor enables them.
+ Allowed constants: 
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+, 
+\family typewriter 
+PTL_NOACK_REQ
+\family default 
+.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A process id for the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index in the remote Portal table.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index into the access control table of the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The match bits to use for message selection at the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The offset into the target memory descriptor (only used when the target
+ memory descriptor has the 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+ option set).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+hdr_data
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+64 bits of user data that can be included in message header.
+  This data is written to an event queue entry at the target if an event
+ queue is present on the matching memory descriptor.
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Subsection
+
+PtlGet
+\begin_inset LatexCommand \label{sec:get}
+
+\end_inset 
+
+
+\layout LyX-Code
+
+int PtlGet( ptl_handle_md_t  mem_desc,
+\newline 
+            ptl_process_id_t target,
+\newline 
+            ptl_pt_index_t   portal,
+\newline 
+            ptl_ac_index_t   cookie,
+\newline 
+            ptl_match_bits_t match_bits,
+\newline 
+            ptl_size_t       offset );
+\layout Standard
+\noindent 
+The 
+\emph on 
+PtlGet
+\emph default 
+ function initiates a remote read operation.
+ There are two event pairs associated with a get operation , when the data
+ is sent from the remote node, a 
+\family typewriter 
+PTL_EVENT_GET{START|END}
+\family default 
+ event pair is registered on the remote node; and when the data is returned
+ from the remote node a 
+\family typewriter 
+PTL_EVENT_REPLY{START|END}
+\family default 
+ event pair is registered on the local node.
+\layout Subsubsection
+
+Return Codes
+\layout Description
+
+PTL_OK Indicates success.
+\layout Description
+
+PTL_NOINIT Indicates that the Portals API has not been successfully initialized.
+\layout Description
+
+PTL_INV_MD Indicates that 
+\family typewriter 
+mem_desc
+\family default 
+ is not a valid memory descriptor.
+\layout Description
+
+PTL_INV_PROC Indicates that 
+\family typewriter 
+target
+\family default 
+ is not a valid process id.
+\layout Subsubsection
+
+Arguments
+\layout Standard
+
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="6" columns="3">
+<features>
+<column alignment="right" valignment="top" width="0pt">
+<column alignment="center" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="4.7in">
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A handle for the memory descriptor that describes the memory into which
+ the requested data will be received.
+  The memory descriptor can have an event queue associated with it to record
+ events, such as when the message receive has started (
+\family typewriter 
+PTL_EVENT_REPLY
+\family default 
+_
+\family typewriter 
+START
+\family default 
+).
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+A process id for the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index in the remote Portal table.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The index into the access control table of the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The match bits to use for message selection at the target process.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+input
+\end_inset 
+</cell>
+<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+The offset into the target memory descriptor (only used when the target
+ memory descriptor has the 
+\family typewriter 
+PTL_MD_MANAGE_REMOTE
+\family default 
+ option set).
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\layout Section
+
+Summary
+\layout Standard
+
+
+\begin_inset LatexCommand \label{sec:summary}
+
+\end_inset 
+
+ We conclude this section by summarizing the names introduced by the Portals
+ 3.2 API.
+ We start by summarizing the names of the types introduced by the API.
+ This is followed by a summary of the functions introduced by the API.
+ Which is followed by a summary of the function return codes.
+ Finally, we conclude with a summary of the other constant values introduced
+ by the API.
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:types}
+
+\end_inset 
+
+ presents a summary of the types defined by the Portals API.
+ The first column in this table gives the type name, the second column gives
+ a brief description of the type, the third column identifies the section
+ where the type is defined, and the fourth column lists the functions that
+ have arguments of this type.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Types Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:types}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\noindent 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="25" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2in">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2.2in">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Sect
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+ Functions 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for an access control table 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:index-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlACEntry, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+acknowledgement request types 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlPut
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+kinds of events
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+information about events 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+plt_seq_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+event sequence number
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:event-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlEQGet, PtlEQWait
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_any_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for any object 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIHandle 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_eq_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for event queues 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for memory descriptors 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert,
+ PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_me_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for match entries 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_ni_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+handles for network interfaces 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut,
+ PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_nid_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+node identifiers
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlGetId,PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+process identifier
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlGetId, PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user indentifier
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlGetUid, PtlACEntry
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+insertion position (before or after) 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_interface_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+identifiers for network interfaces 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+match (and ignore) bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mb-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+memory descriptors 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ni_fail_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+network interface-specific failures
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlEQGet, PtlEQWait
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+process identifiers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:pid-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for Portal tables 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:index-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlACEntry 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+sizes 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:size-t}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlEQAlloc, PtlPut, PtlGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+indexes for status registers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_value_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+values in status registers 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+unlink options 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:func}
+
+\end_inset 
+
+ presents a summary of the functions defined by the Portals API.
+ The first column in this table gives the name for the function, the second
+ column gives a brief description of the operation implemented by the function,
+ and the third column identifies the section where the function is defined.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Functions Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:func}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="24" columns="3">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+Name 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ Section 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlACEntry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ update an entry in an access control table 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ac}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQAlloc 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQGet 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the next event from an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQFree 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ release the resources for an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlEQWait 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ wait for a new event in an event queue 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:eq}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlFini 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ shutdown the Portals API 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:init}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlGet 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ perform a get operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlGetId 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the id for the current process 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:pid}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlInit 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initialize the Portals API 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:init}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDAttach 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a memory descriptor and attach it to a match entry 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDBind 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a free-floating memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdbind}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDUnlink 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ remove a memory descriptor from a list and release its resources 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMDUpdate 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ update a memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEAttach 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+create a match entry and attach it to a Portal table 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMEAttachAny
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+create a match entry and attach it to a free Portal table entry
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:attachany}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEInsert 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ create a match entry and insert it in a list 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlMEUnlink 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ remove a match entry from a list and release its resources 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:me}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIDist 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the distance to another process 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIFini 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ shutdown a network interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIHandle 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ get the network interface handle for an object 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIInit 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initialize a network interface 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlNIStatus 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ read a network interface status register 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ PtlPut 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ perform a put operation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:datamovement}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:retcodes}
+
+\end_inset 
+
+ summarizes the return codes used by functions defined by the Portals API.
+ All of these constants are integer values.
+ The first column of this table gives the symbolic name for the constant,
+ the second column gives a brief description of the value, and the third
+ column identifies the functions that can return this value.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Function Return Codes for the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:retcodes}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="27" columns="3">
+<features>
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="2.6in">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Functions
+\series default 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_AC_INV_INDEX
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid access control table index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlACEntry 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_DROPPED
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+at least one event has been dropped 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlEQGet, PtlWait 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_EMPTY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+no events available in an event queue 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlEQGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_FAIL 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+error during initialization or cleanup 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlInit, PtlFini 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ILL_MD
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+illegal memory descriptor values 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach, PtlMDBind, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INIT_DUP 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+duplicate initialization of an interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INIT_INV
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initialization of an invalid interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INUSE
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+the ME already has an MD
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_ASIZE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid access control table size 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_EQ
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid event queue handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDUpdate, PtlEQFree, PtlEQGet 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_HANDLE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIHandle 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_MD 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid memory descriptor handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDUnlink, PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_ME
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid match entry handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlMDAttach 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_NI 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid network interface handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_PROC 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid process identifier 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_PTINDEX
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid Portal table index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMEAttach 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_REG 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid status register 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INV_SR_INDX 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+invalid status register index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlNIStatus 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ML_TOOLONG 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match list too long 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMEAttach, PtlMEInsert 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_INUSE
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+MD has pending operations
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMDUnlink
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOINIT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+uninitialized API 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+\emph on 
+all
+\emph default 
+, except PtlInit 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOSPACE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insufficient memory 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+PTL_NOUPDATE
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ no update was performed 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+ PtlMDUpdate 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PT_FULL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+Portal table is full
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+PtlMEAttachAny
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_OK 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ success 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+\emph on 
+all
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_SEGV 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+addressing violation 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+\noindent 
+PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate,
+ PtlEQAlloc, PtlEQGet, PtlEQWait 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:oconsts}
+
+\end_inset 
+
+ summarizes the remaining constant values introduced by the Portals API.
+ The first column in this table presents the symbolic name for the constant,
+ the second column gives a brief description of the value, the third column
+ identifies the type for the value, and the fourth column identifies the
+ sections in which the value is mentioned.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Other Constants Defined by the Portals 3.2 API
+\begin_inset LatexCommand \label{tab:oconsts}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="36" columns="5">
+<features>
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Name
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Meaning 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Base type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Intr.
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Ref.
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_ACK_REQ
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+request an acknowledgement 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EQ_NONE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a NULL event queue handle 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_eq_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:handle-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:mdupdate}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_GET_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+get event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_PUT_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+put event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_REPLY_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+reply event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_ACK_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+acknowledgement event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_START
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event start
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_END
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event end
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_SEND_FAIL
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+send event fail
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_EVENT_UNLINK
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+unlink event
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_event_kind_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:ek-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PID_ANY 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for process id fields 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pid_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NID_ANY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for node id fields
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_nid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_UID_ANY
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for user id
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:id-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\begin_inset LatexCommand \ref{sec:meattach}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_IFACE_DEFAULT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+default interface 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_interface_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:ni-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INS_AFTER 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insert after 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_INS_BEFORE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+insert before 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ins_pos_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:meinsert}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_ACK_DISABLE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to disable acknowledgements 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_MANAGE_REMOTE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable the use of remote offsets 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+, 
+\begin_inset LatexCommand \ref{sec:get}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_OP_GET 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable get operations 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_OP_PUT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable put operations 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_THRESH_INF 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+infinite threshold for a memory descriptor 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_MD_TRUNCATE 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+a flag to enable truncation of a request 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:md-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_NOACK_REQ 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+request no acknowledgement 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ack_req_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:put}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_PT_INDEX_ANY 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+wildcard for Portal indexes 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:acentry}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_RETAIN 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+disable unlinking 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_SR_DROP_COUNT 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+index for the dropped count register 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_sr_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:stat-type}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+PTL_UNLINK 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+enable unlinking 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_unlink_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\begin_inset LatexCommand \ref{sec:mdattach}
+
+\end_inset 
+
+
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Chapter
+
+The Semantics of Message Transmission
+\begin_inset LatexCommand \label{sec:semantics}
+
+\end_inset 
+
+
+\layout Standard
+
+The portals API uses four types of messages: put requests, acknowledgements,
+ get requests, and replies.
+ In this section, we describe the information passed on the wire for each
+ type of message.
+ We also describe how this information is used to process incoming messages.
+\layout Section
+
+Sending Messages
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:put-wire}
+
+\end_inset 
+
+ summarizes the information that is transmitted for a put request.
+ The first column provides a descriptive name for the information, the second
+ column provides the type for this information, the third column identifies
+ the source of the information, and the fourth column provides additional
+ notes.
+ Most information that is transmitted is obtained directly from the 
+\emph on 
+PtlPut
+\emph default 
+ operation.
+ Notice that the handle for the memory descriptor used in the 
+\emph on 
+PtlPut
+\emph default 
+ operation is transmitted even though this value cannot be interpreted by
+ the target.
+ A value of anything other than 
+\family typewriter 
+PTL_MD_NONE
+\family default 
+, is interpreted as a request for an acknowledgement.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Put Request
+\begin_inset LatexCommand \label{tab:put-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="12" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+\emph on 
+PtlPut
+\emph default 
+ arg
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates a put request 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+no ack if 
+\family typewriter 
+PTL_MD_NONE
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+length
+\family default 
+ member 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+data 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family roman 
+\emph on 
+bytes
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+start
+\family default 
+ and 
+\family typewriter 
+length
+\family default 
+ members 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:ack-wire}
+
+\end_inset 
+
+ summarizes the information transmitted in an acknowledgement.
+ Most of the information is simply echoed from the put request.
+ Notice that the initiator and target are obtained directly from the put
+ request, but are swapped in generating the acknowledgement.
+ The only new piece of information in the acknowledgement is the manipulated
+ length which is determined as the put request is satisfied.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in an Acknowledgement
+\begin_inset LatexCommand \label{tab:ack-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="10" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Put Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ indicates an acknowledgement 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ requested length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ manipulated length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+ obtained from the operation 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:get-wire}
+
+\end_inset 
+
+ summarizes the information that is transmitted for a get request.
+ Like the information transmitted in a put request, most of the information
+ transmitted in a get request is obtained directly from the 
+\emph on 
+PtlGet
+\emph default 
+ operation.
+ Unlike put requests, get requests do not include the event queue handle.
+ In this case, the reply is generated whenever the operation succeeds and
+ the memory descriptor must not be unlinked until the reply is received.
+ As such, there is no advantage to explicitly sending the event queue handle.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Get Request
+\begin_inset LatexCommand \label{tab:get-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="11" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+\emph on 
+PtlGet
+\emph default 
+ argument
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates a get operation 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+user
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_uid_t
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+local information
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+portal
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_ac_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+cookie 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+match_bits
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc
+\family default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+mem_desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+length
+\family default 
+ member 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+Table\SpecialChar ~
+
+\begin_inset LatexCommand \ref{tab:reply-wire}
+
+\end_inset 
+
+ summarizes the information transmitted in a reply.
+ Like an acknowledgement, most of the information is simply echoed from
+ the get request.
+ The initiator and target are obtained directly from the get request, but
+ are swapped in generating the acknowledgement.
+ The only new information in the acknowledgement are the manipulated length
+ and the data, which are determined as the get request is satisfied.
+\layout Standard
+
+
+\begin_inset Float table
+placement htbp
+wide false
+collapsed false
+
+\layout Caption
+
+Information Passed in a Reply
+\begin_inset LatexCommand \label{tab:reply-wire}
+
+\end_inset 
+
+
+\layout Standard
+
+
+\begin_inset ERT
+status Collapsed
+
+\layout Standard
+
+\backslash 
+medskip  
+\end_inset 
+
+
+\layout Standard
+\align center 
+
+\size small 
+
+\begin_inset  Tabular
+<lyxtabular version="3" rows="11" columns="4">
+<features firstHeadEmpty="true">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<column alignment="left" valignment="top" width="0pt">
+<row bottomline="true">
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Information
+\series default 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Type
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Put Information 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\series bold 
+Notes 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+operation 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+int
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+indicates an acknowledgement 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+target 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_process_id_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+initiator 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_pt_index_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+portal index 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" bottomline="true" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_match_bits_t 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+match bits 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+offset 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_handle_md_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+memory desc 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+requested length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+echo 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+manipulated length 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\family typewriter 
+ptl_size_t
+\family default 
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+obtained from the operation 
+\end_inset 
+</cell>
+</row>
+<row>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+data 
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+
+\emph on 
+bytes
+\end_inset 
+</cell>
+<cell alignment="left" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+\end_inset 
+</cell>
+<cell alignment="right" valignment="top" usebox="none">
+\begin_inset Text
+
+\layout Standard
+
+obtained from the operation 
+\end_inset 
+</cell>
+</row>
+</lyxtabular>
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Section
+
+Receiving Messages
+\begin_inset LatexCommand \label{sec:receiving}
+
+\end_inset 
+
+
+\layout Standard
+
+When an incoming message arrives on a network interface, the communication
+ system first checks that the target process identified in the request is
+ a valid process that has initialized the network interface (i.e., that the
+ target process has a valid Portal table).
+ If this test fails, the communication system discards the message and increment
+s the dropped message count for the interface.
+ The remainder of the processing depends on the type of the incoming message.
+ Put and get messages are subject to access control checks and translation
+ (searching a match list), while acknowledgement and reply messages bypass
+ the access control checks and the translation step.
+\layout Standard
+
+Acknowledgement messages include a handle for the memory descriptor used
+ in the original 
+\emph on 
+PtlPut
+\emph default 
+ operation.
+ This memory descriptor will identify the event queue where the event should
+ be recorded.
+ Upon receipt of an acknowledgement, the runtime system only needs to confirm
+ that the memory descriptor and event queue still exist and that there is
+ space for another event.
+ Should the any of these conditions fail,  the message is simply discarded
+ and the dropped message count for the interface is incremented.
+ Otherwise, the system builds an acknowledgement event from the information
+ in the acknowledgement message and adds it to the event queue.
+\layout Standard
+
+Reception of reply messages is also relatively straightforward.
+ Each reply message includes a handle for a memory descriptor.
+ If this descriptor exists, it is used to receive the message.
+ A reply message will be dropped if the memory descriptor identified in
+ the request doesn't exist.
+ In either of this case, the dropped message count for the interface is
+ incremented.
+ These are the only reasons for dropping reply messages.
+ Every memory descriptor accepts and truncates incoming reply messages,
+ eliminating the other potential reasons for rejecting a reply message.
+\layout Standard
+
+The critical step in processing an incoming put or get request involves
+ mapping the request to a memory descriptor.
+ This step starts by using the Portal index in the incoming request to identify
+ a list of match entries.
+ This list of match entries is searched in order until a match entry is
+ found whose match criteria matches the match bits in the incoming request
+ and whose memory descriptor accepts the request.
+\layout Standard
+
+Because acknowledge and reply messages are generated in response to requests
+ made by the process receiving these messages, the checks performed by the
+ runtime system for acknowledgements and replies are minimal.
+ In contrast, put and get messages are generated by remote processes and
+ the checks performed for these messages are more extensive.
+ Incoming put or get messages may be rejected because: 
+\layout Itemize
+
+the Portal index supplied in the request is not valid; 
+\layout Itemize
+
+the cookie supplied in the request is not a valid access control entry;
+\layout Itemize
+
+the access control entry identified by the cookie does not match the identifier
+ of the requesting process; 
+\layout Itemize
+
+the access control entry identified by the access control entry does not
+ match the Portal index supplied in the request; or 
+\layout Itemize
+
+the match bits supplied in the request do not match any of the match entries
+ with a memory descriptor that accepts the request.
+\layout Standard
+
+In all cases, if the message is rejected, the incoming message is discarded
+ and the dropped message count for the interface is incremented.
+\layout Standard
+
+A memory descriptor may reject an incoming request for any of the following
+ reasons: 
+\layout Itemize
+
+the 
+\family typewriter 
+PTL_MD_PUT
+\family default 
+ or 
+\family typewriter 
+PTL_MD_GET
+\family default 
+ option has not been enabled and the operation is put or get, respectively;
+\layout Itemize
+
+the length specified in the request is too long for the memory descriptor
+ and the 
+\family typewriter 
+PTL_MD_TRUNCATE
+\family default 
+ option has not been enabled.
+\layout Chapter
+
+Examples
+\begin_inset LatexCommand \label{sec:examples}
+
+\end_inset 
+
+
+\layout Comment
+
+The examples presented in this chapter have not been updated to reflect
+ the current API.
+\layout Standard
+
+In this section we present several example to illustrate expected usage
+ patterns for the Portals 3.2 API.
+ The first example describes how to implement parallel servers using the
+ features of the Portals 3.2 API.
+ This example covers the access control list and the use of remote managed
+ offsets.
+ The second example presents an approach to dealing with dropped requests.
+ This example covers aspects of match lists and memory descriptors.
+ The final example covers message reception in MPI.
+ This example illustrates more sophisticated uses of matching and a procedure
+ to update a memory descriptor.
+\layout Section
+
+Parallel File Servers
+\begin_inset LatexCommand \label{sec:expfs}
+
+\end_inset 
+
+
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:file}
+
+\end_inset 
+
+ illustrates the logical structure of a parallel file server.
+ In this case, the parallel server consists of four servers that stripe
+ application data across four disks.
+ We would like to present applications with the illusion that the file server
+ is a single entity.
+ We will assume that all of the processes that constitute the parallel server
+ have the same user id.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename file.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 196pt
+       lyxheight 147pt
+\end_inset 
+
+
+\layout Caption
+
+Parallel File Server
+\begin_inset LatexCommand \label{fig:file}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+When an application establishes a connection to the parallel file server,
+ it will allocate a Portal and access control list entry for communicating
+ with the server.
+ The access control list entry will include the Portal and match any process
+ in the parallel file server's, so all of the file server processes will
+ have access to the portal.
+ The Portal information and access control entry will be sent to the file
+ server at this time.
+ If the application and server need to have multiple, concurrent I/O operations,
+ they can use additional portals or match entries to keep the operations
+ from interfering with one another.
+\layout Standard
+
+When an application initiates an I/O operation, it first builds a memory
+ descriptor that describes the memory region involved in the operation.
+ This memory descriptor will enable the appropriate operation (put for read
+ operations and get for write operations) and enable the use of remote offsets
+ (this lets the servers decide where their data should be placed in the
+ memory region).
+ After creating the memory descriptor and linking it into the appropriate
+ Portal entry, the application sends a read or write request (using 
+\emph on 
+PtlPut
+\emph default 
+) to one of the file server processes.
+ The file server processes can then use put or get operations with the appropria
+te offsets to fill or retrieve the contents of the application's buffer.
+ To know when the operation has completed, the application can add an event
+ queue to the memory descriptor and add up the lengths of the remote operations
+ until the sum is the size of the requested I/O operation.
+\layout Section
+
+Dealing with Dropped Requests
+\begin_inset LatexCommand \label{sec:exdrop}
+
+\end_inset 
+
+
+\layout Standard
+
+If a process does not anticipate unexpected requests, they will be discarded.
+ Applications using the Portals API can query the dropped count for the
+ interface to determine the number of requests that have been dropped (see
+ Section\SpecialChar ~
+
+\begin_inset LatexCommand \ref{sec:nistatus}
+
+\end_inset 
+
+).
+ While this approach minimizes resource consumption, it does not provide
+ information that might be critical in debugging the implementation of a
+ higher level protocol.
+\layout Standard
+
+To keep track of more information about dropped requests, we use a memory
+ descriptor that truncates each incoming request to zero bytes and logs
+ the 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ operations in an event queue.
+ Note that the operations are not dropped in the Portals sense, because
+ the operation succeeds.
+\layout Standard
+
+The following code fragment illustrates an implementation of this approach.
+ In this case, we assume that a thread is launched to execute the function
+\family typewriter 
+watch_drop
+\family default 
+.
+ This code starts by building an event queue to log truncated operations
+ and a memory descriptor to truncate the incoming requests.
+ This example only captures 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ requests for a single portal.
+ In a more realistic situation, the memory descriptor would be appended
+ to the match list for every portal.
+ We also assume that the thread is capable of keeping up with the 
+\begin_inset Quotes eld
+\end_inset 
+
+dropped
+\begin_inset Quotes erd
+\end_inset 
+
+ requests.
+ If this is not the case, we could use a finite threshold on the memory
+ descriptor to capture the first few dropped requests.
+\layout LyX-Code
+
+
+\size small 
+#include <stdio.h>
+\newline 
+#include <stdlib.h>
+\newline 
+#include <portals.h>
+\newline 
+
+\newline 
+#define DROP_SIZE 32       /* number of dropped requests to track */
+\newline 
+
+\newline 
+int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) {
+\newline 
+    ptl_handle_eq_t drop_events;
+\newline 
+    ptl_event_t event;
+\newline 
+    ptl_handle_md_t drop_em;
+\newline 
+    ptl_md_t drop_desc;
+\newline 
+    ptl_process_id_t any_proc;
+\newline 
+    ptl_handle_me_t match_any;
+\newline 
+
+\newline 
+    /* create the event queue */
+\newline 
+    if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) {
+\newline 
+        fprintf( stderr, "Couldn't create the event queue
+\backslash 
+n" );
+\newline 
+        exit( 1 );
+\newline 
+    }
+\newline 
+
+\newline 
+    /* build a match entry */
+\newline 
+    any_proc.nid = PTL_ID_ANY;
+\newline 
+    any_proc.pid = PTL_ID_ANY;
+\newline 
+    PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN,
+\newline 
+                        &match_any );
+\newline 
+
+\newline 
+    /* create the memory descriptor */
+\newline 
+    drop_desc.start = NULL;
+\newline 
+    drop_desc.length = 0;
+\newline 
+    drop_desc.threshold = PTL_MD_THRESH_INF;
+\newline 
+    drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE;
+\newline 
+    drop_desc.user_ptr = NULL;
+\newline 
+    drop_desc.eventq = drop_events;
+\newline 
+    if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) {
+\newline 
+        fprintf( stderr, "Couldn't create the memory descriptor
+\backslash 
+n" );
+\newline 
+        exit( 1 );
+\newline 
+    }
+\newline 
+
+\newline 
+    /* watch for "dropped" requests */
+\newline 
+    while( 1 ) {
+\newline 
+        if( PtlEQWait( drop_events, &event ) != PTL_OK ) break;
+\newline 
+        fprintf( stderr, "Dropped request from gid = event.initiator.gid,
+ event.initiator.rid );
+\newline 
+    }
+\newline 
+}
+\layout Section
+
+Message Transmission in MPI
+\begin_inset LatexCommand \label{sec:exmpi}
+
+\end_inset 
+
+
+\layout Standard
+
+We conclude this section with a fairly extensive example that describes
+ an approach to implementing message transmission for MPI.
+ Like many MPI implementations, we distinguish two message transmission
+ protocols: a short message protocol and a long message protocol.
+ We use the constant 
+\family typewriter 
+MPI_LONG_LENGTH
+\family default 
+ to determine the size of a long message.
+\layout Standard
+
+For small messages, the sender simply sends the message and presumes that
+ the message will be received (i.e., the receiver has allocated a memory region
+ to receive the message body).
+ For large messages, the sender also sends the message, but does not presume
+ that the message body will be saved.
+ Instead, the sender builds a memory descriptor for the message and enables
+ get operations on this descriptor.
+ If the target does not save the body of the message, it will record an
+ event for the put operation.
+ When the process later issues a matching MPI receive, it will perform a
+ get operation to retrieve the body of the message.
+\layout Standard
+
+To facilitate receive side matching based on the protocol, we use the most
+ significant bit in the match bits to indicate the protocol: 1 for long
+ messages and 0 for short messages.
+\layout Standard
+
+The following code presents a function that implements the send side of
+ the protocol.
+ The global variable 
+\family typewriter 
+EndGet
+\family default 
+ is the last match entry attached to the Portal index used for posting long
+ messages.
+ This entry does not match any incoming requests (i.e., the memory descriptor
+ rejects all get operations) and is built during initialization of the MPI
+ library.
+ The other global variable, 
+\family typewriter 
+MPI_NI
+\family default 
+, is a handle for the network interface used by the MPI implementation.
+\layout LyX-Code
+
+
+\size small 
+extern ptl_handle_me_t EndGet;
+\newline 
+extern ptl_handle_ni_t MPI_NI;
+\newline 
+
+\newline 
+void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq,
+\newline 
+                    ptl_process_id target, ptl_match_bits_t match ) 
+\newline 
+{
+\newline 
+    ptl_handle_md_t send_handle;
+\newline 
+    ptl_md_t mem_desc;
+\newline 
+    ptl_ack_req_t want_ack;
+\newline 
+
+\newline 
+    mem_desc.start = buf;
+\newline 
+    mem_desc.length = len;
+\newline 
+    mem_desc.threshold = 1;
+\newline 
+    mem_desc.options = PTL_MD_GET_OP;
+\newline 
+    mem_desc.user_ptr = data;
+\newline 
+    mem_desc.eventq = eventq;
+\newline 
+
+\newline 
+    if( len >= MPI_LONG_LENGTH ) {
+\newline 
+        ptl_handle_me_t me_handle;
+\newline 
+
+\newline 
+        /* add a match entry to the end of the get list */
+\newline 
+        PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet,
+ &me_handle );
+\newline 
+        PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL );
+\newline 
+
+\newline 
+        /* we want an ack for long messages */
+\newline 
+        want_ack = PTL_ACK_REQ;
+\newline 
+
+\newline 
+        /* set the protocol bit to indicate that this is a long message
+ */
+\newline 
+        match |= 1<<63;
+\newline 
+    } else {
+\newline 
+        /* we don't want an ack for short messages */
+\newline 
+        want_ack = PTL_ACK_REQ;
+\newline 
+
+\newline 
+        /* set the protocol bit to indicate that this is a short message
+ */
+\newline 
+        match &= ~(1<<63);
+\newline 
+    }
+\newline 
+
+\newline 
+   /* create a memory descriptor and send it */
+\newline 
+   PtlMDBind( MPI_NI, mem_desc, &send_handle );
+\newline 
+   PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match,
+ 0 );
+\newline 
+}
+\layout Standard
+
+The 
+\emph on 
+MPISend
+\emph default 
+ function returns as soon as the message has been scheduled for transmission.
+ The event queue argument, 
+\family typewriter 
+eventq
+\family default 
+, can be used to determine the disposition of the message.
+ Assuming that 
+\family typewriter 
+eventq
+\family default 
+ is not 
+\family typewriter 
+PTL_EQ_NONE
+\family default 
+, a 
+\family typewriter 
+PTL_EVENT_SENT
+\family default 
+ event will be recorded for each message as the message is transmitted.
+ For small messages, this is the only event that will be recorded in 
+\family typewriter 
+eventq
+\family default 
+.
+ In contrast, long messages include an explicit request for an acknowledgement.
+ If the 
+\family typewriter 
+target
+\family default 
+ process has posted a matching receive, the acknowledgement will be sent
+ as the message is received.
+ If a matching receive has not been posted, the message will be discarded
+ and no acknowledgement will be sent.
+ When the 
+\family typewriter 
+target
+\family default 
+ process later issues a matching receive, the receive will be translated
+ into a get operation and a 
+\family typewriter 
+PTL_EVENT_GET
+\family default 
+ event will be recorded in 
+\family typewriter 
+eventq
+\family default 
+.
+\layout Standard
+
+Figure\SpecialChar ~
+
+\begin_inset LatexCommand \ref{fig:mpi}
+
+\end_inset 
+
+ illustrates the organization of the match list used for receiving MPI messages.
+ The initial entries (not shown in this figure) would be used to match the
+ MPI receives that have been preposted by the application.
+ The preposted receives are followed by a match entry, 
+\emph on 
+RcvMark
+\emph default 
+, that marks the boundary between preposted receives and the memory descriptors
+ used for 
+\begin_inset Quotes eld
+\end_inset 
+
+unexpected
+\begin_inset Quotes erd
+\end_inset 
+
+ messages.
+ The 
+\emph on 
+RcvMark
+\emph default 
+ entry is followed by a small collection of match entries that match unexpected
+\begin_inset Quotes eld
+\end_inset 
+
+short
+\begin_inset Quotes erd
+\end_inset 
+
+ messages, i.e., messages that have a 0 in the most significant bit of their
+ match bits.
+ The memory descriptors associated with these match entries will append
+ the incoming message to the associated memory descriptor and record an
+ event in an event queue for unexpected messages.
+ The unexpected short message matching entries are followed by a match entry
+ that will match messages that were not matched by the preceding match entries,
+ i.e., the unexpected long messages.
+ The memory descriptor associated with this match entry truncates the message
+ body and records an event in the event queue for unexpected messages.
+ Note that of the memory descriptors used for unexpected messages share
+ a common event queue.
+ This makes it possible to process the unexpected messages in the order
+ in which they arrived, regardless of.
+\layout Standard
+
+
+\begin_inset Float figure
+placement htbp
+wide false
+collapsed false
+
+\layout Standard
+\align center 
+
+\begin_inset Graphics FormatVersion 1
+       filename mpi.eps
+       display color
+       size_type 0
+       rotateOrigin center
+       lyxsize_type 1
+       lyxwidth 389pt
+       lyxheight 284pt
+\end_inset 
+
+
+\layout Caption
+
+Message Reception in MPI
+\begin_inset LatexCommand \label{fig:mpi}
+
+\end_inset 
+
+
+\end_inset 
+
+
+\layout Standard
+
+When the local MPI process posts an MPI receive, we must first search the
+ events unexpected message queue to see if a matching message has already
+ arrived.
+ If no matching message is found, a match entry for the receive is inserted
+ before the 
+\emph on 
+RcvMark
+\emph default 
+ entry--after the match entries for all of the previously posted receives
+ and before the match entries for the unexpected messages.
+ This ensures that preposted receives are matched in the order that they
+ were posted (a requirement of MPI).
+\layout Standard
+
+While this strategy respects the temporal semantics of MPI, it introduces
+ a race condition: a matching message might arrive after the events in the
+ unexpected message queue have been searched, but before the match entry
+ for the receive has been inserted in the match list.
+\layout Standard
+
+To avoid this race condition we start by setting the 
+\family typewriter 
+threshold
+\family default 
+ of the memory descriptor to 0, making the descriptor inactive.
+ We then insert the match entry into the match list and proceed to search
+ the events in the unexpected message queue.
+ A matching message that arrives as we are searching the unexpected message
+ queue will not be accepted by the memory descriptor and, if not matched
+ by an earlier match list element, will add an event to the unexpected message
+ queue.
+ After searching the events in the unexpected message queue, we update the
+ memory descriptor, setting the threshold to 1 to activate the memory descriptor.
+ This update is predicated by the condition that the unexpected message
+ queue is empty.
+ We repeat the process of searching the unexpected message queue until the
+ update succeeds.
+\layout Standard
+
+The following code fragment illustrates this approach.
+ Because events must be removed from the unexpected message queue to be
+ examined, this code fragment assumes the existence of a user managed event
+ list, 
+\family typewriter 
+Rcvd
+\family default 
+, for the events that have already been removed from the unexpected message
+ queue.
+ In an effort to keep the example focused on the basic protocol, we have
+ omitted the code that would be needed to manage the memory descriptors
+ used for unexpected short messages.
+ In particular, we simply leave messages in these descriptors until they
+ are received by the application.
+ In a robust implementation, we would introduce code to ensure that short
+ unexpected messages are removed from these memory descriptors so that they
+ can be re-used.
+\layout LyX-Code
+
+
+\size small 
+extern ptl_handle_eq_t UnexpQueue;
+\newline 
+extern ptl_handle_me_t RcvMark;
+\newline 
+extern ptl_handle_me_t ShortMatch;
+\newline 
+
+\newline 
+typedef struct event_list_tag {
+\newline 
+    ptl_event_t            event;
+\newline 
+    struct event_list_tag* next;
+\newline 
+} event_list;
+\newline 
+
+\newline 
+extern event_list Rcvd;
+\newline 
+
+\newline 
+void AppendRcvd( ptl_event_t event )
+\newline 
+{
+\newline 
+    /* append an event onto the Rcvd list */
+\newline 
+}
+\newline 
+
+\newline 
+int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi
+ts_t match,
+\newline 
+                       ptl_match_bits_t ignore, ptl_event_t *event )
+\newline 
+{
+\newline 
+    /* Search the Rcvd event queue, looking for a message that matches the
+ requested message.
+\newline 
+     * If one is found, remove the event from the Rcvd list and return it.
+ */
+\newline 
+}
+\newline 
+
+\newline 
+typedef enum { RECEIVED, POSTED } receive_state;
+\newline 
+
+\newline 
+receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event,
+ ptl_md_t md_buf )
+\newline 
+{
+\newline 
+    ptl_md_t md_buf;
+\newline 
+    ptl_handle_me_t me_handle;
+\newline 
+
+\newline 
+    if( event.rlength >= MPI_LONG_LENGTH ) {
+\newline 
+        PtlMDBind( MPI_NI, md_buf, &md_handle );
+\newline 
+        PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX,
+ md_handle );
+\newline 
+        return POSTED;
+\newline 
+    } else {
+\newline 
+        /* copy the message */
+\newline 
+        if( event.mlength < *length ) *length = event.mlength;
+\newline 
+        memcpy( buf, (char*)event.md_desc.start+event.offset, *length );
+\newline 
+        return RECEIVED;
+\newline 
+    }
+\newline 
+}
+\newline 
+
+\newline 
+receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle
+_eq_t eventq, 
+\newline 
+                           ptl_process_id_t sender, ptl_match_bits_t match,
+ ptl_match_bits_t ignore )
+\newline 
+{
+\newline 
+    ptl_md_t md_buf;
+\newline 
+    ptl_handle_md_t md_handle;
+\newline 
+    ptl_handle_me_t me_handle;
+\newline 
+    ptl_event_t event;
+\newline 
+
+\newline 
+    /* build a memory descriptor for the receive */
+\newline 
+    md_buf.start = buf;
+\newline 
+    md_buf.length = *len;
+\newline 
+    md_buf.threshold = 0;     /* temporarily disabled */
+\newline 
+    md_buf.options = PTL_MD_PUT_OP;
+\newline 
+    md_buf.user_ptr = MPI_data;
+\newline 
+    md_buf.eventq = eventq;
+\newline 
+
+\newline 
+    /* see if we have already received the message */
+\newline 
+    if( SearchRcvd(buf, len, sender, match, ignore, &event) )
+\newline 
+         return CopyMsg( buf, len, event, md_buf );
+\newline 
+
+\newline 
+    /* create the match entry and attach the  memory descriptor */
+\newline 
+    PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark,
+ &me_handle);
+\newline 
+    PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle );
+\newline 
+
+\newline 
+    md_buf.threshold = 1;
+\newline 
+    do
+\newline 
+        if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) {
+\newline 
+            if( MPIMatch(event, match, ignore, sender) ) {
+\newline 
+                return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset,
+ md_buf );
+\newline 
+            } else {
+\newline 
+                AppendRcvd( event );
+\newline 
+            }
+\newline 
+        }
+\newline 
+    while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE
+ );
+\newline 
+    return POSTED;
+\newline 
+}
+\layout Chapter*
+
+Acknowledgments
+\layout Standard
+
+Several people have contributed to the philosophy, design, and implementation
+ of the Portals message passing architecture as it has evolved.
+ We acknowledge the following people for their contributions: Al Audette,
+ Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike
+ Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke,
+ Dave van Dresser, Lee Ward, and Stephen Wheat.
+\layout Standard
+
+
+\begin_inset LatexCommand \BibTeX[ieee]{portals3}
+
+\end_inset 
+
+
+\the_end
diff --git a/lustre/portals/doc/put.fig b/lustre/portals/doc/put.fig
new file mode 100644 (file)
index 0000000..5235b6d
--- /dev/null
@@ -0,0 +1,32 @@
+#FIG 3.2
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 1350 900 2175 1200
+4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001
+4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001
+-6
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        2700 1275 2700 1725
+2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2
+       0 0 1.00 60.00 120.00
+        900 525 2700 1200
+2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5
+        0 300 1200 300 1200 2250 0 2250 0 300
+2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5
+        2400 300 3600 300 3600 2250 2400 2250 2400 300
+2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2
+       0 0 1.00 60.00 120.00
+        2699 1788 899 1938
+4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001
+4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001
+4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001
+4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001
+4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001
+4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001
diff --git a/lustre/portals/include/.cvsignore b/lustre/portals/include/.cvsignore
new file mode 100644 (file)
index 0000000..d45f796
--- /dev/null
@@ -0,0 +1,4 @@
+config.h
+stamp-h
+stamp-h1
+stamp-h.in
diff --git a/lustre/portals/include/Makefile.am b/lustre/portals/include/Makefile.am
new file mode 100644 (file)
index 0000000..2cf7f99
--- /dev/null
@@ -0,0 +1,8 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = portals linux
+EXTRA_DIST = config.h.in
+include $(top_srcdir)/Rules
diff --git a/lustre/portals/include/config.h.in b/lustre/portals/include/config.h.in
new file mode 100644 (file)
index 0000000..b05d0c4
--- /dev/null
@@ -0,0 +1,11 @@
+/* ../include/config.h.in.  Generated automatically from configure.in by autoheader.  */
+
+/* Define if you have the readline library (-lreadline).  */
+#undef HAVE_LIBREADLINE
+
+/* Name of package */
+#undef PACKAGE
+
+/* Version number of package */
+#undef VERSION
+
diff --git a/lustre/portals/include/linux/Makefile.am b/lustre/portals/include/linux/Makefile.am
new file mode 100644 (file)
index 0000000..6a65cb5
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include $(top_srcdir)/Rules
+
+linuxincludedir = $(includedir)/linux
+
+linuxinclude_HEADERS=kp30.h portals_lib.h
diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h
new file mode 100644 (file)
index 0000000..6d7f3f3
--- /dev/null
@@ -0,0 +1,943 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _KP30_INCLUDED
+#define _KP30_INCLUDED
+
+
+#define PORTAL_DEBUG
+
+#ifndef offsetof
+# define offsetof(typ,memb)    ((int)((char *)&(((typ *)0)->memb)))
+#endif
+
+#define LOWEST_BIT_SET(x)      ((x) & ~((x) - 1))
+
+#ifndef CONFIG_SMP
+# define smp_processor_id() 0
+#endif
+
+/*
+ *  Debugging
+ */
+extern unsigned int portal_subsystem_debug;
+extern unsigned int portal_stack;
+extern unsigned int portal_debug;
+extern unsigned int portal_printk;
+/* Debugging subsystems  (8 bit ID)
+ *
+ * If you add debug subsystem #32, you need to send email to phil, because
+ * you're going to break kernel subsystem debug filtering. */
+#define S_UNDEFINED    (0 << 24)
+#define S_MDC          (1 << 24)
+#define S_MDS          (2 << 24)
+#define S_OSC          (3 << 24)
+#define S_OST          (4 << 24)
+#define S_CLASS        (5 << 24)
+#define S_OBDFS        (6 << 24) /* obsolete */
+#define S_LLITE        (7 << 24)
+#define S_RPC          (8 << 24)
+#define S_EXT2OBD      (9 << 24) /* obsolete */
+#define S_PORTALS     (10 << 24)
+#define S_SOCKNAL     (11 << 24)
+#define S_QSWNAL      (12 << 24)
+#define S_PINGER      (13 << 24)
+#define S_FILTER      (14 << 24)
+#define S_TRACE       (15 << 24) /* obsolete */
+#define S_ECHO        (16 << 24)
+#define S_LDLM        (17 << 24)
+#define S_LOV         (18 << 24)
+#define S_GMNAL       (19 << 24)
+#define S_PTLROUTER   (20 << 24)
+#define S_COBD        (21 << 24)
+#define S_PTLBD       (22 << 24)
+#define S_LOG         (23 << 24)
+
+/* If you change these values, please keep portals/linux/utils/debug.c
+ * up to date! */
+
+/* Debugging masks (24 bits, non-overlapping) */
+#define D_TRACE     (1 << 0) /* ENTRY/EXIT markers */
+#define D_INODE     (1 << 1)
+#define D_SUPER     (1 << 2)
+#define D_EXT2      (1 << 3) /* anything from ext2_debug */
+#define D_MALLOC    (1 << 4) /* print malloc, free information */
+#define D_CACHE     (1 << 5) /* cache-related items */
+#define D_INFO      (1 << 6) /* general information */
+#define D_IOCTL     (1 << 7) /* ioctl related information */
+#define D_BLOCKS    (1 << 8) /* ext2 block allocation */
+#define D_NET       (1 << 9) /* network communications */
+#define D_WARNING   (1 << 10)
+#define D_BUFFS     (1 << 11)
+#define D_OTHER     (1 << 12)
+#define D_DENTRY    (1 << 13)
+#define D_PORTALS   (1 << 14) /* ENTRY/EXIT markers */
+#define D_PAGE      (1 << 15) /* bulk page handling */
+#define D_DLMTRACE  (1 << 16)
+#define D_ERROR     (1 << 17) /* CERROR(...) == CDEBUG (D_ERROR, ...) */
+#define D_EMERG     (1 << 18) /* CEMERG(...) == CDEBUG (D_EMERG, ...) */
+#define D_HA        (1 << 19) /* recovery and failover */
+#define D_RPCTRACE  (1 << 20) /* for distributed debugging */
+#define D_VFSTRACE  (1 << 21)
+
+#ifndef __KERNEL__
+#define THREAD_SIZE 8192
+#endif
+#ifdef  __ia64__
+#define CDEBUG_STACK() (THREAD_SIZE -                                      \
+                        ((unsigned long)__builtin_dwarf_cfa() &            \
+                         (THREAD_SIZE - 1)))
+#else
+#define CDEBUG_STACK() (THREAD_SIZE -                                      \
+                        ((unsigned long)__builtin_frame_address(0) &       \
+                         (THREAD_SIZE - 1)))
+#endif
+
+#ifdef __KERNEL__
+#define CHECK_STACK(stack)                                                    \
+        do {                                                                  \
+                if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) {    \
+                        portals_debug_msg(DEBUG_SUBSYSTEM, D_ERROR,           \
+                                          __FILE__, __FUNCTION__, __LINE__,   \
+                                          (stack),                            \
+                                          "maximum lustre stack %u\n",        \
+                                          portal_stack = (stack));            \
+                      /*panic("LBUG");*/                                      \
+                }                                                             \
+        } while (0)
+#else
+#define CHECK_STACK(stack) do { } while(0)
+#endif
+
+#if 1
+#define CDEBUG(mask, format, a...)                                            \
+do {                                                                          \
+        CHECK_STACK(CDEBUG_STACK());                                          \
+        if (!(mask) || ((mask) & (D_ERROR | D_EMERG)) ||                      \
+            (portal_debug & (mask) &&                                         \
+             portal_subsystem_debug & (1 << (DEBUG_SUBSYSTEM >> 24))))        \
+                portals_debug_msg(DEBUG_SUBSYSTEM, mask,                      \
+                                  __FILE__, __FUNCTION__, __LINE__,           \
+                                  CDEBUG_STACK(), format , ## a);             \
+} while (0)
+
+#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
+#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a)
+
+#define GOTO(label, rc)                                                 \
+do {                                                                    \
+        long GOTO__ret = (long)(rc);                                    \
+        CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \
+               #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\
+               (signed long)GOTO__ret);                                 \
+        goto label;                                                     \
+} while (0)
+
+#define RETURN(rc)                                                      \
+do {                                                                    \
+        typeof(rc) RETURN__ret = (rc);                                  \
+        CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n",       \
+               (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret);\
+        return RETURN__ret;                                             \
+} while (0)
+
+#define ENTRY                                                           \
+do {                                                                    \
+        CDEBUG(D_TRACE, "Process entered\n");                           \
+} while (0)
+
+#define EXIT                                                            \
+do {                                                                    \
+        CDEBUG(D_TRACE, "Process leaving\n");                           \
+} while(0)
+#else
+#define CDEBUG(mask, format, a...)      do { } while (0)
+#define CWARN(format, a...)             do { } while (0)
+#define CERROR(format, a...)            printk("<3>" format, ## a)
+#define CEMERG(format, a...)            printk("<0>" format, ## a)
+#define GOTO(label, rc)                 do { (void)(rc); goto label; } while (0)
+#define RETURN(rc)                      return (rc)
+#define ENTRY                           do { } while (0)
+#define EXIT                            do { } while (0)
+#endif
+
+
+#ifdef __KERNEL__
+# include <linux/vmalloc.h>
+# include <linux/time.h>
+# include <linux/slab.h>
+# include <linux/interrupt.h>
+# include <linux/highmem.h>
+# include <linux/module.h>
+# include <linux/version.h>
+# include <portals/lib-nal.h>
+# include <linux/smp_lock.h>
+# include <asm/atomic.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define schedule_work schedule_task
+#define prepare_work(wq,cb,cbdata)                                            \
+do {                                                                          \
+        INIT_TQUEUE((wq), 0, 0);                                              \
+        PREPARE_TQUEUE((wq), (cb), (cbdata));                                 \
+} while (0)
+
+#define ll_invalidate_inode_pages invalidate_inode_pages
+#define PageUptodate Page_Uptodate
+#define our_recalc_sigpending(current) recalc_sigpending(current)
+#define num_online_cpus() smp_num_cpus
+static inline void our_cond_resched(void)
+{
+        if (current->need_resched)
+               schedule ();
+}
+
+#else
+
+#define prepare_work(wq,cb,cbdata)                                            \
+do {                                                                          \
+        INIT_WORK((wq), (void *)(cb), (void *)(cbdata));                      \
+} while (0)
+#define ll_invalidate_inode_pages(inode) invalidate_inode_pages((inode)->i_mapping)
+#define wait_on_page wait_on_page_locked
+#define our_recalc_sigpending(current) recalc_sigpending()
+#define strtok(a,b) strpbrk(a, b)
+static inline void our_cond_resched(void)
+{
+        cond_resched();
+}
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */
+
+#ifdef PORTAL_DEBUG
+extern void kportal_assertion_failed(char *expr,char *file,char *func,int line);
+#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__,  \
+                                                        __FUNCTION__, __LINE__))
+#else
+#define LASSERT(e)
+#endif
+
+#ifdef __arch_um__
+#define LBUG_WITH_LOC(file, func, line)                                 \
+do {                                                                    \
+        CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n");       \
+        portals_debug_dumplog();                                        \
+        portals_run_lbug_upcall(file, func, line);                      \
+        panic("LBUG");                                                  \
+} while (0)
+#else
+#define LBUG_WITH_LOC(file, func, line)                                 \
+do {                                                                    \
+        CEMERG("LBUG\n");                                               \
+        portals_debug_dumplog();                                        \
+        portals_run_lbug_upcall(file, func, line);                      \
+        set_task_state(current, TASK_UNINTERRUPTIBLE);                  \
+        schedule();                                                     \
+} while (0)
+#endif /* __arch_um__ */
+
+#define LBUG() LBUG_WITH_LOC(__FILE__, __FUNCTION__, __LINE__)
+
+/*
+ * Memory
+ */
+#ifdef PORTAL_DEBUG
+extern atomic_t portal_kmemory;
+
+# define portal_kmem_inc(ptr, size)                                           \
+do {                                                                          \
+        atomic_add(size, &portal_kmemory);                                    \
+} while (0)
+
+# define portal_kmem_dec(ptr, size) do {                                      \
+        atomic_sub(size, &portal_kmemory);                                    \
+} while (0)
+
+#else
+# define portal_kmem_inc(ptr, size) do {} while (0)
+# define portal_kmem_dec(ptr, size) do {} while (0)
+#endif /* PORTAL_DEBUG */
+
+#define PORTAL_VMALLOC_SIZE        16384
+
+#define PORTAL_ALLOC(ptr, size)                                           \
+do {                                                                      \
+        long s = size;                                                    \
+        LASSERT (!in_interrupt());                                        \
+        if (s > PORTAL_VMALLOC_SIZE)                                      \
+                (ptr) = vmalloc(s);                                       \
+        else                                                              \
+                (ptr) = kmalloc(s, GFP_NOFS);                             \
+        if ((ptr) == NULL)                                                \
+                CERROR("PORTALS: out of memory at %s:%d (tried to alloc"  \
+                       " '" #ptr "' = %ld)\n", __FILE__, __LINE__, s);    \
+        else {                                                            \
+                portal_kmem_inc((ptr), s);                                \
+                memset((ptr), 0, s);                                      \
+        }                                                                 \
+        CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n",   \
+               s, (ptr), atomic_read (&portal_kmemory));                  \
+} while (0)
+
+#define PORTAL_FREE(ptr, size)                                          \
+do {                                                                    \
+        long s = (size);                                                \
+        if ((ptr) == NULL) {                                            \
+                CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at "  \
+                       "%s:%d\n", s, __FILE__, __LINE__);               \
+                break;                                                  \
+        }                                                               \
+        if (s > PORTAL_VMALLOC_SIZE)                                    \
+                vfree(ptr);                                             \
+        else                                                            \
+                kfree(ptr);                                             \
+        portal_kmem_dec((ptr), s);                                      \
+        CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n",    \
+               s, (ptr), atomic_read (&portal_kmemory));                \
+} while (0)
+
+#define PORTAL_SLAB_ALLOC(ptr, slab, size)                                \
+do {                                                                      \
+        long s = (size);                                                  \
+        LASSERT (!in_interrupt());                                        \
+        (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL);                    \
+        if ((ptr) == NULL) {                                              \
+                CERROR("PORTALS: out of memory at %s:%d (tried to alloc"  \
+                       " '" #ptr "' from slab '" #slab "')\n", __FILE__,  \
+                       __LINE__);                                         \
+        } else {                                                          \
+                portal_kmem_inc((ptr), s);                                \
+                memset((ptr), 0, s);                                      \
+        }                                                                 \
+        CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n",   \
+               s, (ptr), atomic_read (&portal_kmemory));                  \
+} while (0)
+
+#define PORTAL_SLAB_FREE(ptr, slab, size)                               \
+do {                                                                    \
+        long s = (size);                                                \
+        if ((ptr) == NULL) {                                            \
+                CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at "  \
+                       "%s:%d\n", s, __FILE__, __LINE__);               \
+                break;                                                  \
+        }                                                               \
+        memset((ptr), 0x5a, s);                                         \
+        kmem_cache_free((slab), ptr);                                   \
+        portal_kmem_dec((ptr), s);                                      \
+        CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n",    \
+               s, (ptr), atomic_read (&portal_kmemory));                \
+} while (0)
+
+/* ------------------------------------------------------------------- */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+
+#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x)
+#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x)
+
+#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x))
+#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x)
+
+#define PORTAL_MODULE_USE       MOD_INC_USE_COUNT
+#define PORTAL_MODULE_UNUSE     MOD_DEC_USE_COUNT
+#else
+
+#define PORTAL_SYMBOL_REGISTER(x)
+#define PORTAL_SYMBOL_UNREGISTER(x)
+
+#define PORTAL_SYMBOL_GET(x) symbol_get(x)
+#define PORTAL_SYMBOL_PUT(x) symbol_put(x)
+
+#define PORTAL_MODULE_USE       try_module_get(THIS_MODULE)
+#define PORTAL_MODULE_UNUSE     module_put(THIS_MODULE)
+
+#endif
+
+/******************************************************************************/
+/* Kernel Portals Router interface */
+
+typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback
+
+/* space for routing targets to stash "stuff" in a forwarded packet */
+typedef union {
+        long long        _alignment;
+        void            *_space[16];            /* scale with CPU arch */
+} kprfd_scratch_t;
+
+/* Kernel Portals Routing Forwarded message Descriptor */
+typedef struct {
+        struct list_head     kprfd_list;        /* stash in queues (routing target can use) */
+        ptl_nid_t            kprfd_target_nid;  /* final destination NID */
+        ptl_nid_t            kprfd_gateway_nid; /* gateway NID */
+        int                  kprfd_nob;         /* # message bytes (including header) */
+        int                  kprfd_niov;        /* # message frags (including header) */
+        struct iovec        *kprfd_iov;         /* message fragments */
+        void                *kprfd_router_arg;  // originating NAL's router arg
+        kpr_fwd_callback_t   kprfd_callback;    /* completion callback */
+        void                *kprfd_callback_arg; /* completion callback arg */
+        kprfd_scratch_t      kprfd_scratch;    // scratchpad for routing targets
+} kpr_fwd_desc_t;
+
+typedef void  (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd);
+
+/* NAL's routing interface (Kernel Portals Routing Nal Interface) */
+typedef const struct {
+        int             kprni_nalid;    /* NAL's id */
+        void           *kprni_arg;      /* Arg to pass when calling into NAL */
+        kpr_fwd_t       kprni_fwd;      /* NAL's forwarding entrypoint */
+} kpr_nal_interface_t;
+
+/* Router's routing interface (Kernel Portals Routing Router Interface) */
+typedef const struct {
+        /* register the calling NAL with the router and get back the handle for
+         * subsequent calls */
+        int     (*kprri_register) (kpr_nal_interface_t *nal_interface,
+                                   void **router_arg);
+
+        /* ask the router to find a gateway that forwards to 'nid' and is a peer
+         * of the calling NAL */
+        int     (*kprri_lookup) (void *router_arg, ptl_nid_t nid,
+                                 ptl_nid_t *gateway_nid);
+
+        /* hand a packet over to the router for forwarding */
+        kpr_fwd_t kprri_fwd_start;
+
+        /* hand a packet back to the router for completion */
+        void    (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd,
+                                   int error);
+
+        /* the calling NAL is shutting down */
+        void    (*kprri_shutdown) (void *router_arg);
+
+        /* deregister the calling NAL with the router */
+        void    (*kprri_deregister) (void *router_arg);
+
+} kpr_router_interface_t;
+
+/* Convenient struct for NAL to stash router interface/args */
+typedef struct {
+        kpr_router_interface_t  *kpr_interface;
+        void                    *kpr_arg;
+} kpr_router_t;
+
+/* Router's control interface (Kernel Portals Routing Control Interface) */
+typedef const struct {
+        int     (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid,
+                                   ptl_nid_t lo_nid, ptl_nid_t hi_nid);
+        int     (*kprci_del_route)(ptl_nid_t nid);
+        int     (*kprci_get_route)(int index, int *gateway_nal,
+                                   ptl_nid_t *gateway, ptl_nid_t *lo_nid,
+                                   ptl_nid_t *hi_nid);
+} kpr_control_interface_t;
+
+extern kpr_control_interface_t  kpr_control_interface;
+extern kpr_router_interface_t   kpr_router_interface;
+
+static inline int
+kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif)
+{
+        int    rc;
+
+        router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface);
+        if (router->kpr_interface == NULL)
+                return (-ENOENT);
+
+        rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg);
+        if (rc != 0)
+                router->kpr_interface = NULL;
+
+        PORTAL_SYMBOL_PUT (kpr_router_interface);
+        return (rc);
+}
+
+static inline int
+kpr_routing (kpr_router_t *router)
+{
+        return (router->kpr_interface != NULL);
+}
+
+static inline int
+kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid)
+{
+        if (!kpr_routing (router))
+                return (-EHOSTUNREACH);
+
+        return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid,
+                                                    gateway_nid));
+}
+
+static inline void
+kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid,
+              int nob, int niov, struct iovec *iov,
+              kpr_fwd_callback_t callback, void *callback_arg)
+{
+        fwd->kprfd_target_nid   = nid;
+        fwd->kprfd_gateway_nid  = nid;
+        fwd->kprfd_nob          = nob;
+        fwd->kprfd_niov         = niov;
+        fwd->kprfd_iov          = iov;
+        fwd->kprfd_callback     = callback;
+        fwd->kprfd_callback_arg = callback_arg;
+}
+
+static inline void
+kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd)
+{
+        if (!kpr_routing (router))
+                fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH);
+        else
+                router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd);
+}
+
+static inline void
+kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error)
+{
+        LASSERT (kpr_routing (router));
+        router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error);
+}
+
+static inline void
+kpr_shutdown (kpr_router_t *router)
+{
+        if (kpr_routing (router))
+                router->kpr_interface->kprri_shutdown (router->kpr_arg);
+}
+
+static inline void
+kpr_deregister (kpr_router_t *router)
+{
+        if (!kpr_routing (router))
+                return;
+        router->kpr_interface->kprri_deregister (router->kpr_arg);
+        router->kpr_interface = NULL;
+}
+
+/******************************************************************************/
+
+#ifdef PORTALS_PROFILING
+#define prof_enum(FOO) PROF__##FOO
+enum {
+        prof_enum(our_recvmsg),
+        prof_enum(our_sendmsg),
+        prof_enum(socknal_recv),
+        prof_enum(lib_parse),
+        prof_enum(conn_list_walk),
+        prof_enum(memcpy),
+        prof_enum(lib_finalize),
+        prof_enum(pingcli_time),
+        prof_enum(gmnal_send),
+        prof_enum(gmnal_recv),
+        MAX_PROFS
+};
+
+struct prof_ent {
+        char *str;
+        /* hrmph.  wrap-tastic. */
+        u32       starts;
+        u32       finishes;
+        cycles_t  total_cycles;
+        cycles_t  start;
+        cycles_t  end;
+};
+
+extern struct prof_ent prof_ents[MAX_PROFS];
+
+#define PROF_START(FOO)                                         \
+        do {                                                    \
+                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
+                pe->starts++;                                   \
+                pe->start = get_cycles();                       \
+        } while (0)
+
+#define PROF_FINISH(FOO)                                        \
+        do {                                                    \
+                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
+                pe->finishes++;                                 \
+                pe->end = get_cycles();                         \
+                pe->total_cycles += (pe->end - pe->start);      \
+        } while (0)
+#else /* !PORTALS_PROFILING */
+#define PROF_START(FOO) do {} while(0)
+#define PROF_FINISH(FOO) do {} while(0)
+#endif /* PORTALS_PROFILING */
+
+/* debug.c */
+void portals_run_lbug_upcall(char * file, char *fn, int line);
+void portals_debug_dumplog(void);
+int portals_debug_init(unsigned long bufsize);
+int portals_debug_cleanup(void);
+int portals_debug_clear_buffer(void);
+int portals_debug_mark_buffer(char *text);
+int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
+                char *file, unsigned int size);
+__s32 portals_debug_copy_to_user(char *buf, unsigned long len);
+#if (__GNUC__)
+/* Use the special GNU C __attribute__ hack to have the compiler check the
+ * printf style argument string against the actual argument count and
+ * types.
+ */
+#ifdef printf
+# warning printf has been defined as a macro...
+# undef printf
+#endif
+void portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                        unsigned long stack, const char *format, ...)
+        __attribute__ ((format (printf, 7, 8)));
+#else
+void portals_debug_msg (int subsys, int mask, char *file, char *fn,
+                        int line, unsigned long stack,
+                        const char *format, ...);
+#endif /* __GNUC__ */
+void portals_debug_set_level(unsigned int debug_level);
+
+# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b)
+# define printf(format, b...) CDEBUG(D_OTHER, format , ## b)
+# define time(a) CURRENT_TIME
+
+extern void kportal_daemonize (char *name);
+extern void kportal_blockallsigs (void);
+
+#else  /* !__KERNEL__ */
+# include <stdio.h>
+# include <stdlib.h>
+#ifndef __CYGWIN__
+# include <stdint.h>
+#endif
+# include <unistd.h>
+# include <time.h>
+# include <asm/types.h>
+# ifndef DEBUG_SUBSYSTEM
+#  define DEBUG_SUBSYSTEM S_UNDEFINED
+# endif
+# ifdef PORTAL_DEBUG
+#  undef NDEBUG
+#  include <assert.h>
+#  define LASSERT(e)     assert(e)
+# else
+#  define LASSERT(e)
+# endif
+# define printk(format, args...) printf (format, ## args)
+# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0);
+# define PORTAL_FREE(a, b) do { free(a); } while (0);
+# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \
+    printf ("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format,                    \
+            (subsys) >> 24, (mask), (long)time(0), file, fn, line,            \
+            getpid() , stack, ## a);
+#endif
+
+#ifndef CURRENT_TIME
+# define CURRENT_TIME time(0)
+#endif
+
+#include <linux/portals_lib.h>
+
+/*
+ * USER LEVEL STUFF BELOW
+ */
+
+#define PORTAL_IOCTL_VERSION 0x00010007
+#define PING_SYNC       0
+#define PING_ASYNC      1
+
+struct portal_ioctl_data {
+        __u32 ioc_len;
+        __u32 ioc_version;
+        __u64 ioc_nid;
+        __u64 ioc_nid2;
+        __u64 ioc_nid3;
+        __u32 ioc_count;
+        __u32 ioc_nal;
+        __u32 ioc_nal_cmd;
+        __u32 ioc_fd;
+        __u32 ioc_id;
+
+        __u32 ioc_flags;
+        __u32 ioc_size;
+
+        __u32 ioc_wait;
+        __u32 ioc_timeout;
+        __u32 ioc_misc;
+
+        __u32 ioc_inllen1;
+        char *ioc_inlbuf1;
+        __u32 ioc_inllen2;
+        char *ioc_inlbuf2;
+
+        __u32 ioc_plen1; /* buffers in userspace */
+        char *ioc_pbuf1;
+        __u32 ioc_plen2; /* buffers in userspace */
+        char *ioc_pbuf2;
+
+        char ioc_bulk[0];
+};
+
+struct portal_ioctl_hdr {
+        __u32 ioc_len;
+        __u32 ioc_version;
+};
+
+struct portals_debug_ioctl_data
+{
+        struct portal_ioctl_hdr hdr;
+        unsigned int subs;
+        unsigned int debug;
+};
+
+#define PORTAL_IOC_INIT(data)                           \
+do {                                                    \
+        memset(&data, 0, sizeof(data));                 \
+        data.ioc_version = PORTAL_IOCTL_VERSION;        \
+        data.ioc_len = sizeof(data);                    \
+} while (0)
+
+/* FIXME check conflict with lustre_lib.h */
+#define PTL_IOC_DEBUG_MASK             _IOWR('f', 250, long)
+
+static inline int portal_ioctl_packlen(struct portal_ioctl_data *data)
+{
+        int len = sizeof(*data);
+        len += size_round(data->ioc_inllen1);
+        len += size_round(data->ioc_inllen2);
+        return len;
+}
+
+static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data)
+{
+        if (data->ioc_len > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inllen1 > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inllen2 > (1<<30)) {
+                CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n");
+                return 1;
+        }
+        if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
+                CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
+                CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_pbuf1 && !data->ioc_plen1) {
+                CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_pbuf2 && !data->ioc_plen2) {
+                CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n");
+                return 1;
+        }
+        if (data->ioc_plen1 && !data->ioc_pbuf1) {
+                CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+                return 1;
+        }
+        if (data->ioc_plen2 && !data->ioc_pbuf2) {
+                CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+                return 1;
+        }
+        if (portal_ioctl_packlen(data) != data->ioc_len ) {
+                CERROR ("PORTALS ioctl: packlen != ioc_len\n");
+                return 1;
+        }
+        if (data->ioc_inllen1 &&
+            data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
+                CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n");
+                return 1;
+        }
+        if (data->ioc_inllen2 &&
+            data->ioc_bulk[size_round(data->ioc_inllen1) +
+                           data->ioc_inllen2 - 1] != '\0') {
+                CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n");
+                return 1;
+        }
+        return 0;
+}
+
+#ifndef __KERNEL__
+static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf,
+                                    int max)
+{
+        char *ptr;
+        struct portal_ioctl_data *overlay;
+        data->ioc_len = portal_ioctl_packlen(data);
+        data->ioc_version = PORTAL_IOCTL_VERSION;
+
+        if (*pbuf && portal_ioctl_packlen(data) > max)
+                return 1;
+        if (*pbuf == NULL) {
+                *pbuf = malloc(data->ioc_len);
+        }
+        if (!*pbuf)
+                return 1;
+        overlay = (struct portal_ioctl_data *)*pbuf;
+        memcpy(*pbuf, data, sizeof(*data));
+
+        ptr = overlay->ioc_bulk;
+        if (data->ioc_inlbuf1)
+                LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr);
+        if (data->ioc_inlbuf2)
+                LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr);
+        if (portal_ioctl_is_invalid(overlay))
+                return 1;
+
+        return 0;
+}
+#else
+#include <asm/uaccess.h>
+
+/* buffer MUST be at least the size of portal_ioctl_hdr */
+static inline int portal_ioctl_getdata(char *buf, char *end, void *arg)
+{
+        struct portal_ioctl_hdr *hdr;
+        struct portal_ioctl_data *data;
+        int err;
+        ENTRY;
+
+        hdr = (struct portal_ioctl_hdr *)buf;
+        data = (struct portal_ioctl_data *)buf;
+
+        err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+        if ( err ) {
+                EXIT;
+                return err;
+        }
+
+        if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
+                CERROR ("PORTALS: version mismatch kernel vs application\n");
+                return -EINVAL;
+        }
+
+        if (hdr->ioc_len + buf >= end) {
+                CERROR ("PORTALS: user buffer exceeds kernel buffer\n");
+                return -EINVAL;
+        }
+
+
+        if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
+                CERROR ("PORTALS: user buffer too small for ioctl\n");
+                return -EINVAL;
+        }
+
+        err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+        if ( err ) {
+                EXIT;
+                return err;
+        }
+
+        if (portal_ioctl_is_invalid(data)) {
+                CERROR ("PORTALS: ioctl not correctly formatted\n");
+                return -EINVAL;
+        }
+
+        if (data->ioc_inllen1) {
+                data->ioc_inlbuf1 = &data->ioc_bulk[0];
+        }
+
+        if (data->ioc_inllen2) {
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                        size_round(data->ioc_inllen1);
+        }
+
+        EXIT;
+        return 0;
+}
+#endif
+
+/* ioctls for manipulating snapshots 30- */
+#define IOC_PORTAL_TYPE                   'e'
+#define IOC_PORTAL_MIN_NR                 30
+
+#define IOC_PORTAL_PING                    _IOWR('e', 30, long)
+#define IOC_PORTAL_GET_DEBUG               _IOWR('e', 31, long)
+#define IOC_PORTAL_CLEAR_DEBUG             _IOWR('e', 32, long)
+#define IOC_PORTAL_MARK_DEBUG              _IOWR('e', 33, long)
+#define IOC_PORTAL_PANIC                   _IOWR('e', 34, long)
+#define IOC_PORTAL_ADD_ROUTE               _IOWR('e', 35, long)
+#define IOC_PORTAL_DEL_ROUTE               _IOWR('e', 36, long)
+#define IOC_PORTAL_GET_ROUTE               _IOWR('e', 37, long)
+#define IOC_PORTAL_NAL_CMD                _IOWR('e', 38, long)
+#define IOC_PORTAL_GET_NID                 _IOWR('e', 39, long)
+#define IOC_PORTAL_FAIL_NID                _IOWR('e', 40, long)
+#define IOC_PORTAL_SET_DAEMON              _IOWR('e', 41, long)
+
+#define IOC_PORTAL_MAX_NR               41
+
+enum {
+        QSWNAL  =  1,
+        SOCKNAL,
+        GMNAL,
+        TOENAL,
+        TCPNAL,
+        SCIMACNAL,
+        NAL_ENUM_END_MARKER
+};
+
+#ifdef __KERNEL__
+extern ptl_handle_ni_t  kqswnal_ni;
+extern ptl_handle_ni_t  ksocknal_ni;
+extern ptl_handle_ni_t  ktoenal_ni;
+extern ptl_handle_ni_t  kgmnal_ni;
+extern ptl_handle_ni_t  kscimacnal_ni;
+#endif
+
+#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
+
+#define NAL_CMD_REGISTER_PEER_FD     100
+#define NAL_CMD_CLOSE_CONNECTION     101
+#define NAL_CMD_REGISTER_MYNID       102
+#define NAL_CMD_PUSH_CONNECTION      103
+
+enum {
+        DEBUG_DAEMON_START       =  1,
+        DEBUG_DAEMON_STOP        =  2,
+        DEBUG_DAEMON_PAUSE       =  3,
+        DEBUG_DAEMON_CONTINUE    =  4,
+};
+
+/* XXX remove to lustre ASAP */
+struct lustre_peer {
+        ptl_nid_t       peer_nid;
+        ptl_handle_ni_t peer_ni;
+};
+
+/* module.c */
+typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private);
+int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private);
+int kportal_nal_unregister(int nal);
+
+ptl_handle_ni_t *kportal_get_ni (int nal);
+void kportal_put_ni (int nal);
+
+#ifdef __CYGWIN__
+#ifndef BITS_PER_LONG
+#if (~0UL) == 0xffffffffUL
+#define BITS_PER_LONG 32
+#else
+#define BITS_PER_LONG 64
+#endif
+#endif
+#endif
+
+#if (BITS_PER_LONG == 32 || __WORDSIZE == 32)
+# define LPU64 "%Lu"
+# define LPD64 "%Ld"
+# define LPX64 "%#Lx"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#endif
+#if (BITS_PER_LONG == 64 || __WORDSIZE == 64)
+# define LPU64 "%lu"
+# define LPD64 "%ld"
+# define LPX64 "%#lx"
+# define LPSZ  "%lu"
+# define LPSSZ "%ld"
+#endif
+#ifndef LPU64
+# error "No word size defined"
+#endif
+
+#endif
diff --git a/lustre/portals/include/linux/portals_compat25.h b/lustre/portals/include/linux/portals_compat25.h
new file mode 100644 (file)
index 0000000..e28fbac
--- /dev/null
@@ -0,0 +1,13 @@
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) || defined(CONFIG_RH_2_4_20)
+# define SIGNAL_MASK_LOCK(task, flags)                              \
+  spin_lock_irqsave(&task->sighand->siglock, flags)
+# define SIGNAL_MASK_UNLOCK(task, flags)                            \
+  spin_unlock_irqrestore(&task->sighand->siglock, flags)
+# define RECALC_SIGPENDING         recalc_sigpending()
+#else
+# define SIGNAL_MASK_LOCK(task, flags)                              \
+  spin_lock_irqsave(&task->sigmask_lock, flags)
+# define SIGNAL_MASK_UNLOCK(task, flags)                            \
+  spin_unlock_irqrestore(&task->sigmask_lock, flags)
+# define RECALC_SIGPENDING         recalc_sigpending(current)
+#endif
diff --git a/lustre/portals/include/linux/portals_lib.h b/lustre/portals/include/linux/portals_lib.h
new file mode 100644 (file)
index 0000000..a528a80
--- /dev/null
@@ -0,0 +1,188 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef _PORTALS_LIB_H
+#define _PORTALS_LIB_H
+
+#ifndef __KERNEL__
+# include <string.h>
+#else 
+# include <asm/types.h>
+#endif
+
+#undef MIN
+#define MIN(a,b) (((a)<(b)) ? (a): (b))
+#undef MAX
+#define MAX(a,b) (((a)>(b)) ? (a): (b))
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int size_round (int val)
+{
+        return (val + 7) & (~0x7);
+}
+
+static inline int size_round0(int val)
+{
+        if (!val)
+                return 0;
+        return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t round_strlen(char *fset)
+{
+        return size_round(strlen(fset) + 1);
+}
+
+#ifdef __KERNEL__
+static inline char *strdup(const char *str)
+{
+        int len = strlen(str) + 1;
+        char *tmp = kmalloc(len, GFP_KERNEL);
+        if (tmp)
+                memcpy(tmp, str, len);
+
+        return tmp;
+}
+#endif
+
+#ifdef __KERNEL__
+# define NTOH__u32(var) le32_to_cpu(var)
+# define NTOH__u64(var) le64_to_cpu(var)
+# define HTON__u32(var) cpu_to_le32(var)
+# define HTON__u64(var) cpu_to_le64(var)
+#else
+# define expansion_u64(var) \
+    ({  __u64 ret; \
+       switch (sizeof(var)) {   \
+       case 8: (ret) = (var); break; \
+       case 4: (ret) = (__u32)(var); break; \
+       case 2: (ret) = (__u16)(var); break; \
+       case 1: (ret) = (__u8)(var); break; \
+       };       \
+       (ret);     \
+    })
+# define NTOH__u32(var) (var)
+# define NTOH__u64(var) (expansion_u64(var))
+# define HTON__u32(var) (var)
+# define HTON__u64(var) (expansion_u64(var))
+#endif
+
+/* 
+ * copy sizeof(type) bytes from pointer to var and move ptr forward.
+ * return EFAULT if pointer goes beyond end
+ */
+#define UNLOGV(var,type,ptr,end)                \
+do {                                            \
+        var = *(type *)ptr;                     \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+/* the following two macros convert to little endian */
+/* type MUST be __u32 or __u64 */
+#define LUNLOGV(var,type,ptr,end)               \
+do {                                            \
+        var = NTOH##type(*(type *)ptr);         \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+/* now log values */
+#define LOGV(var,type,ptr)                      \
+do {                                            \
+        *((type *)ptr) = var;                   \
+        ptr += sizeof(type);                    \
+} while (0)
+
+/* and in network order */
+#define LLOGV(var,type,ptr)                     \
+do {                                            \
+        *((type *)ptr) = HTON##type(var);       \
+        ptr += sizeof(type);                    \
+} while (0)
+
+
+/* 
+ * set var to point at (type *)ptr, move ptr forward with sizeof(type)
+ * return from function with EFAULT if ptr goes beyond end
+ */
+#define UNLOGP(var,type,ptr,end)                \
+do {                                            \
+        var = (type *)ptr;                      \
+        ptr += sizeof(type);                    \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+#define LOGP(var,type,ptr)                      \
+do {                                            \
+        memcpy(ptr, var, sizeof(type));         \
+        ptr += sizeof(type);                    \
+} while (0)
+
+/* 
+ * set var to point at (char *)ptr, move ptr forward by size_round(len);
+ * return from function with EFAULT if ptr goes beyond end
+ */
+#define UNLOGL(var,type,len,ptr,end)            \
+do {                                            \
+        var = (type *)ptr;                      \
+        ptr += size_round(len * sizeof(type));  \
+        if (ptr > end )                         \
+                return -EFAULT;                 \
+} while (0)
+
+#define UNLOGL0(var,type,len,ptr,end)                                   \
+do {                                                                    \
+        UNLOGL(var,type,len,ptr,end);                                   \
+        if ( *((char *)ptr - size_round(len) + len - 1) != '\0')        \
+                return -EFAULT;                                         \
+} while (0)
+
+#define LOGL(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)ptr, (const char *)var, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGU(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)var, (const char *)ptr, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGL0(var,len,ptr)                              \
+do {                                                    \
+        if (!len)                                       \
+                break;                                  \
+        memcpy((char *)ptr, (const char *)var, len);    \
+        *((char *)(ptr) + len) = 0;                     \
+        ptr += size_round(len + 1);                     \
+} while (0)
+
+#endif /* _PORTALS_LIB_H */
diff --git a/lustre/portals/include/portals/Makefile.am b/lustre/portals/include/portals/Makefile.am
new file mode 100644 (file)
index 0000000..c61b084
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = base
+include $(top_srcdir)/Rules
+
+pkginclude_HEADERS=api-support.h api.h arg-blocks.h defines.h errno.h internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h myrnal.h nal.h p30.h ppid.h ptlctl.h stringtab.h types.h nalids.h list.h bridge.h ipmap.h procbridge.h lltrace.h
+
diff --git a/lustre/portals/include/portals/api-support.h b/lustre/portals/include/portals/api-support.h
new file mode 100644 (file)
index 0000000..af4a2dc
--- /dev/null
@@ -0,0 +1,27 @@
+# define DEBUG_SUBSYSTEM S_PORTALS
+# define PORTAL_DEBUG
+
+#ifndef __KERNEL__
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <time.h>
+
+/* Lots of POSIX dependencies to support PtlEQWait_timeout */
+# include <signal.h>
+# include <setjmp.h>
+# include <time.h>
+#endif
+
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+
+#include <portals/internal.h>
+#include <portals/nal.h>
+#include <portals/arg-blocks.h>
+
+/* Hack for 2.4.18 macro name collision */
+#ifdef yield
+#undef yield
+#endif
diff --git a/lustre/portals/include/portals/api.h b/lustre/portals/include/portals/api.h
new file mode 100644 (file)
index 0000000..a83749b
--- /dev/null
@@ -0,0 +1,159 @@
+#ifndef P30_API_H
+#define P30_API_H
+
+#include <portals/types.h>
+
+#ifndef PTL_NO_WRAP
+int PtlInit(void);
+int PtlInitialized(void);
+void PtlFini(void);
+
+int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in,
+              ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid,
+              ptl_handle_ni_t * interface_out);
+
+int PtlNIInitialized(ptl_interface_t);
+
+int PtlNIFini(ptl_handle_ni_t interface_in);
+
+#endif
+
+int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id);
+
+
+/*
+ * Network interfaces
+ */
+
+#ifndef PTL_NO_WRAP
+int PtlNIBarrier(ptl_handle_ni_t interface_in);
+#endif
+
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t * status_out);
+
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out);
+
+#ifndef PTL_NO_WRAP
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out);
+#endif
+
+
+/*
+ * PtlNIDebug: 
+ *
+ * This is not an official Portals 3 API call.  It is provided
+ * by the reference implementation to allow the maintainers an
+ * easy way to turn on and off debugging information in the
+ * library.  Do not use it in code that is not intended for use
+ * with any version other than the portable reference library.
+ */
+unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in);
+
+/* 
+ * PtlNIFailNid
+ *
+ * Not an official Portals 3 API call.  It provides a way of simulating
+ * communications failures to all (nid == PTL_NID_ANY), or specific peers
+ * (via multiple calls), either until further notice (threshold == -1), or
+ * for a specific number of messages.  Passing a threshold of zero, "heals"
+ * the given peer.
+ */
+int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold);
+
+
+/*
+ * Match entries
+ */
+
+int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
+                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
+                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
+                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out);
+
+int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
+                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
+                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
+                ptl_handle_me_t * handle_out);
+
+int PtlMEUnlink(ptl_handle_me_t current_in);
+
+int PtlMEUnlinkList(ptl_handle_me_t current_in);
+
+int PtlTblDump(ptl_handle_ni_t ni, int index_in);
+int PtlMEDump(ptl_handle_me_t current_in);
+
+
+
+/*
+ * Memory descriptors
+ */
+
+#ifndef PTL_NO_WRAP
+int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out);
+
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+              ptl_handle_md_t * handle_out);
+
+int PtlMDUnlink(ptl_handle_md_t md_in);
+
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout,
+                ptl_md_t * new_inout, ptl_handle_eq_t testq_in);
+
+#endif
+
+/* These should not be called by users */
+int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout,
+                         ptl_md_t * new_inout, ptl_handle_eq_t testq_in,
+                         ptl_seq_t sequence_in);
+
+
+
+
+/*
+ * Event queues
+ */
+#ifndef PTL_NO_WRAP
+
+/* These should be called by users */
+int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in,
+               int (*callback) (ptl_event_t * event),
+               ptl_handle_eq_t * handle_out);
+int PtlEQFree(ptl_handle_eq_t eventq_in);
+
+int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out);
+
+int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+
+
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout);
+#endif
+
+/*
+ * Access Control Table
+ */
+int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
+               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in);
+
+
+/*
+ * Data movement
+ */
+
+int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
+           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
+           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in);
+
+int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_match_bits_t match_bits_in, ptl_size_t offset_in);
+
+
+
+#endif
diff --git a/lustre/portals/include/portals/arg-blocks.h b/lustre/portals/include/portals/arg-blocks.h
new file mode 100644 (file)
index 0000000..3c3b154
--- /dev/null
@@ -0,0 +1,265 @@
+#ifndef PTL_BLOCKS_H
+#define PTL_BLOCKS_H
+
+/*
+ * blocks.h
+ *
+ * Argument block types for the Portals 3.0 library
+ * Generated by idl
+ *
+ */
+
+#include <portals/types.h>
+
+/* put LIB_MAX_DISPATCH last here  -- these must match the
+   assignements to the dispatch table in lib-p30/dispatch.c */
+#define PTL_GETID     1
+#define PTL_NISTATUS  2
+#define PTL_NIDIST    3
+#define PTL_NIDEBUG   4
+#define PTL_MEATTACH  5
+#define PTL_MEINSERT  6
+// #define PTL_MEPREPEND 7
+#define PTL_MEUNLINK  8
+#define PTL_TBLDUMP   9 
+#define PTL_MEDUMP   10
+#define PTL_MDATTACH 11
+// #define PTL_MDINSERT 12
+#define PTL_MDBIND   13
+#define PTL_MDUPDATE 14
+#define PTL_MDUNLINK 15
+#define PTL_EQALLOC  16
+#define PTL_EQFREE   17
+#define PTL_ACENTRY  18
+#define PTL_PUT      19 
+#define PTL_GET      20
+#define PTL_FAILNID  21
+#define LIB_MAX_DISPATCH 21
+
+typedef struct PtlFailNid_in {
+       ptl_handle_ni_t interface;
+       ptl_nid_t       nid;
+       unsigned int    threshold;
+} PtlFailNid_in;
+
+typedef struct PtlFailNid_out {
+       int             rc;
+} PtlFailNid_out;
+
+typedef struct PtlGetId_in {
+        ptl_handle_ni_t handle_in;
+} PtlGetId_in;
+
+typedef struct PtlGetId_out {
+        int rc;
+        ptl_process_id_t id_out;
+} PtlGetId_out;
+
+typedef struct PtlNIStatus_in {
+        ptl_handle_ni_t interface_in;
+        ptl_sr_index_t register_in;
+} PtlNIStatus_in;
+
+typedef struct PtlNIStatus_out {
+        int rc;
+        ptl_sr_value_t status_out;
+} PtlNIStatus_out;
+
+
+typedef struct PtlNIDist_in {
+        ptl_handle_ni_t interface_in;
+        ptl_process_id_t process_in;
+} PtlNIDist_in;
+
+typedef struct PtlNIDist_out {
+        int rc;
+        unsigned long distance_out;
+} PtlNIDist_out;
+
+
+typedef struct PtlNIDebug_in {
+        unsigned int mask_in;
+} PtlNIDebug_in;
+
+typedef struct PtlNIDebug_out {
+        unsigned int rc;
+} PtlNIDebug_out;
+
+
+typedef struct PtlMEAttach_in {
+        ptl_handle_ni_t interface_in;
+        ptl_pt_index_t index_in;
+        ptl_ins_pos_t position_in;
+        ptl_process_id_t match_id_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_match_bits_t ignore_bits_in;
+        ptl_unlink_t unlink_in;
+} PtlMEAttach_in;
+
+typedef struct PtlMEAttach_out {
+        int rc;
+        ptl_handle_me_t handle_out;
+} PtlMEAttach_out;
+
+
+typedef struct PtlMEInsert_in {
+        ptl_handle_me_t current_in;
+        ptl_process_id_t match_id_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_match_bits_t ignore_bits_in;
+        ptl_unlink_t unlink_in;
+        ptl_ins_pos_t position_in;
+} PtlMEInsert_in;
+
+typedef struct PtlMEInsert_out {
+        int rc;
+        ptl_handle_me_t handle_out;
+} PtlMEInsert_out;
+
+typedef struct PtlMEUnlink_in {
+        ptl_handle_me_t current_in;
+        ptl_unlink_t unlink_in;
+} PtlMEUnlink_in;
+
+typedef struct PtlMEUnlink_out {
+        int rc;
+} PtlMEUnlink_out;
+
+
+typedef struct PtlTblDump_in {
+        int index_in;
+} PtlTblDump_in;
+
+typedef struct PtlTblDump_out {
+        int rc;
+} PtlTblDump_out;
+
+
+typedef struct PtlMEDump_in {
+        ptl_handle_me_t current_in;
+} PtlMEDump_in;
+
+typedef struct PtlMEDump_out {
+        int rc;
+} PtlMEDump_out;
+
+
+typedef struct PtlMDAttach_in {
+        ptl_handle_me_t me_in;
+        ptl_handle_eq_t eq_in;
+        ptl_md_t md_in;
+        ptl_unlink_t unlink_in;
+} PtlMDAttach_in;
+
+typedef struct PtlMDAttach_out {
+        int rc;
+        ptl_handle_md_t handle_out;
+} PtlMDAttach_out;
+
+
+typedef struct PtlMDBind_in {
+        ptl_handle_ni_t ni_in;
+        ptl_handle_eq_t eq_in;
+        ptl_md_t md_in;
+} PtlMDBind_in;
+
+typedef struct PtlMDBind_out {
+        int rc;
+        ptl_handle_md_t handle_out;
+} PtlMDBind_out;
+
+
+typedef struct PtlMDUpdate_internal_in {
+        ptl_handle_md_t md_in;
+        ptl_handle_eq_t testq_in;
+        ptl_seq_t sequence_in;
+
+        ptl_md_t old_inout;
+        int old_inout_valid;
+        ptl_md_t new_inout;
+        int new_inout_valid;
+} PtlMDUpdate_internal_in;
+
+typedef struct PtlMDUpdate_internal_out {
+        int rc;
+        ptl_md_t old_inout;
+        ptl_md_t new_inout;
+} PtlMDUpdate_internal_out;
+
+
+typedef struct PtlMDUnlink_in {
+        ptl_handle_md_t md_in;
+} PtlMDUnlink_in;
+
+typedef struct PtlMDUnlink_out {
+        int rc;
+        ptl_md_t status_out;
+} PtlMDUnlink_out;
+
+
+typedef struct PtlEQAlloc_in {
+        ptl_handle_ni_t ni_in;
+        ptl_size_t count_in;
+        void *base_in;
+        int len_in;
+        int (*callback_in) (ptl_event_t * event);
+} PtlEQAlloc_in;
+
+typedef struct PtlEQAlloc_out {
+        int rc;
+        ptl_handle_eq_t handle_out;
+} PtlEQAlloc_out;
+
+
+typedef struct PtlEQFree_in {
+        ptl_handle_eq_t eventq_in;
+} PtlEQFree_in;
+
+typedef struct PtlEQFree_out {
+        int rc;
+} PtlEQFree_out;
+
+
+typedef struct PtlACEntry_in {
+        ptl_handle_ni_t ni_in;
+        ptl_ac_index_t index_in;
+        ptl_process_id_t match_id_in;
+        ptl_pt_index_t portal_in;
+} PtlACEntry_in;
+
+typedef struct PtlACEntry_out {
+        int rc;
+} PtlACEntry_out;
+
+
+typedef struct PtlPut_in {
+        ptl_handle_md_t md_in;
+        ptl_ack_req_t ack_req_in;
+        ptl_process_id_t target_in;
+        ptl_pt_index_t portal_in;
+        ptl_ac_index_t cookie_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_size_t offset_in;
+        ptl_hdr_data_t hdr_data_in;
+} PtlPut_in;
+
+typedef struct PtlPut_out {
+        int rc;
+} PtlPut_out;
+
+
+typedef struct PtlGet_in {
+        ptl_handle_md_t md_in;
+        ptl_process_id_t target_in;
+        ptl_pt_index_t portal_in;
+        ptl_ac_index_t cookie_in;
+        ptl_match_bits_t match_bits_in;
+        ptl_size_t offset_in;
+} PtlGet_in;
+
+typedef struct PtlGet_out {
+        int rc;
+} PtlGet_out;
+
+
+#endif
diff --git a/lustre/portals/include/portals/defines.h b/lustre/portals/include/portals/defines.h
new file mode 100644 (file)
index 0000000..785ce73
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+**
+** This files contains definitions that are used throughout the cplant code.
+*/
+
+#ifndef CPLANT_H
+#define CPLANT_H
+
+#define TITLE(fname,zmig)
+
+
+/*
+** TRUE and FALSE
+*/
+#undef TRUE
+#define TRUE           (1)
+#undef FALSE
+#define FALSE          (0)
+
+
+/*
+** Return codes from functions
+*/
+#undef OK
+#define OK             (0)
+#undef ERROR
+#define ERROR          (-1)
+
+
+
+/*
+** The GCC macro for a safe max() that works on all types arithmetic types.
+*/
+#ifndef MAX
+#define MAX(a, b)      (a) > (b) ? (a) : (b)
+#endif /* MAX */
+
+#ifndef MIN
+#define MIN(a, b)      (a) < (b) ? (a) : (b)
+#endif /* MIN */
+
+/*
+** The rest is from the old qkdefs.h
+*/
+
+#ifndef __linux__
+#define __inline__
+#endif
+
+#ifndef NULL
+#define NULL ((void *)0)
+#endif
+
+#ifndef __osf__
+#define PRIVATE static
+#define PUBLIC
+#endif
+
+#ifndef __osf__
+typedef unsigned char           uchar;
+#endif
+
+typedef char                    CHAR;
+typedef unsigned char           UCHAR;
+typedef char                    INT8;
+typedef unsigned char           UINT8;
+typedef short int               INT16;
+typedef unsigned short int      UINT16;
+typedef int                     INT32;
+typedef unsigned int            UINT32;
+typedef long                    LONG32;
+typedef unsigned long           ULONG32;
+
+/* long may be 32 or 64, so we can't really append the size to the definition */
+typedef long                    LONG;
+typedef unsigned long           ULONG;
+
+#ifdef __alpha__
+typedef long int_t;
+#ifndef __osf__
+typedef unsigned long uint_t;
+#endif
+#endif
+
+#ifdef __i386__
+typedef int int_t;
+typedef unsigned int uint_t;
+#endif
+
+typedef float                   FLOAT32;
+typedef double                  FLOAT64;
+typedef void                    VOID;
+typedef INT32                   BOOLEAN;
+typedef void (*FCN_PTR)(void);
+
+#ifndef off64_t
+
+#if defined (__alpha__) || defined (__ia64__)
+typedef long                     off64_t;
+#else
+typedef long long                off64_t;
+#endif
+
+#endif
+
+/*
+** Process related typedefs
+*/
+typedef UINT16 PID_TYPE;  /* Type of Local process ID */
+typedef UINT16 NID_TYPE;  /* Type of Physical node ID */
+typedef UINT16 GID_TYPE;  /* Type of Group ID */
+typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */
+
+
+
+#endif /* CPLANT_H */
diff --git a/lustre/portals/include/portals/errno.h b/lustre/portals/include/portals/errno.h
new file mode 100644 (file)
index 0000000..817936a
--- /dev/null
@@ -0,0 +1,61 @@
+#ifndef _P30_ERRNO_H_
+#define _P30_ERRNO_H_
+
+/*
+ * include/portals/errno.h
+ *
+ * Shared error number lists
+ */
+
+/* If you change these, you must update the string table in api-errno.c */
+typedef enum {
+        PTL_OK              = 0,
+        PTL_SEGV            = 1,
+
+        PTL_NOSPACE         = 2,
+        PTL_INUSE           = 3,
+        PTL_VAL_FAILED      = 4,
+
+        PTL_NAL_FAILED      = 5,
+        PTL_NOINIT          = 6,
+        PTL_INIT_DUP        = 7,
+        PTL_INIT_INV        = 8,
+        PTL_AC_INV_INDEX    = 9,
+
+        PTL_INV_ASIZE       = 10,
+        PTL_INV_HANDLE      = 11,
+        PTL_INV_MD          = 12,
+        PTL_INV_ME          = 13,
+        PTL_INV_NI          = 14,
+/* If you change these, you must update the string table in api-errno.c */
+        PTL_ILL_MD          = 15,
+        PTL_INV_PROC        = 16,
+        PTL_INV_PSIZE       = 17,
+        PTL_INV_PTINDEX     = 18,
+        PTL_INV_REG         = 19,
+
+        PTL_INV_SR_INDX     = 20,
+        PTL_ML_TOOLONG      = 21,
+        PTL_ADDR_UNKNOWN    = 22,
+        PTL_INV_EQ          = 23,
+        PTL_EQ_DROPPED      = 24,
+
+        PTL_EQ_EMPTY        = 25,
+        PTL_NOUPDATE        = 26,
+        PTL_FAIL            = 27,
+        PTL_NOT_IMPLEMENTED = 28,
+        PTL_NO_ACK          = 29,
+
+        PTL_IOV_TOO_MANY    = 30,
+        PTL_IOV_TOO_SMALL   = 31,
+
+       PTL_EQ_INUSE        = 32,
+       PTL_MD_INUSE        = 33,
+
+        PTL_MAX_ERRNO       = 33
+} ptl_err_t;
+/* If you change these, you must update the string table in api-errno.c */
+
+extern const char *ptl_err_str[];
+
+#endif
diff --git a/lustre/portals/include/portals/internal.h b/lustre/portals/include/portals/internal.h
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/lustre/portals/include/portals/lib-dispatch.h b/lustre/portals/include/portals/lib-dispatch.h
new file mode 100644 (file)
index 0000000..f87ff83
--- /dev/null
@@ -0,0 +1,45 @@
+#ifndef PTL_DISPATCH_H
+#define PTL_DISPATCH_H
+
+/*
+ * include/dispatch.h
+ *
+ * Dispatch table header and externs for remote side
+ * operations
+ *
+ * Generated by idl
+ *
+ */
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args,
+                           void *ret);
+extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args,
+                                 void *ret);
+extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args,
+                                   void *ret);
+extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args,
+                                  void *ret);
+extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args,
+                                 void *ret);
+extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret);
+extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret);
+
+extern char *dispatch_name(int index);
+#endif
diff --git a/lustre/portals/include/portals/lib-nal.h b/lustre/portals/include/portals/lib-nal.h
new file mode 100644 (file)
index 0000000..4052c0c
--- /dev/null
@@ -0,0 +1,102 @@
+#ifndef _LIB_NAL_H_
+#define _LIB_NAL_H_
+
+/*
+ * nal.h
+ *
+ * Library side headers that define the abstraction layer's
+ * responsibilities and interfaces
+ */
+
+#include <portals/lib-types.h>
+
+struct nal_cb_t {
+       /*
+        * Per interface portal table, access control table
+        * and NAL private data field;
+        */
+       lib_ni_t ni;
+       void *nal_data;
+       /*
+        * send:  Sends a preformatted header and user data to a
+        * specified remote process.
+        * Can overwrite iov.
+        */
+       int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                       ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                       unsigned int niov, struct iovec *iov, size_t mlen);
+
+       /* as send, but with a set of page fragments (NULL if not supported) */
+       int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, 
+                             ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
+                             unsigned int niov, ptl_kiov_t *iov, size_t mlen);
+       /*
+        * recv: Receives an incoming message from a remote process
+        * Type of iov depends on options.  Can overwrite iov.
+        */
+       int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                       unsigned int niov, struct iovec *iov, size_t mlen, 
+                       size_t rlen);
+
+       /* as recv, but with a set of page fragments (NULL if not supported) */
+       int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie,
+                             unsigned int niov, ptl_kiov_t *iov, size_t mlen, 
+                             size_t rlen);
+       /*
+        * read: Reads a block of data from a specified user address
+        */
+       int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr,
+                       user_ptr src_addr, size_t len);
+
+       /*
+        * write: Writes a block of data into a specified user address
+        */
+       int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr,
+                        void *src_addr, size_t len);
+
+       /*
+        * callback: Calls an event callback
+        */
+       int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                        ptl_event_t *ev);
+
+       /*
+        *  malloc: Acquire a block of memory in a system independent
+        * fashion.
+        */
+       void *(*cb_malloc) (nal_cb_t * nal, size_t len);
+
+       void (*cb_free) (nal_cb_t * nal, void *buf, size_t len);
+
+       /*
+        * (un)map: Tell the NAL about some memory it will access.
+        * *addrkey passed to cb_unmap() is what cb_map() set it to.
+        * type of *iov depends on options.
+        * Set to NULL if not required.
+        */
+       int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                      void **addrkey);
+       void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, 
+                         void **addrkey);
+
+       /* as (un)map, but with a set of page fragments */
+       int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                            void **addrkey);
+       void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, 
+                         void **addrkey);
+
+       void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...);
+
+       /* Turn interrupts off (begin of protected area) */
+       void (*cb_cli) (nal_cb_t * nal, unsigned long *flags);
+
+       /* Turn interrupts on (end of protected area) */
+       void (*cb_sti) (nal_cb_t * nal, unsigned long *flags);
+
+       /*
+        * Calculate a network "distance" to given node
+        */
+       int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist);
+};
+
+#endif
diff --git a/lustre/portals/include/portals/lib-p30.h b/lustre/portals/include/portals/lib-p30.h
new file mode 100644 (file)
index 0000000..b623b93
--- /dev/null
@@ -0,0 +1,385 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib-p30.h
+ *
+ * Top level include for library side routines
+ */
+
+#ifndef _LIB_P30_H_
+#define _LIB_P30_H_
+
+#ifdef __KERNEL__
+# include <asm/page.h>
+# include <linux/string.h>
+#else
+# include <portals/list.h>
+# include <string.h>
+#endif
+#include <portals/types.h>
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/errno.h>
+#include <portals/lib-types.h>
+#include <portals/lib-nal.h>
+#include <portals/lib-dispatch.h>
+
+static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
+{
+        return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie &&
+                wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
+}
+
+#ifdef __KERNEL__
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        nal->cb_cli(nal, flagsp);                       \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        nal->cb_sti(nal, flagsp);                       \
+}
+#else
+/* not needed in user space until we thread there */
+#define state_lock(nal,flagsp)                          \
+do {                                                    \
+        CDEBUG(D_PORTALS, "taking state lock\n");       \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+} while (0)
+
+#define state_unlock(nal,flagsp)                        \
+{                                                       \
+        CDEBUG(D_PORTALS, "releasing state lock\n");    \
+        CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp);      \
+}
+#endif /* __KERNEL__ */
+
+#ifndef PTL_USE_SLAB_CACHE
+
+#define MAX_MES         2048
+#define MAX_MDS         2048
+#define MAX_MSGS        2048    /* Outstanding messages */
+#define MAX_EQS         512
+
+extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize);
+extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl);
+
+static inline void *
+lib_freelist_alloc (lib_freelist_t *fl)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o;
+
+        if (list_empty (&fl->fl_list))
+                return (NULL);
+        
+        o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list);
+        list_del (&o->fo_list);
+        return ((void *)&o->fo_contents);
+}
+
+static inline void
+lib_freelist_free (lib_freelist_t *fl, void *obj)
+{
+        /* ALWAYS called with statelock held */
+        lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
+        
+        list_add (&o->fo_list, &fl->fl_list);
+}
+
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_eq_t      *eq;
+        
+        state_lock (nal, &flags);
+        eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs);
+        state_unlock (nal, &flags);
+
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_eqs, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_md_t      *md;
+        
+        state_lock (nal, &flags);
+        md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds);
+        state_unlock (nal, &flags);
+
+        return (md);
+}
+
+static inline void
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mds, md);
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        unsigned long  flags;
+        lib_me_t      *me;
+        
+        state_lock (nal, &flags);
+        me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes);
+        state_unlock (nal, &flags);
+        
+        return (me);
+}
+
+static inline void
+lib_me_free (nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_mes, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc (nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs));
+}
+
+static inline void
+lib_msg_free (nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        lib_freelist_free (&nal->ni.ni_free_msgs, msg);
+}
+
+#else
+
+extern kmem_cache_t *ptl_md_slab;
+extern kmem_cache_t *ptl_msg_slab;
+extern kmem_cache_t *ptl_me_slab;
+extern kmem_cache_t *ptl_eq_slab;
+extern atomic_t      md_in_use_count;
+extern atomic_t      msg_in_use_count;
+extern atomic_t      me_in_use_count;
+extern atomic_t      eq_in_use_count;
+
+static inline lib_eq_t *
+lib_eq_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_NOFS);
+
+        if (eq == NULL)
+                return (NULL);
+
+        atomic_inc (&eq_in_use_count);
+        return (eq);
+}
+
+static inline void
+lib_eq_free (nal_cb_t *nal, lib_eq_t *eq)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&eq_in_use_count);
+        kmem_cache_free(ptl_eq_slab, eq);
+}
+
+static inline lib_md_t *
+lib_md_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_NOFS);
+
+        if (md == NULL)
+                return (NULL);
+
+        atomic_inc (&md_in_use_count);
+        return (md);
+}
+
+static inline void 
+lib_md_free (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&md_in_use_count);
+        kmem_cache_free(ptl_md_slab, md); 
+}
+
+static inline lib_me_t *
+lib_me_alloc (nal_cb_t *nal)
+{
+        /* NEVER called with statelock held */
+        lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_NOFS);
+
+        if (me == NULL)
+                return (NULL);
+
+        atomic_inc (&me_in_use_count);
+        return (me);
+}
+
+static inline void 
+lib_me_free(nal_cb_t *nal, lib_me_t *me)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&me_in_use_count);
+        kmem_cache_free(ptl_me_slab, me);
+}
+
+static inline lib_msg_t *
+lib_msg_alloc(nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC);
+
+        if (msg == NULL)
+                return (NULL);
+        
+        atomic_inc (&msg_in_use_count);
+        return (msg);
+}
+
+static inline void 
+lib_msg_free(nal_cb_t *nal, lib_msg_t *msg)
+{
+        /* ALWAYS called with statelock held */
+        atomic_dec (&msg_in_use_count);
+        kmem_cache_free(ptl_msg_slab, msg); 
+}
+#endif
+
+extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type);
+extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type);
+extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh);
+
+static inline void
+ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq)
+{
+        handle->cookie = eq->eq_lh.lh_cookie;
+}
+
+static inline lib_eq_t *
+ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, 
+                                              PTL_COOKIE_TYPE_EQ);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_eq_t, eq_lh));
+}
+
+static inline void
+ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md)
+{
+        handle->cookie = md->md_lh.lh_cookie;
+}
+
+static inline lib_md_t *
+ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
+                                              PTL_COOKIE_TYPE_MD);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline lib_md_t *
+ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh;
+        
+        if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie)
+                return (NULL);
+        
+        lh = lib_lookup_cookie (nal, wh->wh_object_cookie,
+                                PTL_COOKIE_TYPE_MD);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_md_t, md_lh));
+}
+
+static inline void
+ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me)
+{
+        handle->cookie = me->me_lh.lh_cookie;
+}
+
+static inline lib_me_t *
+ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal)
+{
+        /* ALWAYS called with statelock held */
+        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
+                                              PTL_COOKIE_TYPE_ME);
+        if (lh == NULL)
+                return (NULL);
+
+        return (lh_entry (lh, lib_me_t, me_lh));
+}
+
+extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+                    ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size);
+extern int lib_fini(nal_cb_t * cb);
+extern void lib_dispatch(nal_cb_t * cb, void *private, int index,
+                         void *arg_block, void *ret_block);
+extern char *dispatch_name(int index);
+
+/*
+ * When the NAL detects an incoming message, it should call
+ * lib_parse() decode it.  The NAL callbacks will be handed
+ * the private cookie as a way for the NAL to maintain state
+ * about which transaction is being processed.  An extra parameter,
+ * lib_cookie will contain the necessary information for
+ * finalizing the message.
+ *
+ * After it has finished the handling the message, it should
+ * call lib_finalize() with the lib_cookie parameter.
+ * Call backs will be made to write events, send acks or
+ * replies and so on.
+ */
+extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private);
+extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg);
+extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr);
+
+extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
+extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len);
+extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len);
+
+extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
+extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len);
+extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len);
+extern void lib_assert_wire_constants (void);
+
+extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+                      ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
+extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     lib_md_t *md, ptl_size_t offset, ptl_size_t len);
+
+extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in,
+                               ptl_md_t * md_out);
+extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in);
+extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in);
+#endif
diff --git a/lustre/portals/include/portals/lib-types.h b/lustre/portals/include/portals/lib-types.h
new file mode 100644 (file)
index 0000000..47c0dd2
--- /dev/null
@@ -0,0 +1,282 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * p30/lib-types.h
+ *
+ * Types used by the library side routines that do not need to be
+ * exposed to the user application
+ */
+
+#ifndef _LIB_TYPES_H_
+#define _LIB_TYPES_H_
+
+#include <portals/types.h>
+#ifdef __KERNEL__
+# define PTL_USE_SLAB_CACHE
+# include <linux/uio.h>
+# include <linux/smp_lock.h>
+# include <linux/types.h>
+#else
+# include <sys/types.h>
+#endif
+
+/* struct nal_cb_t is defined in lib-nal.h */
+typedef struct nal_cb_t nal_cb_t;
+
+typedef char *user_ptr;
+typedef struct lib_msg_t lib_msg_t;
+typedef struct lib_ptl_t lib_ptl_t;
+typedef struct lib_ac_t lib_ac_t;
+typedef struct lib_me_t lib_me_t;
+typedef struct lib_md_t lib_md_t;
+typedef struct lib_eq_t lib_eq_t;
+
+#define WIRE_ATTR      __attribute__((packed))
+
+/* The wire handle's interface cookie only matches one network interface in
+ * one epoch (i.e. new cookie when the interface restarts or the node
+ * reboots).  The object cookie only matches one object on that interface
+ * during that object's lifetime (i.e. no cookie re-use). */
+typedef struct {
+        __u64 wh_interface_cookie;
+        __u64 wh_object_cookie;
+} WIRE_ATTR ptl_handle_wire_t;
+
+/* byte-flip insensitive! */
+#define PTL_WIRE_HANDLE_NONE \
+((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1})
+
+typedef enum {
+        PTL_MSG_ACK = 0,
+        PTL_MSG_PUT,
+        PTL_MSG_GET,
+        PTL_MSG_REPLY,
+        PTL_MSG_HELLO,
+} ptl_msg_type_t;
+
+/* Each of these structs should start with an odd number of
+ * __u32, or the compiler could add its own padding and confuse
+ * everyone.
+ *
+ * Also, "length" needs to be at offset 28 of each struct.
+ */
+typedef struct ptl_ack {
+        ptl_size_t mlength;
+        ptl_handle_wire_t dst_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length (0 for acks) moving out RSN */
+} WIRE_ATTR ptl_ack_t;
+
+typedef struct ptl_put {
+        ptl_pt_index_t ptl_index;
+        ptl_handle_wire_t ack_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length moving out RSN */
+        ptl_size_t offset;
+        ptl_hdr_data_t hdr_data;
+} WIRE_ATTR ptl_put_t;
+
+typedef struct ptl_get {
+        ptl_pt_index_t ptl_index;
+        ptl_handle_wire_t return_wmd;
+        ptl_match_bits_t match_bits;
+        ptl_size_t length;                      /* common length (0 for gets) moving out RSN */
+        ptl_size_t src_offset;
+        ptl_size_t return_offset;               /* unused: going RSN */
+        ptl_size_t sink_length;
+} WIRE_ATTR ptl_get_t;
+
+typedef struct ptl_reply {
+        __u32 unused1;                          /* unused fields going RSN */
+        ptl_handle_wire_t dst_wmd;
+        ptl_size_t dst_offset;                  /* unused: going RSN */
+        __u32 unused2;
+        ptl_size_t length;                      /* common length moving out RSN */
+} WIRE_ATTR ptl_reply_t;
+
+typedef struct {
+        ptl_nid_t dest_nid;
+        ptl_nid_t src_nid;
+        ptl_pid_t dest_pid;
+        ptl_pid_t src_pid;
+        __u32 type; /* ptl_msg_type_t */
+        union {
+                ptl_ack_t ack;
+                ptl_put_t put;
+                ptl_get_t get;
+                ptl_reply_t reply;
+        } msg;
+} WIRE_ATTR ptl_hdr_t;
+
+/* All length fields in individual unions at same offset */
+/* LASSERT for same in lib-move.c */
+#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length)
+
+/* A HELLO message contains the portals magic number and protocol version
+ * code in the header's dest_nid, the peer's NID in the src_nid, and
+ * PTL_MSG_HELLO in the type field.  All other fields are zero (including
+ * PTL_HDR_LENGTH; i.e. no payload).
+ * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID, so that hosts with
+ * multiple IP interfaces can have a single NID. These NALs should exchange
+ * HELLO messages when a connection is first established. */
+typedef struct {
+        __u32  magic;                          /* PORTALS_PROTO_MAGIC */
+        __u16   version_major;                  /* increment on incompatible change */
+        __u16   version_minor;                  /* increment on compatible change */
+} WIRE_ATTR ptl_magicversion_t;
+
+#define PORTALS_PROTO_MAGIC                0xeebc0ded
+
+#define PORTALS_PROTO_VERSION_MAJOR        0
+#define PORTALS_PROTO_VERSION_MINOR        1
+
+typedef struct {
+        long recv_count, recv_length, send_count, send_length, drop_count,
+            drop_length, msgs_alloc, msgs_max;
+} lib_counters_t;
+
+/* temporary expedient: limit number of entries in discontiguous MDs */
+#if PTL_LARGE_MTU
+# define PTL_MD_MAX_IOV        64
+#else
+# define PTL_MD_MAX_IOV 16
+#endif
+
+struct lib_msg_t {
+        struct list_head  msg_list;
+        int               send_ack;
+        lib_md_t         *md;
+        ptl_nid_t         nid;
+        ptl_pid_t         pid;
+        ptl_event_t       ev;
+        ptl_handle_wire_t ack_wmd;
+        union {
+                struct iovec  iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+        } msg_iov;
+};
+
+struct lib_ptl_t {
+        ptl_pt_index_t size;
+        struct list_head *tbl;
+};
+
+struct lib_ac_t {
+        int next_free;
+};
+
+typedef struct {
+        struct list_head  lh_hash_chain;
+        __u64             lh_cookie;
+} lib_handle_t;
+
+#define lh_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+struct lib_eq_t {
+        struct list_head  eq_list;
+        lib_handle_t      eq_lh;
+        ptl_seq_t         sequence;
+        ptl_size_t        size;
+        ptl_event_t      *base;
+        int               eq_refcount;
+        int (*event_callback) (ptl_event_t * event);
+        void             *eq_addrkey;
+};
+
+struct lib_me_t {
+        struct list_head  me_list;
+        lib_handle_t      me_lh;
+        ptl_process_id_t  match_id;
+        ptl_match_bits_t  match_bits, ignore_bits;
+        ptl_unlink_t      unlink;
+        lib_md_t         *md;
+};
+
+struct lib_md_t {
+        struct list_head  md_list;
+        lib_handle_t      md_lh;
+        lib_me_t         *me;
+        user_ptr          start;
+        ptl_size_t        offset;
+        ptl_size_t        length;
+        ptl_size_t        max_size;
+        int               threshold;
+        int               pending;
+        ptl_unlink_t      unlink;
+        unsigned int      options;
+        unsigned int      md_flags;
+        void             *user_ptr;
+        lib_eq_t         *eq;
+        void             *md_addrkey;
+        unsigned int      md_niov;                /* # frags */
+        union {
+                struct iovec  iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+        } md_iov;
+};
+
+#define PTL_MD_FLAG_UNLINK            (1 << 0)
+#define PTL_MD_FLAG_AUTO_UNLINKED     (1 << 1)
+
+#ifndef PTL_USE_SLAB_CACHE
+typedef struct
+{
+        void             *fl_objs;             /* single contiguous array of objects */
+        int                fl_nobjs;            /* the number of them */
+        int                fl_objsize;          /* the size (including overhead) of each of them */
+        struct list_head   fl_list;             /* where they are enqueued */
+} lib_freelist_t;
+
+typedef struct
+{
+        struct list_head   fo_list;             /* enqueue on fl_list */
+        void              *fo_contents;         /* aligned contents */
+} lib_freeobj_t;
+#endif
+
+typedef struct {
+        /* info about peers we are trying to fail */
+        struct list_head  tp_list;             /* stash in ni.ni_test_peers */
+        ptl_nid_t         tp_nid;              /* matching nid */
+        unsigned int      tp_threshold;        /* # failures to simulate */
+} lib_test_peer_t;
+
+#define PTL_COOKIE_TYPE_MD    1
+#define PTL_COOKIE_TYPE_ME    2
+#define PTL_COOKIE_TYPE_EQ    3
+#define PTL_COOKIE_TYPES      4
+/* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be
+ * extracted by masking with (PTL_COOKIE_TYPES - 1) */
+
+typedef struct {
+        int up;
+        int refcnt;
+        ptl_nid_t nid;
+        ptl_pid_t pid;
+        int num_nodes;
+        unsigned int debug;
+        lib_ptl_t tbl;
+        lib_ac_t ac;
+        lib_counters_t counters;
+
+        int               ni_lh_hash_size;      /* size of lib handle hash table */
+        struct list_head *ni_lh_hash_table;     /* all extant lib handles, this interface */
+        __u64             ni_next_object_cookie; /* cookie generator */
+        __u64             ni_interface_cookie;  /* uniquely identifies this ni in this epoch */
+        
+        struct list_head ni_test_peers;
+        
+#ifndef PTL_USE_SLAB_CACHE
+        lib_freelist_t   ni_free_mes;
+        lib_freelist_t   ni_free_msgs;
+        lib_freelist_t   ni_free_mds;
+        lib_freelist_t   ni_free_eqs;
+#endif
+        struct list_head ni_active_msgs;
+        struct list_head ni_active_mds;
+        struct list_head ni_active_eqs;
+} lib_ni_t;
+
+#endif
diff --git a/lustre/portals/include/portals/list.h b/lustre/portals/include/portals/list.h
new file mode 100644 (file)
index 0000000..2b63312
--- /dev/null
@@ -0,0 +1,245 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+#define prefetch(a) ((void)a)
+
+struct list_head {
+       struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+       struct list_head name = LIST_HEAD_INIT(name)
+
+#define INIT_LIST_HEAD(ptr) do { \
+       (ptr)->next = (ptr); (ptr)->prev = (ptr); \
+} while (0)
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head * new,
+                             struct list_head * prev,
+                             struct list_head * next)
+{
+       next->prev = new;
+       new->next = next;
+       new->prev = prev;
+       prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+       __list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+       next->prev = prev;
+       prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+       __list_del(entry->prev, entry->next);
+       INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+       __list_del(list->prev, list->next);
+       list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+                                 struct list_head *head)
+{
+       __list_del(list->prev, list->next);
+       list_add_tail(list, head);
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(struct list_head *head)
+{
+       return head->next == head;
+}
+
+static inline void __list_splice(struct list_head *list,
+                                struct list_head *head)
+{
+       struct list_head *first = list->next;
+       struct list_head *last = list->prev;
+       struct list_head *at = head->next;
+
+       first->prev = head;
+       head->next = first;
+
+       last->next = at;
+       at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+       if (!list_empty(list))
+               __list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+                                   struct list_head *head)
+{
+       if (!list_empty(list)) {
+               __list_splice(list, head);
+               INIT_LIST_HEAD(list);
+       }
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:       the &struct list_head pointer.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+       ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+
+/**
+ * list_for_each       -       iterate over a list
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each(pos, head) \
+       for (pos = (head)->next, prefetch(pos->next); pos != (head); \
+               pos = pos->next, prefetch(pos->next))
+
+/**
+ * list_for_each_prev  -       iterate over a list in reverse order
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @head:      the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+       for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
+               pos = pos->prev, prefetch(pos->prev))
+
+/**
+ * list_for_each_safe  -       iterate over a list safe against removal of list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+       for (pos = (head)->next, n = pos->next; pos != (head); \
+               pos = n, n = pos->next)
+
+#endif
+
+#ifndef list_for_each_entry
+/**
+ * list_for_each_entry  -       iterate over list of given type
+ * @pos:        the type * to use as a loop counter.
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)                         \
+        for (pos = list_entry((head)->next, typeof(*pos), member),     \
+                    prefetch(pos->member.next);                        \
+            &pos->member != (head);                                    \
+            pos = list_entry(pos->member.next, typeof(*pos), member),  \
+            prefetch(pos->member.next))
+#endif
+
+#ifndef list_for_each_entry_safe
+/**
+ * list_for_each_entry_safe  -       iterate over list of given type safe against removal of list entry
+ * @pos:        the type * to use as a loop counter.
+ * @n:          another type * to use as temporary storage
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)                 \
+        for (pos = list_entry((head)->next, typeof(*pos), member),     \
+               n = list_entry(pos->member.next, typeof(*pos), member); \
+            &pos->member != (head);                                    \
+            pos = n, n = list_entry(n->member.next, typeof(*n), member))
+#endif
diff --git a/lustre/portals/include/portals/lltrace.h b/lustre/portals/include/portals/lltrace.h
new file mode 100644 (file)
index 0000000..7d1b304
--- /dev/null
@@ -0,0 +1,175 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Compile with:
+ * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl 
+ */
+#ifndef __LTRACE_H_
+#define __LTRACE_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <portals/types.h>
+#include <portals/ptlctl.h>
+#include <linux/kp30.h>
+#include <linux/limits.h>
+#include <asm/page.h>
+#include <linux/version.h>
+
+static inline int ltrace_write_file(char* fname)
+{
+        char* argv[3];
+
+        argv[0] = "debug_kernel";
+        argv[1] = fname;
+        argv[2] = "1";
+        
+        fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]);
+        
+        return jt_dbg_debug_kernel(3, argv);
+}
+
+static inline int ltrace_clear()
+{
+        char* argv[1];
+        
+        argv[0] = "clear";
+        
+        fprintf(stderr, "[ptlctl] %s\n", argv[0]);
+        
+        return jt_dbg_clear_debug_buf(1, argv);
+}
+
+static inline int ltrace_mark(int indent_level, char* text)
+{
+        char* argv[2];
+        char mark_buf[PATH_MAX];
+        
+        snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text);
+        
+        argv[0] = "mark";
+        argv[1] = mark_buf;
+        return jt_dbg_mark_debug_buf(2, argv);
+}
+
+static inline int ltrace_applymasks()
+{
+        char* argv[2];
+        argv[0] = "list";
+        argv[1] = "applymasks";
+        
+        fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]);
+        
+        return jt_dbg_list(2, argv);
+}
+
+
+static inline int ltrace_filter(char* subsys_or_mask)
+{
+        char* argv[2];
+        argv[0] = "filter";
+        argv[1] = subsys_or_mask;
+        return jt_dbg_filter(2, argv);
+}
+
+static inline int ltrace_show(char* subsys_or_mask)
+{
+        char* argv[2];
+        argv[0] = "show";
+        argv[1] = subsys_or_mask;
+        return jt_dbg_show(2, argv);
+}
+
+static inline int ltrace_start()
+{
+        int rc = 0;
+        dbg_initialize(0, NULL);
+#ifdef PORTALS_DEV_ID
+        rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+#endif
+        ltrace_filter("class"); 
+        ltrace_filter("socknal");
+        ltrace_filter("qswnal"); 
+        ltrace_filter("gmnal");  
+        ltrace_filter("portals");  
+        
+        ltrace_show("all_types");  
+        ltrace_filter("trace");  
+        ltrace_filter("malloc"); 
+        ltrace_filter("net"); 
+        ltrace_filter("page"); 
+        ltrace_filter("other"); 
+        ltrace_filter("info"); 
+        ltrace_applymasks();
+
+        return rc;
+}
+
+
+static inline void ltrace_stop()
+{
+#ifdef PORTALS_DEV_ID
+        unregister_ioc_dev(PORTALS_DEV_ID);
+#endif
+}
+
+static inline int not_uml()
+{
+  /* Return Values:
+   *   0 when run under UML
+   *   1 when run on host
+   *  <0 when lookup failed
+   */
+       struct stat buf;
+       int rc = stat("/dev/ubd", &buf);
+       rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc;
+       if (rc<0) {
+         fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno));
+         rc = 1; /* Assume host */
+       }
+       return rc;
+}
+
+#define LTRACE_MAX_NOB   256
+static inline void ltrace_add_processnames(char* fname)
+{
+        char cmdbuf[LTRACE_MAX_NOB];
+        struct timeval tv;
+        struct timezone tz;
+        int nob;
+        int underuml = !not_uml();
+        
+        gettimeofday(&tv, &tz);
+
+        nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \"");
+
+        /* Careful - these format strings need to match the CDEBUG
+         * formats in portals/linux/debug.c EXACTLY
+         */
+        nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ",
+                        S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec);
+
+        if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) {
+                nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB,
+                                 "(%s:%d:%s() %d | %d+%lu): ",
+                                 "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L);
+        }
+        else {
+                nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB,
+                                 "(%s:%d:%s() %d+%lu): ",
+                                 "lltrace.h", __LINE__, __FUNCTION__, 0, 0L);
+        }
+         
+        nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname);
+        system(cmdbuf);
+}
+
+#endif
diff --git a/lustre/portals/include/portals/myrnal.h b/lustre/portals/include/portals/myrnal.h
new file mode 100644 (file)
index 0000000..12b1925
--- /dev/null
@@ -0,0 +1,26 @@
+/*
+*/
+
+#ifndef MYRNAL_H
+#define MYRNAL_H
+
+#define MAX_ARGS_LEN            (256)
+#define MAX_RET_LEN             (128)
+#define MYRNAL_MAX_ACL_SIZE     (64)
+#define MYRNAL_MAX_PTL_SIZE     (64)
+
+#define P3CMD                   (100)
+#define P3SYSCALL               (200)
+#define P3REGISTER              (300)
+
+enum { PTL_MLOCKALL };
+
+typedef struct {
+       void *args;
+       size_t args_len;
+       void *ret;
+       size_t ret_len;
+       int p3cmd;
+} myrnal_forward_t;
+
+#endif                         /* MYRNAL_H */
diff --git a/lustre/portals/include/portals/nal.h b/lustre/portals/include/portals/nal.h
new file mode 100644 (file)
index 0000000..88be63c
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+*/
+#ifndef _NAL_H_
+#define _NAL_H_
+
+/*
+ * p30/nal.h
+ *
+ * The API side NAL declarations
+ */
+
+#include <portals/types.h>
+
+#ifdef yield
+#undef yield
+#endif
+
+typedef struct nal_t nal_t;
+
+struct nal_t {
+       ptl_ni_t ni;
+       int refct;
+       void *nal_data;
+       int *timeout;           /* for libp30api users */
+       int (*forward) (nal_t * nal, int index, /* Function ID */
+                       void *args, size_t arg_len, void *ret, size_t ret_len);
+
+       int (*shutdown) (nal_t * nal, int interface);
+
+       int (*validate) (nal_t * nal, void *base, size_t extent);
+
+       void (*yield) (nal_t * nal);
+
+       void (*lock) (nal_t * nal, unsigned long *flags);
+
+       void (*unlock) (nal_t * nal, unsigned long *flags);
+};
+
+typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid);
+
+extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any);
+
+#ifndef PTL_IFACE_DEFAULT
+#define PTL_IFACE_DEFAULT (PTL_IFACE_IP)
+#endif
+
+#endif
diff --git a/lustre/portals/include/portals/nalids.h b/lustre/portals/include/portals/nalids.h
new file mode 100644 (file)
index 0000000..1b837b4
--- /dev/null
@@ -0,0 +1,4 @@
+#define PTL_IFACE_TCP 1
+#define PTL_IFACE_ER 2
+#define PTL_IFACE_SS 3
+#define PTL_IFACE_MAX 4
diff --git a/lustre/portals/include/portals/p30.h b/lustre/portals/include/portals/p30.h
new file mode 100644 (file)
index 0000000..a4ea39b
--- /dev/null
@@ -0,0 +1,72 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _P30_H_
+#define _P30_H_
+
+/*
+ * p30.h
+ *
+ * User application interface file
+ */
+
+#if defined (__KERNEL__)
+#include <linux/uio.h>
+#include <linux/types.h>
+#else
+#include <sys/types.h>
+#include <sys/uio.h>
+#endif
+
+#include <portals/types.h>
+#include <portals/nal.h>
+#include <portals/api.h>
+#include <portals/errno.h>
+#include <portals/nalids.h>
+
+extern int __p30_initialized;  /* for libraries & test codes  */
+extern int __p30_myr_initialized;      /*   that don't know if p30    */
+extern int __p30_ip_initialized;       /*   had been initialized yet  */
+extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle;
+
+extern int __p30_myr_timeout;  /* in seconds, for PtlNIBarrier,     */
+extern int __p30_ip_timeout;   /* PtlReduce_all, & PtlBroadcast_all */
+
+/*
+ * Debugging flags reserved for the Portals reference library.
+ * These are not part of the API as described in the SAND report
+ * but are for the use of the maintainers of the reference implementation.
+ *
+ * It is not expected that the real implementations will export
+ * this functionality.
+ */
+#define PTL_DEBUG_NONE          0ul
+#define PTL_DEBUG_ALL           (0x0FFFul)     /* Only the Portals flags */
+
+#define __bit(x)                ((unsigned long) 1<<(x))
+#define PTL_DEBUG_PUT           __bit(0)
+#define PTL_DEBUG_GET           __bit(1)
+#define PTL_DEBUG_REPLY         __bit(2)
+#define PTL_DEBUG_ACK           __bit(3)
+#define PTL_DEBUG_DROP          __bit(4)
+#define PTL_DEBUG_REQUEST       __bit(5)
+#define PTL_DEBUG_DELIVERY      __bit(6)
+#define PTL_DEBUG_UNLINK        __bit(7)
+#define PTL_DEBUG_THRESHOLD     __bit(8)
+#define PTL_DEBUG_API           __bit(9)
+
+/*
+ * These eight are reserved for the NAL to define
+ * It should probably give them better names...
+ */
+#define PTL_DEBUG_NI_ALL        (0xF000ul)     /* Only the NAL flags */
+#define PTL_DEBUG_NI0           __bit(24)
+#define PTL_DEBUG_NI1           __bit(25)
+#define PTL_DEBUG_NI2           __bit(26)
+#define PTL_DEBUG_NI3           __bit(27)
+#define PTL_DEBUG_NI4           __bit(28)
+#define PTL_DEBUG_NI5           __bit(29)
+#define PTL_DEBUG_NI6           __bit(30)
+#define PTL_DEBUG_NI7           __bit(31)
+
+#endif
diff --git a/lustre/portals/include/portals/ppid.h b/lustre/portals/include/portals/ppid.h
new file mode 100644 (file)
index 0000000..4727599
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ */
+
+#ifndef _INCppidh_
+#define _INCppidh_
+
+#include "defines.h"
+// #include "idtypes.h"
+
+
+#define MAX_PPID         1000    /* this needs to fit into 16 bits so the 
+                                    maximum value is 65535. having it "large"
+                                    can help w/ debugging process accounting
+                                    but there are reasons for making it 
+                                    somewhat smaller than the maximum --
+                                    requiring storage for arrays that index 
+                                    on the ppid, eg...  */
+                                 
+#define MAX_GID          1000    /* this needs to fit into 16 bits... */
+
+#define MAX_FIXED_PPID   100
+#define MAX_FIXED_GID    100
+#define PPID_FLOATING    MAX_FIXED_PPID+1   /* Floating area starts here */
+#define GID_FLOATING     MAX_FIXED_GID+1    /* Floating area starts here */
+#define NUM_PTL_TASKS    MAX_FIXED_PPID+80  /* Maximum no. portals tasks */
+
+#define PPID_AUTO        0
+
+/* Minimum PPID is 1 */
+#define PPID_BEBOPD      1            /* bebopd */
+#define  GID_BEBOPD      1            /* bebopd */
+
+#define PPID_PCT         2            /* pct */
+#define  GID_PCT         2            /* pct */
+
+#define PPID_FYOD        3            /* fyod */
+#define  GID_FYOD        3            /* fyod */
+
+#define PPID_GDBWRAP     11           /* portals proxy for gdb */
+#define  GID_GDBWRAP     11           /* portals proxy for gdb */
+
+#define PPID_TEST        15           /* for portals tests */
+#define  GID_TEST        15
+
+#define  GID_YOD         5            /* yod */
+#define  GID_PINGD       6            /* pingd */
+#define  GID_BT          7            /* bt */
+#define  GID_PTLTEST     8            /* ptltest */
+#define  GID_CGDB        9            /* cgdb */
+#define  GID_TVDSVR     10            /* start-tvdsvr */
+
+#endif /* _INCppidh_ */
diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h
new file mode 100644 (file)
index 0000000..dc02780
--- /dev/null
@@ -0,0 +1,75 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * header for libptlctl.a
+ */
+#ifndef _PTLCTL_H_
+#define _PTLCTL_H_
+
+#define PORTALS_DEV_ID 0
+#define PORTALS_DEV_PATH "/dev/portals"
+#define OBD_DEV_ID 1
+#define OBD_DEV_PATH "/dev/obd"
+
+int ptl_name2nal(char *str);
+int ptl_parse_nid (ptl_nid_t *nidp, char *str);
+char * ptl_nid2str (char *buffer, ptl_nid_t nid);
+
+int ptl_initialize(int argc, char **argv);
+int jt_ptl_network(int argc, char **argv);
+int jt_ptl_connect(int argc, char **argv);
+int jt_ptl_disconnect(int argc, char **argv);
+int jt_ptl_push_connection(int argc, char **argv);
+int jt_ptl_ping(int argc, char **argv);
+int jt_ptl_shownid(int argc, char **argv);
+int jt_ptl_mynid(int argc, char **argv);
+int jt_ptl_add_uuid(int argc, char **argv);
+int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
+int jt_ptl_close_uuid(int argc, char **argv);
+int jt_ptl_del_uuid(int argc, char **argv);
+int jt_ptl_rxmem (int argc, char **argv);
+int jt_ptl_txmem (int argc, char **argv);
+int jt_ptl_nagle (int argc, char **argv);
+int jt_ptl_add_route (int argc, char **argv);
+int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_print_routes (int argc, char **argv);
+int jt_ptl_fail_nid (int argc, char **argv);
+
+int dbg_initialize(int argc, char **argv);
+int jt_dbg_filter(int argc, char **argv);
+int jt_dbg_show(int argc, char **argv);
+int jt_dbg_list(int argc, char **argv);
+int jt_dbg_debug_kernel(int argc, char **argv);
+int jt_dbg_debug_daemon(int argc, char **argv);
+int jt_dbg_debug_file(int argc, char **argv);
+int jt_dbg_clear_debug_buf(int argc, char **argv);
+int jt_dbg_mark_debug_buf(int argc, char **argv);
+int jt_dbg_modules(int argc, char **argv);
+int jt_dbg_panic(int argc, char **argv);
+
+/* l_ioctl.c */
+int register_ioc_dev(int dev_id, const char * dev_name);
+void unregister_ioc_dev(int dev_id);
+int set_ioctl_dump(char * file);
+int l_ioctl(int dev_id, int opc, void *buf);
+int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *));
+int jt_ioc_dump(int argc, char **argv);
+
+#endif
diff --git a/lustre/portals/include/portals/stringtab.h b/lustre/portals/include/portals/stringtab.h
new file mode 100644 (file)
index 0000000..c9683f7
--- /dev/null
@@ -0,0 +1,5 @@
+/*
+*/
+/*
+ * stringtab.h
+ */
diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h
new file mode 100644 (file)
index 0000000..d4038b6
--- /dev/null
@@ -0,0 +1,157 @@
+#ifndef _P30_TYPES_H_
+#define _P30_TYPES_H_
+
+#ifdef __linux__
+#include <asm/types.h>
+#include <asm/timex.h>
+#else
+#include <sys/types.h>
+typedef u_int32_t __u32;
+typedef u_int64_t __u64;
+typedef unsigned long long cycles_t;
+static inline cycles_t get_cycles(void) { return 0; }
+#endif
+
+typedef __u64 ptl_nid_t;
+typedef __u32 ptl_pid_t;
+typedef __u32 ptl_pt_index_t;
+typedef __u32 ptl_ac_index_t;
+typedef __u64 ptl_match_bits_t;
+typedef __u64 ptl_hdr_data_t;
+typedef __u32 ptl_size_t;
+
+typedef struct {
+        unsigned long nal_idx;                 /* which network interface */
+        __u64         cookie;                  /* which thing on that interface */
+} ptl_handle_any_t;
+
+typedef ptl_handle_any_t ptl_handle_ni_t;
+typedef ptl_handle_any_t ptl_handle_eq_t;
+typedef ptl_handle_any_t ptl_handle_md_t;
+typedef ptl_handle_any_t ptl_handle_me_t;
+
+#define PTL_HANDLE_NONE \
+((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1})
+#define PTL_EQ_NONE PTL_HANDLE_NONE
+
+static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2)
+{
+       return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie);
+}
+
+#define PTL_NID_ANY      ((ptl_nid_t) -1)
+#define PTL_PID_ANY      ((ptl_pid_t) -1)
+
+typedef struct {
+        ptl_nid_t nid;
+        ptl_pid_t pid;   /* node id / process id */
+} ptl_process_id_t;
+
+typedef enum {
+        PTL_RETAIN = 0,
+        PTL_UNLINK
+} ptl_unlink_t;
+
+typedef enum {
+        PTL_INS_BEFORE,
+        PTL_INS_AFTER
+} ptl_ins_pos_t;
+
+typedef struct {
+       struct page     *kiov_page;
+       unsigned int     kiov_len;
+       unsigned int     kiov_offset;
+} ptl_kiov_t;
+
+typedef struct {
+        void            *start;
+        ptl_size_t       length;
+        int              threshold;
+        int              max_size;
+        unsigned int     options;
+        void            *user_ptr;
+        ptl_handle_eq_t  eventq;
+       unsigned int     niov;
+} ptl_md_t;
+
+/* Options for the MD structure */
+#define PTL_MD_OP_PUT           (1 << 0)
+#define PTL_MD_OP_GET           (1 << 1)
+#define PTL_MD_MANAGE_REMOTE    (1 << 2)
+#define PTL_MD_AUTO_UNLINK      (1 << 3)
+#define PTL_MD_TRUNCATE         (1 << 4)
+#define PTL_MD_ACK_DISABLE      (1 << 5)
+#define PTL_MD_IOV             (1 << 6)
+#define PTL_MD_MAX_SIZE                (1 << 7)
+#define PTL_MD_KIOV             (1 << 8)
+
+#define PTL_MD_THRESH_INF       (-1)
+
+typedef enum {
+        PTL_EVENT_GET,
+        PTL_EVENT_PUT,
+        PTL_EVENT_REPLY,
+        PTL_EVENT_ACK,
+        PTL_EVENT_SENT
+} ptl_event_kind_t;
+
+#define PTL_SEQ_BASETYPE       long
+typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
+#define PTL_SEQ_GT(a,b)        (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0)
+
+typedef struct {
+        ptl_event_kind_t type;
+        ptl_process_id_t initiator;
+        ptl_pt_index_t portal;
+        ptl_match_bits_t match_bits;
+        ptl_size_t rlength, mlength, offset;
+        ptl_handle_me_t unlinked_me;
+        ptl_md_t mem_desc;
+        ptl_hdr_data_t hdr_data;
+        cycles_t  arrival_time;
+        volatile ptl_seq_t sequence;
+} ptl_event_t;
+
+
+typedef enum {
+        PTL_ACK_REQ,
+        PTL_NOACK_REQ
+} ptl_ack_req_t;
+
+
+typedef struct {
+        volatile ptl_seq_t sequence;
+        ptl_size_t size;
+        ptl_event_t *base;
+        ptl_handle_any_t cb_eq_handle;
+} ptl_eq_t;
+
+typedef struct {
+        ptl_eq_t *eq;
+} ptl_ni_t;
+
+
+typedef struct {
+        int max_match_entries;    /* max number of match entries */
+        int max_mem_descriptors;  /* max number of memory descriptors */
+        int max_event_queues;     /* max number of event queues */
+        int max_atable_index;     /* maximum access control list table index */
+        int max_ptable_index;     /* maximum portals table index */
+} ptl_ni_limits_t;
+
+/*
+ * Status registers
+ */
+typedef enum {
+        PTL_SR_DROP_COUNT,
+        PTL_SR_DROP_LENGTH,
+        PTL_SR_RECV_COUNT,
+        PTL_SR_RECV_LENGTH,
+        PTL_SR_SEND_COUNT,
+        PTL_SR_SEND_LENGTH,
+        PTL_SR_MSGS_MAX,
+} ptl_sr_index_t;
+
+typedef int ptl_sr_value_t;
+
+#endif
diff --git a/lustre/portals/knals/.cvsignore b/lustre/portals/knals/.cvsignore
new file mode 100644 (file)
index 0000000..282522d
--- /dev/null
@@ -0,0 +1,2 @@
+Makefile
+Makefile.in
diff --git a/lustre/portals/knals/Makefile.am b/lustre/portals/knals/Makefile.am
new file mode 100644 (file)
index 0000000..fed2785
--- /dev/null
@@ -0,0 +1,7 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+DIST_SUBDIRS= socknal toenal qswnal gmnal scimacnal 
+SUBDIRS= socknal toenal        @QSWNAL@ @GMNAL@ @SCIMACNAL@
diff --git a/lustre/portals/knals/Makefile.mk b/lustre/portals/knals/Makefile.mk
new file mode 100644 (file)
index 0000000..ce40a60
--- /dev/null
@@ -0,0 +1,4 @@
+include ../Kernelenv
+
+obj-y = socknal/
+# more coming...
\ No newline at end of file
diff --git a/lustre/portals/knals/gmnal/.cvsignore b/lustre/portals/knals/gmnal/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/portals/knals/gmnal/Makefile.am b/lustre/portals/knals/gmnal/Makefile.am
new file mode 100644 (file)
index 0000000..1dc6f4e
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kgmnal
+modulenet_DATA = kgmnal.o
+EXTRA_PROGRAMS = kgmnal
+
+DEFS =
+kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h
diff --git a/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch b/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch
new file mode 100644 (file)
index 0000000..23c80d9
--- /dev/null
@@ -0,0 +1,43 @@
+diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c
+--- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c        Mon Jul  1 10:35:09 2002
++++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c    Thu Sep 19 14:19:38 2002
+@@ -30,6 +30,8 @@
+  *
+  ************************************************************************/
++#define EXPORT_SYMTAB
++
+ #include <linux/config.h>
+ #include <linux/module.h>
+@@ -4075,6 +4077,28 @@
+   return 0;
+ }
++EXPORT_SYMBOL(gm_blocking_receive_no_spin);
++EXPORT_SYMBOL(gm_close);
++EXPORT_SYMBOL(gm_dma_free);
++EXPORT_SYMBOL(gm_dma_malloc);
++EXPORT_SYMBOL(gm_drop_sends);
++EXPORT_SYMBOL(gm_finalize);
++EXPORT_SYMBOL(gm_get_node_id);
++EXPORT_SYMBOL(gm_init);
++EXPORT_SYMBOL(gm_initialize_alarm);
++EXPORT_SYMBOL(gm_max_node_id_in_use);
++EXPORT_SYMBOL(gm_min_size_for_length);
++EXPORT_SYMBOL(gm_num_receive_tokens);
++EXPORT_SYMBOL(gm_num_send_tokens);
++EXPORT_SYMBOL(gm_open);
++EXPORT_SYMBOL(gm_provide_receive_buffer);
++EXPORT_SYMBOL(gm_resume_sending);
++EXPORT_SYMBOL(gm_send_with_callback);
++EXPORT_SYMBOL(gm_set_acceptable_sizes);
++EXPORT_SYMBOL(gm_set_alarm);
++EXPORT_SYMBOL(gm_unknown);
++
++
+ /*
+   This file uses GM standard indentation.
+Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~
+Only in gm-1.5.2.1_Linux-cfs/: trace
diff --git a/lustre/portals/knals/gmnal/gmnal.c b/lustre/portals/knals/gmnal/gmnal.c
new file mode 100644 (file)
index 0000000..ceeea2a
--- /dev/null
@@ -0,0 +1,284 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Based on ksocknal and qswnal
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Robert Read  <rread@datarithm.net>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "gmnal.h"
+
+ptl_handle_ni_t kgmnal_ni;
+nal_t  kgmnal_api;
+
+kgmnal_data_t kgmnal_data;
+int gmnal_debug = 0;
+
+kpr_nal_interface_t kqswnal_router_interface = {
+        kprni_nalid:        GMNAL,
+        kprni_arg:        NULL,
+        kprni_fwd:          kgmnal_fwd_packet,
+};
+
+static int kgmnal_forward(nal_t   *nal,
+                          int     id,
+                          void    *args,  size_t args_len,
+                          void    *ret,   size_t ret_len)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
+        return PTL_OK;
+}
+
+static void kgmnal_lock(nal_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+static void kgmnal_unlock(nal_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *k = nal->nal_data;
+        nal_cb_t      *nal_cb = k->kgm_cb;
+
+
+        LASSERT (nal == &kgmnal_api);
+        LASSERT (k == &kgmnal_data);
+        LASSERT (nal_cb == &kgmnal_lib);
+
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+static int kgmnal_shutdown(nal_t *nal, int ni)
+{
+        LASSERT (nal == &kgmnal_api);
+        return 0;
+}
+
+static void kgmnal_yield( nal_t *nal )
+{
+        LASSERT (nal == &kgmnal_api);
+
+        if (current->need_resched)
+                schedule();
+        return;
+}
+
+kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx)
+{
+        kgmnal_rx_t *conn;
+
+        PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t));
+        /* Check for out of mem here */
+        if (conn==NULL) {
+                        printk("kgm_add_recv: memory alloc failed\n");
+                        return NULL;
+        }
+
+        list_add(&conn->krx_item,(struct list_head *)&data->kgm_list);
+        //        conn->ndx=ndx;
+        //        conn->len=conn->ptlhdr_copied=0;
+        //        conn->loopback=0;
+        return conn;
+}
+
+static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size,
+                          ptl_ac_index_t  ac_size, ptl_pid_t requested_pid)
+{
+        unsigned int nnids;
+
+        gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
+
+        CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n",
+               kgmnal_data.kgm_nid, nnids);
+        lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size);
+        return &kgmnal_api;
+}
+
+static void __exit
+kgmnal_finalize(void)
+{
+        struct list_head *tmp;
+
+        PORTAL_SYMBOL_UNREGISTER (kgmnal_ni);
+        PtlNIFini(kgmnal_ni);
+        lib_fini(&kgmnal_api);
+
+        if (kgmnal_data.kgm_port) {
+                gm_close(kgmnal_data.kgm_port);
+        }
+
+        /* FIXME: free dma buffers */
+        /* FIXME: kill receiver thread */
+
+        PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS);
+
+        list_for_each(tmp, &kgmnal_data.kgm_list) {
+                kgmnal_rx_t *conn;
+                conn = list_entry(tmp, kgmnal_rx_t, krx_item);
+                CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
+                tmp = tmp->next;
+                list_del(&conn->krx_item);
+                PORTAL_FREE(conn, sizeof(*conn));
+        }
+
+        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
+
+        return;
+}
+
+static int __init
+kgmnal_initialize(void)
+{
+        int rc;
+        int ntok;
+        unsigned long sizemask;
+        unsigned int nid;
+
+        CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
+
+        kgmnal_api.forward = kgmnal_forward;
+        kgmnal_api.shutdown = kgmnal_shutdown;
+        kgmnal_api.yield = kgmnal_yield;
+        kgmnal_api.validate = NULL;         /* our api validate is a NOOP */
+        kgmnal_api.lock= kgmnal_lock;
+        kgmnal_api.unlock= kgmnal_unlock;
+        kgmnal_api.nal_data = &kgmnal_data;
+
+        kgmnal_lib.nal_data = &kgmnal_data;
+
+        memset(&kgmnal_data, 0, sizeof(kgmnal_data));
+
+        INIT_LIST_HEAD(&kgmnal_data.kgm_list);
+        kgmnal_data.kgm_cb = &kgmnal_lib;
+
+        /* Allocate transmit descriptors */
+        PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS);
+        if (kgmnal_data.kgm_trans==NULL) {
+                printk("kgmnal: init: failed to allocate transmit "
+                       "descriptors\n");
+                return -1;
+        }
+        memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS));
+
+        spin_lock_init(&kgmnal_data.kgm_dispatch_lock);
+        spin_lock_init(&kgmnal_data.kgm_update_lock);
+        spin_lock_init(&kgmnal_data.kgm_send_lock);
+
+        /* Do the receiver and xmtr allocation */
+
+        rc = gm_init();
+        if (rc != GM_SUCCESS) {
+                CERROR("gm_init failed: %d\n", rc);
+                return -1;
+        }
+
+        rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME,
+                     GM_API_VERSION_1_1);
+        if (rc != GM_SUCCESS) {
+                gm_finalize();
+                kgmnal_data.kgm_port = NULL;
+                CERROR("gm_open failed: %d\n", rc);
+                return -1;
+        }
+        gm_get_node_id(kgmnal_data.kgm_port, &nid);
+        kgmnal_data.kgm_nid = nid;
+        /* Allocate 2 different sizes of buffers. For new, use half
+           the tokens for each. */
+        ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
+        CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n",
+               ntok, MSG_LEN_LARGE);
+        while (ntok-- > 0) {
+                void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
+                                              MSG_LEN_LARGE);
+                if (buffer == NULL) {
+                        CERROR("gm_init failed: %d\n", rc);
+                        return (-ENOMEM);
+                }
+                CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
+                       "pri %d\n ", kgmnal_data.kgm_port, buffer,
+                       MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY);
+
+                gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
+                                          MSG_SIZE_LARGE, GM_LOW_PRIORITY);
+        }
+
+        ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
+        CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n",
+               ntok, MSG_LEN_SMALL);
+        while (ntok-- > 0) {
+                void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
+                                              MSG_LEN_SMALL);
+                if (buffer == NULL) {
+                        CERROR("gm_init failed: %d\n", rc);
+                        return (-ENOMEM);
+                }
+                CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
+                       "pri %d\n ", kgmnal_data.kgm_port, buffer,
+                       MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY);
+
+                gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
+                                          MSG_SIZE_SMALL, GM_LOW_PRIORITY);
+
+        }
+        sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL);
+        CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n",
+                        kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask);
+        gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY,
+                                sizemask);
+        gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0);
+
+        /* Initialize Network Interface */
+        rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni);
+        if (rc) {
+                CERROR("PtlNIInit failed %d\n", rc);
+                return (-ENOMEM);
+        }
+
+        /* Start receiver thread */
+        kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0);
+
+        PORTAL_SYMBOL_REGISTER(kgmnal_ni);
+
+        kgmnal_data.kgm_init = 1;
+
+        return 0;
+}
+
+MODULE_AUTHOR("Robert Read <rread@datarithm.net>");
+MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1");
+MODULE_LICENSE("GPL");
+
+module_init (kgmnal_initialize);
+module_exit (kgmnal_finalize);
+
+EXPORT_SYMBOL (kgmnal_ni);
diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h
new file mode 100644 (file)
index 0000000..47e8c3c
--- /dev/null
@@ -0,0 +1,101 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef _GMNAL_H
+#define _GMNAL_H
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_GMNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#include <gm.h>
+
+
+/*
+ *  Myrinet GM NAL
+ */
+#define NPAGES_LARGE            16
+#define NPAGES_SMALL            1
+#define MSG_LEN_LARGE            NPAGES_LARGE*PAGE_SIZE
+#define MSG_LEN_SMALL            NPAGES_SMALL*PAGE_SIZE
+#define MSG_SIZE_LARGE           (gm_min_size_for_length(MSG_LEN_LARGE))
+#define MSG_SIZE_SMALL           (gm_min_size_for_length(MSG_LEN_SMALL))
+
+#define TXMSGS                  64 /* Number of Transmit Messages */
+#define ENVELOPES               8  /* Number of outstanding receive msgs */
+
+#define KGM_PORT_NUM 3
+#define KGM_HOSTNAME "kgmnal"
+
+
+typedef struct {
+        char *krx_buffer;
+        unsigned long   krx_len;
+        unsigned int   krx_size;
+        unsigned int   krx_priority;
+        struct list_head krx_item;
+}  kgmnal_rx_t;
+
+
+typedef struct {
+        nal_cb_t  *ktx_nal;
+        void      *ktx_private;
+        lib_msg_t *ktx_cookie;
+        char      *ktx_buffer;
+        size_t     ktx_len;
+        unsigned long ktx_size;
+        int        ktx_ndx;
+        unsigned int ktx_priority;
+        unsigned int ktx_tgt_node;
+        unsigned int ktx_tgt_port_id;
+}  kgmnal_tx_t;
+
+
+typedef struct {
+        char              kgm_init;
+        char              kgm_shuttingdown;
+        struct gm_port   *kgm_port;
+        struct list_head  kgm_list;
+        ptl_nid_t         kgm_nid;
+        nal_cb_t         *kgm_cb;
+        struct kgm_trans *kgm_trans;
+        struct tq_struct  kgm_ready_tq;
+        spinlock_t        kgm_dispatch_lock;
+        spinlock_t        kgm_update_lock;
+        spinlock_t        kgm_send_lock;
+}  kgmnal_data_t;
+
+int kgm_init(kgmnal_data_t *kgm_data);
+int kgmnal_recv_thread(void *);
+int gm_return_mynid(void);
+void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+
+extern kgmnal_data_t      kgmnal_data;
+extern nal_t              kgmnal_api;
+extern nal_cb_t           kgmnal_lib;
+
+#endif  /* _GMNAL_H */
+
diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c
new file mode 100644 (file)
index 0000000..3d4c86d
--- /dev/null
@@ -0,0 +1,517 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Based on ksocknal and qswnal
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *  Author: Robert Read  <rread@datarithm.net>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* TODO
+ * preallocate send buffers, store on list
+ * put receive buffers on queue, handle with receive threads
+ * use routing
+ */
+
+#include "gmnal.h"
+
+extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int);
+
+static kgmnal_tx_t *
+get_trans(void)
+{
+        kgmnal_tx_t *t;
+        PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t)));
+        return t;
+}
+
+static void
+put_trans(kgmnal_tx_t *t)
+{
+        PORTAL_FREE(t, sizeof(kgmnal_tx_t));
+}
+
+int
+kgmnal_ispeer (ptl_nid_t nid)
+{
+   unsigned int gmnid = (unsigned int)nid;
+   unsigned int nnids;
+
+   gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
+
+   return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */
+           gmnid < nnids); /* it's in this machine */
+}
+
+/*
+ *  LIB functions follow
+ *
+ */
+static int
+kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
+             size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+static int
+kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
+             size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+static void *
+kgmnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return buf;
+}
+
+static void
+kgmnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+static void
+kgmnal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list                ap;
+        char msg[256];
+
+        if (portal_debug & D_NET) {
+                va_start( ap, fmt );
+                vsnprintf( msg, sizeof(msg), fmt, ap );
+                va_end( ap );
+
+                printk("CPUId: %d %s",smp_processor_id(), msg);
+        }
+}
+
+
+static void
+kgmnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->kgm_dispatch_lock,*flags);
+}
+
+
+static void
+kgmnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kgmnal_data_t *data= nal->nal_data;
+
+        spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags);
+}
+
+
+static int
+kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* network distance doesn't mean much for this nal */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+/* FIXME rmr: add rounting code here */
+static void
+kgmnal_tx_done(kgmnal_tx_t  *trans, int error)
+{
+        lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie);
+
+        gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer);
+
+        trans->ktx_buffer = NULL;
+        trans->ktx_len = 0;
+
+        put_trans(trans);
+}
+static char * gm_error_strings[GM_NUM_STATUS_CODES] = {
+        [GM_SUCCESS] = "GM_SUCCESS",
+        [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT",
+        [GM_SEND_REJECTED] = "GM_SEND_REJECTED",
+        [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED",
+        [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE",
+        [GM_SEND_DROPPED] = "GM_SEND_DROPPED",
+        [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED",
+};
+
+inline char * get_error(int status)
+{
+        if (gm_error_strings[status] != NULL)
+                return gm_error_strings[status];
+        else
+                return "Unknown error";
+}
+
+static void
+kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status)
+{
+        CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status);
+}
+
+static void
+kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status)
+{
+        kgmnal_tx_t *ktx = (kgmnal_tx_t *)context;
+        int err = 0;
+
+        LASSERT (p != NULL);
+        LASSERT (ktx != NULL);
+
+        CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status,
+                ktx->ktx_tgt_node, ktx->ktx_tgt_port_id);
+
+        switch((int)status) {
+        case GM_SUCCESS:        /* normal */
+                break;
+        case GM_SEND_TIMED_OUT: /* application error */
+        case GM_SEND_REJECTED:  /* size of msg unacceptable */
+        case GM_SEND_TARGET_PORT_CLOSED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority,
+                                  ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
+                                  kgmnal_errhandler, NULL);
+                err = -EIO;
+                break;
+        case GM_SEND_TARGET_NODE_UNREACHABLE:
+        case GM_SEND_PORT_CLOSED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority,
+                              ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
+                              kgmnal_errhandler, NULL);
+                err = -EIO;
+                break;
+        case GM_SEND_DROPPED:
+                CERROR("%s (%d):\n", get_error(status), status);
+                err = -EIO;
+                break;
+        default:
+                CERROR("Unknown status: %d\n", status);
+                err = -EIO;
+                break;
+        }
+
+        kgmnal_tx_done(ktx, err);
+}
+
+/*
+ */
+
+static int
+kgmnal_send(nal_cb_t        *nal,
+           void            *private,
+           lib_msg_t       *cookie,
+           ptl_hdr_t       *hdr,
+           int              type,
+           ptl_nid_t        nid,
+           ptl_pid_t        pid,
+           int              options,
+           unsigned int     niov,
+           lib_md_iov_t    *iov,
+           size_t           len)
+{
+        /*
+         * ipnal assumes that this is the private as passed to lib_dispatch..
+         * so do we :/
+         */
+        kgmnal_tx_t *ktx=NULL;
+        int rc=0;
+        void * buf;
+        int buf_len = sizeof(ptl_hdr_t) + len;
+        int buf_size = 0;
+
+        LASSERT ((options & PTL_MD_KIOV) == 0);
+        
+        PROF_START(gmnal_send);
+
+
+        CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n",
+               len, iov, nid, KGM_PORT_NUM);
+
+        /* ensure there is an available tx handle */
+
+        /* save transaction info to trans for later finalize and cleanup */
+        ktx = get_trans();
+        if (ktx == NULL) {
+                rc = -ENOMEM;
+                goto send_exit;
+        }
+
+        /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce
+           header and data.
+           Also, memory must be dma'able or registered with GM. */
+
+        if (buf_len <= MSG_LEN_SMALL) {
+                buf_size = MSG_SIZE_SMALL;
+        } else if (buf_len <= MSG_LEN_LARGE) {
+                buf_size = MSG_SIZE_LARGE;
+        } else {
+                printk("kgmnal:request exceeds TX MTU size (%d).\n",
+                       MSG_SIZE_LARGE);
+                rc = -1;
+                goto send_exit;
+        }
+
+               buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len);
+        if (buf == NULL) {
+                rc = -ENOMEM;
+                goto send_exit;
+        }
+        memcpy(buf, hdr, sizeof(ptl_hdr_t));
+
+        if (len != 0)
+                lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), 
+                                 options, niov, iov, len);
+
+        ktx->ktx_nal = nal;
+        ktx->ktx_private = private;
+        ktx->ktx_cookie = cookie;
+        ktx->ktx_len = buf_len;
+        ktx->ktx_size = buf_size;
+        ktx->ktx_buffer = buf;
+        ktx->ktx_priority = GM_LOW_PRIORITY;
+        ktx->ktx_tgt_node = nid;
+        ktx->ktx_tgt_port_id = KGM_PORT_NUM;
+
+        CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx "
+               "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM,
+               GM_LOW_PRIORITY);
+
+        gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size,
+                              buf_len, GM_LOW_PRIORITY,
+                              nid, KGM_PORT_NUM,
+                              kgmnal_txhandler, ktx);
+
+        PROF_FINISH(gmnal_send);
+ send_exit:
+        return rc;
+}
+void
+kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+void
+kqswnal_fwd_callback (void *arg, int error)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+
+static inline void
+kgmnal_requeue_rx(kgmnal_rx_t *krx)
+{
+        gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer,
+                                  krx->krx_size, krx->krx_priority);
+}
+
+/* Process a received portals packet */
+
+/* Receive Interrupt Handler */
+static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size,
+                      void * buf, unsigned int pri)
+{
+        ptl_hdr_t  *hdr = buf;
+        kgmnal_rx_t krx;
+
+        CDEBUG(D_NET,"buf %p, len %ld\n", buf, len);
+
+        if ( len < sizeof( ptl_hdr_t ) ) {
+                /* XXX what's this for? */
+                if (kgm->kgm_shuttingdown)
+                        return;
+                CERROR("kgmnal: did not receive complete portal header, "
+                       "len= %ld", len);
+                gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri);
+                return;
+        }
+
+       /* might want to use seperate threads to handle receive */
+        krx.krx_buffer = buf;
+        krx.krx_len = len;
+        krx.krx_size = size;
+        krx.krx_priority = pri;
+
+        if ( hdr->dest_nid == kgmnal_lib.ni.nid ) {
+                PROF_START(lib_parse);
+                lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx);
+                PROF_FINISH(lib_parse);
+        } else if (kgmnal_ispeer(hdr->dest_nid)) {
+                /* should have gone direct to peer */
+                CERROR("dropping packet from 0x%llx to 0x%llx: target is "
+                       "a peer", hdr->src_nid, hdr->dest_nid);
+                kgmnal_requeue_rx(&krx);
+        } else {
+                /* forward to gateway */
+                CERROR("forwarding not implemented yet");
+                kgmnal_requeue_rx(&krx);
+        }
+
+        return;
+}
+
+
+static int kgmnal_recv(nal_cb_t     *nal,
+                      void         *private,
+                      lib_msg_t    *cookie,
+                      int           options,
+                      unsigned int  niov,
+                      lib_md_iov_t *iov,
+                      size_t        mlen,
+                      size_t        rlen)
+{
+        kgmnal_rx_t *krx = private;
+
+        LASSERT ((options & PTL_MD_KIOV) == 0);
+
+        CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen);
+
+        /* What was actually received must be >= what sender claims to
+         * have sent.  This is an LASSERT, since lib-move doesn't
+         * check cb return code yet. */
+        LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
+        LASSERT (mlen <= rlen);
+
+        PROF_START(gmnal_recv);
+
+        if(mlen != 0) {
+                PROF_START(memcpy);
+                lib_copy_buf2iov (options, niov, iov, 
+                                  krx->krx_buffer + sizeof (ptl_hdr_t), mlen);
+                PROF_FINISH(memcpy);
+        }
+
+        PROF_START(lib_finalize);
+        lib_finalize(nal, private, cookie);
+        PROF_FINISH(lib_finalize);
+
+        kgmnal_requeue_rx(krx);
+
+        PROF_FINISH(gmnal_recv);
+
+        return rlen;
+}
+
+
+static void kgmnal_shutdown(void * none)
+{
+        CERROR("called\n");
+        return;
+}
+
+/*
+ * Set terminate and use alarm to wake up the recv thread.
+ */
+static void  recv_shutdown(kgmnal_data_t *kgm)
+{
+        gm_alarm_t alarm;
+
+        kgm->kgm_shuttingdown = 1;
+        gm_initialize_alarm(&alarm);
+        gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL);
+}
+
+int kgmnal_end(kgmnal_data_t *kgm)
+{
+
+        /* wait for sends to finish ? */
+        /* remove receive buffers */
+        /* shutdown receive thread */
+
+        recv_shutdown(kgm);
+
+        return 0;
+}
+
+/* Used only for the spinner */
+int kgmnal_recv_thread(void *arg)
+{
+        kgmnal_data_t *kgm = arg;
+
+        LASSERT(kgm != NULL);
+
+        kportal_daemonize("kgmnal_rx");
+        
+        while(1) {
+                gm_recv_event_t *e;
+                int priority = GM_LOW_PRIORITY;
+                if (kgm->kgm_shuttingdown)
+                        break;
+
+                e = gm_blocking_receive_no_spin(kgm->kgm_port);
+                if (e == NULL) {
+                        CERROR("gm_blocking_receive returned NULL\n");
+                        break;
+                }
+
+                switch(gm_ntohc(e->recv.type)) {
+                case GM_HIGH_RECV_EVENT:
+                        priority = GM_HIGH_PRIORITY;
+                        /* fall through */
+                case GM_RECV_EVENT:
+                        kgmnal_rx(kgm, gm_ntohl(e->recv.length),
+                                  gm_ntohc(e->recv.size),
+                                  gm_ntohp(e->recv.buffer), priority);
+                        break;
+                case GM_ALARM_EVENT:
+                        CERROR("received alarm");
+                        gm_unknown(kgm->kgm_port, e);
+                        break;
+                case GM_BAD_SEND_DETECTED_EVENT: /* ?? */
+                        CERROR("received bad send!\n");
+                        break;
+                default:
+                        gm_unknown(kgm->kgm_port, e);
+                }
+        }
+
+        CERROR("shuttting down.\n");
+        return 0;
+}
+
+nal_cb_t kgmnal_lib = {
+        nal_data: &kgmnal_data,                /* NAL private data */
+        cb_send: kgmnal_send,
+        cb_recv: kgmnal_recv,
+        cb_read: kgmnal_read,
+        cb_write: kgmnal_write,
+        cb_malloc: kgmnal_malloc,
+        cb_free: kgmnal_free,
+        cb_printf: kgmnal_printf,
+        cb_cli: kgmnal_cli,
+        cb_sti: kgmnal_sti,
+        cb_dist: kgmnal_dist
+};
diff --git a/lustre/portals/knals/qswnal/.cvsignore b/lustre/portals/knals/qswnal/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/portals/knals/qswnal/Makefile.am b/lustre/portals/knals/qswnal/Makefile.am
new file mode 100644 (file)
index 0000000..3eb4dd5
--- /dev/null
@@ -0,0 +1,17 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kqswnal
+modulenet_DATA = kqswnal.o
+EXTRA_PROGRAMS = kqswnal
+
+
+#CFLAGS:= @KCFLAGS@ 
+#CPPFLAGS:=@KCPPFLAGS@
+DEFS =
+CPPFLAGS=@CPPFLAGS@ @with_quadrics@
+kqswnal_SOURCES = qswnal.c qswnal_cb.c qswnal.h
diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c
new file mode 100644 (file)
index 0000000..1a8fb74
--- /dev/null
@@ -0,0 +1,608 @@
+/*
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * W. Marcus Miller - Based on ksocknal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswnal.h"
+
+ptl_handle_ni_t                kqswnal_ni;
+nal_t                  kqswnal_api;
+kqswnal_data_t         kqswnal_data;
+
+kpr_nal_interface_t kqswnal_router_interface = {
+       kprni_nalid:    QSWNAL,
+       kprni_arg:      NULL,
+       kprni_fwd:      kqswnal_fwd_packet,
+};
+
+
+static int
+kqswnal_forward(nal_t   *nal,
+               int     id,
+               void    *args,  size_t args_len,
+               void    *ret,   size_t ret_len)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
+       return (PTL_OK);
+}
+
+static void
+kqswnal_lock (nal_t *nal, unsigned long *flags)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       nal_cb->cb_cli(nal_cb,flags);
+}
+
+static void
+kqswnal_unlock(nal_t *nal, unsigned long *flags)
+{
+       kqswnal_data_t *k = nal->nal_data;
+       nal_cb_t       *nal_cb = k->kqn_cb;
+
+       LASSERT (nal == &kqswnal_api);
+       LASSERT (k == &kqswnal_data);
+       LASSERT (nal_cb == &kqswnal_lib);
+
+       nal_cb->cb_sti(nal_cb,flags);
+}
+
+static int
+kqswnal_shutdown(nal_t *nal, int ni)
+{
+       CDEBUG (D_NET, "shutdown\n");
+
+       LASSERT (nal == &kqswnal_api);
+       return (0);
+}
+
+static void
+kqswnal_yield( nal_t *nal )
+{
+       CDEBUG (D_NET, "yield\n");
+
+       if (current->need_resched)
+               schedule();
+       return;
+}
+
+static nal_t *
+kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size,
+            ptl_pid_t requested_pid)
+{
+       ptl_nid_t mynid = kqswnal_elanid2nid (kqswnal_data.kqn_elanid);
+       int       nnids = kqswnal_data.kqn_nnodes;
+
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid, nnids);
+
+       lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size);
+
+       return (&kqswnal_api);
+}
+
+int
+kqswnal_cmd (struct portal_ioctl_data *data, void *private)
+{
+       LASSERT (data != NULL);
+       
+       switch (data->ioc_nal_cmd) {
+       case NAL_CMD_REGISTER_MYNID:
+               CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n",
+                       data->ioc_nid - kqswnal_data.kqn_elanid,
+                       kqswnal_data.kqn_nid_offset);
+               kqswnal_data.kqn_nid_offset =
+                       data->ioc_nid - kqswnal_data.kqn_elanid;
+               kqswnal_lib.ni.nid = data->ioc_nid;
+               return (0);
+               
+       default:
+               return (-EINVAL);
+       }
+}
+
+void __exit
+kqswnal_finalise (void)
+{
+       switch (kqswnal_data.kqn_init)
+       {
+       default:
+               LASSERT (0);
+
+       case KQN_INIT_ALL:
+               PORTAL_SYMBOL_UNREGISTER (kqswnal_ni);
+               /* fall through */
+
+       case KQN_INIT_PTL:
+               PtlNIFini (kqswnal_ni);
+               lib_fini (&kqswnal_lib);
+               /* fall through */
+
+       case KQN_INIT_DATA:
+               break;
+
+       case KQN_INIT_NOTHING:
+               return;
+       }
+
+       /**********************************************************************/
+       /* Make router stop her calling me and fail any more call-ins */
+       kpr_shutdown (&kqswnal_data.kqn_router);
+
+       /**********************************************************************/
+       /* flag threads to terminate, wake them and wait for them to die */
+
+       kqswnal_data.kqn_shuttingdown = 1;
+       wake_up_all (&kqswnal_data.kqn_sched_waitq);
+
+       while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
+               CDEBUG(D_NET, "waiting for %d threads to terminate\n",
+                      atomic_read (&kqswnal_data.kqn_nthreads));
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+
+       /**********************************************************************/
+       /* close elan comms */
+
+       if (kqswnal_data.kqn_eprx_small != NULL)
+               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small);
+
+       if (kqswnal_data.kqn_eprx_large != NULL)
+               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large);
+
+       if (kqswnal_data.kqn_eptx != NULL)
+               ep_free_large_xmtr (kqswnal_data.kqn_eptx);
+
+       /**********************************************************************/
+       /* No more threads.  No more portals, router or comms callbacks!
+        * I control the horizontals and the verticals...
+        */
+
+       /**********************************************************************/
+       /* Complete any blocked forwarding packets with error
+        */
+
+       while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq))
+       {
+               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
+                                                 kpr_fwd_desc_t, kprfd_list);
+               list_del (&fwd->kprfd_list);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+       }
+
+       while (!list_empty (&kqswnal_data.kqn_delayedfwds))
+       {
+               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next,
+                                                 kpr_fwd_desc_t, kprfd_list);
+               list_del (&fwd->kprfd_list);
+               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH);
+       }
+
+       /**********************************************************************/
+       /* Wait for router to complete any packets I sent her
+        */
+
+       kpr_deregister (&kqswnal_data.kqn_router);
+
+
+       /**********************************************************************/
+       /* Unmap message buffers and free all descriptors and buffers
+        */
+
+       if (kqswnal_data.kqn_eprxdmahandle != NULL)
+       {
+               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eprxdmahandle, 0,
+                                 KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                                 KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE);
+
+               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eprxdmahandle);
+       }
+
+       if (kqswnal_data.kqn_eptxdmahandle != NULL)
+       {
+               elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eptxdmahandle, 0,
+                                 KQSW_NTXMSGPAGES * (KQSW_NTXMSGS +
+                                                     KQSW_NNBLK_TXMSGS));
+
+               elan3_dma_release(kqswnal_data.kqn_epdev->DmaState,
+                                 kqswnal_data.kqn_eptxdmahandle);
+       }
+
+       if (kqswnal_data.kqn_txds != NULL)
+       {
+               int   i;
+
+               for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++)
+               {
+                       kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+
+                       if (ktx->ktx_buffer != NULL)
+                               PORTAL_FREE(ktx->ktx_buffer,
+                                           KQSW_TX_BUFFER_SIZE);
+               }
+
+               PORTAL_FREE(kqswnal_data.kqn_txds,
+                           sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS +
+                                                    KQSW_NNBLK_TXMSGS));
+       }
+
+       if (kqswnal_data.kqn_rxds != NULL)
+       {
+               int   i;
+               int   j;
+
+               for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+               {
+                       kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+                       for (j = 0; j < krx->krx_npages; j++)
+                               if (krx->krx_pages[j] != NULL)
+                                       __free_page (krx->krx_pages[j]);
+               }
+
+               PORTAL_FREE(kqswnal_data.kqn_rxds,
+                           sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL +
+                                                   KQSW_NRXMSGS_LARGE));
+       }
+
+       /* resets flags, pointers to NULL etc */
+       memset(&kqswnal_data, 0, sizeof (kqswnal_data));
+
+       CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
+
+       printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n",
+                atomic_read(&portal_kmemory));
+}
+
+static int __init
+kqswnal_initialise (void)
+{
+       ELAN3_DMA_REQUEST dmareq;
+       int               rc;
+       int               i;
+       int               elan_page_idx;
+       int               pkmem = atomic_read(&portal_kmemory);
+
+       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
+
+       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
+
+       kqswnal_api.forward  = kqswnal_forward;
+       kqswnal_api.shutdown = kqswnal_shutdown;
+       kqswnal_api.yield    = kqswnal_yield;
+       kqswnal_api.validate = NULL;            /* our api validate is a NOOP */
+       kqswnal_api.lock     = kqswnal_lock;
+       kqswnal_api.unlock   = kqswnal_unlock;
+       kqswnal_api.nal_data = &kqswnal_data;
+
+       kqswnal_lib.nal_data = &kqswnal_data;
+
+       /* ensure all pointers NULL etc */
+       memset (&kqswnal_data, 0, sizeof (kqswnal_data));
+
+       kqswnal_data.kqn_cb = &kqswnal_lib;
+
+       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds);
+       spin_lock_init (&kqswnal_data.kqn_idletxd_lock);
+       init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq);
+
+       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
+
+       spin_lock_init (&kqswnal_data.kqn_sched_lock);
+       init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
+
+       spin_lock_init (&kqswnal_data.kqn_statelock);
+
+       /* pointers/lists/locks initialised */
+       kqswnal_data.kqn_init = KQN_INIT_DATA;
+
+       /**********************************************************************/
+       /* Find the first Elan device */
+
+       kqswnal_data.kqn_epdev = ep_device (0);
+       if (kqswnal_data.kqn_epdev == NULL)
+       {
+               CERROR ("Can't get elan device 0\n");
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_nid_offset = 0;
+       kqswnal_data.kqn_nnodes     = ep_numnodes (kqswnal_data.kqn_epdev);
+       kqswnal_data.kqn_elanid     = ep_nodeid (kqswnal_data.kqn_epdev);
+       
+       /**********************************************************************/
+       /* Get the transmitter */
+
+       kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev);
+       if (kqswnal_data.kqn_eptx == NULL)
+       {
+               CERROR ("Can't allocate transmitter\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Get the receivers */
+
+       kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
+                                                            EP_SVC_LARGE_PORTALS_SMALL,
+                                                            KQSW_EP_ENVELOPES_SMALL);
+       if (kqswnal_data.kqn_eprx_small == NULL)
+       {
+               CERROR ("Can't install small msg receiver\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev,
+                                                            EP_SVC_LARGE_PORTALS_LARGE,
+                                                            KQSW_EP_ENVELOPES_LARGE);
+       if (kqswnal_data.kqn_eprx_large == NULL)
+       {
+               CERROR ("Can't install large msg receiver\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Reserve Elan address space for transmit buffers */
+
+        dmareq.Waitfn   = DDI_DMA_SLEEP;
+        dmareq.ElanAddr = (E3_Addr) 0;
+        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
+        dmareq.Perm     = ELAN_PERM_REMOTEREAD;
+
+       rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState,
+                             KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS),
+                             &dmareq, &kqswnal_data.kqn_eptxdmahandle);
+       if (rc != DDI_SUCCESS)
+       {
+               CERROR ("Can't reserve rx dma space\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Reserve Elan address space for receive buffers */
+
+        dmareq.Waitfn   = DDI_DMA_SLEEP;
+        dmareq.ElanAddr = (E3_Addr) 0;
+        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
+        dmareq.Perm     = ELAN_PERM_REMOTEWRITE;
+
+       rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState,
+                               KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
+                               KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE,
+                               &dmareq, &kqswnal_data.kqn_eprxdmahandle);
+       if (rc != DDI_SUCCESS)
+       {
+               CERROR ("Can't reserve rx dma space\n");
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /**********************************************************************/
+       /* Allocate/Initialise transmit descriptors */
+
+       PORTAL_ALLOC(kqswnal_data.kqn_txds,
+                    sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       if (kqswnal_data.kqn_txds == NULL)
+       {
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       /* clear flags, null pointers etc */
+       memset(kqswnal_data.kqn_txds, 0,
+              sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS));
+       for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
+       {
+               int           premapped_pages;
+               kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i];
+               int           basepage = i * KQSW_NTXMSGPAGES;
+
+               PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
+               if (ktx->ktx_buffer == NULL)
+               {
+                       kqswnal_finalise ();
+                       return (-ENOMEM);
+               }
+
+               /* Map pre-allocated buffer NOW, to save latency on transmit */
+               premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
+                                                       KQSW_TX_BUFFER_SIZE);
+
+               elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                      kqswnal_data.kqn_eptxdmahandle,
+                                      ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
+                                      basepage, &ktx->ktx_ebuffer);
+
+               ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
+               ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
+
+               if (i < KQSW_NTXMSGS)
+                       ktx->ktx_idle = &kqswnal_data.kqn_idletxds;
+               else
+                       ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds;
+
+               list_add_tail (&ktx->ktx_list, ktx->ktx_idle);
+       }
+
+       /**********************************************************************/
+       /* Allocate/Initialise receive descriptors */
+
+       PORTAL_ALLOC (kqswnal_data.kqn_rxds,
+                     sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE));
+       if (kqswnal_data.kqn_rxds == NULL)
+       {
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */
+              sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE));
+
+       elan_page_idx = 0;
+       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       {
+               E3_Addr       elanaddr;
+               int           j;
+               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               if (i < KQSW_NRXMSGS_SMALL)
+               {
+                       krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
+                       krx->krx_eprx   = kqswnal_data.kqn_eprx_small;
+               }
+               else
+               {
+                       krx->krx_npages = KQSW_NRXMSGPAGES_LARGE;
+                       krx->krx_eprx   = kqswnal_data.kqn_eprx_large;
+               }
+
+               LASSERT (krx->krx_npages > 0);
+               for (j = 0; j < krx->krx_npages; j++)
+               {
+                       krx->krx_pages[j] = alloc_page(GFP_KERNEL);
+                       if (krx->krx_pages[j] == NULL)
+                       {
+                               kqswnal_finalise ();
+                               return (-ENOMEM);
+                       }
+
+                       LASSERT(page_address(krx->krx_pages[j]) != NULL);
+
+                       elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState,
+                                             kqswnal_data.kqn_eprxdmahandle,
+                                             page_address(krx->krx_pages[j]),
+                                             PAGE_SIZE, elan_page_idx,
+                                             &elanaddr);
+                       elan_page_idx++;
+
+                       if (j == 0)
+                               krx->krx_elanaddr = elanaddr;
+
+                       /* NB we assume a contiguous  */
+                       LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE);
+               }
+       }
+       LASSERT (elan_page_idx ==
+                (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) +
+                (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE));
+
+       /**********************************************************************/
+       /* Network interface ready to initialise */
+
+        rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni);
+        if (rc != 0)
+       {
+               CERROR ("PtlNIInit failed %d\n", rc);
+               kqswnal_finalise ();
+               return (-ENOMEM);
+       }
+
+       kqswnal_data.kqn_init = KQN_INIT_PTL;
+
+       /**********************************************************************/
+       /* Queue receives, now that it's OK to run their completion callbacks */
+
+       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       {
+               kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i];
+
+               /* NB this enqueue can allocate/sleep (attr == 0) */
+               rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
+                                     krx->krx_elanaddr,
+                                     krx->krx_npages * PAGE_SIZE, 0);
+               if (rc != 0)
+               {
+                       CERROR ("failed ep_queue_receive %d\n", rc);
+                       kqswnal_finalise ();
+                       return (-ENOMEM);
+               }
+       }
+
+       /**********************************************************************/
+       /* Spawn scheduling threads */
+       for (i = 0; i < smp_num_cpus; i++)
+       {
+               rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
+               if (rc != 0)
+               {
+                       CERROR ("failed to spawn scheduling thread: %d\n", rc);
+                       kqswnal_finalise ();
+                       return (rc);
+               }
+       }
+
+       /**********************************************************************/
+       /* Connect to the router */
+       rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface);
+       CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc);
+
+       rc = kportal_nal_register (QSWNAL, &kqswnal_cmd, NULL);
+       if (rc != 0) {
+               CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+               kqswnal_finalise ();
+               return (rc);
+       }
+
+       PORTAL_SYMBOL_REGISTER(kqswnal_ni);
+       kqswnal_data.kqn_init = KQN_INIT_ALL;
+
+       printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d "
+              "(Routing %s, initial mem %d)\n", 
+              kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes,
+              kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
+              pkmem);
+
+       return (0);
+}
+
+
+MODULE_AUTHOR("W. Marcus Miller <marcusm@llnl.gov>");
+MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00");
+MODULE_LICENSE("GPL");
+
+module_init (kqswnal_initialise);
+module_exit (kqswnal_finalise);
+
+EXPORT_SYMBOL (kqswnal_ni);
diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h
new file mode 100644 (file)
index 0000000..88ab74f
--- /dev/null
@@ -0,0 +1,270 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef _QSWNAL_H
+#define _QSWNAL_H
+#define EXPORT_SYMTAB
+
+#ifdef PROPRIETARY_ELAN
+# include <qsw/kernel.h>
+#else
+# include <qsnet/kernel.h>
+#endif
+
+#undef printf                                   /* nasty QSW #define */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <elan3/elanregs.h>
+#include <elan3/elandev.h>
+#include <elan3/elanvp.h>
+#include <elan3/elan3mmu.h>
+#include <elan3/elanctxt.h>
+#include <elan3/elandebug.h>
+#include <elan3/urom_addrs.h>
+#include <elan3/busops.h>
+#include <elan3/kcomm.h>
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_QSWNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define KQSW_CHECKSUM  0
+#if KQSW_CHECKSUM
+typedef unsigned long kqsw_csum_t;
+#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t))
+#else
+#define KQSW_CSUM_SIZE 0
+#endif
+#define KQSW_HDR_SIZE  (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE)
+
+/*
+ *  Elan NAL
+ */
+#define EP_SVC_LARGE_PORTALS_SMALL     (0x10)  /* Portals over elan port number (large payloads) */
+#define EP_SVC_LARGE_PORTALS_LARGE     (0x11)  /* Portals over elan port number (small payloads) */
+/* NB small/large message sizes are GLOBAL constants */
+
+/*
+ * Performance Tuning defines
+ * NB no mention of PAGE_SIZE for interoperability
+ */
+#if PTL_LARGE_MTU
+# define KQSW_MAXPAYLOAD               (256<<10) /* biggest message this NAL will cope with */
+#else
+# define KQSW_MAXPAYLOAD               (64<<10) /* biggest message this NAL will cope with */
+#endif
+
+#define KQSW_SMALLPAYLOAD              ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */
+
+#define KQSW_TX_MAXCONTIG              (1<<10) /* largest payload that gets made contiguous on transmit */
+
+#define KQSW_NTXMSGS                   8       /* # normal transmit messages */
+#define KQSW_NNBLK_TXMSGS              128     /* # reserved transmit messages if can't block */
+
+#define KQSW_NRXMSGS_LARGE             64      /* # large receive buffers */
+#define KQSW_EP_ENVELOPES_LARGE        128     /* # large ep envelopes */
+
+#define KQSW_NRXMSGS_SMALL             256     /* # small receive buffers */
+#define KQSW_EP_ENVELOPES_SMALL                2048    /* # small ep envelopes */
+
+#define KQSW_RESCHED                   100     /* # busy loops that forces scheduler to yield */
+
+/*
+ * derived constants
+ */
+
+#define KQSW_TX_BUFFER_SIZE    (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG)
+/* The pre-allocated tx buffer (hdr + small payload) */
+
+#define KQSW_NTXMSGPAGES       (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1)
+/* Reserve elan address space for pre-allocated and pre-mapped transmit
+ * buffer and a full payload too.  Extra pages allow for page alignment */
+
+#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD))
+/* receive hdr/payload always contiguous and page aligned */
+#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE)
+
+#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD))
+/* receive hdr/payload always contiguous and page aligned */
+#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE)
+/* biggest complete packet we can receive (or transmit) */
+
+
+typedef struct 
+{
+        struct list_head krx_list;              /* enqueue -> thread */
+        EP_RCVR                *krx_eprx;              /* port to post receives to */
+        EP_RXD          *krx_rxd;               /* receive descriptor (for repost) */
+        E3_Addr          krx_elanaddr;          /* Elan address of buffer (contiguous in elan vm) */
+        int              krx_npages;            /* # pages in receive buffer */
+        int              krx_nob;               /* Number Of Bytes received into buffer */
+        kpr_fwd_desc_t   krx_fwd;               /* embedded forwarding descriptor */
+        struct page     *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */
+        struct iovec     krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */
+}  kqswnal_rx_t;
+
+typedef struct
+{
+        struct list_head  ktx_list;             /* enqueue idle/delayed */
+        struct list_head *ktx_idle;             /* where to put when idle */
+        char              ktx_state;            /* What I'm doing */
+        uint32_t          ktx_basepage;         /* page offset in reserved elan tx vaddrs for mapping pages */
+        int               ktx_npages;           /* pages reserved for mapping messages */
+        int               ktx_nmappedpages;     /* # pages mapped for current message */
+        EP_IOVEC         ktx_iov[EP_MAXFRAG];  /* msg frags (elan vaddrs) */
+        int               ktx_niov;             /* # message frags */
+        int               ktx_port;             /* destination ep port */
+        ptl_nid_t         ktx_nid;              /* destination node */
+        void             *ktx_args[2];          /* completion passthru */
+        E3_Addr                  ktx_ebuffer;          /* elan address of ktx_buffer */
+        char             *ktx_buffer;           /* pre-allocated contiguous buffer for hdr + small payloads */
+} kqswnal_tx_t;
+
+#define KTX_IDLE       0                       /* MUST BE ZERO (so zeroed ktx is idle) */
+#define KTX_SENDING    1                       /* local send */
+#define KTX_FORWARDING 2                       /* routing a packet */
+
+typedef struct
+{
+        char               kqn_init;            /* what's been initialised */
+        char               kqn_shuttingdown;    /* I'm trying to shut down */
+        atomic_t           kqn_nthreads;        /* # threads still running */
+
+        kqswnal_rx_t      *kqn_rxds;            /* all the receive descriptors */
+        kqswnal_tx_t      *kqn_txds;            /* all the transmit descriptors */
+
+        struct list_head   kqn_idletxds;        /* transmit descriptors free to use */
+        struct list_head   kqn_nblk_idletxds;   /* reserve of */
+        spinlock_t         kqn_idletxd_lock;    /* serialise idle txd access */
+        wait_queue_head_t  kqn_idletxd_waitq;   /* sender blocks here waiting for idle txd */
+        struct list_head   kqn_idletxd_fwdq;    /* forwarded packets block here waiting for idle txd */
+        
+        spinlock_t         kqn_sched_lock;      /* serialise packet schedulers */
+        wait_queue_head_t  kqn_sched_waitq;     /* scheduler blocks here */
+
+        struct list_head   kqn_readyrxds;       /* rxds full of data */
+        struct list_head   kqn_delayedfwds;     /* delayed forwards */
+        struct list_head   kqn_delayedtxds;     /* delayed transmits */
+
+        spinlock_t         kqn_statelock;       /* cb_cli/cb_sti */
+        nal_cb_t          *kqn_cb;              /* -> kqswnal_lib */
+       EP_DEV            *kqn_epdev;           /* elan device */
+       EP_XMTR           *kqn_eptx;            /* elan transmitter */
+       EP_RCVR           *kqn_eprx_small;      /* elan receiver (small messages) */
+        EP_RCVR                  *kqn_eprx_large;      /* elan receiver (large messages) */
+       ELAN3_DMA_HANDLE  *kqn_eptxdmahandle;   /* elan reserved tx vaddrs */
+       ELAN3_DMA_HANDLE  *kqn_eprxdmahandle;   /* elan reserved rx vaddrs */
+        kpr_router_t       kqn_router;          /* connection to Kernel Portals Router module */
+
+        ptl_nid_t          kqn_nid_offset;      /* this cluster's NID offset */
+        int                kqn_nnodes;          /* this cluster's size */
+        int                kqn_elanid;          /* this nodes's elan ID */
+}  kqswnal_data_t;
+
+/* kqn_init state */
+#define KQN_INIT_NOTHING       0               /* MUST BE ZERO so zeroed state is initialised OK */
+#define KQN_INIT_DATA          1
+#define KQN_INIT_PTL           2
+#define KQN_INIT_ALL           3
+
+extern nal_cb_t        kqswnal_lib;
+extern nal_t           kqswnal_api;
+extern kqswnal_data_t  kqswnal_data;
+
+extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg);
+extern void kqswnal_rxhandler(EP_RXD *rxd);
+extern int kqswnal_scheduler (void *);
+extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+
+static inline ptl_nid_t
+kqswnal_elanid2nid (int elanid) 
+{
+        return (kqswnal_data.kqn_nid_offset + elanid);
+}
+
+static inline int
+kqswnal_nid2elanid (ptl_nid_t nid) 
+{
+        /* not in this cluster? */
+        if (nid < kqswnal_data.kqn_nid_offset ||
+            nid >= kqswnal_data.kqn_nid_offset + kqswnal_data.kqn_nnodes)
+                return (-1);
+        
+        return (nid - kqswnal_data.kqn_nid_offset);
+}
+
+static inline void
+kqswnal_requeue_rx (kqswnal_rx_t *krx)
+{
+        ep_requeue_receive (krx->krx_rxd, kqswnal_rxhandler, krx,
+                            krx->krx_elanaddr, krx->krx_npages * PAGE_SIZE);
+}
+
+static inline int
+kqswnal_pages_spanned (void *base, int nob)
+{
+        unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT;
+        unsigned long last_page  = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT;
+
+        LASSERT (last_page >= first_page);      /* can't wrap address space */
+        return (last_page - first_page + 1);
+}
+
+#if KQSW_CHECKSUM
+static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob)
+{
+        unsigned char *ptr = (unsigned char *)base;
+        
+        while (nob-- > 0)
+                sum += *ptr++;
+        
+        return (sum);
+}
+#endif
+
+#endif /* _QSWNAL_H */
diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c
new file mode 100644 (file)
index 0000000..3b47a25
--- /dev/null
@@ -0,0 +1,1239 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * W. Marcus Miller - Based on ksocknal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswnal.h"
+
+atomic_t kqswnal_packets_launched;
+atomic_t kqswnal_packets_transmitted;
+atomic_t kqswnal_packets_received;
+
+
+/*
+ *  LIB functions follow
+ *
+ */
+static int
+kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
+             size_t len)
+{
+        CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n",
+                nal->ni.nid, len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+
+        return (0);
+}
+
+static int
+kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
+              size_t len)
+{
+        CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n",
+                nal->ni.nid, len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+
+        return (0);
+}
+
+static void *
+kqswnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return (buf);
+}
+
+static void
+kqswnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+static void
+kqswnal_printf (nal_cb_t * nal, const char *fmt, ...)
+{
+        va_list ap;
+        char msg[256];
+
+        va_start (ap, fmt);
+        vsnprintf (msg, sizeof (msg), fmt, ap);        /* sprint safely */
+        va_end (ap);
+
+        msg[sizeof (msg) - 1] = 0;                /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+
+static void
+kqswnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kqswnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->kqn_statelock, *flags);
+}
+
+
+static void
+kqswnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kqswnal_data_t *data= nal->nal_data;
+
+        spin_unlock_irqrestore(&data->kqn_statelock, *flags);
+}
+
+
+static int
+kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        if (nid == nal->ni.nid)
+                *dist = 0;                      /* it's me */
+        else if (kqswnal_nid2elanid (nid) >= 0)
+                *dist = 1;                      /* it's my peer */
+        else
+                *dist = 2;                      /* via router */
+        return (0);
+}
+
+void
+kqswnal_unmap_tx (kqswnal_tx_t *ktx)
+{
+        if (ktx->ktx_nmappedpages == 0)
+                return;
+
+        CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n",
+                ktx, ktx->ktx_niov, ktx->ktx_basepage, ktx->ktx_nmappedpages);
+
+        LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages);
+        LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <=
+                 kqswnal_data.kqn_eptxdmahandle->NumDvmaPages);
+
+        elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState,
+                          kqswnal_data.kqn_eptxdmahandle,
+                          ktx->ktx_basepage, ktx->ktx_nmappedpages);
+        ktx->ktx_nmappedpages = 0;
+}
+
+int
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov)
+{
+        int       nfrags    = ktx->ktx_niov;
+        const int maxfrags  = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]);
+        int       nmapped   = ktx->ktx_nmappedpages;
+        int       maxmapped = ktx->ktx_npages;
+        uint32_t  basepage  = ktx->ktx_basepage + nmapped;
+        char     *ptr;
+        
+        LASSERT (nmapped <= maxmapped);
+        LASSERT (nfrags <= maxfrags);
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+        
+        do {
+                int  fraglen = kiov->kiov_len;
+
+                /* nob exactly spans the iovs */
+                LASSERT (fraglen <= nob);
+                /* each frag fits in a page */
+                LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE);
+
+                nmapped++;
+                if (nmapped > maxmapped) {
+                        CERROR("Can't map message in %d pages (max %d)\n",
+                               nmapped, maxmapped);
+                        return (-EMSGSIZE);
+                }
+
+                if (nfrags == maxfrags) {
+                        CERROR("Message too fragmented in Elan VM (max %d frags)\n",
+                               maxfrags);
+                        return (-EMSGSIZE);
+                }
+
+                /* XXX this is really crap, but we'll have to kmap until
+                 * EKC has a page (rather than vaddr) mapping interface */
+
+                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+
+                CDEBUG(D_NET,
+                       "%p[%d] loading %p for %d, page %d, %d total\n",
+                        ktx, nfrags, ptr, fraglen, basepage, nmapped);
+
+                elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                       kqswnal_data.kqn_eptxdmahandle,
+                                       ptr, fraglen,
+                                       basepage, &ktx->ktx_iov[nfrags].Base);
+
+                kunmap (kiov->kiov_page);
+                
+                /* keep in loop for failure case */
+                ktx->ktx_nmappedpages = nmapped;
+
+                if (nfrags > 0 &&                /* previous frag mapped */
+                    ktx->ktx_iov[nfrags].Base == /* contiguous with this one */
+                    (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len))
+                        /* just extend previous */
+                        ktx->ktx_iov[nfrags - 1].Len += fraglen;
+                else {
+                        ktx->ktx_iov[nfrags].Len = fraglen;
+                        nfrags++;                /* new frag */
+                }
+
+                basepage++;
+                kiov++;
+                niov--;
+                nob -= fraglen;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        ktx->ktx_niov = nfrags;
+        CDEBUG (D_NET, "%p got %d frags over %d pages\n",
+                ktx, ktx->ktx_niov, ktx->ktx_nmappedpages);
+
+        return (0);
+}
+
+int
+kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov)
+{
+        int       nfrags    = ktx->ktx_niov;
+        const int maxfrags  = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]);
+        int       nmapped   = ktx->ktx_nmappedpages;
+        int       maxmapped = ktx->ktx_npages;
+        uint32_t  basepage  = ktx->ktx_basepage + nmapped;
+
+        LASSERT (nmapped <= maxmapped);
+        LASSERT (nfrags <= maxfrags);
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+
+        do {
+                int  fraglen = iov->iov_len;
+                long npages  = kqswnal_pages_spanned (iov->iov_base, fraglen);
+
+                /* nob exactly spans the iovs */
+                LASSERT (fraglen <= nob);
+                
+                nmapped += npages;
+                if (nmapped > maxmapped) {
+                        CERROR("Can't map message in %d pages (max %d)\n",
+                               nmapped, maxmapped);
+                        return (-EMSGSIZE);
+                }
+
+                if (nfrags == maxfrags) {
+                        CERROR("Message too fragmented in Elan VM (max %d frags)\n",
+                               maxfrags);
+                        return (-EMSGSIZE);
+                }
+
+                CDEBUG(D_NET,
+                       "%p[%d] loading %p for %d, pages %d for %ld, %d total\n",
+                        ktx, nfrags, iov->iov_base, fraglen, basepage, npages,
+                        nmapped);
+
+                elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState,
+                                       kqswnal_data.kqn_eptxdmahandle,
+                                       iov->iov_base, fraglen,
+                                       basepage, &ktx->ktx_iov[nfrags].Base);
+                /* keep in loop for failure case */
+                ktx->ktx_nmappedpages = nmapped;
+
+                if (nfrags > 0 &&                /* previous frag mapped */
+                    ktx->ktx_iov[nfrags].Base == /* contiguous with this one */
+                    (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len))
+                        /* just extend previous */
+                        ktx->ktx_iov[nfrags - 1].Len += fraglen;
+                else {
+                        ktx->ktx_iov[nfrags].Len = fraglen;
+                        nfrags++;                /* new frag */
+                }
+
+                basepage += npages;
+                iov++;
+                niov--;
+                nob -= fraglen;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        ktx->ktx_niov = nfrags;
+        CDEBUG (D_NET, "%p got %d frags over %d pages\n",
+                ktx, ktx->ktx_niov, ktx->ktx_nmappedpages);
+
+        return (0);
+}
+
+void
+kqswnal_put_idle_tx (kqswnal_tx_t *ktx)
+{
+        kpr_fwd_desc_t   *fwd = NULL;
+        struct list_head *idle = ktx->ktx_idle;
+        unsigned long     flags;
+
+        kqswnal_unmap_tx (ktx);                /* release temporary mappings */
+        ktx->ktx_state = KTX_IDLE;
+
+        spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        list_add (&ktx->ktx_list, idle);
+
+        /* reserved for non-blocking tx */
+        if (idle == &kqswnal_data.kqn_nblk_idletxds) {
+                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+                return;
+        }
+
+        /* anything blocking for a tx descriptor? */
+        if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */
+        {
+                CDEBUG(D_NET,"wakeup fwd\n");
+
+                fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
+                                  kpr_fwd_desc_t, kprfd_list);
+                list_del (&fwd->kprfd_list);
+        }
+
+        if (waitqueue_active (&kqswnal_data.kqn_idletxd_waitq))  /* process? */
+        {
+                /* local sender waiting for tx desc */
+                CDEBUG(D_NET,"wakeup process\n");
+                wake_up (&kqswnal_data.kqn_idletxd_waitq);
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        if (fwd == NULL)
+                return;
+
+        /* schedule packet for forwarding again */
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+kqswnal_tx_t *
+kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
+{
+        unsigned long  flags;
+        kqswnal_tx_t  *ktx = NULL;
+
+        for (;;) {
+                spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
+
+                /* "normal" descriptor is free */
+                if (!list_empty (&kqswnal_data.kqn_idletxds)) {
+                        ktx = list_entry (kqswnal_data.kqn_idletxds.next,
+                                          kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        break;
+                }
+
+                /* "normal" descriptor pool is empty */
+
+                if (fwd != NULL) { /* forwarded packet => queue for idle txd */
+                        CDEBUG (D_NET, "blocked fwd [%p]\n", fwd);
+                        list_add_tail (&fwd->kprfd_list,
+                                       &kqswnal_data.kqn_idletxd_fwdq);
+                        break;
+                }
+
+                /* doing a local transmit */
+                if (!may_block) {
+                        if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) {
+                                CERROR ("intr tx desc pool exhausted\n");
+                                break;
+                        }
+
+                        ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next,
+                                          kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        break;
+                }
+
+                /* block for idle tx */
+
+                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+                CDEBUG (D_NET, "blocking for tx desc\n");
+                wait_event (kqswnal_data.kqn_idletxd_waitq,
+                            !list_empty (&kqswnal_data.kqn_idletxds));
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
+
+        /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */
+        LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0);
+        return (ktx);
+}
+
+void
+kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
+{
+        switch (ktx->ktx_state) {
+        case KTX_FORWARDING:       /* router asked me to forward this packet */
+                kpr_fwd_done (&kqswnal_data.kqn_router,
+                              (kpr_fwd_desc_t *)ktx->ktx_args[0], error);
+                break;
+
+        case KTX_SENDING:          /* packet sourced locally */
+                lib_finalize (&kqswnal_lib, ktx->ktx_args[0],
+                              (lib_msg_t *)ktx->ktx_args[1]);
+                break;
+
+        default:
+                LASSERT (0);
+        }
+
+        kqswnal_put_idle_tx (ktx);
+}
+
+static void
+kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
+{
+        kqswnal_tx_t      *ktx = (kqswnal_tx_t *)arg;
+
+        LASSERT (txd != NULL);
+        LASSERT (ktx != NULL);
+
+        CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status);
+
+        if (status == EP_SUCCESS)
+                atomic_inc (&kqswnal_packets_transmitted);
+
+        if (status != EP_SUCCESS)
+        {
+                CERROR ("kqswnal: Transmit failed with %d\n", status);
+                status = -EIO;
+        }
+
+        kqswnal_tx_done (ktx, status);
+}
+
+int
+kqswnal_launch (kqswnal_tx_t *ktx)
+{
+        /* Don't block for transmit descriptor if we're in interrupt context */
+        int   attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0;
+        int   dest = kqswnal_nid2elanid (ktx->ktx_nid);
+        long  flags;
+        int   rc;
+        
+        LASSERT (dest >= 0);                    /* must be a peer */
+        rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest,
+                               ktx->ktx_port, attr, kqswnal_txhandler,
+                               ktx, ktx->ktx_iov, ktx->ktx_niov);
+        if (rc == 0)
+                atomic_inc (&kqswnal_packets_launched);
+
+        if (rc != ENOMEM)
+                return (rc);
+
+        /* can't allocate ep txd => queue for later */
+
+        LASSERT (in_interrupt());      /* not called by thread (not looping) */
+
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+        return (0);
+}
+
+
+static char *
+hdr_type_string (ptl_hdr_t *hdr)
+{
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return ("ACK");
+        case PTL_MSG_PUT:
+                return ("PUT");
+        case PTL_MSG_GET:
+                return ("GET");
+        case PTL_MSG_REPLY:
+                return ("REPLY");
+        default:
+                return ("<UNKNOWN>");
+        }
+}
+
+static void
+kqswnal_cerror_hdr(ptl_hdr_t * hdr)
+{
+        char *type_str = hdr_type_string (hdr);
+
+        CERROR("P3 Header at %p of type %s\n", hdr, type_str);
+        CERROR("    From nid/pid "LPU64"/%u", NTOH__u64(hdr->src_nid),
+               NTOH__u32(hdr->src_pid));
+        CERROR("    To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid),
+               NTOH__u32(hdr->dest_pid));
+
+        switch (NTOH__u32(hdr->type)) {
+        case PTL_MSG_PUT:
+                CERROR("    Ptl index %d, ack md "LPX64"."LPX64", "
+                       "match bits "LPX64"\n",
+                       NTOH__u32 (hdr->msg.put.ptl_index),
+                       hdr->msg.put.ack_wmd.wh_interface_cookie,
+                       hdr->msg.put.ack_wmd.wh_object_cookie,
+                       NTOH__u64 (hdr->msg.put.match_bits));
+                CERROR("    Length %d, offset %d, hdr data "LPX64"\n",
+                       NTOH__u32(PTL_HDR_LENGTH(hdr)),
+                       NTOH__u32(hdr->msg.put.offset),
+                       hdr->msg.put.hdr_data);
+                break;
+
+        case PTL_MSG_GET:
+                CERROR("    Ptl index %d, return md "LPX64"."LPX64", "
+                       "match bits "LPX64"\n",
+                       NTOH__u32 (hdr->msg.get.ptl_index),
+                       hdr->msg.get.return_wmd.wh_interface_cookie,
+                       hdr->msg.get.return_wmd.wh_object_cookie,
+                       hdr->msg.get.match_bits);
+                CERROR("    Length %d, src offset %d\n",
+                       NTOH__u32 (hdr->msg.get.sink_length),
+                       NTOH__u32 (hdr->msg.get.src_offset));
+                break;
+
+        case PTL_MSG_ACK:
+                CERROR("    dst md "LPX64"."LPX64", manipulated length %d\n",
+                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                       hdr->msg.ack.dst_wmd.wh_object_cookie,
+                       NTOH__u32 (hdr->msg.ack.mlength));
+                break;
+
+        case PTL_MSG_REPLY:
+                CERROR("    dst md "LPX64"."LPX64", length %d\n",
+                       hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                       hdr->msg.reply.dst_wmd.wh_object_cookie,
+                       NTOH__u32 (PTL_HDR_LENGTH(hdr)));
+        }
+
+}                               /* end of print_hdr() */
+
+static int
+kqswnal_sendmsg (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 ptl_hdr_t    *hdr,
+                 int           type,
+                 ptl_nid_t     nid,
+                 ptl_pid_t     pid,
+                 unsigned int  payload_niov,
+                 struct iovec *payload_iov,
+                 ptl_kiov_t   *payload_kiov,
+                 size_t        payload_nob)
+{
+        kqswnal_tx_t      *ktx;
+        int                rc;
+        ptl_nid_t          gatewaynid;
+#if KQSW_CHECKSUM
+        int                i;
+        kqsw_csum_t        csum;
+        int                sumnob;
+#endif
+        
+        /* NB, the return code from this procedure is ignored.
+         * If we can't send, we must still complete with lib_finalize().
+         * We'll have to wait for 3.2 to return an error event.
+         */
+
+        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64
+               " pid %u\n", payload_nob, payload_niov, nid, pid);
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        /* It must be OK to kmap() if required */
+        LASSERT (payload_kiov == NULL || !in_interrupt ());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+        
+        if (payload_nob > KQSW_MAXPAYLOAD) {
+                CERROR ("request exceeds MTU size "LPSZ" (max %u).\n",
+                        payload_nob, KQSW_MAXPAYLOAD);
+                lib_finalize (&kqswnal_lib, private, cookie);
+                return (-1);
+        }
+
+        if (kqswnal_nid2elanid (nid) < 0) {     /* Can't send direct: find gateway? */
+                rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid);
+                if (rc != 0) {
+                        CERROR("Can't route to "LPX64": router error %d\n",
+                               nid, rc);
+                        lib_finalize (&kqswnal_lib, private, cookie);
+                        return (-1);
+                }
+                if (kqswnal_nid2elanid (gatewaynid) < 0) {
+                        CERROR("Bad gateway "LPX64" for "LPX64"\n",
+                               gatewaynid, nid);
+                        lib_finalize (&kqswnal_lib, private, cookie);
+                        return (-1);
+                }
+                nid = gatewaynid;
+        }
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK ||
+                                          type == PTL_MSG_REPLY ||
+                                          in_interrupt()));
+        if (ktx == NULL) {
+                kqswnal_cerror_hdr (hdr);
+                lib_finalize (&kqswnal_lib, private, cookie);
+        }
+
+        memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */
+
+#if KQSW_CHECKSUM
+        csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
+        memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
+        for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) {
+                if (payload_kiov != NULL) {
+                        ptl_kiov_t *kiov = &payload_kiov[i];
+                        char       *addr = ((char *)kmap (kiov->kiov_page)) +
+                                           kiov->kiov_offset;
+                        
+                        csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len));
+                        sumnob -= kiov->kiov_len;
+                } else {
+                        struct iovec *iov = &payload_iov[i];
+
+                        csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len));
+                        sumnob -= iov->iov_len;
+                }
+        }
+        memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum));
+#endif
+
+        /* Set up first frag from pre-mapped buffer (it's at least the
+         * portals header) */
+        ktx->ktx_iov[0].Base = ktx->ktx_ebuffer;
+        ktx->ktx_iov[0].Len = KQSW_HDR_SIZE;
+        ktx->ktx_niov = 1;
+
+        if (payload_nob > 0) { /* got some payload (something more to do) */
+                /* make a single contiguous message? */
+                if (payload_nob <= KQSW_TX_MAXCONTIG) {
+                        /* copy payload to ktx_buffer, immediately after hdr */
+                        if (payload_kiov != NULL)
+                                lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
+                                                   payload_niov, payload_kiov, payload_nob);
+                        else
+                                lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
+                                                  payload_niov, payload_iov, payload_nob);
+                        /* first frag includes payload */
+                        ktx->ktx_iov[0].Len += payload_nob;
+                } else {
+                        if (payload_kiov != NULL)
+                                rc = kqswnal_map_tx_kiov (ktx, payload_nob, 
+                                                          payload_niov, payload_kiov);
+                        else
+                                rc = kqswnal_map_tx_iov (ktx, payload_nob,
+                                                         payload_niov, payload_iov);
+                        if (rc != 0) {
+                                kqswnal_put_idle_tx (ktx);
+                                lib_finalize (&kqswnal_lib, private, cookie);
+                                return (-1);
+                        }
+                } 
+        }
+
+        ktx->ktx_port    = (payload_nob <= KQSW_SMALLPAYLOAD) ?
+                        EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE;
+        ktx->ktx_nid     = nid;
+        ktx->ktx_state   = KTX_SENDING;   /* => lib_finalize() on completion */
+        ktx->ktx_args[0] = private;
+        ktx->ktx_args[1] = cookie;
+
+        rc = kqswnal_launch (ktx);
+        if (rc != 0) {                    /* failed? */
+                CERROR ("Failed to send packet to "LPX64": %d\n", nid, rc);
+                lib_finalize (&kqswnal_lib, private, cookie);
+                return (-1);
+        }
+
+        CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid);
+        return (0);
+}
+
+static int
+kqswnal_send (nal_cb_t     *nal,
+              void         *private,
+              lib_msg_t    *cookie,
+              ptl_hdr_t    *hdr,
+              int           type,
+              ptl_nid_t     nid,
+              ptl_pid_t     pid,
+              unsigned int  payload_niov,
+              struct iovec *payload_iov,
+              size_t        payload_nob)
+{
+        return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                 payload_niov, payload_iov, NULL, payload_nob));
+}
+
+static int
+kqswnal_send_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    ptl_hdr_t    *hdr,
+                    int           type,
+                    ptl_nid_t     nid,
+                    ptl_pid_t     pid,
+                    unsigned int  payload_niov,
+                    ptl_kiov_t   *payload_kiov,
+                    size_t        payload_nob)
+{
+        return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid,
+                                 payload_niov, NULL, payload_kiov, payload_nob));
+}
+
+int kqswnal_fwd_copy_contig = 0;
+
+void
+kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        int             rc;
+        kqswnal_tx_t   *ktx;
+        struct iovec   *iov = fwd->kprfd_iov;
+        int             niov = fwd->kprfd_niov;
+        int             nob = fwd->kprfd_nob;
+        ptl_nid_t       nid = fwd->kprfd_gateway_nid;
+
+#if KQSW_CHECKSUM
+        CERROR ("checksums for forwarded packets not implemented\n");
+        LBUG ();
+#endif
+        /* The router wants this NAL to forward a packet */
+        CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n",
+                fwd, nid, niov, nob);
+
+        LASSERT (niov > 0);
+        
+        ktx = kqswnal_get_idle_tx (fwd, FALSE);
+        if (ktx == NULL)        /* can't get txd right now */
+                return;         /* fwd will be scheduled when tx desc freed */
+
+        if (nid == kqswnal_lib.ni.nid)          /* gateway is me */
+                nid = fwd->kprfd_target_nid;    /* target is final dest */
+
+        if (kqswnal_nid2elanid (nid) < 0) {
+                CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid);
+                rc = -EHOSTUNREACH;
+                goto failed;
+        }
+
+        if (nob > KQSW_NRXMSGBYTES_LARGE) {
+                CERROR ("Can't forward [%p] to "LPX64
+                        ": size %d bigger than max packet size %ld\n",
+                        fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE);
+                rc = -EMSGSIZE;
+                goto failed;
+        }
+
+        if ((kqswnal_fwd_copy_contig || niov > 1) &&
+            nob <= KQSW_TX_BUFFER_SIZE) 
+        {
+                /* send from ktx's pre-allocated/mapped contiguous buffer? */
+                lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob);
+                ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; /* already mapped */
+                ktx->ktx_iov[0].Len = nob;
+                ktx->ktx_niov = 1;
+        }
+        else
+        {
+                /* zero copy */
+                ktx->ktx_niov = 0;        /* no frags mapped yet */
+                rc = kqswnal_map_tx_iov (ktx, nob, niov, iov);
+                if (rc != 0)
+                        goto failed;
+        }
+
+        ktx->ktx_port    = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ?
+                        EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE;
+        ktx->ktx_nid     = nid;
+        ktx->ktx_state   = KTX_FORWARDING; /* kpr_put_packet() on completion */
+        ktx->ktx_args[0] = fwd;
+
+        rc = kqswnal_launch (ktx);
+        if (rc == 0)
+                return;
+
+ failed:
+        LASSERT (rc != 0);
+        CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc);
+
+        kqswnal_put_idle_tx (ktx);
+        /* complete now (with failure) */
+        kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc);
+}
+
+void
+kqswnal_fwd_callback (void *arg, int error)
+{
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)arg;
+
+        /* The router has finished forwarding this packet */
+
+        if (error != 0)
+        {
+                ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]);
+
+                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
+                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error);
+        }
+
+        kqswnal_requeue_rx (krx);
+}
+
+void
+kqswnal_rx (kqswnal_rx_t *krx)
+{
+        ptl_hdr_t      *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]);
+        ptl_nid_t       dest_nid = NTOH__u64 (hdr->dest_nid);
+        int             nob;
+        int             niov;
+
+        if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */
+                /* NB krx requeued when lib_parse() calls back kqswnal_recv */
+                lib_parse (&kqswnal_lib, hdr, krx);
+                return;
+        }
+
+#if KQSW_CHECKSUM
+        CERROR ("checksums for forwarded packets not implemented\n");
+        LBUG ();
+#endif
+        if (kqswnal_nid2elanid (dest_nid) >= 0)  /* should have gone direct to peer */
+        {
+                CERROR("dropping packet from "LPX64" for "LPX64
+                       ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid);
+                kqswnal_requeue_rx (krx);
+                return;
+        }
+
+        /* NB forwarding may destroy iov; rebuild every time */
+        for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++)
+        {
+                LASSERT (niov < krx->krx_npages);
+                krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]);
+                krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob);
+        }
+
+        kpr_fwd_init (&krx->krx_fwd, dest_nid,
+                      krx->krx_nob, niov, krx->krx_iov,
+                      kqswnal_fwd_callback, krx);
+
+        kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd);
+}
+
+/* Receive Interrupt Handler: posts to schedulers */
+void 
+kqswnal_rxhandler(EP_RXD *rxd)
+{
+        long          flags;
+        int           nob    = ep_rxd_len (rxd);
+        int           status = ep_rxd_status (rxd);
+        kqswnal_rx_t *krx    = (kqswnal_rx_t *)ep_rxd_arg (rxd);
+
+        CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n",
+               rxd, krx, nob, status);
+
+        LASSERT (krx != NULL);
+
+        krx->krx_rxd = rxd;
+        krx->krx_nob = nob;
+
+        /* must receive a whole header to be able to parse */
+        if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t))
+        {
+                /* receives complete with failure when receiver is removed */
+                if (kqswnal_data.kqn_shuttingdown)
+                        return;
+
+                CERROR("receive status failed with status %d nob %d\n",
+                       ep_rxd_status(rxd), nob);
+                kqswnal_requeue_rx (krx);
+                return;
+        }
+
+        atomic_inc (&kqswnal_packets_received);
+
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds);
+        if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+                wake_up (&kqswnal_data.kqn_sched_waitq);
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+}
+
+#if KQSW_CHECKSUM
+void
+kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
+{
+        ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]);
+
+        CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64
+                ", dpid %d, spid %d, type %d\n",
+                ishdr ? "Header" : "Payload", krx,
+                NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid)
+                NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid),
+                NTOH__u32(hdr->type));
+
+        switch (NTOH__u32 (hdr->type))
+        {
+        case PTL_MSG_ACK:
+                CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64
+                       " len %u\n",
+                       NTOH__u32(hdr->msg.ack.mlength),
+                       hdr->msg.ack.dst_wmd.handle_cookie,
+                       hdr->msg.ack.dst_wmd.handle_idx,
+                       NTOH__u64(hdr->msg.ack.match_bits),
+                       NTOH__u32(hdr->msg.ack.length));
+                break;
+        case PTL_MSG_PUT:
+                CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64
+                       " len %u off %u data "LPX64"\n",
+                       NTOH__u32(hdr->msg.put.ptl_index),
+                       hdr->msg.put.ack_wmd.handle_cookie,
+                       hdr->msg.put.ack_wmd.handle_idx,
+                       NTOH__u64(hdr->msg.put.match_bits),
+                       NTOH__u32(hdr->msg.put.length),
+                       NTOH__u32(hdr->msg.put.offset),
+                       hdr->msg.put.hdr_data);
+                break;
+        case PTL_MSG_GET:
+                CERROR ("GET: <>\n");
+                break;
+        case PTL_MSG_REPLY:
+                CERROR ("REPLY: <>\n");
+                break;
+        default:
+                CERROR ("TYPE?: <>\n");
+        }
+}
+#endif
+
+static int
+kqswnal_recvmsg (nal_cb_t     *nal,
+                 void         *private,
+                 lib_msg_t    *cookie,
+                 unsigned int  niov,
+                 struct iovec *iov,
+                 ptl_kiov_t   *kiov,
+                 size_t        mlen,
+                 size_t        rlen)
+{
+        kqswnal_rx_t *krx = (kqswnal_rx_t *)private;
+        int           page;
+        char         *page_ptr;
+        int           page_nob;
+        char         *iov_ptr;
+        int           iov_nob;
+        int           frag;
+#if KQSW_CHECKSUM
+        kqsw_csum_t   senders_csum;
+        kqsw_csum_t   payload_csum = 0;
+        kqsw_csum_t   hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]),
+                                           sizeof(ptl_hdr_t));
+        size_t        csum_len = mlen;
+        int           csum_frags = 0;
+        int           csum_nob = 0;
+        static atomic_t csum_counter;
+        int           csum_verbose = (atomic_read(&csum_counter)%1000001) == 0;
+
+        atomic_inc (&csum_counter);
+
+        memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) +
+                                sizeof (ptl_hdr_t), sizeof (kqsw_csum_t));
+        if (senders_csum != hdr_csum)
+                kqswnal_csum_error (krx, 1);
+#endif
+        CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
+
+        /* What was actually received must be >= payload.
+         * This is an LASSERT, as lib_finalize() doesn't have a completion status. */
+        LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen);
+        LASSERT (mlen <= rlen);
+
+        /* It must be OK to kmap() if required */
+        LASSERT (kiov == NULL || !in_interrupt ());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+        
+        if (mlen != 0)
+        {
+                page     = 0;
+                page_ptr = ((char *) page_address(krx->krx_pages[0])) +
+                        KQSW_HDR_SIZE;
+                page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
+
+                LASSERT (niov > 0);
+                if (kiov != NULL) {
+                        iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                        iov_nob = kiov->kiov_len;
+                } else {
+                        iov_ptr = iov->iov_base;
+                        iov_nob = iov->iov_len;
+                }
+
+                for (;;)
+                {
+                        /* We expect the iov to exactly match mlen */
+                        LASSERT (iov_nob <= mlen);
+                        
+                        frag = MIN (page_nob, iov_nob);
+                        memcpy (iov_ptr, page_ptr, frag);
+#if KQSW_CHECKSUM
+                        payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
+                        csum_nob += frag;
+                        csum_frags++;
+#endif
+                        mlen -= frag;
+                        if (mlen == 0)
+                                break;
+
+                        page_nob -= frag;
+                        if (page_nob != 0)
+                                page_ptr += frag;
+                        else
+                        {
+                                page++;
+                                LASSERT (page < krx->krx_npages);
+                                page_ptr = page_address(krx->krx_pages[page]);
+                                page_nob = PAGE_SIZE;
+                        }
+
+                        iov_nob -= frag;
+                        if (iov_nob != 0)
+                                iov_ptr += frag;
+                        else if (kiov != NULL) {
+                                kunmap (kiov->kiov_page);
+                                kiov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                                iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                                iov_nob = kiov->kiov_len;
+                        } else {
+                                iov++;
+                                niov--;
+                                LASSERT (niov > 0);
+                                iov_ptr = iov->iov_base;
+                                iov_nob = iov->iov_len;
+                        }
+                }
+
+                if (kiov != NULL)
+                        kunmap (kiov->kiov_page);
+        }
+
+#if KQSW_CHECKSUM
+        memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) +
+                sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t));
+
+        if (csum_len != rlen)
+                CERROR("Unable to checksum data in user's buffer\n");
+        else if (senders_csum != payload_csum)
+                kqswnal_csum_error (krx, 0);
+
+        if (csum_verbose)
+                CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, "
+                       "csum_nob %d\n",
+                        hdr_csum, payload_csum, csum_frags, csum_nob);
+#endif
+        lib_finalize(nal, private, cookie);
+
+        kqswnal_requeue_rx (krx);
+
+        return (rlen);
+}
+
+static int
+kqswnal_recv(nal_cb_t     *nal,
+             void         *private,
+             lib_msg_t    *cookie,
+             unsigned int  niov,
+             struct iovec *iov,
+             size_t        mlen,
+             size_t        rlen)
+{
+        return (kqswnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen));
+}
+
+static int
+kqswnal_recv_pages (nal_cb_t     *nal,
+                    void         *private,
+                    lib_msg_t    *cookie,
+                    unsigned int  niov,
+                    ptl_kiov_t   *kiov,
+                    size_t        mlen,
+                    size_t        rlen)
+{
+        return (kqswnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen));
+}
+
+int
+kqswnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kqswnal_data.kqn_nthreads);
+        return (0);
+}
+
+void
+kqswnal_thread_fini (void)
+{
+        atomic_dec (&kqswnal_data.kqn_nthreads);
+}
+
+int
+kqswnal_scheduler (void *arg)
+{
+        kqswnal_rx_t    *krx;
+        kqswnal_tx_t    *ktx;
+        kpr_fwd_desc_t  *fwd;
+        long             flags;
+        int              rc;
+        int              counter = 0;
+        int              did_something;
+
+        kportal_daemonize ("kqswnal_sched");
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+
+        while (!kqswnal_data.kqn_shuttingdown)
+        {
+                did_something = FALSE;
+
+                if (!list_empty (&kqswnal_data.kqn_readyrxds))
+                {
+                        krx = list_entry(kqswnal_data.kqn_readyrxds.next,
+                                         kqswnal_rx_t, krx_list);
+                        list_del (&krx->krx_list);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        kqswnal_rx (krx);
+
+                        did_something = TRUE;
+                        spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                if (!list_empty (&kqswnal_data.kqn_delayedtxds))
+                {
+                        ktx = list_entry(kqswnal_data.kqn_delayedtxds.next,
+                                         kqswnal_tx_t, ktx_list);
+                        list_del (&ktx->ktx_list);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        rc = kqswnal_launch (ktx);
+                        if (rc != 0)          /* failed: ktx_nid down? */
+                        {
+                                CERROR("Failed delayed transmit to "LPX64
+                                       ": %d\n", ktx->ktx_nid, rc);
+                                kqswnal_tx_done (ktx, rc);
+                        }
+
+                        did_something = TRUE;
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                if (!list_empty (&kqswnal_data.kqn_delayedfwds))
+                {
+                        fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list);
+                        list_del (&fwd->kprfd_list);
+                        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+                        kqswnal_fwd_packet (NULL, fwd);
+
+                        did_something = TRUE;
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+
+                    /* nothing to do or hogging CPU */
+                if (!did_something || counter++ == KQSW_RESCHED) {
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
+
+                        counter = 0;
+
+                        if (!did_something) {
+                                rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq,
+                                                               kqswnal_data.kqn_shuttingdown ||
+                                                               !list_empty(&kqswnal_data.kqn_readyrxds) ||
+                                                               !list_empty(&kqswnal_data.kqn_delayedtxds) ||
+                                                               !list_empty(&kqswnal_data.kqn_delayedfwds));
+                                LASSERT (rc == 0);
+                        } else if (current->need_resched)
+                                schedule ();
+
+                        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+
+        kqswnal_thread_fini ();
+        return (0);
+}
+
+nal_cb_t kqswnal_lib =
+{
+        nal_data:       &kqswnal_data,         /* NAL private data */
+        cb_send:        kqswnal_send,
+        cb_send_pages:  kqswnal_send_pages,
+        cb_recv:        kqswnal_recv,
+        cb_recv_pages:  kqswnal_recv_pages,
+        cb_read:        kqswnal_read,
+        cb_write:       kqswnal_write,
+        cb_malloc:      kqswnal_malloc,
+        cb_free:        kqswnal_free,
+        cb_printf:      kqswnal_printf,
+        cb_cli:         kqswnal_cli,
+        cb_sti:         kqswnal_sti,
+        cb_dist:        kqswnal_dist
+};
diff --git a/lustre/portals/knals/scimacnal/.cvsignore b/lustre/portals/knals/scimacnal/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/portals/knals/scimacnal/Makefile.am b/lustre/portals/knals/scimacnal/Makefile.am
new file mode 100644 (file)
index 0000000..6da31f0
--- /dev/null
@@ -0,0 +1,11 @@
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = kscimacnal
+modulenet_DATA = kscimacnal.o
+EXTRA_PROGRAMS = kscimacnal
+
+DEFS =
+kscimacnal_SOURCES = scimacnal.c scimacnal_cb.c scimacnal.h
diff --git a/lustre/portals/knals/scimacnal/README.scimacnal b/lustre/portals/knals/scimacnal/README.scimacnal
new file mode 100644 (file)
index 0000000..d4c6a49
--- /dev/null
@@ -0,0 +1,14 @@
+
+scimacnal - A NAL for the Scali ScaMAC midlayer.
+
+The ScaMAC midlayer is a simplified API to the SCI high performance
+interconnect.
+
+In order to use this NAL you'll need to tune scimac to use larger buffers.
+See scimac.conf in this directory for an example.
+
+Overall performance and stability isn't great but this can be attributed
+to the scimac driver which apparently is in need of some development.
+
+TODO:
+Routing isn't yet implemented.
diff --git a/lustre/portals/knals/scimacnal/scimac.conf b/lustre/portals/knals/scimacnal/scimac.conf
new file mode 100644 (file)
index 0000000..bfb6d02
--- /dev/null
@@ -0,0 +1,35 @@
+#  Configuration file for the scimac driver - lustre friendly settings
+#
+
+#  The maximal number of message headers to use in the system.
+scimac_max_no_hdrs = 32
+
+#  The maximal number of eager buffers to use in the system.
+scimac_max_no_ebufs = 8
+
+#  The maximal size in bytes of each eager buffer.
+scimac_max_ebuf_size = 65536
+
+#  Enable use of a kernel thread to defer reception of packets.
+#  Default is to use a tasklet (sw interrupt).
+scimac_use_ulevel_recv = 1
+
+#  The maximal number of packets queued for transfer per path at any one time. 
+scimac_max_send_queuelen = 2000
+
+#  The packet retransmit time in milliseconds.
+#  The time elapsed since a packet was attempted sent until the packet is resent.
+scimac_pkt_rexmit_time = 200
+
+#  The packet's maximal retransmit time in milliseconds.
+#  The total time that a packet will be attempted sent before it is dropped.
+scimac_max_rexmit_time = 5000
+
+#  The lowest valid node identifier in the system.
+scimac_min_nodeid_number = 0x100
+
+#  The largest valid node identifier in the system.
+scimac_max_nodeid_number = 0xff00
+
+#  The incremental nodeid step in the system.
+scimac_nodeid_increment = 0x100
diff --git a/lustre/portals/knals/scimacnal/scimacnal.c b/lustre/portals/knals/scimacnal/scimacnal.c
new file mode 100644 (file)
index 0000000..1066d69
--- /dev/null
@@ -0,0 +1,219 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+
+ * Based on gmnal, which is based on ksocknal and qswnal
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include "scimacnal.h"
+
+ptl_handle_ni_t kscimacnal_ni;
+nal_t  kscimacnal_api;
+
+kscimacnal_data_t kscimacnal_data;
+
+kpr_nal_interface_t kscimacnal_router_interface = {
+        kprni_nalid:    SCIMACNAL,
+        kprni_arg:      NULL,
+        kprni_fwd:      kscimacnal_fwd_packet,
+};
+
+
+static int kscimacnal_forward(nal_t   *nal,
+                          int     id,
+                          void    *args,  size_t args_len,
+                          void    *ret,   size_t ret_len)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        lib_dispatch(nal_cb, ksci, id, args, ret); /* nal needs ksci */
+        return PTL_OK;
+}
+
+
+static void kscimacnal_lock(nal_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+
+static void kscimacnal_unlock(nal_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *ksci = nal->nal_data;
+        nal_cb_t      *nal_cb = ksci->ksci_cb;
+
+
+        LASSERT (nal == &kscimacnal_api);
+        LASSERT (ksci == &kscimacnal_data);
+        LASSERT (nal_cb == &kscimacnal_lib);
+
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+
+static int kscimacnal_shutdown(nal_t *nal, int ni)
+{
+        LASSERT (nal == &kscimacnal_api);
+        return 0;
+}
+
+
+static void kscimacnal_yield( nal_t *nal )
+{
+        LASSERT (nal == &kscimacnal_api);
+
+        if (current->need_resched) 
+                schedule();
+        return;
+}
+
+
+static nal_t *kscimacnal_init(int interface, ptl_pt_index_t  ptl_size,
+                ptl_ac_index_t  ac_size, ptl_pid_t requested_pid)
+{
+        int     nnids = 512; /* FIXME: Need ScaMac funktion to get #nodes */
+
+        CDEBUG(D_NET, "calling lib_init with nid 0x%Lx nnids %d\n", kscimacnal_data.ksci_nid, nnids);
+        lib_init(&kscimacnal_lib, kscimacnal_data.ksci_nid, 0, nnids,ptl_size, ac_size); 
+        return &kscimacnal_api;
+}
+
+
+/* Called by kernel at module unload time */
+static void __exit 
+kscimacnal_finalize(void)
+{
+        /* FIXME: How should the shutdown procedure really look? */
+        kscimacnal_data.ksci_shuttingdown=1;
+
+        PORTAL_SYMBOL_UNREGISTER(kscimacnal_ni);
+
+        PtlNIFini(kscimacnal_ni);
+        lib_fini(&kscimacnal_lib);
+
+        mac_finish(kscimacnal_data.ksci_machandle);
+
+        CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
+
+        return;
+}
+
+
+/* Called by kernel at module insertion time */
+static int __init
+kscimacnal_initialize(void)
+{
+        int rc;
+        unsigned long     nid=0;
+        mac_handle_t    *machandle = NULL;
+
+
+        CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
+
+        kscimacnal_api.forward = kscimacnal_forward;
+        kscimacnal_api.shutdown = kscimacnal_shutdown;
+        kscimacnal_api.yield = kscimacnal_yield;
+        kscimacnal_api.validate = NULL;         /* our api validate is a NOOP */
+        kscimacnal_api.lock= kscimacnal_lock;
+        kscimacnal_api.unlock= kscimacnal_unlock;
+        kscimacnal_api.nal_data = &kscimacnal_data;
+
+        kscimacnal_lib.nal_data = &kscimacnal_data;
+
+        memset(&kscimacnal_data, 0, sizeof(kscimacnal_data));
+
+        kscimacnal_data.ksci_cb = &kscimacnal_lib;
+
+        /* We're not using this, but cli/sti callbacks does... ??? */
+        spin_lock_init(&kscimacnal_data.ksci_dispatch_lock);
+
+        /* FIXME: We only support one adapter for now */
+        machandle = mac_init(0, MAC_SAPID_LUSTRE, kscimacnal_rx,
+                        &kscimacnal_data);
+
+        if(!machandle) {
+                CERROR("mac_init() failed\n");
+                return -1;
+        }
+
+        kscimacnal_data.ksci_machandle = machandle;
+
+        /* Make sure the scimac MTU is tuned */
+        if(mac_get_mtusize(machandle) < SCIMACNAL_MTU) {
+                CERROR("scimac mtu of %ld smaller than SCIMACNAL MTU of %d\n",
+                                mac_get_mtusize(machandle), SCIMACNAL_MTU);
+                CERROR("Consult README.scimacnal for more information\n");
+                mac_finish(machandle);
+                return -1;
+        }
+
+        /* Get the node ID */
+        /* mac_get_physaddrlen() is a function instead of define, sigh */
+        LASSERT(mac_get_physaddrlen(machandle) <= sizeof(nid));
+        if(mac_get_physaddr(machandle, (mac_physaddr_t *) &nid)) {
+                CERROR("mac_get_physaddr() failed\n");
+                mac_finish(machandle);
+                return -1;
+        }
+        nid = ntohl(nid);
+        kscimacnal_data.ksci_nid = nid;
+
+
+        /* Initialize Network Interface */
+        /* FIXME: What do the magic numbers mean? Documentation anyone? */
+        rc = PtlNIInit(kscimacnal_init, 32, 4, 0, &kscimacnal_ni);
+        if (rc) {
+                CERROR("PtlNIInit failed %d\n", rc);
+                mac_finish(machandle);
+                return (-ENOMEM);
+        }
+
+        PORTAL_SYMBOL_REGISTER(kscimacnal_ni);
+
+        /* We're done now, it's OK for the RX callback to do stuff */
+        kscimacnal_data.ksci_init = 1;
+
+        return 0;
+}
+
+
+MODULE_AUTHOR("Niklas Edmundsson <nikke@hpc2n.umu.se>");
+MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0");
+MODULE_LICENSE("GPL");
+
+module_init (kscimacnal_initialize);
+module_exit (kscimacnal_finalize);
+
+EXPORT_SYMBOL(kscimacnal_ni);
diff --git a/lustre/portals/knals/scimacnal/scimacnal.h b/lustre/portals/knals/scimacnal/scimacnal.h
new file mode 100644 (file)
index 0000000..1ff180e
--- /dev/null
@@ -0,0 +1,85 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+ */
+
+
+#ifndef _SCIMACNAL_H
+#define _SCIMACNAL_H
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/locks.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <asm/page.h>            /* For PAGE_SIZE */
+
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#include <scamac.h>
+
+#ifndef MAC_SAPID_LUSTRE
+#define MAC_SAPID_LUSTRE MAC_SAPID_TEST1
+#endif /* MAC_SAPID_LUSTRE */
+
+#define SCIMACNAL_MTU 65536
+/* FIXME: What is really the MTU of lustre? */
+#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU
+#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger.
+#endif
+
+typedef struct {
+        mac_handle_t    *handle;
+        mac_mblk_t      *msg;
+        mac_msg_type_t   type;
+        void            *userdata;
+}  kscimacnal_rx_t;
+
+
+typedef struct {
+        nal_cb_t        *ktx_nal;
+        void            *ktx_private;
+        lib_msg_t       *ktx_cookie;
+        ptl_hdr_t       ktx_hdr;
+}  kscimacnal_tx_t;
+
+
+typedef struct {
+        char              ksci_init;
+        char              ksci_shuttingdown;
+        ptl_nid_t         ksci_nid;
+        nal_cb_t         *ksci_cb;
+        spinlock_t        ksci_dispatch_lock;
+        mac_handle_t     *ksci_machandle;
+}  kscimacnal_data_t;
+
+extern kscimacnal_data_t   kscimacnal_data;
+extern nal_t            kscimacnal_api;
+extern nal_cb_t         kscimacnal_lib;
+
+void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+void kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, void *userdata);
+
+
+#endif  /* _SCIMACNAL_H */
diff --git a/lustre/portals/knals/scimacnal/scimacnal_cb.c b/lustre/portals/knals/scimacnal/scimacnal_cb.c
new file mode 100644 (file)
index 0000000..7e4a2e8
--- /dev/null
@@ -0,0 +1,468 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:cindent:
+ *
+ * Copyright (C) 2003 High Performance Computing Center North (HPC2N)
+ *   Author: Niklas Edmundsson <nikke@hpc2n.umu.se>
+
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "scimacnal.h"
+
+static int 
+kscimacnal_read (nal_cb_t *nal, void *private,
+                void *dst_addr, user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+
+static int 
+kscimacnal_write(nal_cb_t *nal, void *private,
+                user_ptr dst_addr, void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr );
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+
+static void *
+kscimacnal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+        return buf;
+}
+
+
+static void 
+kscimacnal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+
+static void 
+kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list         ap;
+        char msg[256]; 
+
+        if (portal_debug & D_NET) {
+                va_start( ap, fmt );
+                vsnprintf( msg, sizeof(msg), fmt, ap );
+                va_end( ap );
+
+                printk("CPUId: %d %s",smp_processor_id(), msg);
+        }
+}
+
+
+static void 
+kscimacnal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *data= nal->nal_data;
+
+        spin_lock_irqsave(&data->ksci_dispatch_lock,*flags);
+}
+
+
+static void 
+kscimacnal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        kscimacnal_data_t *data= nal->nal_data; 
+
+        spin_unlock_irqrestore(&data->ksci_dispatch_lock,*flags);
+}
+
+
+static int 
+kscimacnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* FIXME: Network distance has a meaning, but is there no easy
+         * way to figure it out (depends on routing) */
+
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+
+static
+char * get_mac_error(mac_status_t status) 
+{
+        switch(status) {
+                case MAC_MSG_STAT_OK:
+                        return "MAC_MSG_STAT_OK";
+                case MAC_MSG_STAT_FREED:
+                        return "MAC_MSG_STAT_FREED";
+                case MAC_MSG_STAT_ABORTED:
+                        return "MAC_MSG_STAT_ABORTED";
+                case MAC_MSG_STAT_TIMEDOUT:
+                        return "MAC_MSG_STAT_TIMEDOUT";
+                case MAC_MSG_STAT_NODEUNREACH:
+                        return "MAC_MSG_STAT_NODEUNREACH";
+                case MAC_MSG_STAT_NETDOWN:
+                        return "MAC_MSG_STAT_NETDOWN";
+                case MAC_MSG_STAT_RESET:
+                        return "MAC_MSG_STAT_RESET";
+                case MAC_MSG_STAT_INITFAILED:
+                        return "MAC_MSG_STAT_INITFAILED";
+                case MAC_MSG_STAT_SYNCFAILED:
+                        return "MAC_MSG_STAT_SYNCFAILED";
+                case MAC_MSG_STAT_BADPROTO:
+                        return "MAC_MSG_STAT_BADPROTO";
+                case MAC_MSG_STAT_NOBUFSPACE:
+                        return "MAC_MSG_STAT_NOBUFSPACE";
+                case MAC_MSG_STAT_CONGESTION:
+                        return "MAC_MSG_STAT_CONGESTION";
+                case MAC_MSG_STAT_OTHER:
+                        return "MAC_MSG_STAT_OTHER";
+                default:
+                        return "Unknown error";
+        }
+}
+
+
+/* FIXME add routing code here ? */
+
+/* Called by ScaMac when transmission is complete  (ie. message is released) */
+static void 
+kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context)
+{
+        kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context;
+        int err=0;
+        
+        LASSERT (ktx != NULL);
+
+        /* Euh, there is no feedback when transmission fails?! */
+        switch(status) {
+                case MAC_MSG_STAT_OK:        /* normal */
+                        break;
+                default:
+                        CERROR("%s (%d):\n", get_mac_error(status), status);
+                        err = -EIO;
+                        break;
+        }
+
+        lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie);
+
+        PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+}
+
+
+/* Called by portals when it wants to send a message.
+ * Since ScaMAC has it's own TX thread we don't bother setting up our own. */
+static int 
+kscimacnal_send(nal_cb_t        *nal,
+           void            *private,
+           lib_msg_t       *cookie,
+           ptl_hdr_t       *hdr,
+           int              type, 
+           ptl_nid_t        nid,
+           ptl_pid_t        pid,
+           unsigned int     payload_niov,
+           struct iovec    *payload_iov,
+           size_t           payload_len)
+{
+        kscimacnal_tx_t    *ktx=NULL;
+        kscimacnal_data_t  *ksci = nal->nal_data;
+        int              rc=0;
+        int              buf_len = sizeof(ptl_hdr_t) + payload_len;
+        mac_mblk_t      *msg=NULL, *lastblk, *newblk;
+        unsigned long   physaddr;
+        
+
+        CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n",
+               payload_len, payload_iov, nid, payload_niov);
+
+        LASSERT(ksci != NULL);
+
+        LASSERT(hdr != NULL);
+
+        /* Do real check if we can send this */
+        if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) {
+                CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n",
+                                mac_get_mtusize(ksci->ksci_machandle));
+                return -EINVAL;
+        }
+
+
+        /* save transaction info for later finalize and cleanup */
+        PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t)));
+        if (!ktx) {
+                return -ENOMEM;
+        }
+
+        /* *SIGH* hdr is a stack variable in the calling function, so we
+         * need to copy it to a buffer. Zerocopy magic (or is it just
+         * deferred memcpy?) is annoying sometimes.  */
+        memcpy(&ktx->ktx_hdr, hdr, sizeof(ptl_hdr_t));
+
+        /* First, put the header in the main message mblk */
+        msg = mac_alloc_mblk(&ktx->ktx_hdr, sizeof(ptl_hdr_t),
+                        kscimacnal_txrelease, ktx);
+        if (!msg) {
+                PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                return -ENOMEM;
+        }
+        mac_put_mblk(msg, sizeof(ptl_hdr_t));
+        lastblk=msg;
+
+        /* Allocate additional mblks for each iov as needed.
+         * Essentially lib_copy_iov2buf with a twist or two */
+        while (payload_len > 0)
+        {
+                ptl_size_t nob;
+
+                LASSERT (payload_niov > 0);
+
+                nob = MIN (payload_iov->iov_len, payload_len);
+
+                /* We don't need a callback on the additional mblks, since
+                 * all release callbacks seems to be called when the entire
+                 * message has been sent */
+                newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL);
+                if(!newblk) {
+                        mac_free_msg(msg);
+                        PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                        return -ENOMEM;
+                }
+                mac_put_mblk(newblk, nob);
+                mac_link_mblk(lastblk, newblk);
+                lastblk=newblk;
+
+                payload_len -= nob;
+                payload_niov--;
+                payload_iov++;
+        }
+
+        ktx->ktx_nal = nal;
+        ktx->ktx_private = private;
+        ktx->ktx_cookie = cookie;
+
+        CDEBUG(D_NET, "mac_send %d bytes to nid: 0x%Lx\n", buf_len, nid);
+
+        physaddr = htonl(nid);
+
+        if((rc=mac_send(ksci->ksci_machandle, msg,
+                                        (mac_physaddr_t *) &physaddr))) {
+                CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc);
+                mac_free_msg(msg);
+                PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t)));
+                return rc;
+        }
+
+        return 0;
+}
+
+
+void
+kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        CERROR ("forwarding not implemented\n");
+}
+
+
+/* Process a received portals packet */
+/* Called by the ScaMac RX thread when a packet is received */
+void
+kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type,
+                void *userdata)
+{
+        ptl_hdr_t       *hdr = NULL;
+        kscimacnal_rx_t     krx; 
+        mac_size_t       size;
+        kscimacnal_data_t  *ksci = userdata;
+
+        LASSERT(ksci != NULL);
+
+        if ( !ksci->ksci_init || ksci->ksci_shuttingdown || 
+                    type == MAC_MSG_TYPE_CTRL || type == MAC_MSG_TYPE_OTHER ) {
+                /* We're not interested in messages not for us, ignore */
+                mac_free_msg(msg);
+                return;
+        }
+
+        size = mac_msg_size(msg);
+
+        CDEBUG(D_NET,"msg %p type %d, size %ld bytes (%ld mblks)\n", 
+                        msg, type, size, mac_msg_mblks(msg));
+
+        if( size < sizeof( ptl_hdr_t ) ) {
+                /* XXX what's this for? */
+                if (ksci->ksci_shuttingdown)
+                        return;
+                CERROR("kscimacnal: did not receive complete portal header,"
+                                "size= %ld\n", size);
+                /* Free the message before exiting */
+                mac_free_msg(msg);
+                return;
+        }
+
+        /* Provide everything we know */
+        krx.handle = handle;
+        krx.msg = msg;
+        krx.type = type;
+        krx.userdata = userdata;
+
+        /* mac_msg_next returns the next mblk with unread data */
+        hdr = mac_get_mblk(mac_msg_next(msg), sizeof(ptl_hdr_t) );
+
+        if(!hdr) {
+                CERROR("kscimacnal: no data block in message %p\n", msg);
+                mac_free_msg(msg);
+                return;
+        }
+
+        if ( hdr->dest_nid == kscimacnal_lib.ni.nid ) {
+                PROF_START(lib_parse);
+                /* sets wanted_len, iovs etc and calls our callback */
+                lib_parse(&kscimacnal_lib, hdr, &krx);
+                PROF_FINISH(lib_parse);
+#if 0 /* FIXME: Is it possible to detect this? */
+        } else if (kgmnal_ispeer(hdr->dest_nid)) {
+                /* should have gone direct to peer */
+                CERROR("dropping packet from 0x%llx to 0x%llx:"
+                                "target is a  peer\n",
+                                hdr->src_nid, hdr->dest_nid);
+                kgmnal_requeue_rx(&krx);
+#endif /* if 0 FIXME */
+        } else {
+                /* forward to gateway */
+                CERROR("forwarding not implemented, mynid=0x%llx dest=0x%llx\n",
+                                kscimacnal_lib.ni.nid, hdr->dest_nid);
+        }
+
+        mac_free_msg(msg);
+
+        CDEBUG(D_NET, "msg %p: Done\n", msg);
+}
+
+
+/* Called by portals to process a recieved packet */
+static int kscimacnal_recv(nal_cb_t     *nal, 
+                      void         *private, 
+                      lib_msg_t    *cookie, 
+                      unsigned int  niov, 
+                      struct iovec *iov, 
+                      size_t        mlen, 
+                      size_t        rlen)
+{
+        kscimacnal_rx_t    *krx = private;
+        mac_mblk_t      *mblk;
+        void            *src;
+        mac_size_t       pkt_len;
+        ptl_size_t       iovused=0;
+
+        LASSERT (krx != NULL);
+        LASSERT (krx->msg != NULL);
+
+        CDEBUG(D_NET,"msg %p: mlen=%d, rlen=%d, niov=%d\n",
+                        krx->msg, mlen, rlen, niov);
+
+        /* What was actually received must be >= what sender claims to have
+         * sent.  This is an LASSERT, since lib-move doesn't check cb return
+         * code yet. Also, rlen seems to be negative when mlen==0 so don't
+         * assert on that.
+         */
+        LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen);
+        LASSERT (mlen==0 || mlen <= rlen);
+
+        PROF_START(memcpy);
+
+        /* mac_msg_next returns next mblk with unread data (ie. can
+         * be same mblk */
+        while (mlen != 0 && (mblk = mac_msg_next(krx->msg))) {
+                pkt_len = mac_mblk_len(mblk);
+                src = mac_get_mblk(mblk, pkt_len); /* Next unread block */
+
+                CDEBUG(D_NET,"msg %p: mblk: %p pkt_len: %ld  src: %p\n",
+                                krx->msg, mblk, pkt_len, src);
+
+                LASSERT(src != NULL);
+
+                /* Essentially lib_copy_buf2iov but with continuation support,
+                 * we "gracefully" thrash the argument vars ;) */
+                while (pkt_len > 0) {
+                        ptl_size_t nob;
+
+                        LASSERT (niov > 0);
+
+                        LASSERT(iovused < iov->iov_len);
+
+                        nob = MIN (iov->iov_len-iovused, pkt_len);
+                        CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p  nob: %d "
+                                        "iovused: %d\n",
+                                        iov->iov_base, iov->iov_len,
+                                        src, nob, iovused);
+
+                        memcpy (iov->iov_base+iovused, src, nob);
+                        pkt_len -= nob;
+                        src += nob;
+
+                        if(nob+iovused < iov->iov_len) {
+                                /* We didn't use all of the iov */
+                                iovused+=nob;
+                        }
+                        else {
+                                niov--;
+                                iov++;
+                                iovused=0;
+                        }
+                }
+        }
+        PROF_FINISH(memcpy);
+
+        CDEBUG(D_NET, "Calling lib_finalize.\n");
+
+        PROF_START(lib_finalize);
+        lib_finalize(nal, private, cookie);
+        PROF_FINISH(lib_finalize);
+
+        CDEBUG(D_NET, "Done.\n");
+
+        return rlen;
+}
+
+
+nal_cb_t kscimacnal_lib = {
+        nal_data:       &kscimacnal_data,               /* NAL private data */
+        cb_send:         kscimacnal_send,
+        cb_send_pages:   NULL,                  /* Ignore for now */
+        cb_recv:         kscimacnal_recv,
+        cb_recv_pages:   NULL,
+        cb_read:         kscimacnal_read,
+        cb_write:        kscimacnal_write,
+        cb_malloc:       kscimacnal_malloc,
+        cb_free:         kscimacnal_free,
+        cb_printf:       kscimacnal_printf,
+        cb_cli:          kscimacnal_cli,
+        cb_sti:          kscimacnal_sti,
+        cb_dist:         kscimacnal_dist
+};
diff --git a/lustre/portals/knals/socknal/.cvsignore b/lustre/portals/knals/socknal/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/portals/knals/socknal/Makefile.am b/lustre/portals/knals/socknal/Makefile.am
new file mode 100644 (file)
index 0000000..437d7fc
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = ksocknal
+modulenet_DATA = ksocknal.o
+EXTRA_PROGRAMS = ksocknal
+
+DEFS =
+ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h
diff --git a/lustre/portals/knals/socknal/Makefile.mk b/lustre/portals/knals/socknal/Makefile.mk
new file mode 100644 (file)
index 0000000..46edf01
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Kernelenv
+
+obj-y += ksocknal.o
+ksocknal-objs    := socknal.o socknal_cb.o
+
diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c
new file mode 100644 (file)
index 0000000..91d971c
--- /dev/null
@@ -0,0 +1,860 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socknal.h"
+
+ptl_handle_ni_t         ksocknal_ni;
+static nal_t            ksocknal_api;
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+ksock_nal_data_t ksocknal_data;
+#else
+static ksock_nal_data_t ksocknal_data;
+#endif
+
+kpr_nal_interface_t ksocknal_router_interface = {
+        kprni_nalid:      SOCKNAL,
+        kprni_arg:        &ksocknal_data,
+        kprni_fwd:        ksocknal_fwd_packet,
+};
+
+
+int
+ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
+                       void *ret, size_t ret_len)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */
+        return PTL_OK;
+}
+
+int
+ksocknal_api_shutdown(nal_t *nal, int ni)
+{
+        CDEBUG (D_NET, "closing all connections\n");
+
+        return ksocknal_close_sock(0);          /* close all sockets */
+}
+
+void
+ksocknal_api_yield(nal_t *nal)
+{
+        our_cond_resched();
+        return;
+}
+
+void
+ksocknal_api_lock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+void
+ksocknal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+nal_t *
+ksocknal_init(int interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
+{
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", (ptl_nid_t)0);
+        lib_init(&ksocknal_lib, (ptl_nid_t)0, 0, 10, ptl_size, ac_size);
+        return (&ksocknal_api);
+}
+
+/*
+ *  EXTRA functions follow
+ */
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define SOCKET_I(inode) (&(inode)->u.socket_i)
+#endif
+static __inline__ struct socket *
+socki_lookup(struct inode *inode)
+{
+        return SOCKET_I(inode);
+}
+
+int
+ksocknal_set_mynid(ptl_nid_t nid)
+{
+        lib_ni_t *ni = &ksocknal_lib.ni;
+
+        /* FIXME: we have to do this because we call lib_init() at module
+         * insertion time, which is before we have 'mynid' available.  lib_init
+         * sets the NAL's nid, which it uses to tell other nodes where packets
+         * are coming from.  This is not a very graceful solution to this
+         * problem. */
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
+               nid, ni->nid);
+
+        ni->nid = nid;
+        return (0);
+}
+
+void
+ksocknal_bind_irq (unsigned int irq, int cpu)
+{
+#if (defined(CONFIG_SMP) && CPU_AFFINITY)
+        char  cmdline[64];
+        char *argv[] = {"/bin/sh",
+                        "-c",
+                        cmdline,
+                        NULL};
+        char *envp[] = {"HOME=/",
+                        "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+                        NULL};
+
+        snprintf (cmdline, sizeof (cmdline),
+                  "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq);
+
+        printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n",
+                irq, cpu, cmdline);
+
+        /* FIXME: Find a better method of setting IRQ affinity...
+         */
+
+        call_usermodehelper (argv[0], argv, envp);
+#endif
+}
+
+int
+ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        struct file       *file = NULL;
+        struct socket     *sock = NULL;
+        ksock_sched_t     *sched = NULL;
+        unsigned int       irq = 0;
+        struct net_device *dev = NULL;
+        int                ret;
+        int                idx;
+        ENTRY;
+
+        LASSERT (!in_interrupt());
+
+        file = fget(fd);
+        if (file == NULL)
+                RETURN(-EINVAL);
+
+        ret = -EINVAL;
+        sock = socki_lookup(file->f_dentry->d_inode);
+        if (sock == NULL)
+                GOTO(error, ret);
+
+        ret = -ENOMEM;
+        PORTAL_ALLOC(conn, sizeof(*conn));
+        if (!conn)
+                GOTO(error, ret);
+
+        sock->sk->allocation = GFP_NOFS;    /* don't call info fs for alloc */
+
+        conn->ksnc_file = file;
+        conn->ksnc_sock = sock;
+        conn->ksnc_saved_data_ready = sock->sk->data_ready;
+        conn->ksnc_saved_write_space = sock->sk->write_space;
+        conn->ksnc_peernid = nid;
+        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for socklist */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ksocknal_new_packet (conn, 0);
+
+        INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+
+#warning check it is OK to derefence sk->dst_cache->dev like this...
+        lock_sock (conn->ksnc_sock->sk);
+
+        if (conn->ksnc_sock->sk->dst_cache != NULL) {
+                dev = conn->ksnc_sock->sk->dst_cache->dev;
+                if (dev != NULL) {
+                        irq = dev->irq;
+                        if (irq >= NR_IRQS) {
+                                CERROR ("Unexpected IRQ %x\n", irq);
+                                irq = 0;
+                        }
+                }
+        }
+
+        release_sock (conn->ksnc_sock->sk);
+
+        write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (irq == 0 ||
+            ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) {
+                /* This is a software NIC, or we haven't associated it with
+                 * a CPU yet */
+
+                /* Choose the CPU with the fewest connections */
+                sched = ksocknal_data.ksnd_schedulers;
+                for (idx = 1; idx < SOCKNAL_N_SCHED; idx++)
+                        if (sched->kss_nconns >
+                            ksocknal_data.ksnd_schedulers[idx].kss_nconns)
+                                sched = &ksocknal_data.ksnd_schedulers[idx];
+
+                if (irq != 0) {                 /* Hardware NIC */
+                        /* Remember which scheduler we chose */
+                        idx = sched - ksocknal_data.ksnd_schedulers;
+
+                        LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK);
+
+                        if (bind_irq)       /* remember if we will bind below */
+                                idx |= SOCKNAL_IRQ_BOUND;
+
+                        ksocknal_data.ksnd_irq_info[irq] = idx;
+                }
+        } else { 
+                /* This is a hardware NIC, associated with a CPU */
+                idx = ksocknal_data.ksnd_irq_info[irq];
+
+                /* Don't bind again if we've bound already */
+                if ((idx & SOCKNAL_IRQ_BOUND) != 0)
+                        bind_irq = 0;
+                
+                sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK];
+        }
+
+        sched->kss_nconns++;
+        conn->ksnc_scheduler = sched;
+
+        list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist);
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (bind_irq &&                         /* irq binding required */
+            irq != 0)                           /* hardware NIC */
+                ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers);
+
+        /* NOW it's safe to get called back when socket is ready... */
+        sock->sk->user_data = conn;
+        sock->sk->data_ready = ksocknal_data_ready;
+        sock->sk->write_space = ksocknal_write_space;
+
+        /* ...which I call right now to get things going */
+        ksocknal_data_ready (sock->sk, 0);
+        ksocknal_write_space (sock->sk);
+
+        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n",
+               conn, conn->ksnc_peernid);
+
+        /* Can't unload while connection active */
+        PORTAL_MODULE_USE;
+        RETURN(0);
+
+error:
+        fput(file);
+        return (ret);
+}
+
+/* Passing in a zero nid will close all connections */
+int
+ksocknal_close_sock(ptl_nid_t nid)
+{
+        long               flags;
+        ksock_conn_t      *conn;
+        LIST_HEAD         (death_row);
+        struct list_head  *tmp;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (nid == 0) {                         /* close ALL connections */
+                /* insert 'death row' into the socket list... */
+                list_add (&death_row, &ksocknal_data.ksnd_socklist);
+                /* ...extract and reinitialise the socket list itself... */
+                list_del_init (&ksocknal_data.ksnd_socklist);
+                /* ...and voila, death row is the proud owner of all conns */
+        } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) {
+
+                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid) {
+                        list_del (&conn->ksnc_list);
+                        list_add (&conn->ksnc_list, &death_row);
+                        break;
+                }
+        }
+
+        write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags);
+
+        if (nid && list_empty (&death_row))
+                return (-ENOENT);
+
+        while (!list_empty (&death_row)) {
+                conn = list_entry (death_row.next, ksock_conn_t, ksnc_list);
+                list_del (&conn->ksnc_list);
+
+                /* NB I _have_ to restore the callback, rather than storing
+                 * a noop, since the socket could survive past this module
+                 * being unloaded!! */
+                conn->ksnc_sock->sk->data_ready = conn->ksnc_saved_data_ready;
+                conn->ksnc_sock->sk->write_space = conn->ksnc_saved_write_space;
+
+                /* OK; no more callbacks, but they could be in progress now,
+                 * so wait for them to complete... */
+                write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags);
+
+                /* ...however if I get the lock before a callback gets it,
+                 * this will make them noop
+                 */
+                conn->ksnc_sock->sk->user_data = NULL;
+
+                /* And drop the scheduler's connection count while I've got
+                 * the exclusive lock */
+                conn->ksnc_scheduler->kss_nconns--;
+
+                write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock,
+                                        flags);
+
+                ksocknal_put_conn (conn);       /* drop ref for ksnd_socklist */
+        }
+
+        return (0);
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+struct tcp_opt *sock2tcp_opt(struct sock *sk)
+{
+        return &(sk->tp_pinfo.af_tcp);
+}
+#else
+struct tcp_opt *sock2tcp_opt(struct sock *sk)
+{
+        struct tcp_sock *s = (struct tcp_sock *)sk;
+        return &s->tcp;
+}
+#endif
+
+void
+ksocknal_push_conn (ksock_conn_t *conn)
+{
+        struct sock    *sk = conn->ksnc_sock->sk;
+        struct tcp_opt *tp = sock2tcp_opt(sk);
+        int             nonagle;
+        int             val = 1;
+        int             rc;
+        mm_segment_t    oldmm;
+
+        lock_sock (sk);
+        nonagle = tp->nonagle;
+        tp->nonagle = 1;
+        release_sock (sk);
+
+        oldmm = get_fs ();
+        set_fs (KERNEL_DS);
+
+        rc = sk->prot->setsockopt (sk, SOL_TCP, TCP_NODELAY,
+                                   (char *)&val, sizeof (val));
+        LASSERT (rc == 0);
+
+        set_fs (oldmm);
+
+        lock_sock (sk);
+        tp->nonagle = nonagle;
+        release_sock (sk);
+}
+
+/* Passing in a zero nid pushes all connections */
+int
+ksocknal_push_sock (ptl_nid_t nid)
+{
+        ksock_conn_t      *conn;
+        struct list_head  *tmp;
+        int                index;
+        int                i;
+
+        if (nid != 0) {
+                conn = ksocknal_get_conn (nid);
+
+                if (conn == NULL)
+                        return (-ENOENT);
+
+                ksocknal_push_conn (conn);
+                ksocknal_put_conn (conn);
+
+                return (0);
+        }
+
+        /* NB we can't remove connections from the socket list so we have to
+         * cope with them being removed from under us...
+         */
+        for (index = 0; ; index++) {
+                read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+                i = 0;
+                conn = NULL;
+
+                list_for_each (tmp, &ksocknal_data.ksnd_socklist) {
+                        if (i++ == index) {
+                                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                                atomic_inc (&conn->ksnc_refcount); // take a ref
+                                break;
+                        }
+                }
+
+                read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+                if (conn == NULL)
+                        break;
+
+                ksocknal_push_conn (conn);
+                ksocknal_put_conn (conn);
+        }
+
+        return (0);
+}
+
+ksock_conn_t *
+ksocknal_get_conn (ptl_nid_t nid)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        PROF_START(conn_list_walk);
+
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        list_for_each(tmp, &ksocknal_data.ksnd_socklist) {
+
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid) {
+                        /* caller is referencing */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+                        CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n",
+                               conn, nid, atomic_read (&conn->ksnc_refcount));
+
+                        PROF_FINISH(conn_list_walk);
+                        return (conn);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+        CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n",
+               nid);
+        PROF_FINISH(conn_list_walk);
+        return (NULL);
+}
+
+void
+ksocknal_close_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_NET, "connection [%p] closed \n", conn);
+
+        fput (conn->ksnc_file);
+        PORTAL_FREE (conn, sizeof (*conn));
+
+        /* One less connection keeping us hanging on */
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+_ksocknal_put_conn (ksock_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn);
+
+        /* "But what is the black spot, captain?" I asked.
+         * "That's a summons, mate..." */
+
+        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (conn->ksnc_sock->sk->data_ready != ksocknal_data_ready);
+        LASSERT (conn->ksnc_sock->sk->write_space != ksocknal_write_space);
+        LASSERT (conn->ksnc_sock->sk->user_data == NULL);
+        LASSERT (!conn->ksnc_rx_scheduled);
+
+        if (!in_interrupt()) {
+                ksocknal_close_conn (conn);
+                return;
+        }
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+        list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list);
+        wake_up (&ksocknal_data.ksnd_reaper_waitq);
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+}
+
+int
+ksocknal_cmd(struct portal_ioctl_data * data, void * private)
+{
+        int rc = -EINVAL;
+
+        LASSERT (data != NULL);
+
+        switch(data->ioc_nal_cmd) {
+        case NAL_CMD_REGISTER_PEER_FD: {
+                rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd,
+                                       data->ioc_flags);
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = ksocknal_close_sock(data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                rc = ksocknal_set_mynid (data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_PUSH_CONNECTION: {
+                rc = ksocknal_push_sock (data->ioc_nid);
+                break;
+        }
+        }
+
+        return rc;
+}
+
+void
+ksocknal_free_buffers (void)
+{
+        if (ksocknal_data.ksnd_fmbs != NULL) {
+                ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs;
+                int          i;
+                int          j;
+
+                for (i = 0;
+                     i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS);
+                     i++, fmb++)
+                        for (j = 0; j < fmb->fmb_npages; j++)
+                                if (fmb->fmb_pages[j] != NULL)
+                                        __free_page (fmb->fmb_pages[j]);
+
+                PORTAL_FREE (ksocknal_data.ksnd_fmbs,
+                             sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                                     SOCKNAL_LARGE_FWD_NMSGS));
+        }
+
+        if (ksocknal_data.ksnd_ltxs != NULL)
+                PORTAL_FREE (ksocknal_data.ksnd_ltxs,
+                             sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS +
+                                                     SOCKNAL_NNBLK_LTXS));
+
+        if (ksocknal_data.ksnd_schedulers != NULL)
+                PORTAL_FREE (ksocknal_data.ksnd_schedulers,
+                             sizeof (ksock_sched_t) * SOCKNAL_N_SCHED);
+}
+
+void __exit
+ksocknal_module_fini (void)
+{
+        int   i;
+
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        switch (ksocknal_data.ksnd_init) {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                kportal_nal_unregister(SOCKNAL);
+                PORTAL_SYMBOL_UNREGISTER (ksocknal_ni);
+                /* fall through */
+
+        case SOCKNAL_INIT_PTL:
+                PtlNIFini(ksocknal_ni);
+                lib_fini(&ksocknal_lib);
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all connections
+                 * have been closed so all lists must be empty */
+                LASSERT (list_empty (&ksocknal_data.ksnd_socklist));
+                LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list));
+                LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns));
+
+                if (ksocknal_data.ksnd_schedulers != NULL)
+                        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                                ksock_sched_t *kss =
+                                        &ksocknal_data.ksnd_schedulers[i];
+
+                                LASSERT (list_empty (&kss->kss_tx_conns));
+                                LASSERT (list_empty (&kss->kss_rx_conns));
+                                LASSERT (kss->kss_nconns == 0);
+                        }
+
+                /* stop router calling me */
+                kpr_shutdown (&ksocknal_data.ksnd_router);
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ksocknal_data.ksnd_shuttingdown = 1;
+                wake_up_all (&ksocknal_data.ksnd_reaper_waitq);
+
+                for (i = 0; i < SOCKNAL_N_SCHED; i++)
+                       wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq);
+
+                while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) {
+                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
+                                atomic_read (&ksocknal_data.ksnd_nthreads));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+
+                kpr_deregister (&ksocknal_data.ksnd_router);
+
+                ksocknal_free_buffers();
+                /* fall through */
+
+        case SOCKNAL_INIT_NOTHING:
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+}
+
+
+int __init
+ksocknal_module_init (void)
+{
+        int   pkmem = atomic_read(&portal_kmemory);
+        int   rc;
+        int   i;
+        int   j;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+
+        LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+
+        ksocknal_api.forward  = ksocknal_api_forward;
+        ksocknal_api.shutdown = ksocknal_api_shutdown;
+        ksocknal_api.yield    = ksocknal_api_yield;
+        ksocknal_api.validate = NULL;           /* our api validate is a NOOP */
+        ksocknal_api.lock     = ksocknal_api_lock;
+        ksocknal_api.unlock   = ksocknal_api_unlock;
+        ksocknal_api.nal_data = &ksocknal_data;
+
+        ksocknal_lib.nal_data = &ksocknal_data;
+
+        memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist);
+        rwlock_init(&ksocknal_data.ksnd_socklist_lock);
+
+        ksocknal_data.ksnd_nal_cb = &ksocknal_lib;
+        spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock);
+
+        spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns);
+
+        spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns);
+
+        spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list);
+        INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list);
+        init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq);
+
+        spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
+        INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list);
+        init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq);
+
+        memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED,
+                sizeof (ksocknal_data.ksnd_irq_info));
+
+        /* flag lists/ptrs/locks initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+
+        PORTAL_ALLOC(ksocknal_data.ksnd_schedulers,
+                     sizeof(ksock_sched_t) * SOCKNAL_N_SCHED);
+        if (ksocknal_data.ksnd_schedulers == NULL)
+                RETURN(-ENOMEM);
+
+        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
+
+                spin_lock_init (&kss->kss_lock);
+                INIT_LIST_HEAD (&kss->kss_rx_conns);
+                INIT_LIST_HEAD (&kss->kss_tx_conns);
+#if SOCKNAL_ZC
+                INIT_LIST_HEAD (&kss->kss_zctxdone_list);
+#endif
+                init_waitqueue_head (&kss->kss_waitq);
+        }
+
+        CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t),
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        PORTAL_ALLOC(ksocknal_data.ksnd_ltxs,
+                     sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS));
+        if (ksocknal_data.ksnd_ltxs == NULL) {
+                ksocknal_module_fini ();
+                return (-ENOMEM);
+        }
+
+        /* Deterministic bugs please */
+        memset (ksocknal_data.ksnd_ltxs, 0xeb,
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) {
+                ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i];
+
+                ltx->ltx_idle = i < SOCKNAL_NLTXS ?
+                                &ksocknal_data.ksnd_idle_ltx_list :
+                                &ksocknal_data.ksnd_idle_nblk_ltx_list;
+                list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+        }
+
+        rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni);
+        if (rc != 0) {
+                CERROR("ksocknal: PtlNIInit failed: error %d\n", rc);
+                ksocknal_module_fini ();
+                RETURN (rc);
+        }
+        PtlNIDebug(ksocknal_ni, ~0);
+
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called
+
+        for (i = 0; i < SOCKNAL_N_SCHED; i++) {
+                rc = ksocknal_thread_start (ksocknal_scheduler,
+                                            &ksocknal_data.ksnd_schedulers[i]);
+                if (rc != 0) {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n",
+                               i, rc);
+                        ksocknal_module_fini ();
+                        RETURN (rc);
+                }
+        }
+
+        rc = ksocknal_thread_start (ksocknal_reaper, NULL);
+        if (rc != 0) {
+                CERROR("Can't spawn socknal reaper: %d\n", rc);
+                ksocknal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = kpr_register(&ksocknal_data.ksnd_router,
+                          &ksocknal_router_interface);
+        if (rc != 0) {
+                CDEBUG(D_NET, "Can't initialise routing interface "
+                       "(rc = %d): not routing\n", rc);
+        } else {
+                /* Only allocate forwarding buffers if I'm on a gateway */
+
+                PORTAL_ALLOC(ksocknal_data.ksnd_fmbs,
+                             sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                                    SOCKNAL_LARGE_FWD_NMSGS));
+                if (ksocknal_data.ksnd_fmbs == NULL) {
+                        ksocknal_module_fini ();
+                        RETURN(-ENOMEM);
+                }
+
+                /* NULL out buffer pointers etc */
+                memset(ksocknal_data.ksnd_fmbs, 0,
+                       sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS +
+                                              SOCKNAL_LARGE_FWD_NMSGS));
+
+                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS +
+                                 SOCKNAL_LARGE_FWD_NMSGS); i++) {
+                        ksock_fmb_t *fmb =
+                                &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i];
+
+                        if (i < SOCKNAL_SMALL_FWD_NMSGS) {
+                                fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES;
+                                fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp;
+                        } else {
+                                fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES;
+                                fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp;
+                        }
+
+                        LASSERT (fmb->fmb_npages > 0);
+                        for (j = 0; j < fmb->fmb_npages; j++) {
+                                fmb->fmb_pages[j] = alloc_page(GFP_KERNEL);
+
+                                if (fmb->fmb_pages[j] == NULL) {
+                                        ksocknal_module_fini ();
+                                        return (-ENOMEM);
+                                }
+
+                                LASSERT(page_address (fmb->fmb_pages[j]) !=
+                                        NULL);
+                        }
+
+                        list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+                }
+        }
+
+        rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL);
+        if (rc != 0) {
+                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                ksocknal_module_fini ();
+                return (rc);
+        }
+
+        PORTAL_SYMBOL_REGISTER(ksocknal_ni);
+
+        /* flag everything initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+        printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial "
+               "mem %d)\n",
+               kpr_routing (&ksocknal_data.ksnd_router) ?
+               "enabled" : "disabled", pkmem);
+
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(ksocknal_module_init);
+module_exit(ksocknal_module_fini);
+
+EXPORT_SYMBOL (ksocknal_ni);
diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h
new file mode 100644 (file)
index 0000000..86cdeb0
--- /dev/null
@@ -0,0 +1,292 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_SOCKNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define SOCKNAL_N_SCHED num_online_cpus()       /* # socknal schedulers */
+
+#if PTL_LARGE_MTU
+# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10)      /* biggest payload I can forward */
+#else
+# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10)       /* biggest payload I can forward */
+#endif
+
+#define SOCKNAL_NLTXS           128             /* # normal transmit messages */
+#define SOCKNAL_NNBLK_LTXS     128             /* # transmit messages reserved if can't block */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 64              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
+
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+                                               /* # pages in a large message fwd buffer */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+
+#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10)
+
+typedef struct                                  /* pool of forwarding buffers */
+{
+        spinlock_t        fmp_lock;             /* serialise */
+        struct list_head  fmp_idle_fmbs;        /* buffers waiting for a connection */
+        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
+} ksock_fmb_pool_t;
+
+
+typedef struct                                  /* per scheduler state */
+{
+        spinlock_t        kss_lock;             /* serialise */
+        struct list_head  kss_rx_conns;         /* conn waiting to be read */
+        struct list_head  kss_tx_conns;         /* conn waiting to be written */
+#if SOCKNAL_ZC
+        struct list_head  kss_zctxdone_list;    /* completed ZC transmits */
+#endif
+        wait_queue_head_t kss_waitq;            /* where scheduler sleeps */
+        int               kss_nconns;           /* # connections assigned to this scheduler */
+} ksock_sched_t;
+
+typedef struct {
+        int               ksnd_init;            /* initialisation state */
+        
+        struct list_head  ksnd_socklist;        /* all my connections */
+        rwlock_t          ksnd_socklist_lock;   /* stabilise add/find/remove */
+
+        nal_cb_t         *ksnd_nal_cb;
+        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
+
+        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        ksock_sched_t    *ksnd_schedulers;      /* scheduler state */
+        
+        kpr_router_t      ksnd_router;          /* THE router */
+
+        void             *ksnd_fmbs;            /* all the pre-allocated FMBs */
+        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
+        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
+
+        void             *ksnd_ltxs;            /* all the pre-allocated LTXs */
+        spinlock_t        ksnd_idle_ltx_lock;   /* serialise ltx alloc/free */
+        struct list_head  ksnd_idle_ltx_list;   /* where to get an idle LTX */
+        struct list_head  ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
+        wait_queue_head_t ksnd_idle_ltx_waitq;  /* where to block for an idle LTX */
+
+        struct list_head  ksnd_reaper_list;     /* conn waiting to be reaped */
+        wait_queue_head_t ksnd_reaper_waitq;    /* reaper sleeps here */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+        unsigned char     ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_ALL        3
+
+#define SOCKNAL_IRQ_BOUND       0x80            /* flag we _did_ bind already */
+#define SOCKNAL_IRQ_SCHED_MASK 0x7f            /* we assume < 127 CPUs */
+#define SOCKNAL_IRQ_UNASSIGNED  0xff            /* flag unassigned */
+
+/* A packet just assembled for transmission is represented by 1 or more
+ * struct iovec fragments and 0 or more ptl_kiov_t fragments.  Forwarded
+ * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0
+ * ptl_kiov_t fragments.  Messages from an MD with PTL_MD_KIOV set, have 1
+ * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t
+ * fragments.
+ *
+ * On the receive side, initially 1 struct iovec fragment is posted for
+ * receive (the header).  Once the header has been received, if the message
+ * requires forwarding or will be received into mapped memory, up to
+ * PTL_MD_MAX_IOV struct iovec fragments describe the target memory.
+ * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used.
+ */
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;        /* queue on conn for transmission etc */
+        char                    tx_isfwd;       /* forwarding / sourced here */
+        int                     tx_nob;         /* # packet bytes */
+        int                     tx_niov;        /* # packet iovec frags */
+        struct iovec           *tx_iov;         /* packet iovec frags */
+        int                     tx_nkiov;       /* # packet page frags */
+        ptl_kiov_t             *tx_kiov;        /* packet page frags */
+#if SOCKNAL_ZC        
+        ksock_sched_t          *tx_sched;       /* who to wake on callback */
+        zccd_t                  tx_zccd;        /* zero copy callback descriptor */
+#endif
+} ksock_tx_t;
+
+#define KSOCK_ZCCD_2_TX(ptr)   list_entry (ptr, ksock_tx_t, tx_zccd)
+/* network zero copy callback descriptor embedded in ksock_tx_t */
+
+/* space for the tx frag descriptors: hdr is always 1 iovec
+ * and payload is PTL_MD_MAX of either type. */
+typedef struct
+{
+        struct iovec            hdr;
+        union {
+                struct iovec    iov[PTL_MD_MAX_IOV];
+                ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+        }                       payload;
+} ksock_txiovspace_t;
+
+typedef struct                                  /* locally transmitted packet */
+{
+        ksock_tx_t              ltx_tx;         /* send info */
+        struct list_head       *ltx_idle;       /* where to put when idle */
+        void                   *ltx_private;    /* lib_finalize() callback arg */
+        void                   *ltx_cookie;     /* lib_finalize() callback arg */
+        ksock_txiovspace_t      ltx_iov_space;  /* where to stash frag descriptors */
+        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
+} ksock_ltx_t;
+
+#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch)
+/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
+
+#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
+/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
+
+/* NB list_entry() is used here as convenient macro for calculating a
+ * pointer to a struct from the address of a member.
+ */
+
+typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
+{                                               /* (socknal->router) */
+        struct list_head        fmb_list;       /* queue idle */
+        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
+        int                     fmb_npages;     /* # pages allocated */
+        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
+        struct page            *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
+        struct iovec            fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+} ksock_fmb_t;
+
+/* space for the rx frag descriptors; we either read a single contiguous
+ * header, or PTL_MD_MAX_IOV frags of payload of either type. */
+typedef union {
+        struct iovec    iov[PTL_MD_MAX_IOV];
+        ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+} ksock_rxiovspace_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
+#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
+#define SOCKNAL_RX_SLOP         4               /* skipping body */
+#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
+#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+
+typedef struct 
+{ 
+        struct list_head    ksnc_list;          /* stash on global socket list */
+        struct file        *ksnc_file;          /* socket filp */
+        struct socket      *ksnc_sock;          /* actual socket */
+        void               *ksnc_saved_data_ready; /* socket's original data_ready() callback */
+        void               *ksnc_saved_write_space; /* socket's original write_space() callback */
+        ptl_nid_t           ksnc_peernid;       /* who's on the other end */
+        atomic_t            ksnc_refcount;      /* # users */
+        ksock_sched_t     *ksnc_scheduler;     /* who schedules this connection */
+        
+        /* READER */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        volatile int        ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # iovec frags */
+        struct iovec       *ksnc_rx_iov;        /* the iovec frags */
+        int                 ksnc_rx_nkiov;      /* # page frags */
+        ptl_kiov_t         *ksnc_rx_kiov;       /* the page frags */
+        ksock_rxiovspace_t  ksnc_rx_iov_space;  /* space for frag descriptors */
+        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
+        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        volatile int        ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+
+} ksock_conn_t;
+
+extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client);
+extern int ksocknal_close_sock(ptl_nid_t nid);
+extern int ksocknal_set_mynid(ptl_nid_t nid);
+extern int ksocknal_push_sock(ptl_nid_t nid);
+extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid);
+extern void _ksocknal_put_conn (ksock_conn_t *conn);
+extern void ksocknal_close_conn (ksock_conn_t *conn);
+
+static inline void
+ksocknal_put_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n",
+                conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount));
+
+        if (atomic_dec_and_test (&conn->ksnc_refcount))
+                _ksocknal_put_conn (conn);
+}
+
+extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
+extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
+extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern int ksocknal_scheduler (void *arg);
+extern int ksocknal_reaper (void *arg);
+extern void ksocknal_data_ready(struct sock *sk, int n);
+extern void ksocknal_write_space(struct sock *sk);
+
+
+extern nal_cb_t         ksocknal_lib;
+extern ksock_nal_data_t ksocknal_data;
diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c
new file mode 100644 (file)
index 0000000..6147d8a
--- /dev/null
@@ -0,0 +1,1613 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socknal.h"
+
+atomic_t   ksocknal_packets_received;
+atomic_t   ksocknal_packets_launched;
+atomic_t   ksocknal_packets_being_sent;
+
+#if SOCKNAL_ZC
+int        ksocknal_do_zc = 1;
+int        ksocknal_zc_min_frag = 2048;
+#endif
+
+/*
+ *  LIB functions follow
+ *
+ */
+int
+ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr,
+              user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
+               void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                         ptl_event_t *ev)
+{
+        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
+               nal->ni.nid, eq, ev);
+
+        if (eq->event_callback != NULL)
+                eq->event_callback(ev);
+
+        return 0;
+}
+
+void *
+ksocknal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+
+        if (buf != NULL)
+                memset(buf, 0, len);
+
+        return (buf);
+}
+
+void
+ksocknal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+void
+ksocknal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+        va_list ap;
+        char msg[256];
+
+        va_start (ap, fmt);
+        vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
+        va_end (ap);
+
+        msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+void
+ksocknal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data = nal->nal_data;
+
+        spin_lock(&data->ksnd_nal_cb_lock);
+}
+
+void
+ksocknal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data;
+        data = nal->nal_data;
+
+        spin_unlock(&data->ksnd_nal_cb_lock);
+}
+
+int
+ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if ksocknal_get_conn(nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ksock_ltx_t *
+ksocknal_get_ltx (int may_block)
+{
+        long             flags;
+        ksock_ltx_t *ltx = NULL;
+
+        for (;;) {
+                spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+                if (!list_empty (&ksocknal_data.ksnd_idle_ltx_list)) {
+                        ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next,
+                                         ksock_ltx_t, ltx_tx.tx_list);
+                        list_del (&ltx->ltx_tx.tx_list);
+                        break;
+                }
+
+                if (!may_block) {
+                        if (!list_empty(&ksocknal_data.ksnd_idle_nblk_ltx_list)) {
+                                ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next,
+                                                 ksock_ltx_t, ltx_tx.tx_list);
+                                list_del (&ltx->ltx_tx.tx_list);
+                        }
+                        break;
+                }
+
+                spin_unlock_irqrestore(&ksocknal_data.ksnd_idle_ltx_lock,
+                                       flags);
+
+                wait_event (ksocknal_data.ksnd_idle_ltx_waitq,
+                            !list_empty (&ksocknal_data.ksnd_idle_ltx_list));
+        }
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+        return (ltx);
+}
+
+#if SOCKNAL_ZC
+struct page *
+ksocknal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+                /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (page == NULL ||
+            !VALID_PAGE (page))
+                return (NULL);
+
+        return (page);
+}
+#endif
+
+int
+ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        struct iovec  *iov = tx->tx_iov;
+        int            fragsize = iov->iov_len;
+        unsigned long  vaddr = (unsigned long)iov->iov_base;
+#if SOCKNAL_ZC
+        int            offset = vaddr & (PAGE_SIZE - 1);
+        int            zcsize = MIN (fragsize, PAGE_SIZE - offset);
+        struct page   *page;
+#endif
+        int            rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_nob);
+        LASSERT (tx->tx_niov > 0);
+        more |= (tx->tx_niov > 1);
+        
+#if SOCKNAL_ZC
+        if (ksocknal_do_zc &&
+            (sock->sk->route_caps & NETIF_F_SG) &&
+            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            zcsize >= ksocknal_zc_min_frag &&
+            (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
+                
+                CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
+                       (void *)vaddr, page, page_address(page), offset, zcsize);
+
+                more |= (zcsize < fragsize);
+
+                rc = tcp_sendpage_zccd(sock, page, offset, zcsize, 
+                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                /* NB don't pass tx's iov; sendmsg may or may not update it */
+                struct iovec fragiov = { .iov_base = (void *)vaddr,
+                                         .iov_len  = fragsize};
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &fragiov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
+                };
+                mm_segment_t oldmm = get_fs();
+                
+                set_fs (KERNEL_DS);
+                rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize);
+                set_fs (oldmm);
+        } 
+
+        if (rc <= 0)
+                return (rc);
+
+        tx->tx_nob -= rc;
+
+        if (rc < fragsize) {
+                /* didn't send whole frag */
+                iov->iov_base = (void *)(vaddr + rc);
+                iov->iov_len  = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        /* everything went */
+        LASSERT (rc == fragsize);
+        tx->tx_iov++;
+        tx->tx_niov--;
+        return (1);
+}
+
+int
+ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        ptl_kiov_t    *kiov = tx->tx_kiov;
+        int            fragsize = kiov->kiov_len;
+        struct page   *page = kiov->kiov_page;
+        int            offset = kiov->kiov_offset;
+        int            rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only send 1 frag at a time. */
+        LASSERT (fragsize <= tx->tx_nob);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        LASSERT (tx->tx_nkiov > 0);
+        more |= (tx->tx_nkiov > 1);
+
+#if SOCKNAL_ZC
+        if (ksocknal_do_zc &&
+            (sock->sk->route_caps & NETIF_F_SG) &&
+            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            fragsize >= ksocknal_zc_min_frag) {
+
+                CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                               page, offset, fragsize);
+
+                rc = tcp_sendpage_zccd(sock, page, offset, fragsize,
+                                       more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                char *addr = ((char *)kmap (page)) + offset;
+                struct iovec fragiov = {.iov_base = addr,
+                                        .iov_len  = fragsize};
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &fragiov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT
+                };
+                mm_segment_t  oldmm = get_fs();
+                
+                set_fs (KERNEL_DS);
+                rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize);
+                set_fs (oldmm);
+                kunmap (page);
+        }
+
+        if (rc <= 0)
+                return (rc);
+
+        tx->tx_nob -= rc;
+
+        if (rc < fragsize) {
+                /* didn't send whole frag */
+                kiov->kiov_offset = offset + rc;
+                kiov->kiov_len    = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        /* everything went */
+        LASSERT (rc == fragsize);
+        tx->tx_kiov++;
+        tx->tx_nkiov--;
+        return (1);
+}
+
+int
+ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more)
+{
+        int    rc;
+        int    sent_some = 0;
+        ENTRY;
+        
+        LASSERT (!in_interrupt());
+
+        for (;;) {
+                if (tx->tx_niov != 0)
+                        rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0);
+                else
+                        rc = ksocknal_send_kiov (sock, tx, more);
+
+                /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */
+                if (rc <= 0)                    /* error or partial send */
+                        RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc);
+                
+                if (tx->tx_nob == 0)            /* sent everything */
+                        RETURN (0);
+
+                sent_some = 1;
+        }
+}
+
+int
+ksocknal_recv_iov (ksock_conn_t *conn)
+{
+        struct iovec *iov = conn->ksnc_rx_iov;
+        int           fragsize  = iov->iov_len;
+        unsigned long vaddr = (unsigned long)iov->iov_base;
+        struct iovec  fragiov = { .iov_base = (void *)vaddr,
+                                  .iov_len  = fragsize};
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = &fragiov,
+                .msg_iovlen     = 1,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        mm_segment_t oldmm = get_fs();
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (conn->ksnc_rx_niov > 0);
+        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        
+        set_fs (KERNEL_DS);
+        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
+        /* NB this is just a boolean............................^ */
+        set_fs (oldmm);
+
+        if (rc <= 0)
+                return (rc);
+
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+                
+        if (rc < fragsize) {
+                iov->iov_base = (void *)(vaddr + rc);
+                iov->iov_len = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        LASSERT (rc == fragsize);
+        conn->ksnc_rx_iov++;
+        conn->ksnc_rx_niov--;
+        return (1);
+}
+
+int
+ksocknal_recv_kiov (ksock_conn_t *conn)
+{
+        ptl_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        struct page  *page = kiov->kiov_page;
+        int           offset = kiov->kiov_offset;
+        int           fragsize = kiov->kiov_len;
+        unsigned long vaddr = ((unsigned long)kmap (page)) + offset;
+        struct iovec  fragiov = { .iov_base = (void *)vaddr,
+                                  .iov_len  = fragsize};
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = &fragiov,
+                .msg_iovlen     = 1,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        mm_segment_t oldmm = get_fs();
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (fragsize <= conn->ksnc_rx_nob_wanted);
+        LASSERT (conn->ksnc_rx_nkiov > 0);
+        LASSERT (offset + fragsize <= PAGE_SIZE);
+        
+        set_fs (KERNEL_DS);
+        rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
+        /* NB this is just a boolean............................^ */
+        set_fs (oldmm);
+        kunmap (page);
+        
+        if (rc <= 0)
+                return (rc);
+        
+        conn->ksnc_rx_nob_wanted -= rc;
+        conn->ksnc_rx_nob_left -= rc;
+                
+        if (rc < fragsize) {
+                kiov->kiov_offset = offset + rc;
+                kiov->kiov_len = fragsize - rc;
+                return (-EAGAIN);
+        }
+
+        LASSERT (rc == fragsize);
+        conn->ksnc_rx_kiov++;
+        conn->ksnc_rx_nkiov--;
+        return (1);
+}
+
+int
+ksocknal_recvmsg (ksock_conn_t *conn) 
+{
+        int    rc;
+        int    got_some = 0;
+        ENTRY;
+        
+        LASSERT (!in_interrupt ());
+
+        for (;;) {
+                LASSERT (conn->ksnc_rx_nob_wanted > 0);
+                
+                if (conn->ksnc_rx_niov != 0)
+                        rc = ksocknal_recv_iov (conn);
+                else
+                        rc = ksocknal_recv_kiov (conn);
+
+                /* CAVEAT EMPTOR: we return...
+                 * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */
+
+                if (rc <= 0)                    /* error/EOF or partial receive */
+                        RETURN ((got_some || rc == -EAGAIN) ? 1 : rc);
+                
+                if (conn->ksnc_rx_nob_wanted == 0)
+                        RETURN (1);
+
+                got_some = 0;
+        }
+}
+
+#if SOCKNAL_ZC
+void
+ksocknal_zc_callback (zccd_t *zcd)
+{
+        ksock_tx_t    *tx = KSOCK_ZCCD_2_TX(zcd);
+        ksock_sched_t *sched = tx->tx_sched;
+        unsigned long  flags;
+        ENTRY;
+
+        /* Schedule tx for cleanup (can't do it now due to lock conflicts) */
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
+        if (waitqueue_active (&sched->kss_waitq))
+                wake_up (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        EXIT;
+}
+#endif
+
+void
+ksocknal_tx_done (ksock_tx_t *tx)
+{
+        long           flags;
+        ksock_ltx_t   *ltx;
+        ENTRY;
+
+        atomic_dec (&ksocknal_packets_being_sent);
+
+        if (tx->tx_isfwd) {             /* was a forwarded packet? */
+                kpr_fwd_done (&ksocknal_data.ksnd_router,
+                              KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+                EXIT;
+                return;
+        }
+
+        /* local send */
+        ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+
+        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie);
+
+        spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+
+        list_add_tail (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+
+        /* normal tx desc => wakeup anyone blocking for one */
+        if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list &&
+            waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq))
+                wake_up (&ksocknal_data.ksnd_idle_ltx_waitq);
+
+        spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags);
+        EXIT;
+}
+
+void
+ksocknal_process_transmit (ksock_sched_t *sched, long *irq_flags)
+{
+        ksock_conn_t *conn;
+        ksock_tx_t *tx;
+        int         rc;
+
+        LASSERT (!list_empty (&sched->kss_tx_conns));
+        conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list);
+        list_del (&conn->ksnc_tx_list);
+
+        LASSERT (conn->ksnc_tx_scheduled);
+        LASSERT (conn->ksnc_tx_ready);
+        LASSERT (!list_empty (&conn->ksnc_tx_queue));
+        tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list);
+        /* assume transmit will complete now, so dequeue while I've got lock */
+        list_del (&tx->tx_list);
+
+        spin_unlock_irqrestore (&sched->kss_lock, *irq_flags);
+
+        LASSERT (tx->tx_nob > 0);
+
+        conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */
+        mb();                   /* => clear BEFORE trying to write */
+
+        rc = ksocknal_sendmsg (conn->ksnc_sock, tx, 
+                               !list_empty (&conn->ksnc_tx_queue)); /* more to come? */
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc);
+
+        if (rc != 0) {
+#warning FIXME: handle socket errors properly
+                CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc);
+                /* kid on for now the whole packet went.
+                 * NB when we handle the error better, we'll still need to
+                 * block for zccd completion.
+                 */
+                tx->tx_nob = 0;
+        }
+
+        if (tx->tx_nob == 0)                    /* nothing left to send */
+        {
+                /* everything went; assume more can go, so prevent write_space locking */
+                conn->ksnc_tx_ready = 1;
+
+                ksocknal_put_conn (conn);       /* release packet's ref */
+                atomic_inc (&ksocknal_packets_being_sent);
+#if SOCKNAL_ZC
+                if (atomic_read (&tx->tx_zccd.zccd_count) != 1) {
+                        /* zccd skbufs are still in-flight.  Release my
+                         * initial ref on zccd, so callback can occur */
+                        zccd_put (&tx->tx_zccd);
+                } else
+#endif
+                        ksocknal_tx_done (tx);
+
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+        } else {
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+
+                /* back onto HEAD of tx_queue */
+                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+        }
+
+        if (!conn->ksnc_tx_ready ||             /* no space to write now */
+            list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */
+                conn->ksnc_tx_scheduled = 0;    /* not being scheduled */
+                ksocknal_put_conn (conn);       /* release scheduler's ref */
+        } else                                 /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
+}
+
+void
+ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        unsigned long  flags;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
+
+        /* Ensure the frags we've been given EXACTLY match the number of
+         * bytes we want to send.  Many TCP/IP stacks disregard any total
+         * size parameters passed to them and just look at the frags. 
+         *
+         * We always expect at least 1 mapped fragment containing the
+         * complete portals header.
+         */
+        LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) +
+                 lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob);
+        LASSERT (tx->tx_niov >= 1);
+        LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t));
+        
+        CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n",
+                ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, 
+                tx->tx_niov, tx->tx_nkiov);
+
+#if SOCKNAL_ZC
+        zccd_init (&tx->tx_zccd, ksocknal_zc_callback);
+        /* NB this sets 1 ref on zccd, so the callback can only occur
+         * after I've released this ref */
+        tx->tx_sched = sched;
+#endif
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+
+        if (conn->ksnc_tx_ready &&              /* able to send */
+            !conn->ksnc_tx_scheduled) {          /* not scheduled to send */
+                list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */
+                if (waitqueue_active (&sched->kss_waitq))
+                        wake_up (&sched->kss_waitq);
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+        atomic_inc (&ksocknal_packets_launched);
+}
+
+ksock_conn_t *
+ksocknal_send_target (ptl_nid_t nid) 
+{
+        ptl_nid_t     gatewaynid;
+        ksock_conn_t *conn;
+        int           rc;
+
+        if ((conn = ksocknal_get_conn (nid)) == NULL) {
+                /* It's not a peer; try to find a gateway */
+                rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid);
+                if (rc != 0) {
+                        CERROR("Can't route to "LPX64": router error %d\n",
+                               nid, rc);
+                        return (NULL);
+                }
+
+                if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) {
+                        CERROR ("Can't route to "LPX64": gateway "LPX64
+                                " is not a peer\n", nid, gatewaynid);
+                        return (NULL);
+                }
+        }
+
+        return (conn);
+}
+
+ksock_ltx_t *
+ksocknal_setup_hdr (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+                    ptl_hdr_t *hdr, int type)
+{
+        ksock_ltx_t  *ltx;
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ltx = ksocknal_get_ltx (!(type == PTL_MSG_ACK ||
+                                  type == PTL_MSG_REPLY ||
+                                  in_interrupt ()));
+        if (ltx == NULL) {
+                CERROR ("Can't allocate tx desc\n");
+                return (NULL);
+        }
+
+        /* Init local send packet (storage for hdr, finalize() args) */
+        ltx->ltx_hdr = *hdr;
+        ltx->ltx_private = private;
+        ltx->ltx_cookie = cookie;
+        
+        /* Init common ltx_tx */
+        ltx->ltx_tx.tx_isfwd = 0;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr);
+
+        /* We always have 1 mapped frag for the header */
+        ltx->ltx_tx.tx_niov = 1;
+        ltx->ltx_tx.tx_iov = &ltx->ltx_iov_space.hdr;
+        ltx->ltx_tx.tx_iov[0].iov_base = &ltx->ltx_hdr;
+        ltx->ltx_tx.tx_iov[0].iov_len = sizeof (ltx->ltx_hdr);
+
+        ltx->ltx_tx.tx_kiov  = NULL;
+        ltx->ltx_tx.tx_nkiov = 0;
+
+        return (ltx);
+}
+
+int
+ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie,
+               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+               unsigned int payload_niov, struct iovec *payload_iov,
+               size_t payload_len)
+{
+        ksock_ltx_t  *ltx;
+        ksock_conn_t *conn;
+
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it until we can rely on it
+         *
+         * Also, the return code from this procedure is ignored.
+         * If we can't send, we must still complete with lib_finalize().
+         * We'll have to wait for 3.2 to return an error event.
+         */
+
+        CDEBUG(D_NET,
+               "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64
+               " pid %d\n", payload_len, payload_niov, nid, pid);
+
+        conn = ksocknal_send_target (nid);
+        if (conn == NULL) {
+                lib_finalize (&ksocknal_lib, private, cookie);
+                return (-1);
+        }
+
+        ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type);
+        if (ltx == NULL) {
+                ksocknal_put_conn (conn);
+                lib_finalize (&ksocknal_lib, private, cookie);
+                return (-1);
+        }
+
+        /* append the payload_iovs to the one pointing at the header */
+        LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        memcpy (ltx->ltx_tx.tx_iov + 1, payload_iov,
+                payload_niov * sizeof (*payload_iov));
+        ltx->ltx_tx.tx_niov = 1 + payload_niov;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+
+        ksocknal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+int
+ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, 
+                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+                     unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len)
+{
+        ksock_ltx_t *ltx;
+        ksock_conn_t *conn;
+        
+        /* NB 'private' is different depending on what we're sending.
+         * Just ignore it until we can rely on it */
+
+        CDEBUG(D_NET,
+               "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n",
+               payload_len, payload_niov, nid, pid);
+
+        conn = ksocknal_send_target (nid);
+        if (conn == NULL)
+                return (-1);
+
+        ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type);
+        if (ltx == NULL) {
+                ksocknal_put_conn (conn);
+                return (-1);
+        }
+
+        LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        
+        ltx->ltx_tx.tx_kiov = ltx->ltx_iov_space.payload.kiov;
+        memcpy (ltx->ltx_tx.tx_kiov, payload_iov, 
+                payload_niov * sizeof (*payload_iov));
+        ltx->ltx_tx.tx_nkiov = payload_niov;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+
+        ksocknal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+void
+ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        ksock_conn_t *conn;
+        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
+        ksock_tx_t   *tx  = (ksock_tx_t *)&fwd->kprfd_scratch;
+
+        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd,
+                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+
+        /* I'm the gateway; must be the last hop */
+        if (nid == ksocknal_lib.ni.nid)
+                nid = fwd->kprfd_target_nid;
+
+        conn = ksocknal_get_conn (nid);
+        if (conn == NULL) {
+                CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid);
+                kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH);
+                return;
+        }
+
+        /* This forward has now got a ref on conn */
+
+        tx->tx_isfwd = 1;                   /* This is a forwarding packet */
+        tx->tx_nob   = fwd->kprfd_nob;
+        tx->tx_niov  = fwd->kprfd_niov;
+        tx->tx_iov   = fwd->kprfd_iov;
+        tx->tx_nkiov = 0;
+        tx->tx_kiov  = NULL;
+        
+        ksocknal_launch_packet (conn, tx);
+}
+
+int
+ksocknal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&ksocknal_data.ksnd_nthreads);
+        return (0);
+}
+
+void
+ksocknal_thread_fini (void)
+{
+        atomic_dec (&ksocknal_data.ksnd_nthreads);
+}
+
+void
+ksocknal_fmb_callback (void *arg, int error)
+{
+        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
+        ksock_fmb_pool_t  *fmp = fmb->fmb_pool;
+        ptl_hdr_t         *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]);
+        ksock_conn_t      *conn = NULL;
+        ksock_sched_t     *sched;
+        long               flags;
+
+        if (error != 0)
+                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
+                       NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),
+                       error);
+        else
+                CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n",
+                        NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid));
+
+        spin_lock_irqsave (&fmp->fmp_lock, flags);
+
+        list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
+
+        if (!list_empty (&fmp->fmp_blocked_conns)) {
+                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next,
+                                   ksock_conn_t, ksnc_rx_list);
+                list_del (&conn->ksnc_rx_list);
+        }
+
+        spin_unlock_irqrestore (&fmp->fmp_lock, flags);
+
+        if (conn == NULL)
+                return;
+
+        CDEBUG (D_NET, "Scheduling conn %p\n", conn);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+
+        sched = conn->ksnc_scheduler;
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
+
+        if (waitqueue_active (&sched->kss_waitq))
+                wake_up (&sched->kss_waitq);
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+}
+
+ksock_fmb_t *
+ksocknal_get_idle_fmb (ksock_conn_t *conn)
+{
+        int               payload_nob = conn->ksnc_rx_nob_left;
+        int               packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        long              flags;
+        ksock_fmb_pool_t *pool;
+        ksock_fmb_t      *fmb;
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (ksocknal_data.ksnd_fmbs != NULL);
+
+        if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+                pool = &ksocknal_data.ksnd_small_fmp;
+        else
+                pool = &ksocknal_data.ksnd_large_fmp;
+
+        spin_lock_irqsave (&pool->fmp_lock, flags);
+
+        if (!list_empty (&pool->fmp_idle_fmbs)) {
+                fmb = list_entry(pool->fmp_idle_fmbs.next,
+                                 ksock_fmb_t, fmb_list);
+                list_del (&fmb->fmb_list);
+                spin_unlock_irqrestore (&pool->fmp_lock, flags);
+
+                return (fmb);
+        }
+
+        /* deschedule until fmb free */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+
+        list_add_tail (&conn->ksnc_rx_list,
+                       &pool->fmp_blocked_conns);
+
+        spin_unlock_irqrestore (&pool->fmp_lock, flags);
+        return (NULL);
+}
+
+
+int
+ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
+{
+        int payload_nob = conn->ksnc_rx_nob_left;
+        int packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        int niov;                               /* at least the header */
+        int nob;
+
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
+        LASSERT (payload_nob >= 0);
+        LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+
+        /* Got a forwarding buffer; copy the header we just read into the
+         * forwarding buffer.  If there's payload start reading reading it
+         * into the buffer, otherwise the forwarding buffer can be kicked
+         * off immediately.
+         *
+         * NB fmb->fmb_iov spans the WHOLE packet.
+         *    conn->ksnc_rx_iov spans just the payload.
+         */
+
+        fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]);
+
+        /* copy header */
+        memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t));
+
+        if (payload_nob == 0) {         /* got complete packet already */
+                atomic_inc (&ksocknal_packets_received);
+
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n",
+                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
+                        dest_nid, packet_nob);
+
+                fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t);
+
+                kpr_fwd_init (&fmb->fmb_fwd, dest_nid,
+                              packet_nob, 1, fmb->fmb_iov,
+                              ksocknal_fmb_callback, fmb);
+
+                /* forward it now */
+                kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd);
+
+                ksocknal_new_packet (conn, 0);  /* on to next packet */
+                return (1);
+        }
+
+        niov = 1;
+        if (packet_nob <= PAGE_SIZE) {  /* whole packet fits in first page */
+                fmb->fmb_iov[0].iov_len = packet_nob;
+        } else {
+                fmb->fmb_iov[0].iov_len = PAGE_SIZE;
+                nob = packet_nob - PAGE_SIZE;
+
+                do {
+                        LASSERT (niov < fmb->fmb_npages);
+                        fmb->fmb_iov[niov].iov_base =
+                                page_address (fmb->fmb_pages[niov]);
+                        fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob);
+                        nob -= PAGE_SIZE;
+                        niov++;
+                } while (nob > 0);
+        }
+
+        kpr_fwd_init (&fmb->fmb_fwd, dest_nid,
+                      packet_nob, niov, fmb->fmb_iov,
+                      ksocknal_fmb_callback, fmb);
+
+        /* stash router's descriptor ready for call to kpr_fwd_start */
+        conn->ksnc_cookie = &fmb->fmb_fwd;
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
+
+        /* payload is desc's iov-ed buffer, but skipping the hdr */
+        LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) /
+                 sizeof (struct iovec));
+
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        conn->ksnc_rx_iov[0].iov_base =
+                (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) +
+                         sizeof (ptl_hdr_t));
+        conn->ksnc_rx_iov[0].iov_len =
+                fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t);
+
+        if (niov > 1)
+                memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1],
+                       (niov - 1) * sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
+                NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob);
+        return (0);
+}
+
+void
+ksocknal_fwd_parse (ksock_conn_t *conn)
+{
+        ksock_conn_t *conn2;
+        ptl_nid_t     dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid);
+        int           body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr));
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
+                NTOH__u64 (conn->ksnc_hdr.src_nid),
+                dest_nid, conn->ksnc_rx_nob_left);
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        if (body_len < 0) {                 /* length corrupt (overflow) */
+                CERROR("dropping packet from "LPX64" for "LPX64": packet "
+                       "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid),
+                       dest_nid, body_len);
+                ksocknal_new_packet (conn, 0);          /* on to new packet */
+                return;
+        }
+
+        if (ksocknal_data.ksnd_fmbs == NULL) {        /* not forwarding */
+                CERROR("dropping packet from "LPX64" for "LPX64": not "
+                       "forwarding\n", conn->ksnc_hdr.src_nid,
+                       conn->ksnc_hdr.dest_nid);
+                /* on to new packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) {      /* too big to forward */
+                CERROR ("dropping packet from "LPX64" for "LPX64
+                        ": packet size %d too big\n", conn->ksnc_hdr.src_nid,
+                        conn->ksnc_hdr.dest_nid, body_len);
+                /* on to new packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        /* should have gone direct */
+        conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid);
+        if (conn2 != NULL) {
+                CERROR ("dropping packet from "LPX64" for "LPX64
+                        ": target is a peer\n", conn->ksnc_hdr.src_nid,
+                        conn->ksnc_hdr.dest_nid);
+                ksocknal_put_conn (conn2);  /* drop ref from get above */
+
+                /* on to next packet (skip this one's body) */
+                ksocknal_new_packet (conn, body_len);
+                return;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
+        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
+        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+}
+
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ksocknal_slop_buffer[4096];
+
+        int   nob;
+        int   niov;
+        int   skipped;
+
+        if (nob_to_skip == 0) {         /* right at next packet boundary now */
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+
+                conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_niov = 1;
+
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+                return (1);
+        }
+
+        /* Set up to skip as much a possible now.  If there's more left
+         * (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        skipped = 0;
+        niov = 0;
+
+        do {
+                nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+                 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+void
+ksocknal_process_receive (ksock_sched_t *sched, long *irq_flags)
+{
+        ksock_conn_t *conn;
+        ksock_fmb_t  *fmb;
+        int           rc;
+
+        /* NB: sched->ksnc_lock lock held */
+
+        LASSERT (!list_empty (&sched->kss_rx_conns));
+        conn = list_entry(sched->kss_rx_conns.next, ksock_conn_t, ksnc_rx_list);
+        list_del (&conn->ksnc_rx_list);
+
+        spin_unlock_irqrestore (&sched->kss_lock, *irq_flags);
+
+        CDEBUG(D_NET, "sched %p conn %p\n", sched, conn);
+        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_ready);
+
+        /* doesn't need a forwarding buffer */
+        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)
+                goto try_read;
+
+ get_fmb:
+        fmb = ksocknal_get_idle_fmb (conn);
+        if (fmb == NULL) {      /* conn descheduled waiting for idle fmb */
+                spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+                return;
+        }
+
+        if (ksocknal_init_fmb (conn, fmb)) /* packet forwarded ? */
+                goto out;               /* come back later for next packet */
+
+ try_read:
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+
+        LASSERT (conn->ksnc_rx_nob_wanted > 0);
+
+        conn->ksnc_rx_ready = 0;/* data ready may race with me and set ready */
+        mb();                   /* => clear BEFORE trying to read */
+
+        rc = ksocknal_recvmsg(conn);
+
+        if (rc == 0)
+                goto out;
+        if (rc < 0) {
+#warning FIXME: handle socket errors properly
+                CERROR ("Error socknal read %p: %d\n", conn, rc);
+                goto out;
+        }
+
+        if (conn->ksnc_rx_nob_wanted != 0)      /* short read */
+                goto out;                       /* try again later */
+
+        /* got all I wanted, assume there's more - prevent data_ready locking */
+        conn->ksnc_rx_ready = 1;
+
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_HEADER:
+                /* It's not for me */
+                if (conn->ksnc_hdr.type != PTL_MSG_HELLO &&
+                    NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) {
+                        ksocknal_fwd_parse (conn);
+                        switch (conn->ksnc_rx_state) {
+                        case SOCKNAL_RX_HEADER: /* skipped (zero payload) */
+                                goto out;       /* => come back later */
+                        case SOCKNAL_RX_SLOP:   /* skipping packet's body */
+                                goto try_read;  /* => go read it */
+                        case SOCKNAL_RX_GET_FMB: /* forwarding */
+                                goto get_fmb;   /* => go get a fwd msg buffer */
+                        default:
+                                LBUG ();
+                        }
+                        /* Not Reached */
+                }
+
+                PROF_START(lib_parse);
+                /* sets wanted_len, iovs etc */
+                lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
+                PROF_FINISH(lib_parse);
+
+                if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */
+                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+                        goto try_read;          /* go read the payload */
+                }
+                /* Fall through (completed packet for me) */
+
+        case SOCKNAL_RX_BODY:
+                atomic_inc (&ksocknal_packets_received);
+                /* packet is done now */
+                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                /* starting new packet? */
+                if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
+                        goto out;       /* come back later */
+                goto try_read;          /* try to finish reading slop now */
+
+        case SOCKNAL_RX_BODY_FWD:
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n",
+                        conn, NTOH__u64 (conn->ksnc_hdr.src_nid),
+                        NTOH__u64 (conn->ksnc_hdr.dest_nid),
+                        conn->ksnc_rx_nob_left);
+
+                atomic_inc (&ksocknal_packets_received);
+
+                /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */
+                kpr_fwd_start (&ksocknal_data.ksnd_router,
+                               (kpr_fwd_desc_t *)conn->ksnc_cookie);
+
+                /* no slop in forwarded packets */
+                LASSERT (conn->ksnc_rx_nob_left == 0);
+
+                ksocknal_new_packet (conn, 0);  /* on to next packet */
+                goto out;                       /* (later) */
+
+        default:
+        }
+
+        /* Not Reached */
+        LBUG ();
+
+ out:
+        spin_lock_irqsave (&sched->kss_lock, *irq_flags);
+
+        /* no data there to read? */
+        if (!conn->ksnc_rx_ready) {
+                /* let socket callback schedule again */
+                conn->ksnc_rx_scheduled = 0;
+                ksocknal_put_conn (conn);       /* release scheduler's ref */
+        } else                              /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
+}
+
+int
+ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg,
+               unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+        memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov));
+
+        LASSERT (mlen == 
+                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        return (rlen);
+}
+
+int
+ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg,
+                     unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        
+        conn->ksnc_cookie = msg;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        conn->ksnc_rx_niov = 0;
+        conn->ksnc_rx_iov  = NULL;
+        conn->ksnc_rx_nkiov = niov;
+        conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+        memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov));
+
+        LASSERT (mlen == 
+                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+
+        return (rlen);
+}
+
+int ksocknal_scheduler (void *arg)
+{
+        ksock_sched_t     *sched = (ksock_sched_t *)arg;
+        unsigned long      flags;
+        int                rc;
+        int                nloops = 0;
+        int                id = sched - ksocknal_data.ksnd_schedulers;
+        char               name[16];
+#if (CONFIG_SMP && CPU_AFFINITY)
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        int                cpu = cpu_logical_map(id % num_online_cpus());
+#else
+#warning "Take care of architecure specific logical APIC map"
+        int cpu = 1;    /* Have to change later. */
+#endif /* LINUX_VERSION_CODE */
+        
+        set_cpus_allowed (current, 1 << cpu);
+        id = cpu;
+#endif /* CONFIG_SMP && CPU_AFFINITY */
+
+        snprintf (name, sizeof (name),"ksocknald[%d]", id);
+        kportal_daemonize (name);
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&sched->kss_rx_conns)) {
+                        did_something = 1;
+                        /* drops & regains kss_lock */
+                        ksocknal_process_receive (sched, &flags);
+                }
+
+                if (!list_empty (&sched->kss_tx_conns)) {
+                        did_something = 1;
+                        /* drops and regains kss_lock */
+                        ksocknal_process_transmit (sched, &flags);
+                }
+#if SOCKNAL_ZC
+                if (!list_empty (&sched->kss_zctxdone_list)) {
+                        ksock_tx_t *tx =
+                                list_entry(sched->kss_zctxdone_list.next,
+                                           ksock_tx_t, tx_list);
+                        did_something = 1;
+
+                        list_del (&tx->tx_list);
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        ksocknal_tx_done (tx);
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+#endif
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+#if SOCKNAL_ZC
+                                rc = wait_event_interruptible (sched->kss_waitq,
+                                                               ksocknal_data.ksnd_shuttingdown ||
+                                                               !list_empty(&sched->kss_rx_conns) ||
+                                                               !list_empty(&sched->kss_tx_conns) ||
+                                                               !list_empty(&sched->kss_zctxdone_list));
+#else
+                                rc = wait_event_interruptible (sched->kss_waitq,
+                                                               ksocknal_data.ksnd_shuttingdown ||
+                                                               !list_empty(&sched->kss_rx_conns) ||
+                                                               !list_empty(&sched->kss_tx_conns));
+#endif
+                                LASSERT (rc == 0);
+                        } else
+                               our_cond_resched();
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+void
+ksocknal_data_ready (struct sock *sk, int n)
+{
+        unsigned long  flags;
+        ksock_conn_t  *conn;
+        ksock_sched_t *sched;
+        ENTRY;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        conn = sk->user_data;
+        if (conn == NULL) {             /* raced with ksocknal_close_sock */
+                LASSERT (sk->data_ready != &ksocknal_data_ready);
+                sk->data_ready (sk, n);
+        } else if (!conn->ksnc_rx_ready) {        /* new news */
+                /* Set ASAP in case of concurrent calls to me */
+                conn->ksnc_rx_ready = 1;
+
+                sched = conn->ksnc_scheduler;
+
+                spin_lock_irqsave (&sched->kss_lock, flags);
+
+                /* Set again (process_receive may have cleared while I blocked for the lock) */
+                conn->ksnc_rx_ready = 1;
+
+                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail(&conn->ksnc_rx_list,
+                                      &sched->kss_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        if (waitqueue_active (&sched->kss_waitq))
+                                wake_up (&sched->kss_waitq);
+                }
+
+                spin_unlock_irqrestore (&sched->kss_lock, flags);
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+
+        EXIT;
+}
+
+void
+ksocknal_write_space (struct sock *sk)
+{
+        unsigned long  flags;
+        ksock_conn_t  *conn;
+        ksock_sched_t *sched;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_socklist_lock);
+
+        conn = sk->user_data;
+
+        CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
+               sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn,
+               (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ?
+                                      " ready" : " blocked"),
+               (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
+                                      " scheduled" : " idle"),
+               (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ?
+                                      " empty" : " queued"));
+
+        if (conn == NULL) {             /* raced with ksocknal_close_sock */
+                LASSERT (sk->write_space != &ksocknal_write_space);
+                sk->write_space (sk);
+        } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
+                clear_bit (SOCK_NOSPACE, &sk->socket->flags);
+
+                if (!conn->ksnc_tx_ready) {      /* new news */
+                        /* Set ASAP in case of concurrent calls to me */
+                        conn->ksnc_tx_ready = 1;
+
+                        sched = conn->ksnc_scheduler;
+
+                        spin_lock_irqsave (&sched->kss_lock, flags);
+
+                        /* Set again (process_transmit may have
+                           cleared while I blocked for the lock) */
+                        conn->ksnc_tx_ready = 1;
+
+                        if (!conn->ksnc_tx_scheduled && // not being progressed
+                            !list_empty(&conn->ksnc_tx_queue)){//packets to send
+                                list_add_tail (&conn->ksnc_tx_list,
+                                               &sched->kss_tx_conns);
+                                conn->ksnc_tx_scheduled = 1;
+                                /* extra ref for scheduler */
+                                atomic_inc (&conn->ksnc_refcount);
+
+                                if (waitqueue_active (&sched->kss_waitq))
+                                        wake_up (&sched->kss_waitq);
+                        }
+
+                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_socklist_lock);
+}
+
+int
+ksocknal_reaper (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        
+        kportal_daemonize ("ksocknal_reaper");
+        kportal_blockallsigs ();
+
+        while (!ksocknal_data.ksnd_shuttingdown) {
+                spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                if (list_empty (&ksocknal_data.ksnd_reaper_list)) {
+                        conn = NULL;
+                } else {
+                        conn = list_entry (ksocknal_data.ksnd_reaper_list.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                }
+
+                spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+
+                if (conn != NULL)
+                        ksocknal_close_conn (conn);
+                else {
+                        rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq,
+                                                       ksocknal_data.ksnd_shuttingdown ||
+                                                       !list_empty(&ksocknal_data.ksnd_reaper_list));
+                        LASSERT (rc == 0);
+                }
+        }
+
+        ksocknal_thread_fini ();
+        return (0);
+}
+
+nal_cb_t ksocknal_lib = {
+        nal_data:       &ksocknal_data,                /* NAL private data */
+        cb_send:         ksocknal_send,
+        cb_send_pages:   ksocknal_send_pages,
+        cb_recv:         ksocknal_recv,
+        cb_recv_pages:   ksocknal_recv_pages,
+        cb_read:         ksocknal_read,
+        cb_write:        ksocknal_write,
+        cb_callback:     ksocknal_callback,
+        cb_malloc:       ksocknal_malloc,
+        cb_free:         ksocknal_free,
+        cb_printf:       ksocknal_printf,
+        cb_cli:          ksocknal_cli,
+        cb_sti:          ksocknal_sti,
+        cb_dist:         ksocknal_dist
+};
diff --git a/lustre/portals/knals/toenal/.cvsignore b/lustre/portals/knals/toenal/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/portals/knals/toenal/Makefile.am b/lustre/portals/knals/toenal/Makefile.am
new file mode 100644 (file)
index 0000000..9bfff64
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../../Rules.linux
+
+MODULE = ktoenal
+modulenet_DATA = ktoenal.o
+EXTRA_PROGRAMS = ktoenal
+
+DEFS =
+ktoenal_SOURCES = toenal.c toenal_cb.c toenal.h
diff --git a/lustre/portals/knals/toenal/toenal.c b/lustre/portals/knals/toenal/toenal.c
new file mode 100644 (file)
index 0000000..1f5dc38
--- /dev/null
@@ -0,0 +1,629 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <linux/poll.h>
+#include "toenal.h"
+
+ptl_handle_ni_t         ktoenal_ni;
+static nal_t            ktoenal_api;
+static ksock_nal_data_t ktoenal_data;
+
+/*
+ksocknal_interface_t ktoenal_interface = {
+        ksni_add_sock:         ktoenal_add_sock,
+        ksni_close_sock:       ktoenal_close_sock,
+        ksni_set_mynid:                ktoenal_set_mynid,
+};
+*/
+
+kpr_nal_interface_t ktoenal_router_interface = {
+        kprni_nalid:   TOENAL,
+        kprni_arg:     &ktoenal_data,
+        kprni_fwd:     ktoenal_fwd_packet,
+};
+
+
+int
+ktoenal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
+                       void *ret, size_t ret_len)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+
+        lib_dispatch(nal_cb, k, id, args, ret); /* ktoenal_send needs k */
+        return PTL_OK;
+}
+
+int
+ktoenal_api_shutdown(nal_t *nal, int ni)
+{
+       CDEBUG (D_NET, "closing all connections\n");
+
+        return ktoenal_close_sock(0);          /* close all sockets */
+}
+
+void
+ktoenal_api_yield(nal_t *nal)
+{
+        our_cond_resched();
+        return;
+}
+
+void
+ktoenal_api_lock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_cli(nal_cb,flags);
+}
+
+void
+ktoenal_api_unlock(nal_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *k;
+        nal_cb_t *nal_cb;
+
+        k = nal->nal_data;
+        nal_cb = k->ksnd_nal_cb;
+        nal_cb->cb_sti(nal_cb,flags);
+}
+
+nal_t *
+ktoenal_init(int interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
+{
+        CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n",
+               ktoenal_data.ksnd_mynid);
+        lib_init(&ktoenal_lib, ktoenal_data.ksnd_mynid, 0, 10, ptl_size,
+                 ac_size);
+        return (&ktoenal_api);
+}
+
+/*
+ *  EXTRA functions follow
+ */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define SOCKET_I(inode) (&(inode)->u.socket_i)
+#endif
+static __inline__ struct socket *
+socki_lookup(struct inode *inode)
+{
+        return SOCKET_I(inode);
+}
+
+int
+ktoenal_set_mynid(ptl_nid_t nid)
+{
+        lib_ni_t *ni = &ktoenal_lib.ni;
+
+        /* FIXME: we have to do this because we call lib_init() at module
+         * insertion time, which is before we have 'mynid' available.  lib_init
+         * sets the NAL's nid, which it uses to tell other nodes where packets
+         * are coming from.  This is not a very graceful solution to this
+         * problem. */
+
+        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->nid);
+
+        ktoenal_data.ksnd_mynid = nid;
+        ni->nid = nid;
+        return (0);
+}
+
+int
+ktoenal_add_sock (ptl_nid_t nid, int fd)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        struct file       *file = NULL;
+        struct socket     *sock = NULL;
+        int                ret;
+        ENTRY;
+
+        file = fget(fd);
+        if (file == NULL)
+                RETURN(-EINVAL);
+
+        ret = -EINVAL;
+        sock = socki_lookup(file->f_dentry->d_inode);
+        if (sock == NULL)
+                GOTO(error, ret);
+
+        ret = -ENOMEM;
+        PORTAL_ALLOC(conn, sizeof(*conn));
+        if (!conn)
+                GOTO(error, ret);
+
+        memset (conn, 0, sizeof (conn));        /* zero for consistency */
+        file->f_flags |= O_NONBLOCK;  /*  Does this have any conflicts */
+        conn->ksnc_file = file;
+        conn->ksnc_sock = sock;
+        conn->ksnc_peernid = nid;
+        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for socklist */
+
+        conn->ksnc_rx_ready = 0;
+        conn->ksnc_rx_scheduled = 0;
+        ktoenal_new_packet (conn, 0);
+
+        INIT_LIST_HEAD (&conn->ksnc_tx_queue);
+        conn->ksnc_tx_ready = 0;
+        conn->ksnc_tx_scheduled = 0;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        list_add(&conn->ksnc_list, &ktoenal_data.ksnd_socklist);
+        write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        ktoenal_data_ready(conn);
+        ktoenal_write_space(conn);
+
+        ktoenal_data.ksnd_slistchange = 1;
+        wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+        /* Schedule pollthread so that it will poll
+         * for newly created socket
+         */
+
+
+        CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n",
+               conn, conn->ksnc_peernid);
+
+        /* Can't unload while connection active */
+        PORTAL_MODULE_USE;
+        RETURN(0);
+
+error:
+        fput(file);
+        return (ret);
+}
+
+/* Passing in a zero nid will close all connections */
+int
+ktoenal_close_sock(ptl_nid_t nid)
+{
+        long               flags;
+        ksock_conn_t      *conn;
+        LIST_HEAD         (death_row);
+        struct list_head  *tmp;
+
+        LASSERT (!in_interrupt());
+        write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        if (nid == 0)                           /* close ALL connections */
+        {
+                /* insert 'death row' into the socket list... */
+                list_add (&death_row, &ktoenal_data.ksnd_socklist);
+                /* ...extract and reinitialise the socket list itself... */
+                list_del_init (&ktoenal_data.ksnd_socklist);
+                /* ...and voila, death row is the proud owner of all conns */
+        } else list_for_each (tmp, &ktoenal_data.ksnd_socklist) {
+
+                conn = list_entry (tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid)
+                {
+                        list_del (&conn->ksnc_list);
+                        list_add (&conn->ksnc_list, &death_row);
+                        break;
+                }
+        }
+
+
+        write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags);
+
+        if (list_empty (&death_row))
+                return (-ENOENT);
+
+        do {
+                conn = list_entry (death_row.next, ksock_conn_t, ksnc_list);
+                list_del (&conn->ksnc_list);
+                ktoenal_put_conn (conn);       /* drop ref for ksnd_socklist */
+        } while (!list_empty (&death_row));
+
+        ktoenal_data.ksnd_slistchange = 1;
+        wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+
+        return (0);
+}
+
+
+ksock_conn_t *
+ktoenal_get_conn (ptl_nid_t nid)
+{
+        struct list_head *tmp;
+        ksock_conn_t     *conn;
+
+        PROF_START(conn_list_walk);
+
+        read_lock (&ktoenal_data.ksnd_socklist_lock);
+
+        list_for_each(tmp, &ktoenal_data.ksnd_socklist) {
+
+                conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+
+                if (conn->ksnc_peernid == nid)
+                {
+                        /* caller is referencing */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+
+                        CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n",
+                               conn, nid, atomic_read (&conn->ksnc_refcount));
+
+                        PROF_FINISH(conn_list_walk);
+                        return (conn);
+                }
+        }
+
+        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+
+        CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", nid);
+        PROF_FINISH(conn_list_walk);
+        return (NULL);
+}
+
+void
+ktoenal_close_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_NET, "connection [%p] closed \n", conn);
+
+        fput (conn->ksnc_file);
+        PORTAL_FREE (conn, sizeof (*conn));
+        /* One less connection keeping us hanging on */
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+_ktoenal_put_conn (ksock_conn_t *conn)
+{
+        unsigned long flags;
+
+        CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn);
+
+        /* "But what is the black spot, captain?" I asked.
+         * "That's a summons, mate..." */
+
+        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (!conn->ksnc_rx_scheduled);
+
+        if (!in_interrupt())
+        {
+                ktoenal_close_conn (conn);
+                return;
+        }
+
+        spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags);
+
+        list_add (&conn->ksnc_list, &ktoenal_data.ksnd_reaper_list);
+        wake_up (&ktoenal_data.ksnd_reaper_waitq);
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags);
+}
+
+void
+ktoenal_free_buffers (void)
+{
+        if (ktoenal_data.ksnd_fmbs != NULL)
+        {
+                ksock_fmb_t *fmb = (ksock_fmb_t *)ktoenal_data.ksnd_fmbs;
+                int          i;
+                int          j;
+
+                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++, fmb++)
+                        for (j = 0; j < fmb->fmb_npages; j++)
+                                if (fmb->fmb_pages[j] != NULL)
+                                        __free_page (fmb->fmb_pages[j]);
+
+                PORTAL_FREE (ktoenal_data.ksnd_fmbs,
+                             sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+        }
+
+        if (ktoenal_data.ksnd_ltxs != NULL)
+                PORTAL_FREE (ktoenal_data.ksnd_ltxs,
+                             sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+}
+
+int
+ktoenal_cmd(struct portal_ioctl_data * data, void * private)
+{
+        int rc = -EINVAL;
+
+        LASSERT (data != NULL);
+
+        switch(data->ioc_nal_cmd) {
+        case NAL_CMD_REGISTER_PEER_FD: {
+                rc = ktoenal_add_sock(data->ioc_nid, data->ioc_fd);
+                break;
+        }
+        case NAL_CMD_CLOSE_CONNECTION: {
+                rc = ktoenal_close_sock(data->ioc_nid);
+                break;
+        }
+        case NAL_CMD_REGISTER_MYNID: {
+                rc = ktoenal_set_mynid (data->ioc_nid);
+                break;
+        }
+        }
+
+        return rc;
+}
+
+
+void __exit
+ktoenal_module_fini (void)
+{
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        switch (ktoenal_data.ksnd_init)
+        {
+        default:
+                LASSERT (0);
+
+        case SOCKNAL_INIT_ALL:
+                kportal_nal_unregister(TOENAL);
+                PORTAL_SYMBOL_UNREGISTER (ktoenal_ni);
+                /* fall through */
+
+        case SOCKNAL_INIT_PTL:
+                PtlNIFini(ktoenal_ni);
+                lib_fini(&ktoenal_lib);
+                /* fall through */
+
+        case SOCKNAL_INIT_DATA:
+                /* Module refcount only gets to zero when all connections
+                 * have been closed so all lists must be empty */
+                LASSERT (list_empty (&ktoenal_data.ksnd_socklist));
+                LASSERT (list_empty (&ktoenal_data.ksnd_reaper_list));
+                LASSERT (list_empty (&ktoenal_data.ksnd_rx_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_tx_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns));
+
+                kpr_shutdown (&ktoenal_data.ksnd_router); /* stop router calling me */
+
+                /* flag threads to terminate; wake and wait for them to die */
+                ktoenal_data.ksnd_shuttingdown = 1;
+                wake_up_all (&ktoenal_data.ksnd_reaper_waitq);
+                wake_up_all (&ktoenal_data.ksnd_sched_waitq);
+                wake_up_process(ktoenal_data.ksnd_pollthread_tsk);
+
+                while (atomic_read (&ktoenal_data.ksnd_nthreads) != 0)
+                {
+                        CDEBUG (D_NET, "waitinf for %d threads to terminate\n",
+                                atomic_read (&ktoenal_data.ksnd_nthreads));
+                        set_current_state (TASK_UNINTERRUPTIBLE);
+                        schedule_timeout (HZ);
+                }
+
+                kpr_deregister (&ktoenal_data.ksnd_router);
+
+                ktoenal_free_buffers();
+                /* fall through */
+
+        case SOCKNAL_INIT_NOTHING:
+                break;
+        }
+
+        CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
+               atomic_read (&portal_kmemory));
+
+        printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+               atomic_read(&portal_kmemory));
+}
+
+int __init
+ktoenal_module_init (void)
+{
+        int   pkmem = atomic_read(&portal_kmemory);
+        int   rc;
+        int   i;
+        int   j;
+
+        /* packet descriptor must fit in a router descriptor's scratchpad */
+        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+
+        LASSERT (ktoenal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+
+        ktoenal_api.forward  = ktoenal_api_forward;
+        ktoenal_api.shutdown = ktoenal_api_shutdown;
+        ktoenal_api.yield    = ktoenal_api_yield;
+        ktoenal_api.validate = NULL;           /* our api validate is a NOOP */
+        ktoenal_api.lock     = ktoenal_api_lock;
+        ktoenal_api.unlock   = ktoenal_api_unlock;
+        ktoenal_api.nal_data = &ktoenal_data;
+
+        ktoenal_lib.nal_data = &ktoenal_data;
+
+        memset (&ktoenal_data, 0, sizeof (ktoenal_data)); /* zero pointers */
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_socklist);
+        rwlock_init(&ktoenal_data.ksnd_socklist_lock);
+
+        ktoenal_data.ksnd_nal_cb = &ktoenal_lib;
+        spin_lock_init (&ktoenal_data.ksnd_nal_cb_lock);
+
+        spin_lock_init (&ktoenal_data.ksnd_sched_lock);
+
+        init_waitqueue_head (&ktoenal_data.ksnd_sched_waitq);
+
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_rx_conns);
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_tx_conns);
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_idle_fmbs);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns);
+
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_nblk_ltx_list);
+        INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_ltx_list);
+        init_waitqueue_head(&ktoenal_data.ksnd_idle_ltx_waitq);
+
+        INIT_LIST_HEAD (&ktoenal_data.ksnd_reaper_list);
+        init_waitqueue_head(&ktoenal_data.ksnd_reaper_waitq);
+        spin_lock_init (&ktoenal_data.ksnd_reaper_lock);
+
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_DATA; /* flag lists/ptrs/locks initialised */
+
+        PORTAL_ALLOC(ktoenal_data.ksnd_fmbs,
+                     sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+        if (ktoenal_data.ksnd_fmbs == NULL)
+                RETURN(-ENOMEM);
+
+        /* NULL out buffer pointers etc */
+        memset(ktoenal_data.ksnd_fmbs, 0,
+               sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS));
+
+        for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++)
+        {
+                ksock_fmb_t *fmb = &((ksock_fmb_t *)ktoenal_data.ksnd_fmbs)[i];
+
+                if (i < SOCKNAL_SMALL_FWD_NMSGS)
+                {
+                        fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES;
+                        fmb->fmb_pool = &ktoenal_data.ksnd_small_fmp;
+                }
+                else
+                {
+                        fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES;
+                        fmb->fmb_pool = &ktoenal_data.ksnd_large_fmp;
+                }
+
+                LASSERT (fmb->fmb_npages > 0);
+                for (j = 0; j < fmb->fmb_npages; j++)
+                {
+                        fmb->fmb_pages[j] = alloc_page(GFP_KERNEL);
+
+                        if (fmb->fmb_pages[j] == NULL)
+                        {
+                                ktoenal_module_fini ();
+                                return (-ENOMEM);
+                        }
+
+                        LASSERT (page_address (fmb->fmb_pages[j]) != NULL);
+                }
+
+                list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+        }
+
+        PORTAL_ALLOC(ktoenal_data.ksnd_ltxs,
+                     sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+        if (ktoenal_data.ksnd_ltxs == NULL)
+        {
+                ktoenal_module_fini ();
+                return (-ENOMEM);
+        }
+
+        /* Deterministic bugs please */
+        memset (ktoenal_data.ksnd_ltxs, 0xeb,
+                sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS));
+
+        for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++)
+        {
+                ksock_ltx_t *ltx = &((ksock_ltx_t *)ktoenal_data.ksnd_ltxs)[i];
+
+                ltx->ltx_idle = i < SOCKNAL_NLTXS ?
+                                &ktoenal_data.ksnd_idle_ltx_list :
+                                &ktoenal_data.ksnd_idle_nblk_ltx_list;
+                list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+        }
+
+        rc = PtlNIInit(ktoenal_init, 32, 4, 0, &ktoenal_ni);
+        if (rc != 0)
+        {
+                CERROR("ktoenal: PtlNIInit failed: error %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+        PtlNIDebug(ktoenal_ni, ~0);
+
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_PTL; /* flag PtlNIInit() called */
+
+        ktoenal_data.ksnd_slistchange = 1;
+        for (i = 0; i < TOENAL_N_SCHED; i++)
+        {
+                rc = ktoenal_thread_start (ktoenal_scheduler, NULL);
+                if (rc != 0)
+                {
+                        CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc);
+                        ktoenal_module_fini ();
+                        RETURN (rc);
+                }
+        }
+
+        rc = ktoenal_thread_start (ktoenal_reaper, NULL);
+        if (rc != 0)
+        {
+                CERROR("Can't spawn socknal reaper: %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = ktoenal_thread_start (ktoenal_pollthread, NULL);
+        if (rc != 0)
+        {
+                CERROR("Can't spawn socknal pollthread: %d\n", rc);
+                ktoenal_module_fini ();
+                RETURN (rc);
+        }
+
+        rc = kpr_register(&ktoenal_data.ksnd_router,
+                  &ktoenal_router_interface);
+        if (rc != 0)
+                CDEBUG (D_NET, "Can't initialise routing interface (rc = %d): not routing\n", rc);
+
+        rc = kportal_nal_register(TOENAL, &ktoenal_cmd, NULL);
+        if (rc != 0)
+                CDEBUG(D_NET, "Can't initialise command interface (rc = %d)\n",
+                       rc);
+
+        PORTAL_SYMBOL_REGISTER(ktoenal_ni);
+
+        /* flag everything initialised */
+        ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL;
+
+       printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n",
+              kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled",
+               pkmem);
+
+        return (0);
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01");
+MODULE_LICENSE("GPL");
+
+module_init(ktoenal_module_init);
+module_exit(ktoenal_module_fini);
+
+EXPORT_SYMBOL (ktoenal_ni);
diff --git a/lustre/portals/knals/toenal/toenal.h b/lustre/portals/knals/toenal/toenal.h
new file mode 100644 (file)
index 0000000..f793d3b
--- /dev/null
@@ -0,0 +1,236 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_PORTAL_ALLOC
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/tcp.h>
+#include <linux/uio.h>
+#include <linux/sched.h> 
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#define DEBUG_SUBSYSTEM S_SOCKNAL
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+#define SOCKNAL_MAX_FWD_PAYLOAD (64<<10)        /* biggest payload I can forward */
+
+#define SOCKNAL_NLTXS           128             /* # normal transmit messages */
+#define SOCKNAL_NNBLK_LTXS     128             /* # transmit messages reserved if can't block */
+
+#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
+#define SOCKNAL_LARGE_FWD_NMSGS 32              /* # large messages I can be forwarding at any time */
+
+#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
+
+#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT)
+                                               /* # pages in a large message fwd buffer */
+
+#define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
+
+#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10)
+
+#define TOENAL_N_SCHED 1
+
+typedef struct                                  /* pool of forwarding buffers */
+{
+        struct list_head  fmp_idle_fmbs;        /* buffers waiting for a connection */
+        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
+} ksock_fmb_pool_t;
+
+typedef struct {
+        int               ksnd_init;            /* initialisation state */
+        
+        struct list_head  ksnd_socklist;        /* all my connections */
+        rwlock_t          ksnd_socklist_lock;   /* stabilise add/find/remove */
+
+
+        ptl_nid_t         ksnd_mynid;
+        nal_cb_t         *ksnd_nal_cb;
+        spinlock_t        ksnd_nal_cb_lock;     /* lib cli/sti lock */
+
+        atomic_t          ksnd_nthreads;        /* # live threads */
+        int               ksnd_shuttingdown;    /* tell threads to exit */
+        
+        kpr_router_t      ksnd_router;          /* THE router */
+
+        spinlock_t        ksnd_sched_lock;      /* serialise packet scheduling */
+        wait_queue_head_t ksnd_sched_waitq;     /* where scheduler(s) wait */
+
+        struct list_head  ksnd_rx_conns;        /* conn waiting to be read */
+        struct list_head  ksnd_tx_conns;        /* conn waiting to be written */
+        
+        void             *ksnd_fmbs;            /* all the pre-allocated FMBs */
+        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
+        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
+
+        void             *ksnd_ltxs;            /* all the pre-allocated LTXs */
+        struct list_head  ksnd_idle_ltx_list;   /* where to get an idle LTX */
+        struct list_head  ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */
+        wait_queue_head_t ksnd_idle_ltx_waitq;  /* where to block for an idle LTX */
+
+        struct list_head  ksnd_reaper_list;     /* conn waiting to be reaped */
+        wait_queue_head_t ksnd_reaper_waitq;    /* reaper sleeps here */
+        spinlock_t        ksnd_reaper_lock;     /* serialise */
+        
+        struct task_struct *ksnd_pollthread_tsk;/* task_struct for the poll thread */
+        poll_table          ksnd_pwait;         /* poll wait table for the socket */
+        int                 ksnd_slistchange;   /* informs the pollthread that
+                                                 * the socklist has changed */  
+} ksock_nal_data_t;
+
+#define SOCKNAL_INIT_NOTHING    0
+#define SOCKNAL_INIT_DATA       1
+#define SOCKNAL_INIT_PTL        2
+#define SOCKNAL_INIT_ALL        3
+
+typedef struct                                  /* transmit packet */
+{
+        struct list_head        tx_list;       /* queue on conn for transmission etc */
+        char                    tx_isfwd;      /* forwarding / sourced here */
+        int                     tx_nob;        /* # packet bytes */
+        int                     tx_niov;       /* # packet frags */
+        struct iovec           *tx_iov;        /* packet frags */
+} ksock_tx_t;
+
+typedef struct                                  /* locally transmitted packet */
+{
+        ksock_tx_t              ltx_tx;         /* send info */
+        struct list_head       *ltx_idle;       /* where to put when idle */
+        void                   *ltx_private;    /* lib_finalize() callback arg */
+        void                   *ltx_cookie;     /* lib_finalize() callback arg */
+        struct iovec            ltx_iov[1 + PTL_MD_MAX_IOV]; /* msg frags */
+        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
+} ksock_ltx_t;
+
+#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry (ptr, kpr_fwd_desc_t, kprfd_scratch)
+/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
+
+#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
+/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
+
+/* NB list_entry() is used here as convenient macro for calculating a
+ * pointer to a struct from the addres of a member.
+ */
+
+typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
+{                                               /* (socknal->router) */
+        struct list_head        fmb_list;       /* queue idle */
+        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
+        int                     fmb_npages;     /* # pages allocated */
+        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
+        struct page            *fmb_pages[SOCKNAL_LARGE_FWD_PAGES];
+        struct iovec            fmb_iov[SOCKNAL_LARGE_FWD_PAGES];
+} ksock_fmb_t;
+
+#define SOCKNAL_RX_HEADER       1               /* reading header */
+#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
+#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
+#define SOCKNAL_RX_SLOP         4               /* skipping body */
+#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
+#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+
+typedef struct 
+{ 
+        struct list_head    ksnc_list;          /* stash on global socket list */
+        struct file        *ksnc_file;          /* socket filp */
+        struct socket      *ksnc_sock;          /* socket */
+        ptl_nid_t           ksnc_peernid;       /* who's on the other end */
+        atomic_t            ksnc_refcount;      /* # users */
+        
+        /* READER */
+        struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
+        unsigned long       ksnc_rx_ready;      /* data ready to read */
+        int                 ksnc_rx_scheduled;  /* being progressed */
+        int                 ksnc_rx_state;      /* what is being read */
+        int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
+        int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
+        int                 ksnc_rx_niov;       /* # frags */
+        struct iovec        ksnc_rx_iov[1 + PTL_MD_MAX_IOV]; /* the frags */
+
+        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
+        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+
+        /* WRITER */
+        struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
+        struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        unsigned long       ksnc_tx_ready;      /* write space */
+        int                 ksnc_tx_scheduled;  /* being progressed */
+        
+} ksock_conn_t;
+
+extern int ktoenal_add_sock (ptl_nid_t nid, int fd);
+extern int ktoenal_close_sock(ptl_nid_t nid);
+extern int ktoenal_set_mynid(ptl_nid_t nid);
+extern int ktoenal_push_sock(ptl_nid_t nid);
+extern ksock_conn_t *ktoenal_get_conn (ptl_nid_t nid);
+extern void _ktoenal_put_conn (ksock_conn_t *conn);
+extern void ktoenal_close_conn (ksock_conn_t *conn);
+
+static inline void
+ktoenal_put_conn (ksock_conn_t *conn)
+{
+        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", 
+                conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount));
+        
+        if (atomic_dec_and_test (&conn->ksnc_refcount))
+                _ktoenal_put_conn (conn);
+}
+
+extern int ktoenal_thread_start (int (*fn)(void *arg), void *arg);
+extern int ktoenal_new_packet (ksock_conn_t *conn, int skip);
+extern void ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern int ktoenal_scheduler (void *arg);
+extern int ktoenal_reaper (void *arg);
+extern int ktoenal_pollthread (void *arg);
+extern void ktoenal_data_ready(ksock_conn_t *conn);
+extern void ktoenal_write_space(ksock_conn_t *conn);
+
+
+extern nal_cb_t         ktoenal_lib;
+extern ksock_nal_data_t ktoenal_data;
diff --git a/lustre/portals/knals/toenal/toenal_cb.c b/lustre/portals/knals/toenal/toenal_cb.c
new file mode 100644 (file)
index 0000000..ec37f6f
--- /dev/null
@@ -0,0 +1,1219 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *   Author: Kedar Sovani <kedar@calsoftinc.com>
+ *   Author: Amey Inamdar <amey@calsoftinc.com>
+ *   
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/poll.h>
+#include "toenal.h"
+
+atomic_t   ktoenal_packets_received;
+long       ktoenal_packets_launched;
+long       ktoenal_packets_transmitted;
+
+/*
+ *  LIB functions follow
+ *
+ */
+int
+ktoenal_read(nal_cb_t *nal, void *private, void *dst_addr,
+              user_ptr src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int
+ktoenal_write(nal_cb_t *nal, void *private, user_ptr dst_addr,
+               void *src_addr, size_t len)
+{
+        CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n",
+               nal->ni.nid, (long)len, src_addr, dst_addr);
+
+        memcpy( dst_addr, src_addr, len );
+        return 0;
+}
+
+int 
+ktoenal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq,
+                        ptl_event_t *ev)
+{
+        CDEBUG(D_NET, LPX64": callback eq %p ev %p\n",
+               nal->ni.nid, eq, ev);
+
+        if (eq->event_callback != NULL) 
+                eq->event_callback(ev);
+
+        return 0;
+}
+
+void *
+ktoenal_malloc(nal_cb_t *nal, size_t len)
+{
+        void *buf;
+
+        PORTAL_ALLOC(buf, len);
+
+        if (buf != NULL)
+                memset(buf, 0, len);
+
+        return (buf);
+}
+
+void
+ktoenal_free(nal_cb_t *nal, void *buf, size_t len)
+{
+        PORTAL_FREE(buf, len);
+}
+
+void
+ktoenal_printf(nal_cb_t *nal, const char *fmt, ...)
+{
+       va_list ap;
+       char msg[256];
+
+       va_start (ap, fmt);
+       vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */
+       va_end (ap);
+
+       msg[sizeof (msg) - 1] = 0;              /* ensure terminated */
+
+        CDEBUG (D_NET, "%s", msg);
+}
+
+void
+ktoenal_cli(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data = nal->nal_data;
+
+        spin_lock(&data->ksnd_nal_cb_lock);
+}
+
+void
+ktoenal_sti(nal_cb_t *nal, unsigned long *flags)
+{
+        ksock_nal_data_t *data;
+        data = nal->nal_data;
+
+        spin_unlock(&data->ksnd_nal_cb_lock);
+}
+
+int
+ktoenal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+{
+        /* I would guess that if ktoenal_get_conn(nid) == NULL,
+           and we're not routing, then 'nid' is very distant :) */
+        if ( nal->ni.nid == nid ) {
+                *dist = 0;
+        } else {
+                *dist = 1;
+        }
+
+        return 0;
+}
+
+ksock_ltx_t *
+ktoenal_get_ltx (int may_block)
+{
+        long        flags;
+        ksock_ltx_t *ltx = NULL;
+        
+        for (;;)
+        {
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        
+                if (!list_empty (&ktoenal_data.ksnd_idle_ltx_list))
+                {
+                        ltx = list_entry (ktoenal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list);
+                        list_del (&ltx->ltx_tx.tx_list);
+                        break;
+                }
+
+                if (!may_block)
+                {
+                        if (!list_empty (&ktoenal_data.ksnd_idle_nblk_ltx_list))
+                        {
+                                ltx = list_entry (ktoenal_data.ksnd_idle_nblk_ltx_list.next, 
+                                                  ksock_ltx_t, ltx_tx.tx_list);
+                                list_del (&ltx->ltx_tx.tx_list);
+                        }
+                        break;
+                }
+                
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+                
+                wait_event (ktoenal_data.ksnd_idle_ltx_waitq,
+                            !list_empty (&ktoenal_data.ksnd_idle_ltx_list));
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+
+        return (ltx);
+}
+
+int
+ktoenal_sendmsg (struct file *sock, struct iovec *iov, int niov, int nob, int flags)
+{
+        /* NB This procedure "consumes" iov (actually we do, tcp_sendmsg doesn't)
+         */
+        mm_segment_t oldmm;
+        int           rc;
+
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+        
+        oldmm = get_fs();
+        set_fs (KERNEL_DS);
+
+#ifdef PORTAL_DEBUG
+        {
+                int total_nob;
+                int i;
+                
+                for (i = total_nob = 0; i < niov; i++)
+                        total_nob += iov[i].iov_len;
+                
+                LASSERT (nob == total_nob);
+        }
+#endif        
+        LASSERT (!in_interrupt());
+       
+        rc = sock->f_op->writev(sock, iov, niov, NULL);
+
+        set_fs (oldmm);
+
+        if (rc > 0)                             /* sent something? */
+        {
+                nob = rc;                       /* consume iov */
+                for (;;)
+                {
+                        LASSERT (niov > 0);
+                        
+                        if (iov->iov_len >= nob)
+                        {
+                                iov->iov_len -= nob;
+                                iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob);
+                                break;
+                        }
+                        nob -= iov->iov_len;
+                        iov->iov_len = 0;
+                        iov++;
+                        niov--;
+                }
+        }
+
+        return (rc);
+}
+
+int
+ktoenal_recvmsg(struct file *sock, struct iovec *iov, int niov, int toread)
+{
+        /* NB This procedure "consumes" iov (actually tcp_recvmsg does)
+         */
+        mm_segment_t oldmm;
+        int ret, i, len = 0, origlen = 0;
+        
+        PROF_START(our_recvmsg);
+        for(i = 0; i < niov; i++) {
+                len += iov[i].iov_len;
+                if(len >= toread)
+                        break;
+        }
+
+        if(len >= toread) {
+                origlen = iov[i].iov_len;
+                iov[i].iov_len -= (len - toread);
+        }
+        else {  /* i == niov */
+                i = niov - 1;
+        }
+
+        oldmm = get_fs();
+        set_fs(KERNEL_DS);
+
+        ret = sock->f_op->readv(sock, iov, i + 1, NULL);
+        
+        set_fs(oldmm);
+
+        if(origlen)
+                iov[i].iov_len = origlen;
+
+        PROF_FINISH(our_recvmsg);
+        return ret;
+}
+
+void
+ktoenal_process_transmit (ksock_conn_t *conn, long *irq_flags)
+{
+        ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list);
+        int         rc;
+        
+        LASSERT (conn->ksnc_tx_scheduled);
+        LASSERT (conn->ksnc_tx_ready);
+        LASSERT (!list_empty (&conn->ksnc_tx_queue));
+
+        /* assume transmit will complete now, so dequeue while I've got the lock */
+        list_del (&tx->tx_list);
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+        LASSERT (tx->tx_nob > 0);
+
+        conn->ksnc_tx_ready = 0;                /* write_space may race with me and set ready */
+        mb();                                   /* => clear BEFORE trying to write */
+
+        rc = ktoenal_sendmsg (conn->ksnc_file,
+                               tx->tx_iov, tx->tx_niov, tx->tx_nob,
+                               list_empty (&conn->ksnc_tx_queue) ? 
+                               MSG_DONTWAIT : (MSG_DONTWAIT | MSG_MORE));
+
+        CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc);
+
+        if (rc < 0)                             /* error */
+        {
+                if (rc == -EAGAIN)              /* socket full => */
+                        rc = 0;                 /* nothing sent */
+                else
+                {
+#warning FIXME: handle socket errors properly
+                        CERROR ("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc);
+                        rc = tx->tx_nob;        /* kid on for now whole packet went */
+                }
+        }
+
+        if (rc == tx->tx_nob)                   /* everything went */
+        {
+                conn->ksnc_tx_ready = 1;        /* assume more can go (ASAP) */
+                ktoenal_put_conn (conn);       /* release packet's ref */
+
+                if (tx->tx_isfwd)               /* was a forwarded packet? */
+                {
+                        kpr_fwd_done (&ktoenal_data.ksnd_router,
+                                      KSOCK_TX_2_KPR_FWD_DESC (tx), 0);
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                }
+                else                            /* local send */
+                {
+                        ksock_ltx_t *ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+
+                        lib_finalize (&ktoenal_lib, ltx->ltx_private, ltx->ltx_cookie);
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                        
+                        list_add (&ltx->ltx_tx.tx_list, ltx->ltx_idle);
+
+                        /* normal tx desc => wakeup anyone blocking for one */
+                        if (ltx->ltx_idle == &ktoenal_data.ksnd_idle_ltx_list &&
+                            waitqueue_active (&ktoenal_data.ksnd_idle_ltx_waitq))
+                                wake_up (&ktoenal_data.ksnd_idle_ltx_waitq);
+                }
+                ktoenal_packets_transmitted++;
+        }
+        else
+        {
+                tx->tx_nob -= rc;
+
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+                /* back onto HEAD of tx_queue */
+                list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+        }
+
+        if (!conn->ksnc_tx_ready ||             /* no space to write now */
+            list_empty (&conn->ksnc_tx_queue))  /* nothing to write */
+        {
+                conn->ksnc_tx_scheduled = 0;    /* not being scheduled */
+                ktoenal_put_conn (conn);       /* release scheduler's ref */
+        }
+        else                                    /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns);
+}
+
+void
+ktoenal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        long          flags;
+        int           nob = tx->tx_nob;
+        struct iovec *iov = tx->tx_iov;
+        int           niov = 1;
+        
+        LASSERT (nob >= sizeof (ptl_hdr_t));
+
+        /* Truncate iov to exactly match total packet length
+         * since socket sendmsg pays no attention to requested length.
+         */
+        for (;;)
+        {
+                LASSERT (niov <= tx->tx_niov);
+                LASSERT (iov->iov_len >= 0);
+                
+                if (iov->iov_len >= nob)
+                {
+                        iov->iov_len = nob;
+                        break;
+                }
+                nob -= iov->iov_len;
+                iov++;
+                niov++;
+        }
+        tx->tx_niov = niov;
+        
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+
+        if (conn->ksnc_tx_ready &&              /* able to send */
+            !conn->ksnc_tx_scheduled)           /* not scheduled to send */
+        {
+                list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns);
+                conn->ksnc_tx_scheduled = 1;
+                atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */
+                if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                        wake_up (&ktoenal_data.ksnd_sched_waitq);
+        }
+
+        ktoenal_packets_launched++;
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+}
+
+int
+ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie,
+              ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+              unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len)
+{
+        ptl_nid_t     gatewaynid;
+        ksock_conn_t *conn;
+        ksock_ltx_t  *ltx;
+        int           rc;
+        int           i;
+
+        /* By this point, as it happens, we have absolutely no idea what
+         * 'private' is.  It might be ksock_nal_data or it might be ksock_conn.
+         * Ha ha, isn't that a funny joke?
+         *
+         * FIXME: this is not the right way to fix this; the right way is to
+         * always pass in the same kind of structure.  This is hard right now.
+         * To revisit this issue, set a breakpoint in here and watch for when
+         * it's called from lib_finalize.  I think this occurs when we send a
+         * packet as a side-effect of another packet, such as when an ACK has
+         * been requested. -phil */
+
+        CDEBUG(D_NET, "sending %d bytes from [%d](%p,%d)... to nid: "
+               LPX64" pid %d\n", (int)payload_len, payload_niov,
+               payload_niov > 0 ? payload_iov[0].iov_base : NULL,
+               (int)(payload_niov > 0 ? payload_iov[0].iov_len : 0), nid, pid);
+
+        if ((conn = ktoenal_get_conn (nid)) == NULL)
+        {
+                /* It's not a peer; try to find a gateway */
+                rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid);
+                if (rc != 0)
+                {
+                        CERROR ("Can't route to "LPX64": router error %d\n", nid, rc);
+                        return (-1);
+                }
+
+                if ((conn = ktoenal_get_conn (gatewaynid)) == NULL)
+                {
+                        CERROR ("Can't route to "LPX64": gateway "LPX64" is not a peer\n", 
+                                nid, gatewaynid);
+                        return (-1);
+                }
+        }
+
+        /* This transmit has now got a ref on conn */
+
+        /* I may not block for a transmit descriptor if I might block the
+         * receiver, or an interrupt handler. */
+        ltx = ktoenal_get_ltx (!(type == PTL_MSG_ACK ||
+                                 type == PTL_MSG_REPLY ||
+                                 in_interrupt ()));
+        if (ltx == NULL)
+        {
+                CERROR ("Can't allocate tx desc\n");
+                ktoenal_put_conn (conn);
+                return (-1);
+        }
+        
+        /* Init common (to sends and forwards) packet part */
+        ltx->ltx_tx.tx_isfwd = 0;
+        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len;
+        ltx->ltx_tx.tx_niov = 1 + payload_niov;
+        ltx->ltx_tx.tx_iov = ltx->ltx_iov;
+
+        /* Init local send packet (storage for hdr, finalize() args, iov) */
+        ltx->ltx_hdr = *hdr;
+        ltx->ltx_private = private;
+        ltx->ltx_cookie = cookie;
+
+        ltx->ltx_iov[0].iov_base = &ltx->ltx_hdr;
+        ltx->ltx_iov[0].iov_len = sizeof (ltx->ltx_hdr);
+
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+
+        for (i = 0; i < payload_niov; i++)
+        {
+                ltx->ltx_iov[1 + i].iov_base = payload_iov[i].iov_base;
+                ltx->ltx_iov[1 + i].iov_len  = payload_iov[i].iov_len;
+        }
+
+        ktoenal_launch_packet (conn, &ltx->ltx_tx);
+        return (0);
+}
+
+void
+ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+        ksock_conn_t *conn;
+        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
+        ksock_tx_t   *tx  = (ksock_tx_t *)&fwd->kprfd_scratch;
+
+        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, 
+                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+
+        if (nid == ktoenal_lib.ni.nid)         /* I'm the gateway; must be the last hop */
+                nid = fwd->kprfd_target_nid;
+        
+        conn = ktoenal_get_conn (nid);
+        if (conn == NULL)
+        {
+                CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid);
+                kpr_fwd_done (&ktoenal_data.ksnd_router, fwd, -EHOSTUNREACH);
+                return;
+        }
+
+        /* This forward has now got a ref on conn */
+
+        tx->tx_isfwd = 1;                       /* This is a forwarding packet */
+        tx->tx_nob   = fwd->kprfd_nob;
+        tx->tx_niov  = fwd->kprfd_niov;
+        tx->tx_iov   = fwd->kprfd_iov;
+
+        ktoenal_launch_packet (conn, tx);
+}
+
+int
+ktoenal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&ktoenal_data.ksnd_nthreads);
+        return (0);
+}
+
+void
+ktoenal_thread_fini (void)
+{
+        atomic_dec (&ktoenal_data.ksnd_nthreads);
+}
+
+void
+ktoenal_fmb_callback (void *arg, int error)
+{
+        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
+        ptl_hdr_t         *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]);
+        ksock_conn_t      *conn;
+        long               flags;
+
+        CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": %d\n", 
+                hdr->src_nid, hdr->dest_nid, error);
+
+        if (error != 0)
+                CERROR ("Failed to route packet from "LPX64" to "LPX64": %d\n", 
+                        hdr->src_nid, hdr->dest_nid, error);
+
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+        
+        list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs);
+
+        if (!list_empty (&fmb->fmb_pool->fmp_blocked_conns))
+        {
+                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, ksock_conn_t, ksnc_rx_list);
+                list_del (&conn->ksnc_rx_list);
+
+                CDEBUG (D_NET, "Scheduling conn %p\n", conn);
+                LASSERT (conn->ksnc_rx_scheduled);
+                LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+
+                conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+                list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns);
+
+                if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                        wake_up (&ktoenal_data.ksnd_sched_waitq);
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+}
+
+ksock_fmb_t *
+ktoenal_get_idle_fmb (ksock_conn_t *conn)
+{
+        /* NB called with sched lock held */
+        int               payload_nob = conn->ksnc_rx_nob_left;
+        int               packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        ksock_fmb_pool_t *pool;
+        ksock_fmb_t      *fmb;
+        
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+
+        if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE)
+                pool = &ktoenal_data.ksnd_small_fmp;
+        else
+                pool = &ktoenal_data.ksnd_large_fmp;
+        
+        if (!list_empty (&pool->fmp_idle_fmbs))
+        {
+                fmb = list_entry (pool->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list);
+                list_del (&fmb->fmb_list);
+                return (fmb);
+        }
+
+        /* deschedule until fmb free */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+
+        list_add_tail (&conn->ksnc_rx_list,
+                       &pool->fmp_blocked_conns);
+        return (NULL);
+}
+
+
+int
+ktoenal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
+{
+        int payload_nob = conn->ksnc_rx_nob_left;
+        int packet_nob = sizeof (ptl_hdr_t) + payload_nob;
+        int niov;                               /* at least the header */
+        int nob;
+        
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
+        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
+        LASSERT (payload_nob >= 0);
+        LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE);
+        LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE);
+        
+        /* Got a forwarding buffer; copy the header we just read into the
+         * forwarding buffer.  If there's payload start reading reading it
+         * into the buffer, otherwise the forwarding buffer can be kicked
+         * off immediately.
+         *
+         * NB fmb->fmb_iov spans the WHOLE packet.
+         *    conn->ksnc_rx_iov spans just the payload.
+         */
+
+        fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]);
+                
+        memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); /* copy header */
+
+        if (payload_nob == 0)                   /* got complete packet already */
+        {
+                atomic_inc (&ktoenal_packets_received);
+
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn,
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, packet_nob);
+
+                fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t);
+
+                kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, 
+                              packet_nob, 1, fmb->fmb_iov, 
+                              ktoenal_fmb_callback, fmb);
+
+                kpr_fwd_start (&ktoenal_data.ksnd_router, &fmb->fmb_fwd); /* forward it now */
+
+                ktoenal_new_packet (conn, 0);  /* on to next packet */
+                return (1);
+        }
+
+        niov = 1;
+        if (packet_nob <= PAGE_SIZE)            /* whole packet fits in first page */
+                fmb->fmb_iov[0].iov_len = packet_nob;
+        else
+        {
+                fmb->fmb_iov[0].iov_len = PAGE_SIZE;
+                nob = packet_nob - PAGE_SIZE;
+                
+                do
+                {
+                        LASSERT (niov < fmb->fmb_npages);
+                        fmb->fmb_iov[niov].iov_base = page_address (fmb->fmb_pages[niov]);
+                        fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob);
+                        nob -= PAGE_SIZE;
+                        niov++;
+                } while (nob > 0);
+        }
+
+        kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, 
+                      packet_nob, niov, fmb->fmb_iov, 
+                      ktoenal_fmb_callback, fmb);
+
+        /* stash router's descriptor ready for call to kpr_fwd_start */        
+        conn->ksnc_cookie = &fmb->fmb_fwd;
+
+        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
+
+        /* payload is desc's iov-ed buffer, but skipping the hdr */
+        LASSERT (niov <= sizeof (conn->ksnc_rx_iov) / sizeof (conn->ksnc_rx_iov[0]));
+
+        conn->ksnc_rx_iov[0].iov_base = (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + sizeof (ptl_hdr_t));
+        conn->ksnc_rx_iov[0].iov_len = fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t);
+
+        if (niov > 1)
+                memcpy (&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], (niov - 1) * sizeof (struct iovec));
+
+        conn->ksnc_rx_niov = niov;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
+                conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, payload_nob);
+        return (0);
+}
+
+void
+ktoenal_fwd_parse (ksock_conn_t *conn)
+{
+        ksock_conn_t *conn2;
+        int           body_len;
+
+        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
+                conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left);
+
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
+        LASSERT (conn->ksnc_rx_scheduled);
+
+        switch (conn->ksnc_hdr.type)
+        {
+        case PTL_MSG_GET:
+        case PTL_MSG_ACK:
+                body_len = 0;
+                break;
+        case PTL_MSG_PUT:
+                body_len = conn->ksnc_hdr.msg.put.length;
+                break;
+        case PTL_MSG_REPLY:
+                body_len = conn->ksnc_hdr.msg.reply.length;
+                break;
+        default:
+                /* Unrecognised packet type */
+                CERROR ("Unrecognised packet type %d from "LPX64" for "LPX64"\n",
+                        conn->ksnc_hdr.type, conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid);
+                /* Ignore this header and go back to reading a new packet. */
+                ktoenal_new_packet (conn, 0);
+                return;
+        }
+
+        if (body_len < 0)                               /* length corrupt */
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d illegal\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len);
+                ktoenal_new_packet (conn, 0);          /* on to new packet */
+                return;
+        }
+
+        if (body_len > SOCKNAL_MAX_FWD_PAYLOAD)         /* too big to forward */
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d too big\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len);
+                ktoenal_new_packet (conn, body_len);    /* on to new packet (skip this one's body) */
+                return;
+        }
+
+        conn2 = ktoenal_get_conn (conn->ksnc_hdr.dest_nid); /* should have gone direct */
+        if (conn2 != NULL)
+        {
+                CERROR ("dropping packet from "LPX64" for "LPX64": target is a peer\n",
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid);
+                ktoenal_put_conn (conn2);          /* drop ref from get above */
+
+                ktoenal_new_packet (conn, body_len);  /* on to next packet (skip this one's body) */
+                return;
+        }
+
+        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
+        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
+        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+}
+
+int
+ktoenal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+{
+        static char ktoenal_slop_buffer[4096];
+
+        int   nob;
+        int   niov;
+        int   skipped;
+
+        if (nob_to_skip == 0)                   /* right at next packet boundary now */
+        {
+                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
+                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+
+                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
+                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
+                conn->ksnc_rx_niov = 1;
+                return (1);
+        }
+
+        /* set up to skip as much a possible now */
+        /* if there's more left (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        skipped = 0;
+        niov = 0;
+
+        do
+        {
+                nob = MIN (nob_to_skip, sizeof (ktoenal_slop_buffer));
+
+                conn->ksnc_rx_iov[niov].iov_base = ktoenal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
+
+        } while (nob_to_skip != 0 &&            /* mustn't overflow conn's rx iov */
+                 niov < sizeof (conn->ksnc_rx_iov)/sizeof (conn->ksnc_rx_iov[0]));
+
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
+}
+
+void
+ktoenal_process_receive (ksock_conn_t *conn, long *irq_flags)
+{
+        ksock_fmb_t *fmb;
+        int          len;
+        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
+        LASSERT (conn->ksnc_rx_scheduled);
+        LASSERT (conn->ksnc_rx_ready);
+
+        /* NB: sched lock held */
+        CDEBUG(D_NET, "conn %p\n", conn);
+
+        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)     /* doesn't need a forwarding buffer */
+        {
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                goto try_read;
+        }
+
+ get_fmb:
+        /* NB: sched lock held */
+        fmb = ktoenal_get_idle_fmb (conn);
+        if (fmb == NULL)                        /* conn descheduled waiting for idle fmb */
+                return;
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+        
+        if (ktoenal_init_fmb (conn, fmb)) /* packet forwarded ? */
+                goto out;               /* come back later for next packet */
+
+ try_read:
+        /* NB: sched lock NOT held */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+
+        LASSERT (conn->ksnc_rx_niov > 0);
+        LASSERT (conn->ksnc_rx_nob_wanted > 0);
+
+        conn->ksnc_rx_ready = 0;                /* data ready may race with me and set ready */
+        mb();                                   /* => clear BEFORE trying to read */
+
+        /* NB ktoenal_recvmsg "consumes" the iov passed to it */
+        len = ktoenal_recvmsg(conn->ksnc_file,
+                               conn->ksnc_rx_iov, conn->ksnc_rx_niov,
+                               conn->ksnc_rx_nob_wanted);
+        CDEBUG (D_NET, "%p read(%d) %d\n", conn, conn->ksnc_rx_nob_wanted, len);
+
+        if (len <= 0)                           /* nothing ready (EAGAIN) or EOF or error */
+        {
+                if (len != -EAGAIN &&           /* ! nothing to read now */
+                    len != 0)                   /* ! nothing to read ever */
+                {
+#warning FIXME: handle socket errors properly
+                        CERROR ("Error socknal read(%d) %p: %d\n",
+                                conn->ksnc_rx_nob_wanted, conn, len);
+                }
+                goto out;                       /* come back when there's data ready */
+        }
+
+        LASSERT (len <= conn->ksnc_rx_nob_wanted);
+        conn->ksnc_rx_nob_wanted -= len;
+        conn->ksnc_rx_nob_left -= len;
+
+        if (conn->ksnc_rx_nob_wanted != 0)      /* short read */
+                goto out;                       /* try again later */
+
+        conn->ksnc_rx_ready = 1;                /* assume there's more to be had */
+
+        switch (conn->ksnc_rx_state)
+        {
+        case SOCKNAL_RX_HEADER:
+                if (conn->ksnc_hdr.dest_nid != ktoenal_lib.ni.nid) /* It's not for me */
+                {
+                        ktoenal_fwd_parse (conn);
+                        switch (conn->ksnc_rx_state)
+                        {
+                        case SOCKNAL_RX_HEADER: /* skipped this packet (zero payload) */
+                                goto out;       /* => come back later */
+                        case SOCKNAL_RX_SLOP:   /* skipping this packet's body */
+                                goto try_read;  /* => go read it */
+                        case SOCKNAL_RX_GET_FMB: /* forwarding */
+                                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+                                goto get_fmb;   /* => go get a fwd msg buffer */
+                        default:
+                        }
+                        /* Not Reached */
+                        LBUG ();
+                }
+
+                PROF_START(lib_parse);
+                lib_parse(&ktoenal_lib, &conn->ksnc_hdr, conn); /* sets wanted_len, iovs etc */
+                PROF_FINISH(lib_parse);
+
+                if (conn->ksnc_rx_nob_wanted != 0) /* need to get some payload? */
+                {
+                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
+                        goto try_read;          /* go read the payload */
+                }
+                /* Fall through (completed packet for me) */
+
+        case SOCKNAL_RX_BODY:
+                atomic_inc (&ktoenal_packets_received);
+                lib_finalize(&ktoenal_lib, NULL, conn->ksnc_cookie); /* packet is done now */
+                /* Fall through */
+
+        case SOCKNAL_RX_SLOP:
+                if (ktoenal_new_packet (conn, conn->ksnc_rx_nob_left)) /* starting new packet? */
+                        goto out;               /* come back later */
+                goto try_read;                  /* try to finish reading slop now */
+
+        case SOCKNAL_RX_BODY_FWD:
+                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn,
+                        conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left);
+
+                atomic_inc (&ktoenal_packets_received);
+
+                /* ktoenal_init_fmb() stashed router descriptor in conn->ksnc_cookie */
+                kpr_fwd_start (&ktoenal_data.ksnd_router, (kpr_fwd_desc_t *)conn->ksnc_cookie);
+
+                LASSERT (conn->ksnc_rx_nob_left == 0); /* no slop in forwarded packets */
+
+                ktoenal_new_packet (conn, 0);  /* on to next packet */
+                goto out;                       /* (later) */
+
+        default:
+        }
+
+        /* Not Reached */
+        LBUG ();
+
+ out:
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags);
+
+        if (!conn->ksnc_rx_ready)               /* no data there to read? */
+        {
+                conn->ksnc_rx_scheduled = 0;    /* let socket callback schedule again */
+                ktoenal_put_conn (conn);       /* release scheduler's ref */
+        }
+        else                                    /* let scheduler call me again */
+                list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns);
+}
+
+int
+ktoenal_recv(nal_cb_t *nal, void *private, lib_msg_t *msg,
+             unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen)
+{
+        ksock_conn_t *conn = (ksock_conn_t *)private;
+        int           i;
+
+        conn->ksnc_cookie = msg;
+
+        LASSERT (niov <= PTL_MD_MAX_IOV);
+        for (i = 0; i < niov; i++)
+        {
+                conn->ksnc_rx_iov[i].iov_len = iov[i].iov_len;
+                conn->ksnc_rx_iov[i].iov_base = iov[i].iov_base;
+        }
+
+        conn->ksnc_rx_niov       = niov;
+        conn->ksnc_rx_nob_wanted = mlen;
+        conn->ksnc_rx_nob_left   = rlen;
+
+        return (rlen);
+}
+
+int
+ktoenal_scheduler (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        int                nloops = 0;
+
+        kportal_daemonize ("ktoenal_sched");
+        kportal_blockallsigs ();
+        
+        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+        while (!ktoenal_data.ksnd_shuttingdown)
+        {
+                int did_something = 0;
+
+                /* Ensure I progress everything semi-fairly */
+
+                if (!list_empty (&ktoenal_data.ksnd_rx_conns))
+                {
+                        did_something = 1;
+                        conn = list_entry (ktoenal_data.ksnd_rx_conns.next,
+                                           ksock_conn_t, ksnc_rx_list);
+                        list_del (&conn->ksnc_rx_list);
+
+                        ktoenal_process_receive (conn, &flags); /* drops & regains ksnd_sched_lock */
+                }
+
+                if (!list_empty (&ktoenal_data.ksnd_tx_conns))
+                {
+                        did_something = 1;
+                        conn = list_entry (ktoenal_data.ksnd_tx_conns.next,
+                                           ksock_conn_t, ksnc_tx_list);
+
+                        list_del (&conn->ksnc_tx_list);
+                        ktoenal_process_transmit (conn, &flags); /* drops and regains ksnd_sched_lock */
+                }
+
+                if (!did_something ||           /* nothing to do */
+                    ++nloops == SOCKNAL_RESCHED) /* hogging CPU? */
+                {
+                        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+
+                        nloops = 0;
+
+                        if (!did_something) {   /* wait for something to do */
+                                rc = wait_event_interruptible (ktoenal_data.ksnd_sched_waitq,
+                                                               ktoenal_data.ksnd_shuttingdown ||
+                                                               !list_empty (&ktoenal_data.ksnd_rx_conns) ||
+                                                               !list_empty (&ktoenal_data.ksnd_tx_conns));
+                                LASSERT (rc == 0);
+                        } else 
+                                our_cond_resched();
+
+                        spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+                }
+        }
+
+        spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        ktoenal_thread_fini ();
+        return (0);
+}
+
+
+int
+ktoenal_reaper (void *arg)
+{
+        unsigned long      flags;
+        ksock_conn_t      *conn;
+        int                rc;
+        
+        kportal_daemonize ("ktoenal_reaper");
+        kportal_blockallsigs ();
+
+        while (!ktoenal_data.ksnd_shuttingdown)
+        {
+                spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags);
+
+                if (list_empty (&ktoenal_data.ksnd_reaper_list))
+                        conn = NULL;
+                else
+                {
+                        conn = list_entry (ktoenal_data.ksnd_reaper_list.next,
+                                           ksock_conn_t, ksnc_list);
+                        list_del (&conn->ksnc_list);
+                }
+
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags);
+
+                if (conn != NULL)
+                        ktoenal_close_conn (conn);
+                else {
+                        rc = wait_event_interruptible (ktoenal_data.ksnd_reaper_waitq,
+                                                       ktoenal_data.ksnd_shuttingdown ||
+                                                       !list_empty(&ktoenal_data.ksnd_reaper_list));
+                        LASSERT (rc == 0);
+                }
+        }
+
+        ktoenal_thread_fini ();
+        return (0);
+}
+
+#define POLLREAD        (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)
+#define POLLWRITE       (POLLOUT | POLLWRNORM | POLLWRBAND)
+
+int
+ktoenal_pollthread(void *arg)
+{
+        unsigned int mask;
+        struct list_head *tmp;
+        ksock_conn_t *conn;
+        
+        /* Save the task struct for waking it up */
+        ktoenal_data.ksnd_pollthread_tsk = current; 
+        
+        kportal_daemonize ("ktoenal_pollthread");
+        kportal_blockallsigs ();
+        
+        poll_initwait(&ktoenal_data.ksnd_pwait);
+        
+        while(!ktoenal_data.ksnd_shuttingdown) {
+                
+                set_current_state(TASK_INTERRUPTIBLE);
+                
+                read_lock (&ktoenal_data.ksnd_socklist_lock);
+                list_for_each(tmp, &ktoenal_data.ksnd_socklist) {
+                        
+                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                        atomic_inc(&conn->ksnc_refcount);
+                        read_unlock (&ktoenal_data.ksnd_socklist_lock);
+                        
+                        mask = conn->ksnc_file->f_op->poll(conn->ksnc_file,
+                                  ktoenal_data.ksnd_slistchange ? 
+                                  &ktoenal_data.ksnd_pwait : NULL);
+                         
+                        if(mask & POLLREAD) {
+                                ktoenal_data_ready(conn);
+                                                        
+                        } 
+                        if (mask & POLLWRITE) {
+                                ktoenal_write_space(conn);  
+                              
+                        }
+                        if (mask & (POLLERR | POLLHUP)) {
+                                         /* Do error processing */          
+                        }      
+                        
+                        read_lock (&ktoenal_data.ksnd_socklist_lock);
+                        if(atomic_dec_and_test(&conn->ksnc_refcount))
+                                _ktoenal_put_conn(conn);
+                }
+                ktoenal_data.ksnd_slistchange = 0;
+                read_unlock (&ktoenal_data.ksnd_socklist_lock);
+                
+                schedule_timeout(MAX_SCHEDULE_TIMEOUT);
+                if(ktoenal_data.ksnd_slistchange) {
+                        poll_freewait(&ktoenal_data.ksnd_pwait); 
+                        poll_initwait(&ktoenal_data.ksnd_pwait);
+                }
+         }
+        poll_freewait(&ktoenal_data.ksnd_pwait);
+        ktoenal_thread_fini();
+        return (0);
+}
+
+void
+ktoenal_data_ready (ksock_conn_t *conn)
+{
+        unsigned long  flags;
+        ENTRY;
+
+        if (!test_and_set_bit (0, &conn->ksnc_rx_ready)) { 
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+                if (!conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail (&conn->ksnc_rx_list, 
+                                        &ktoenal_data.ksnd_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        /* This is done to avoid the effects of a sequence
+                         * of events in which the rx_ready is lost
+                         */
+                        conn->ksnc_rx_ready=1;
+                          
+                        if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                                wake_up (&ktoenal_data.ksnd_sched_waitq);
+                }
+
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        }
+
+        EXIT;
+}
+
+void
+ktoenal_write_space (ksock_conn_t *conn)
+{
+        unsigned long  flags;
+
+        CDEBUG (D_NET, "conn %p%s%s%s\n",
+                         conn,
+                        (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? " ready" : " blocked"),
+                        (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"),
+                        (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued"));
+
+
+        if (!test_and_set_bit (0, &conn->ksnc_tx_ready)) {
+                spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags);
+
+                if (!list_empty (&conn->ksnc_tx_queue) && /* packets to send */
+                                !conn->ksnc_tx_scheduled) { /* not being progressed */
+
+                        list_add_tail (&conn->ksnc_tx_list, 
+                                        &ktoenal_data.ksnd_tx_conns);
+                        conn->ksnc_tx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_refcount);
+
+                        if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq))
+                                wake_up (&ktoenal_data.ksnd_sched_waitq);
+                }
+                spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags);
+        }
+}
+
+nal_cb_t ktoenal_lib = {
+        nal_data:       &ktoenal_data,                /* NAL private data */
+        cb_send:         ktoenal_send,
+        cb_recv:         ktoenal_recv,
+        cb_read:         ktoenal_read,
+        cb_write:        ktoenal_write,
+        cb_callback:     ktoenal_callback,
+        cb_malloc:       ktoenal_malloc,
+        cb_free:         ktoenal_free,
+        cb_printf:       ktoenal_printf,
+        cb_cli:          ktoenal_cli,
+        cb_sti:          ktoenal_sti,
+        cb_dist:         ktoenal_dist
+};
diff --git a/lustre/portals/libcfs/.cvsignore b/lustre/portals/libcfs/.cvsignore
new file mode 100644 (file)
index 0000000..67d1a3d
--- /dev/null
@@ -0,0 +1,4 @@
+.deps
+Makefile
+Makefile.in
+link-stamp
diff --git a/lustre/portals/libcfs/Makefile.am b/lustre/portals/libcfs/Makefile.am
new file mode 100644 (file)
index 0000000..20d7fbd
--- /dev/null
@@ -0,0 +1,29 @@
+# Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+MODULE = portals
+modulenet_DATA = portals.o
+EXTRA_PROGRAMS = portals
+
+LIBLINKS := lib-dispatch.c lib-eq.c lib-init.c lib-md.c lib-me.c lib-move.c lib-msg.c lib-ni.c lib-pid.c
+APILINKS := api-eq.c api-errno.c api-init.c api-me.c api-ni.c api-wrap.c
+LINKS = $(APILINKS) $(LIBLINKS) 
+DISTCLEANFILES = $(LINKS) link-stamp *.orig *.rej
+
+$(LINKS): link-stamp
+link-stamp:
+       -list='$(LIBLINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done
+       -list='$(APILINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done
+       echo timestamp > link-stamp
+
+DEFS =
+portals_SOURCES = $(LINKS) module.c proc.c debug.c
+
+# Don't distribute any patched files.
+dist-hook:
+       list='$(EXT2C)'; for f in $$list; do rm -f $(distdir)/$$f; done
+
+include ../Rules.linux
diff --git a/lustre/portals/libcfs/Makefile.mk b/lustre/portals/libcfs/Makefile.mk
new file mode 100644 (file)
index 0000000..3196ea2
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include fs/lustre/portals/Kernelenv
+
+obj-y += libcfs.o
+licfs-objs    := module.o proc.o debug.o
\ No newline at end of file
diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c
new file mode 100644 (file)
index 0000000..8d26dbb
--- /dev/null
@@ -0,0 +1,830 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/notifier.h>
+#include <linux/kernel.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/interrupt.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/miscdevice.h>
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/kp30.h>
+
+#define DEBUG_OVERFLOW 1024
+static char *debug_buf = NULL;
+static unsigned long debug_size = 0;
+static atomic_t debug_off_a = ATOMIC_INIT(0);
+static int debug_wrapped;
+wait_queue_head_t debug_ctlwq;
+#define DAEMON_SND_SIZE      (64 << 10)
+
+/*
+ * used by the daemon to keep track the offset into debug_buffer for the next
+ * write to the file.  Usually, the daemon is to write out buffer
+ * from debug_daemon_next_write upto debug_off
+ *  variable usage
+ *      Reader - portals_debug_msg()
+ *      Writer - portals_debug_daemon()
+ *               portals_debug_daemon_start() during daemon init time
+ *               portals_debug_daemon_continue() to reset to debug_off
+ *               portals_debug_clear_buffer() reset to debug_off for clear
+ *      Note that *_start(), *_continue() & *clear_buffer() should serialized;
+ */
+static atomic_t   debug_daemon_next_write;
+
+/*
+ * A debug_daemon can be in following states
+ *      stopped - stopped state means there is no debug_daemon running.
+ *                accordingly, it must be in paused state
+ *                a daemon is in !stopped && !paused state after
+ *                "lctl debug_daemon start" creates debug_daemon successfully
+ *                Variable Usage
+ *                      Reader - portals_debug_daemon()
+ *                               portals_debug_set_daemon() routines
+ *                      Writer - portals_debug_set_daemon() routines
+ *                              portals_debug_daemon() on IO error
+ *      paused -  a debug_daemon state is changed from !paused into paused
+ *                when "lctl debug_daemon paused" is issued
+ *                "lctl debug_daemon continue" gets a daemon into !paused mode
+ *                      Reader - portals_debug_set_daemon() routines
+ *                               portals_debug_msg()
+ *                      Writer - portals_debug_set_daemon() on init
+ *                               portals_debug_daemon()
+ *
+ *        Daemon  state diagram.
+ *                      (stopped, paused)
+ *                              |  <-- debug_daemon start
+ *                              V
+ *                      (!stopped, !paused)
+ *                              |  <-- debug_daemon pause
+ *                              V
+ *                      (!stopped, paused)
+ *                              |  <-- debug_daemon continue
+ *                              V
+ *                      (!stopped, !paused)
+ *                              |  <-- debug_daemon stop
+ *                              V
+ *                      (stopped, paused)
+ *      Overlapped - this is a state when CDEBUG is too fast for the daemon to
+ *                   write out the debug_bufferr.  That is, debug_off is to
+ *                   overlap debug_daemon_next_write;
+ *                     Reader - portals_debug_msg()
+ *                     Writer - portals_debug_msg()
+ */
+
+/*
+ * Description on Trace Daemon Synchronization
+ *
+ * Three categories of code are synchronizing between each other
+ * 1.   lctl, portals_debug_set_daemon(), the user debug control code, 
+ *      as well as portals_debug_clear_buffer()
+ * 2.   CDEBUG, portals_debug_msg(), the debug put messages routine
+ * 3.   Daemon, portals_debug_daemon(), to write out debug log file
+ *
+ *
+ * Three different controls for synchronizations
+ *
+ * 1.   debug_daemon_semaphore
+ *      The usage of this semaphore is to serialize multiple lctl controls 
+ *      in manipulating debug daemon state.  The semaphore serves as the 
+ *      gatekeeper to allow only one user control thread, at any giving time, 
+ *      to access debug daemon state and keeps the other user control requests 
+ *      in wait state until the current control request is serviced.
+ *
+ * 2.   wait_queue_head_t lctl (paired with lctl_event flag)
+ *      Lctl event is the event between portals_debug_set_daemon() and 
+ *      portals_debug_daemon().  Lctl is an indicator for portals_debug_daemon()
+ *      to flush data out to file.  portals_debug_daemon() is to use lctl event
+ *      as signal channel to wakeup portals_debug_set_daemon() upon flush 
+ *      operation is done.
+ *
+ *      Producer :
+ *              portals_debug_daemon() uses to wake up 
+ *              portals_debug_set_daemon(), pause and stop, routines
+ *      Consumer :
+ *              portals_debug_set_daemon(), stop and pause operations, 
+ *              wait and sleep on the event
+ *
+ * 3.   wait_queue_head_t daemon (paired with daemon_event flag)
+ *      This is an event channel to wakeup portals_debug_daemon.  Daemon 
+ *      wakes up to run whenever there is an event posted.   Daemon handles 
+ *      2 types of operations . 1. Writes data out to debug file, 2. Flushes 
+ *      file and terminates base on lctl event. 
+ *      File operation -
+ *              Daemon is normally in a sleep state.  
+ *              Daemon is woken up through daemon event whenever CDEBUG is 
+ *              putting data over any 64K boundary. 
+ *      File flush and termination -
+ *              On portals_debug_daemon_stop/pause() operations, lctl control 
+ *              is to wake up daemon through daemon event.
+ *
+ *      We can't use sleep_on() and wake_up() to replace daemon event because 
+ *      portals_debug_daemon() must catch the wakeup operation posted by 
+ *      portals_debug_daemon_stop/pause().  Otherwise, stop and pause may 
+ *      stuck in lctl wait event.
+ *
+ *      Producer :
+ *           a. portals_debug_daemon_pause() and portals_debug_daemon_stop() 
+ *              uses the event to wake up portals_debug_daemon()
+ *           b. portals_debug_msg() uses the event to wake up 
+ *              portals_debug_daemon() whenever the data output is acrossing 
+ *              a 64K bytes boundary.
+ *      Consumer :
+ *              portals_debug_daemon() wakes up upon daemon event.
+ *
+ * Sequence for portals_debug_daemon_stop() operation
+ *
+ * _Portals_debug_daemon_stop()_          _Daemon_
+ *                                      Wait_event(daemon) or running
+ *      Paused = 1;
+ *      Wakeup_event (daemon)
+ *      Wait_event(lctl)
+ *                                      Set force_flush flag if lctlevnt
+ *                                      Flush data
+ *                                      Wakeup_event (lctl)
+ *                                      Wait_event(daemon)
+ *      Stopped = 1;
+ *      Wakeup_event (daemon)
+ *      Wait_event(lctl)
+ *                                      Exit daemon loop if (Stopped)
+ *                                      Wakeup_event (lctl)
+ *                                      Exit
+ *      Return to user application
+ *
+ *
+ * _Portals_debug_msg()_                  _Daemon_
+ *                                      Wait_event(daemon) or running
+ *      If (WriteStart<64K<WriteEnd)
+ *         Wakeup_event(daemon)
+ *                                      Do file IO
+ *                                      Wait_event(daemon)
+ */
+struct debug_daemon_state {
+        unsigned long overlapped;
+        unsigned long stopped;
+        atomic_t paused;
+        unsigned long   lctl_event;     /* event for lctl */
+        wait_queue_head_t lctl;
+        unsigned long   daemon_event;   /* event for daemon */
+        wait_queue_head_t daemon;
+};
+static struct debug_daemon_state debug_daemon_state;
+static DECLARE_MUTEX(debug_daemon_semaphore);
+
+static loff_t daemon_file_size_limit;
+char debug_daemon_file_path[1024] = "";
+
+spinlock_t portals_debug_lock = SPIN_LOCK_UNLOCKED;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+int handled_panic; /* to avoid recursive calls to notifiers */
+char portals_upcall[1024] = "/usr/lib/lustre/portals_upcall";
+
+
+int portals_do_debug_dumplog(void *arg)
+{
+        struct file *file;
+        void *journal_info;
+        int rc;
+        mm_segment_t oldfs;
+        unsigned long debug_off;
+
+        kportal_daemonize("");
+
+        reparent_to_init();
+        journal_info = current->journal_info;
+        current->journal_info = NULL;
+        sprintf(debug_file_name, "%s.%ld", debug_file_path, CURRENT_TIME);
+        file = filp_open(debug_file_name, O_CREAT|O_TRUNC|O_RDWR, 0644);
+
+        if (!file || IS_ERR(file)) {
+                CERROR("cannot open %s for dumping: %ld\n", debug_file_name,
+                       PTR_ERR(file));
+                GOTO(out, PTR_ERR(file));
+        } else {
+                printk(KERN_ALERT "dumping log to %s ... writing ...\n",
+                       debug_file_name);
+        }
+
+        debug_off = atomic_read(&debug_off_a);
+        oldfs = get_fs();
+        set_fs(get_ds());
+        if (debug_wrapped) {
+                rc = file->f_op->write(file, debug_buf + debug_off + 1,
+                                       debug_size-debug_off-1, &file->f_pos);
+                rc += file->f_op->write(file, debug_buf, debug_off + 1,
+                                        &file->f_pos);
+        } else {
+                rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos);
+        }
+        printk("wrote %d bytes\n", rc);
+        set_fs(oldfs);
+
+        rc = file->f_op->fsync(file, file->f_dentry, 1);
+        if (rc)
+                CERROR("sync returns %d\n", rc);
+        filp_close(file, 0);
+out:
+        current->journal_info = journal_info;
+        wake_up(&debug_ctlwq);
+        return 0;
+}
+
+int portals_debug_daemon(void *arg)
+{
+        struct file *file;
+        void *journal_info;
+        mm_segment_t oldfs;
+        unsigned long force_flush = 0;
+        unsigned long size, off, flags;
+        int rc;
+
+        kportal_daemonize("ldebug_daemon");
+        reparent_to_init();
+        journal_info = current->journal_info;
+        current->journal_info = NULL;
+
+        file = filp_open(debug_daemon_file_path,
+                         O_CREAT|O_TRUNC|O_RDWR|O_LARGEFILE, 0644);
+
+        if (!file || IS_ERR(file)) {
+                CERROR("cannot open %s for logging", debug_daemon_file_path);
+                GOTO(out1, PTR_ERR(file));
+        } else {
+                printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n",
+                       debug_daemon_file_path);
+        }
+
+        debug_daemon_state.overlapped = 0;
+        debug_daemon_state.stopped = 0;
+
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        off = atomic_read(&debug_off_a) + 1;
+        if (debug_wrapped)
+                off = (off >= debug_size)? 0 : off;
+        else
+                off = 0;
+        atomic_set(&debug_daemon_next_write, off);
+        atomic_set(&debug_daemon_state.paused, 0);
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+        while (1) {
+                unsigned long ending;
+                unsigned long start, tail;
+                long delta;
+
+                debug_daemon_state.daemon_event = 0;
+
+                ending = atomic_read(&debug_off_a);
+                start = atomic_read(&debug_daemon_next_write);
+
+                /* check if paused is imposed by lctl ? */
+                force_flush = !debug_daemon_state.lctl_event;
+
+                delta = ending - start;
+                tail = debug_size - start;
+                size = (delta >= 0) ? delta : tail;
+                while (size && (force_flush || (delta < 0) ||
+                                (size >= DAEMON_SND_SIZE))) {
+                        if (daemon_file_size_limit) {
+                               int ssize = daemon_file_size_limit - file->f_pos;
+                               if (size > ssize)
+                                        size = ssize;
+                        }
+
+                        rc = file->f_op->write(file, debug_buf+start,
+                                               size, &file->f_pos);
+                        if (rc < 0) {
+                                printk(KERN_ALERT
+                                           "Debug_daemon write error %d\n", rc);
+                                goto out;
+                        }
+                        start += rc;
+                        delta = ending - start;
+                        tail = debug_size - start;
+                        if (tail == 0)
+                                start = 0;
+                        if (delta >= 0)
+                                size = delta;
+                        else
+                                size = (tail == 0) ? ending : tail;
+                        if (daemon_file_size_limit == file->f_pos) {
+                                // file wrapped around
+                                file->f_pos = 0;
+                        }
+                }
+                atomic_set(&debug_daemon_next_write, start);
+                if (force_flush) {
+                        rc = file->f_op->fsync(file, file->f_dentry, 1);
+                        if (rc < 0) {
+                                printk(KERN_ALERT
+                                       "Debug_daemon sync error %d\n", rc);
+                                goto out;
+                        }
+                        if (debug_daemon_state.stopped)
+                               break;           
+                        debug_daemon_state.lctl_event = 1;
+                        wake_up(&debug_daemon_state.lctl);
+                }
+                wait_event(debug_daemon_state.daemon, 
+                           debug_daemon_state.daemon_event);
+                }
+out:
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.stopped = 1;
+        set_fs(oldfs);
+        filp_close(file, 0);
+        current->journal_info = journal_info;
+out1:
+        debug_daemon_state.lctl_event = 1;
+        wake_up(&debug_daemon_state.lctl);
+        return 0;
+}
+
+void portals_debug_print(void)
+{
+        unsigned long dumplen = 64 * 1024;
+        char *start1, *start2;
+        char *end1, *end2;
+        unsigned long debug_off = atomic_read(&debug_off_a);
+
+        start1 = debug_buf + debug_off - dumplen;
+        if (start1 < debug_buf) {
+                start1 += debug_size;
+                end1 = debug_buf + debug_size - 1;
+                start2 = debug_buf;
+                end2 = debug_buf + debug_off;
+        } else {
+                end1 = debug_buf + debug_off;
+                start2 = debug_buf + debug_off;
+                end2 = debug_buf + debug_off;
+        }
+
+        while (start1 < end1) {
+                int count = MIN(1024, end1 - start1);
+                printk("%*s", count, start1);
+                start1 += 1024;
+        }
+        while (start2 < end2) {
+                int count = MIN(1024, end2 - start2);
+                printk("%*s", count, start2);
+                start2 += 1024;
+        }
+}
+
+void portals_debug_dumplog(void)
+{
+        int rc;
+        ENTRY;
+
+        init_waitqueue_head(&debug_ctlwq);
+
+        rc = kernel_thread(portals_do_debug_dumplog,
+                           NULL, CLONE_VM | CLONE_FS | CLONE_FILES);
+        if (rc < 0) {
+                printk(KERN_ERR "cannot start dump thread\n");
+                return;
+        }
+        sleep_on(&debug_ctlwq);
+}
+
+int portals_debug_daemon_start(char *file, unsigned int size)
+{
+        int rc;
+
+        if (!debug_daemon_state.stopped)
+                return -EALREADY;
+
+        if (file != NULL)
+                strncpy(debug_daemon_file_path, file, 1024);
+
+        init_waitqueue_head(&debug_daemon_state.lctl);
+        init_waitqueue_head(&debug_daemon_state.daemon);
+
+        daemon_file_size_limit = size << 20;
+
+        debug_daemon_state.lctl_event = 0;
+        rc = kernel_thread(portals_debug_daemon, NULL, 0);
+        if (rc < 0) {
+                printk(KERN_ERR "cannot start debug daemon thread\n");
+                strncpy(debug_daemon_file_path, "\0", 1);
+                return rc;
+        }
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+        return 0;
+}
+
+int portals_debug_daemon_pause(void)
+{
+        if (atomic_read(&debug_daemon_state.paused))
+                return -EALREADY;
+
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.lctl_event = 0;
+        debug_daemon_state.daemon_event = 1;
+        wake_up(&debug_daemon_state.daemon);
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+        return 0;
+}
+
+int portals_debug_daemon_continue(void)
+{
+        if (!atomic_read(&debug_daemon_state.paused))
+                return -EINVAL;
+        if (debug_daemon_state.stopped)
+                return -EINVAL;
+
+        debug_daemon_state.overlapped = 0;
+        atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a));
+        atomic_set(&debug_daemon_state.paused, 0);
+        return 0;
+}
+
+int portals_debug_daemon_stop(void)
+{
+        if (debug_daemon_state.stopped)
+                return -EALREADY;
+
+        if (!atomic_read(&debug_daemon_state.paused))
+                portals_debug_daemon_pause();
+
+        debug_daemon_state.lctl_event = 0;
+        debug_daemon_state.stopped = 1;
+
+        debug_daemon_state.daemon_event = 1;
+        wake_up(&debug_daemon_state.daemon);
+        wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event);
+
+        debug_daemon_file_path[0] = '\0';
+        return 0;
+}
+
+int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
+                             char *filename, unsigned int size)
+{
+        int rc = -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        switch (cmd) {
+                case DEBUG_DAEMON_START:
+                        if (length && (filename[length -1] != '\0')) {
+                                CERROR("Invalid filename for debug_daemon\n");
+                                rc = -EINVAL;
+                                break;
+                        }
+                        rc = portals_debug_daemon_start(filename, size);
+                        break;
+                case DEBUG_DAEMON_STOP:
+                        rc = portals_debug_daemon_stop();
+                        break;
+                case DEBUG_DAEMON_PAUSE:
+                        rc = portals_debug_daemon_pause();
+                        break;
+                case DEBUG_DAEMON_CONTINUE:
+                        rc = portals_debug_daemon_continue();
+                        break;
+                default:
+                        CERROR("unknown set_daemon cmd\n");
+        }
+        up(&debug_daemon_semaphore);
+        return rc;
+}
+
+static int panic_dumplog(struct notifier_block *self, unsigned long unused1,
+                         void *unused2)
+{
+        if (handled_panic)
+                return 0;
+        else
+                handled_panic = 1;
+
+        if (in_interrupt()) {
+                portals_debug_print();
+                return 0;
+        }
+
+        while (current->lock_depth >= 0)
+                unlock_kernel();
+        portals_debug_dumplog();
+        return 0;
+}
+
+static struct notifier_block lustre_panic_notifier = {
+        notifier_call :     panic_dumplog,
+        next :              NULL,
+        priority :          10000
+};
+
+int portals_debug_init(unsigned long bufsize)
+{
+        unsigned long debug_off = atomic_read(&debug_off_a);
+        if (debug_buf != NULL)
+                return -EALREADY;
+
+        atomic_set(&debug_daemon_state.paused, 1);
+        debug_daemon_state.stopped = 1;
+
+        debug_buf = vmalloc(bufsize + DEBUG_OVERFLOW);
+        if (debug_buf == NULL)
+                return -ENOMEM;
+        memset(debug_buf, 0, debug_size);
+        debug_wrapped = 0;
+
+        printk(KERN_INFO "Portals: allocated %lu byte debug buffer at %p.\n",
+               bufsize, debug_buf);
+        atomic_set(&debug_off_a, debug_off);
+        notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier);
+        debug_size = bufsize;
+
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier);
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        portals_debug_daemon_stop();
+
+        vfree(debug_buf);
+        atomic_set(&debug_off_a, 0);
+        up(&debug_daemon_semaphore);
+
+        return 0;
+}
+
+int portals_debug_clear_buffer(void)
+{
+        unsigned long flags;
+        unsigned long state;
+
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        down(&debug_daemon_semaphore);
+        state = atomic_read(&debug_daemon_state.paused);
+        if (!state)
+                portals_debug_daemon_pause();
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        atomic_set(&debug_off_a, 0);
+        debug_wrapped = 0;
+        atomic_set(&debug_daemon_next_write, 0);
+        debug_daemon_state.overlapped = 0;
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        if (!state)
+                atomic_set(&debug_daemon_state.paused, 0);
+        up(&debug_daemon_semaphore);
+
+        return 0;
+}
+
+/* Debug markers, although printed by S_PORTALS
+ * should not be be marked as such.
+ */
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_UNDEFINED
+int portals_debug_mark_buffer(char *text)
+{
+        if (debug_buf == NULL)
+                return -EINVAL;
+
+        CDEBUG(0, "*******************************************************************************\n");
+        CDEBUG(0, "DEBUG MARKER: %s\n", text);
+        CDEBUG(0, "*******************************************************************************\n");
+
+        return 0;
+}
+#undef DEBUG_SUBSYSTEM
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+__s32 portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        int rc;
+        unsigned long debug_off;
+        unsigned long flags;
+
+        if (len < debug_size)
+                return -ENOSPC;
+
+        debug_off = atomic_read(&debug_off_a);
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        if (debug_wrapped) {
+                /* All of this juggling with the 1s is to keep the trailing nul
+                 * (which falls at debug_buf + debug_off) at the end of what we
+                 * copy into user space */
+                copy_to_user(buf, debug_buf + debug_off + 1,
+                             debug_size - debug_off - 1);
+                copy_to_user(buf + debug_size - debug_off - 1,
+                             debug_buf, debug_off + 1);
+                rc = debug_size;
+        } else {
+                copy_to_user(buf, debug_buf, debug_off);
+                rc = debug_off;
+        }
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+
+        return rc;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   unsigned long stack, const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        int           max_nob;
+        int           prefix_nob;
+        int           msg_nob;
+        struct timeval tv;
+        unsigned long base_offset;
+        unsigned long debug_off;
+
+        if (debug_buf == NULL) {
+                printk("portals_debug_msg: debug_buf is NULL!\n");
+                return;
+        }
+
+        spin_lock_irqsave(&portals_debug_lock, flags);
+        debug_off = atomic_read(&debug_off_a);
+        if (!atomic_read(&debug_daemon_state.paused)) {
+                unsigned long available;
+                long delta;
+                long v = atomic_read(&debug_daemon_next_write);
+
+                delta = debug_off - v;
+                available = (delta>=0) ? debug_size-delta : -delta;
+                // Check if we still have enough debug buffer for CDEBUG
+                if (available < DAEMON_SND_SIZE) {
+                        /* Drop CDEBUG packets until enough debug_buffer is
+                         * available */
+                        if (debug_daemon_state.overlapped)
+                                 goto out;
+                        /* If this is the first time, leave a marker in the
+                         * output */
+                        debug_daemon_state.overlapped = 1;
+                        ap = NULL;
+                        format = "DEBUG MARKER: Debug buffer overlapped\n";
+                } else  /* More space just became available */
+                        debug_daemon_state.overlapped = 0;
+        }
+
+        max_nob = debug_size - debug_off + DEBUG_OVERFLOW;
+        if (max_nob <= 0) {
+                spin_unlock_irqrestore(&portals_debug_lock, flags);
+                printk("logic error in portals_debug_msg: <0 bytes to write\n");
+                return;
+        }
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        do_gettimeofday(&tv);
+
+        prefix_nob = snprintf(debug_buf + debug_off, max_nob,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id(),
+                              tv.tv_sec, tv.tv_usec);
+        max_nob -= prefix_nob;
+
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d | %d+%lu): ",
+                           file, line, fn, current->pid,
+                           current->thread.extern_pid, stack);
+#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d | %d+%lu): ",
+                           file, line, fn, current->pid,
+                           current->thread.mode.tt.extern_pid, stack);
+#else
+        msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob,
+                           "(%s:%d:%s() %d+%lu): ",
+                           file, line, fn, current->pid, stack);
+#endif
+        max_nob -= msg_nob;
+
+        va_start(ap, format);
+        msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob,
+                            max_nob, format, ap);
+        max_nob -= msg_nob;
+        va_end(ap);
+
+        /* Print to console, while msg is contiguous in debug_buf */
+        /* NB safely terminated see above */
+        if ((mask & D_EMERG) != 0)
+                printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob);
+        if ((mask & D_ERROR) != 0)
+                printk(KERN_ERR   "%s", debug_buf + debug_off + prefix_nob);
+        else if (portal_printk)
+                printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob);
+        base_offset = debug_off & 0xFFFF;
+
+        debug_off += prefix_nob + msg_nob;
+        if (debug_off > debug_size) {
+                memcpy(debug_buf, debug_buf + debug_size,
+                       debug_off - debug_size + 1);
+                debug_off -= debug_size;
+                debug_wrapped = 1;
+        }
+
+        atomic_set(&debug_off_a, debug_off);
+        if (!atomic_read(&debug_daemon_state.paused) &&
+            ((base_offset+prefix_nob+msg_nob) >= DAEMON_SND_SIZE)) {
+                debug_daemon_state.daemon_event = 1;
+                wake_up(&debug_daemon_state.daemon);
+        }
+out:
+        spin_unlock_irqrestore(&portals_debug_lock, flags);
+}
+
+void portals_debug_set_level(unsigned int debug_level)
+{
+        printk("Setting portals debug level to %08x\n", debug_level);
+        portal_debug = debug_level;
+}
+
+void portals_run_lbug_upcall(char * file, char *fn, int line)
+{
+        char *argv[6];
+        char *envp[3];
+        char buf[32];
+        int rc;
+
+        ENTRY;
+        snprintf (buf, sizeof buf, "%d", line);
+
+        argv[0] = portals_upcall;
+        argv[1] = "LBUG";
+        argv[2] = file;
+        argv[3] = fn;
+        argv[4] = buf;
+        argv[5] = NULL;
+
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+        envp[2] = NULL;
+
+        rc = call_usermodehelper(argv[0], argv, envp);
+        if (rc < 0) {
+                CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check "
+                       "/proc/sys/portals/upcall\n",                
+                       argv[0], argv[1], argv[2], argv[3], argv[4], rc);
+                
+        } else {
+                CERROR("Invoked upcall %s %s %s %s %s\n",
+                       argv[0], argv[1], argv[2], argv[3], argv[4]);
+        }
+}
+
+
+EXPORT_SYMBOL(portals_debug_dumplog);
+EXPORT_SYMBOL(portals_debug_msg);
+EXPORT_SYMBOL(portals_debug_set_level);
+EXPORT_SYMBOL(portals_run_lbug_upcall);
diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c
new file mode 100644 (file)
index 0000000..5e3fcb5
--- /dev/null
@@ -0,0 +1,574 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+#include <linux/miscdevice.h>
+
+#include <portals/lib-p30.h>
+#include <portals/p30.h>
+#include <linux/kp30.h>
+#include <linux/portals_compat25.h>
+
+#define PORTAL_MINOR 240
+
+extern void (kping_client)(struct portal_ioctl_data *);
+
+struct nal_cmd_handler {
+        nal_cmd_handler_t nch_handler;
+        void * nch_private;
+};
+
+static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1];
+struct semaphore nal_cmd_sem;
+
+#ifdef PORTAL_DEBUG
+void
+kportal_assertion_failed (char *expr, char *file, char *func, int line)
+{
+        portals_debug_msg(0, D_EMERG, file, func, line, CDEBUG_STACK(),
+                          "ASSERTION(%s) failed\n", expr);
+        LBUG_WITH_LOC(file, func, line);
+}
+#endif
+
+void
+kportal_daemonize (char *str) 
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63))
+        daemonize(str);
+#else
+        daemonize();
+        snprintf (current->comm, sizeof (current->comm), "%s", str);
+#endif
+}
+
+void
+kportal_blockallsigs ()
+{
+        unsigned long  flags;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+/* called when opening /dev/device */
+static int kportal_psdev_open(struct inode * inode, struct file * file)
+{
+        ENTRY;
+
+        if (!inode)
+                RETURN(-EINVAL);
+        PORTAL_MODULE_USE;
+        RETURN(0);
+}
+
+/* called when closing /dev/device */
+static int kportal_psdev_release(struct inode * inode, struct file * file)
+{
+        ENTRY;
+
+        if (!inode)
+                RETURN(-EINVAL);
+
+        PORTAL_MODULE_UNUSE;
+        RETURN(0);
+}
+
+static inline void freedata(void *data, int len)
+{
+        PORTAL_FREE(data, len);
+}
+
+static int
+kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
+                  ptl_nid_t hi_nid)
+{
+        int rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+        PORTAL_SYMBOL_PUT(kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_del_route(ptl_nid_t target)
+{
+        int rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_del_route (target);
+
+        PORTAL_SYMBOL_PUT(kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp,
+                  ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp)
+{
+        int       gateway_nalid;
+        ptl_nid_t gateway_nid;
+        ptl_nid_t lo_nid;
+        ptl_nid_t hi_nid;
+        int       rc;
+        kpr_control_interface_t *ci;
+
+        ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface);
+        if (ci == NULL)
+                return (-ENODEV);
+
+        rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid,
+                                 &hi_nid);
+
+        if (rc == 0) {
+                CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n",
+                       index, gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+                *gateway_nalidp = (__u32)gateway_nalid;
+                *gateway_nidp   = (__u32)gateway_nid;
+                *lo_nidp        = (__u32)lo_nid;
+                *hi_nidp        = (__u32)hi_nid;
+        }
+
+        PORTAL_SYMBOL_PUT (kpr_control_interface);
+        return (rc);
+}
+
+static int
+kportal_nal_cmd(int nal, struct portal_ioctl_data *data)
+{
+        int rc = -EINVAL;
+
+        ENTRY;
+
+        down(&nal_cmd_sem);
+        if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) {
+                CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, data->ioc_nal_cmd);
+                rc = nal_cmd[nal].nch_handler(data, nal_cmd[nal].nch_private);
+        }
+        up(&nal_cmd_sem);
+        RETURN(rc);
+}
+
+ptl_handle_ni_t *
+kportal_get_ni (int nal)
+{
+
+        switch (nal)
+        {
+        case QSWNAL:
+                return (PORTAL_SYMBOL_GET(kqswnal_ni));
+        case SOCKNAL:
+                return (PORTAL_SYMBOL_GET(ksocknal_ni));
+        case TOENAL:
+                return  (PORTAL_SYMBOL_GET(ktoenal_ni));
+        case GMNAL:
+                return  (PORTAL_SYMBOL_GET(kgmnal_ni));
+        case TCPNAL:
+                /* userspace NAL */
+                return (NULL);
+        case SCIMACNAL:
+                return  (PORTAL_SYMBOL_GET(kscimacnal_ni));
+        default:
+                /* A warning to a naive caller */
+                CERROR ("unknown nal: %d\n", nal);
+                return (NULL);
+        }
+}
+
+void
+kportal_put_ni (int nal)
+{
+
+        switch (nal)
+        {
+        case QSWNAL:
+                PORTAL_SYMBOL_PUT(kqswnal_ni);
+                break;
+        case SOCKNAL:
+                PORTAL_SYMBOL_PUT(ksocknal_ni);
+                break;
+        case TOENAL:
+                PORTAL_SYMBOL_PUT(ktoenal_ni);
+                break;
+        case GMNAL:
+                PORTAL_SYMBOL_PUT(kgmnal_ni);
+                break;
+        case TCPNAL:
+                /* A lesson to a malicious caller */
+                LBUG ();
+        case SCIMACNAL:
+                PORTAL_SYMBOL_PUT(kscimacnal_ni);
+                break;
+        default:
+                CERROR ("unknown nal: %d\n", nal);
+        }
+}
+
+int
+kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private)
+{
+        int rc = 0;
+
+        CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler);
+
+        if (nal > 0  && nal <= NAL_MAX_NR) {
+                down(&nal_cmd_sem);
+                if (nal_cmd[nal].nch_handler != NULL)
+                        rc = -EBUSY;
+                else {
+                        nal_cmd[nal].nch_handler = handler;
+                        nal_cmd[nal].nch_private = private;
+                }
+                up(&nal_cmd_sem);
+        }
+        return rc;
+}
+
+int
+kportal_nal_unregister(int nal)
+{
+        int rc = 0;
+
+        CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal);
+
+        if (nal > 0  && nal <= NAL_MAX_NR) {
+                down(&nal_cmd_sem);
+                nal_cmd[nal].nch_handler = NULL;
+                nal_cmd[nal].nch_private = NULL;
+                up(&nal_cmd_sem);
+        }
+        return rc;
+}
+
+
+static int kportal_ioctl(struct inode *inode, struct file *file,
+                         unsigned int cmd, unsigned long arg)
+{
+        int err = 0;
+        char buf[1024];
+        struct portal_ioctl_data *data;
+
+        ENTRY;
+
+        if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE ||
+             _IOC_NR(cmd) < IOC_PORTAL_MIN_NR  ||
+             _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) {
+                CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+                                _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+                RETURN(-EINVAL);
+        }
+
+        if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+                CERROR("PORTALS ioctl: data error\n");
+                RETURN(-EINVAL);
+        }
+
+        data = (struct portal_ioctl_data *)buf;
+
+        switch (cmd) {
+        case IOC_PORTAL_SET_DAEMON: 
+                RETURN (portals_debug_set_daemon ( 
+                                        (unsigned int) data->ioc_count,
+                                        (unsigned int) data->ioc_inllen1,
+                                        (char *) data->ioc_inlbuf1,
+                                        (unsigned int) data->ioc_misc)); 
+        case IOC_PORTAL_GET_DEBUG: {
+                __s32 size = portals_debug_copy_to_user(data->ioc_pbuf1,
+                                                        data->ioc_plen1);
+
+                if (size < 0)
+                        RETURN(size);
+
+                data->ioc_size = size;
+                err = copy_to_user((char *)arg, data, sizeof(*data));
+                RETURN(err);
+        }
+        case IOC_PORTAL_CLEAR_DEBUG:
+                portals_debug_clear_buffer();
+                RETURN(0);
+        case IOC_PORTAL_PANIC:
+                if (!capable (CAP_SYS_BOOT))
+                        RETURN (-EPERM);
+                panic("debugctl-invoked panic");
+                RETURN(0);
+        case IOC_PORTAL_MARK_DEBUG:
+                if (data->ioc_inlbuf1 == NULL ||
+                    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
+                        RETURN(-EINVAL);
+                portals_debug_mark_buffer(data->ioc_inlbuf1);
+                RETURN(0);
+        case IOC_PORTAL_PING: {
+                void (*ping)(struct portal_ioctl_data *);
+
+                CDEBUG(D_IOCTL, "doing %d pings to nid "LPU64"\n",
+                       data->ioc_count, data->ioc_nid);
+                ping = PORTAL_SYMBOL_GET(kping_client);
+                if (!ping)
+                        CERROR("PORTAL_SYMBOL_GET failed\n");
+                else {
+                        ping(data);
+                        PORTAL_SYMBOL_PUT(kping_client);
+                }
+                RETURN(0);
+        }
+
+        case IOC_PORTAL_ADD_ROUTE:
+                CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n",
+                       data->ioc_nal, data->ioc_nid, data->ioc_nid2,
+                       data->ioc_nid3);
+                err = kportal_add_route(data->ioc_nal, data->ioc_nid,
+                                        MIN (data->ioc_nid2, data->ioc_nid3),
+                                        MAX (data->ioc_nid2, data->ioc_nid3));
+                break;
+
+        case IOC_PORTAL_DEL_ROUTE:
+                CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid);
+                err = kportal_del_route (data->ioc_nid);
+                break;
+
+        case IOC_PORTAL_GET_ROUTE:
+                CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count);
+                err = kportal_get_route(data->ioc_count, &data->ioc_nal,
+                                        &data->ioc_nid, &data->ioc_nid2,
+                                        &data->ioc_nid3);
+                if (err == 0)
+                        if (copy_to_user((char *)arg, data, sizeof (*data)))
+                                err = -EFAULT;
+                break;
+
+        case IOC_PORTAL_GET_NID: {
+                const ptl_handle_ni_t *nip;
+                ptl_process_id_t       pid;
+
+                CDEBUG (D_IOCTL, "Getting nid [%d]\n", data->ioc_nal);
+
+                nip = kportal_get_ni (data->ioc_nal);
+                if (nip == NULL)
+                        RETURN (-EINVAL);
+
+                err = PtlGetId (*nip, &pid);
+                LASSERT (err == PTL_OK);
+                kportal_put_ni (data->ioc_nal);
+
+                data->ioc_nid = pid.nid;
+                if (copy_to_user ((char *)arg, data, sizeof (*data)))
+                        err = -EFAULT;
+                break;
+        }
+
+        case IOC_PORTAL_NAL_CMD:
+                CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", data->ioc_nal,
+                        data->ioc_nal_cmd);
+                err = kportal_nal_cmd(data->ioc_nal, data);
+                if (err == 0)
+                        if (copy_to_user((char *)arg, data, sizeof (*data)))
+                                err = -EFAULT;
+                break;
+
+        case IOC_PORTAL_FAIL_NID: {
+                const ptl_handle_ni_t *nip;
+
+                CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
+                        data->ioc_nal, data->ioc_nid, data->ioc_count);
+
+                nip = kportal_get_ni (data->ioc_nal);
+                if (nip == NULL)
+                        return (-EINVAL);
+
+                err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count);
+                break;
+        }
+
+        default:
+                err = -EINVAL;
+                break;
+        }
+
+        RETURN(err);
+}
+
+
+static struct file_operations portalsdev_fops = {
+        ioctl:   kportal_ioctl,
+        open:    kportal_psdev_open,
+        release: kportal_psdev_release
+};
+
+
+static struct miscdevice portal_dev = {
+        PORTAL_MINOR,
+        "portals",
+        &portalsdev_fops
+};
+
+extern int insert_proc(void);
+extern void remove_proc(void);
+MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
+MODULE_DESCRIPTION("Portals v3.1");
+MODULE_LICENSE("GPL");
+
+static int init_kportals_module(void)
+{
+        int rc;
+
+        rc = portals_debug_init(5 * 1024 * 1024);
+        if (rc < 0) {
+                printk(KERN_ERR "portals_debug_init: %d\n", rc);
+                return (rc);
+        }
+
+        sema_init(&nal_cmd_sem, 1);
+
+        rc = misc_register(&portal_dev);
+        if (rc) {
+                CERROR("misc_register: error %d\n", rc);
+                goto cleanup_debug;
+        }
+
+        rc = PtlInit();
+        if (rc) {
+                CERROR("PtlInit: error %d\n", rc);
+                goto cleanup_deregister;
+        }
+
+        rc = insert_proc();
+        if (rc) {
+                CERROR("insert_proc: error %d\n", rc);
+                goto cleanup_fini;
+        }
+
+        CDEBUG (D_OTHER, "portals setup OK\n");
+        return (0);
+
+ cleanup_fini:
+        PtlFini();
+ cleanup_deregister:
+        misc_deregister(&portal_dev);
+ cleanup_debug:
+        portals_debug_cleanup();
+        return rc;
+}
+
+static void exit_kportals_module(void)
+{
+        int rc;
+
+        remove_proc();
+        PtlFini();
+
+        CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
+               atomic_read(&portal_kmemory));
+
+
+        rc = misc_deregister(&portal_dev);
+        if (rc)
+                CERROR("misc_deregister error %d\n", rc);
+
+        if (atomic_read(&portal_kmemory) != 0)
+                CERROR("Portals memory leaked: %d bytes\n",
+                       atomic_read(&portal_kmemory));
+
+        rc = portals_debug_cleanup();
+        if (rc)
+                printk(KERN_ERR "portals_debug_cleanup: %d\n", rc);
+}
+
+EXPORT_SYMBOL(lib_dispatch);
+EXPORT_SYMBOL(PtlMEAttach);
+EXPORT_SYMBOL(PtlMEInsert);
+EXPORT_SYMBOL(PtlMEUnlink);
+EXPORT_SYMBOL(PtlEQAlloc);
+EXPORT_SYMBOL(PtlMDAttach);
+EXPORT_SYMBOL(PtlMDUnlink);
+EXPORT_SYMBOL(PtlNIInit);
+EXPORT_SYMBOL(PtlNIFini);
+EXPORT_SYMBOL(PtlNIDebug);
+EXPORT_SYMBOL(PtlInit);
+EXPORT_SYMBOL(PtlFini);
+EXPORT_SYMBOL(PtlPut);
+EXPORT_SYMBOL(PtlGet);
+EXPORT_SYMBOL(ptl_err_str);
+EXPORT_SYMBOL(portal_subsystem_debug);
+EXPORT_SYMBOL(portal_debug);
+EXPORT_SYMBOL(portal_stack);
+EXPORT_SYMBOL(portal_printk);
+EXPORT_SYMBOL(PtlEQWait);
+EXPORT_SYMBOL(PtlEQFree);
+EXPORT_SYMBOL(PtlEQGet);
+EXPORT_SYMBOL(PtlGetId);
+EXPORT_SYMBOL(PtlMDBind);
+EXPORT_SYMBOL(lib_iov_nob);
+EXPORT_SYMBOL(lib_copy_iov2buf);
+EXPORT_SYMBOL(lib_copy_buf2iov);
+EXPORT_SYMBOL(lib_kiov_nob);
+EXPORT_SYMBOL(lib_copy_kiov2buf);
+EXPORT_SYMBOL(lib_copy_buf2kiov);
+EXPORT_SYMBOL(lib_finalize);
+EXPORT_SYMBOL(lib_parse);
+EXPORT_SYMBOL(lib_init);
+EXPORT_SYMBOL(lib_fini);
+EXPORT_SYMBOL(portal_kmemory);
+EXPORT_SYMBOL(kportal_daemonize);
+EXPORT_SYMBOL(kportal_blockallsigs);
+EXPORT_SYMBOL(kportal_nal_register);
+EXPORT_SYMBOL(kportal_nal_unregister);
+EXPORT_SYMBOL(kportal_assertion_failed);
+EXPORT_SYMBOL(dispatch_name);
+EXPORT_SYMBOL(kportal_get_ni);
+EXPORT_SYMBOL(kportal_put_ni);
+
+module_init(init_kportals_module);
+module_exit (exit_kportals_module);
diff --git a/lustre/portals/libcfs/proc.c b/lustre/portals/libcfs/proc.c
new file mode 100644 (file)
index 0000000..2fa739a
--- /dev/null
@@ -0,0 +1,290 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Zach Brown <zab@zabbo.net>
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <net/sock.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/segment.h>
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+
+#include <linux/kp30.h>
+#include <asm/div64.h>
+
+static struct ctl_table_header *portals_table_header = NULL;
+extern char debug_file_path[1024];
+extern char debug_daemon_file_path[1024];
+extern char portals_upcall[1024];
+
+#define PSDEV_PORTALS  (0x100)
+#define PSDEV_DEBUG           1   /* control debugging */
+#define PSDEV_SUBSYSTEM_DEBUG 2   /* control debugging */
+#define PSDEV_PRINTK          3   /* force all errors to console */
+#define PSDEV_DEBUG_PATH      4   /* crashdump log location */
+#define PSDEV_DEBUG_DUMP_PATH 5   /* crashdump tracelog location */
+#define PSDEV_PORTALS_UPCALL  6   /* User mode upcall script  */
+
+#define PORTALS_PRIMARY_CTLCNT 6
+static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = {
+        {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug,
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
+         sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
+        {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path,
+         sizeof(debug_daemon_file_path), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+        {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall,
+         sizeof(portals_upcall), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+        {0}
+};
+
+static struct ctl_table top_table[2] = {
+        {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table},
+        {0}
+};
+
+
+#ifdef PORTALS_PROFILING
+/*
+ * profiling stuff.  we do this statically for now 'cause its simple,
+ * but we could do some tricks with elf sections to have this array
+ * automatically built.
+ */
+#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, }
+
+struct prof_ent prof_ents[] = {
+        def_prof(our_recvmsg),
+        def_prof(our_sendmsg),
+        def_prof(socknal_recv),
+        def_prof(lib_parse),
+        def_prof(conn_list_walk),
+        def_prof(memcpy),
+        def_prof(lib_finalize),
+        def_prof(pingcli_time),
+        def_prof(gmnal_send),
+        def_prof(gmnal_recv),
+};
+
+EXPORT_SYMBOL(prof_ents);
+
+/*
+ * this function is as crazy as the proc filling api
+ * requires.
+ *
+ * buffer: page allocated for us to scribble in.  the
+ *  data returned to the user will be taken from here.
+ * *start: address of the pointer that will tell the 
+ *  caller where in buffer the data the user wants is.
+ * ppos: offset in the entire /proc file that the user
+ *  currently wants.
+ * wanted: the amount of data the user wants.
+ *
+ * while going, 'curpos' is the offset in the entire
+ * file where we currently are.  We only actually
+ * start filling buffer when we get to a place in
+ * the file that the user cares about.
+ *
+ * we take care to only sprintf when the user cares because
+ * we're holding a lock while we do this.
+ *
+ * we're smart and know that we generate fixed size lines.
+ * we only start writing to the buffer when the user cares.
+ * This is unpredictable because we don't snapshot the
+ * list between calls that are filling in a file from
+ * the list.  The list could change mid read and the
+ * output will look very weird indeed.  oh well.
+ */
+
+static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted,
+                          int *eof, void *data)
+{
+        int len = 0, i;
+        int curpos;
+        char *header = "Interval        Cycles_per (Starts Finishes Total)\n";
+        int header_len = strlen(header);
+        char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)";
+        int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1);
+
+        *start = buffer;
+
+        if (ppos < header_len) {
+                int diff = MIN(header_len, wanted);
+                memcpy(buffer, header + ppos, diff);
+                len += diff;
+                ppos += diff;
+        }
+
+        if (len >= wanted)
+                goto out;
+
+        curpos = header_len;
+
+        for ( i = 0; i < MAX_PROFS ; i++) {
+                int copied;
+                struct prof_ent *pe = &prof_ents[i];
+                long long cycles_per;
+                /*
+                 * find the part of the array that the buffer wants
+                 */
+                if (ppos >= (curpos + line_len))  {
+                        curpos += line_len;
+                        continue;
+                }
+                /* the clever caller split a line */
+                if (ppos > curpos) {
+                        *start = buffer + (ppos - curpos);
+                }
+
+                if (pe->finishes == 0)
+                        cycles_per = 0;
+                else
+                {
+                        cycles_per = pe->total_cycles;
+                        do_div (cycles_per, pe->finishes);
+                }
+
+                copied = sprintf(buffer + len, format, pe->str, cycles_per,
+                                 pe->starts, pe->finishes, pe->total_cycles);
+
+                len += copied;
+
+                /* pad to line len, -1 for \n */
+                if ((copied < line_len-1)) {
+                        int diff = (line_len-1) - copied;
+                        memset(buffer + len, ' ', diff);
+                        len += diff;
+                        copied += diff;
+                }
+
+                buffer[len++]= '\n';
+
+                /* bail if we have enough */
+                if (((buffer + len) - *start) >= wanted)
+                        break;
+
+                curpos += line_len;
+        }
+
+        /* lameness */
+        if (i == MAX_PROFS)
+                *eof = 1;
+ out:
+
+        return MIN(((buffer + len) - *start), wanted);
+}
+
+/*
+ * all kids love /proc :/
+ */
+static unsigned char basedir[]="net/portals";
+#endif /* PORTALS_PROFILING */
+
+int insert_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        struct proc_dir_entry *ent;
+
+        if (ARRAY_SIZE(prof_ents) != MAX_PROFS) {
+                CERROR("profiling enum and array are out of sync.\n");
+                return -1;
+        }
+
+        /*
+         * This is pretty lame.  assuming that failure just
+         * means that they already existed.
+         */
+        strcat(dir, basedir);
+        create_proc_entry(dir, S_IFDIR, 0);
+
+        strcat(dir, "/cycles");
+        ent = create_proc_entry(dir, 0, 0);
+        if (!ent) {
+                CERROR("couldn't register %s?\n", dir);
+                return -1;
+        }
+
+        ent->data = NULL;
+        ent->read_proc = prof_read_proc;
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (!portals_table_header)
+                portals_table_header = register_sysctl_table(top_table, 0);
+#endif
+
+        return 0;
+}
+
+void remove_proc(void)
+{
+#if PORTALS_PROFILING
+        unsigned char dir[128];
+        int end;
+
+        dir[0]='\0';
+        strcat(dir, basedir);
+
+        end = strlen(dir);
+
+        strcat(dir, "/cycles");
+        remove_proc_entry(dir,0);
+
+        dir[end] = '\0';
+        remove_proc_entry(dir,0);
+#endif /* PORTALS_PROFILING */
+
+#ifdef CONFIG_SYSCTL
+        if (portals_table_header)
+                unregister_sysctl_table(portals_table_header);
+        portals_table_header = NULL;
+#endif
+}
diff --git a/lustre/portals/packaging/.cvsignore b/lustre/portals/packaging/.cvsignore
new file mode 100644 (file)
index 0000000..fd1d56a
--- /dev/null
@@ -0,0 +1,8 @@
+Makefile
+Makefile.in
+aclocal.m4
+config.log
+config.status
+config.cache
+configure
+portals.spec
diff --git a/lustre/portals/packaging/Makefile.am b/lustre/portals/packaging/Makefile.am
new file mode 100644 (file)
index 0000000..126bc69
--- /dev/null
@@ -0,0 +1,6 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+EXTRA_DIST = portals.spec
\ No newline at end of file
diff --git a/lustre/portals/packaging/portals.spec.in b/lustre/portals/packaging/portals.spec.in
new file mode 100644 (file)
index 0000000..e196b3f
--- /dev/null
@@ -0,0 +1,116 @@
+%define kversion @RELEASE@
+%define linuxdir @LINUX@
+%define version HEAD
+
+Summary: Sandia Portals Message Passing - utilities 
+Name: portals
+Version: %{version}
+Release: 0210101748uml
+Copyright: LGPL
+Group: Utilities/System
+BuildRoot: /var/tmp/portals-%{version}-root
+Source: http://sandiaportals.org/portals-%{version}.tar.gz
+
+%description
+Sandia Portals message passing package.  Contains kernel modules, libraries and utilities. 
+
+%package -n portals-modules
+Summary: Kernel modules and NAL's for portals
+Group: Development/Kernel
+
+%description -n portals-modules
+Object-Based Disk storage drivers for Linux %{kversion}.
+
+%package -n portals-source
+Summary: Portals kernel source for rebuilding with other kernels
+Group: Development/Kernel
+
+%description -n portals-source
+Portals kernel source for rebuilding with other kernels
+
+%prep
+%setup -n portals-%{version}
+
+%build
+rm -rf $RPM_BUILD_ROOT
+
+# Create the pristine source directory.
+srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version}
+mkdir -p $srcdir
+find . -name CVS -prune -o -print | cpio -ap $srcdir
+
+# Set an explicit path to our Linux tree, if we can.
+conf_flag=
+linuxdir=%{linuxdir}
+test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+./configure $conf_flag
+make 
+
+%install
+make install prefix=$RPM_BUILD_ROOT
+
+%ifarch alpha
+# this hurts me
+  conf_flag=
+  linuxdir=%{linuxdir}
+  test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+  make clean
+  ./configure --enable-rtscts-myrinet $conf_flag
+  make
+  cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o
+  cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload
+%endif
+
+
+%files
+%attr(-, root, root) %doc COPYING
+%attr(-, root, root) /usr/sbin/acceptor
+%attr(-, root, root) /usr/sbin/ptlctl
+%attr(-, root, root) /usr/sbin/debugctl
+%ifarch alpha
+%attr(-, root, root) /usr/sbin/mcpload
+%endif
+%attr(-, root, root) /lib/libmyrnal.a
+%attr(-, root, root) /lib/libptlapi.a
+%attr(-, root, root) /lib/libptlctl.a
+%attr(-, root, root) /lib/libprocbridge.a
+%attr(-, root, root) /lib/libptllib.a
+%attr(-, root, root) /lib/libtcpnal.a 
+%attr(-, root, root) /lib/libtcpnalutil.a
+%attr(-, root, root) /usr/include/portals/*.h
+%attr(-, root, root) /usr/include/portals/base/*.h
+%attr(-, root, root) /usr/include/linux/*.h
+
+%files -n portals-modules
+%attr(-, root, root) %doc COPYING
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o
+%ifarch alpha
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o
+%endif
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o
+
+%files -n portals-source
+%attr(-, root, root) /usr/src/portals-%{version}
+
+%post
+if [ ! -e /dev/portals ]; then
+   mknod /dev/portals c 10 240
+fi
+depmod -ae || exit 0
+
+grep -q portals /etc/modules.conf || \
+       echo 'alias char-major-10-240 portals' >> /etc/modules.conf
+
+grep -q '/dev/portals' /etc/modules.conf || \
+       echo 'alias /dev/portals portals' >> /etc/modules.conf
+
+%postun
+depmod -ae || exit 0
+
+%clean
+#rm -rf $RPM_BUILD_ROOT
+
+# end of file
diff --git a/lustre/portals/portals/.cvsignore b/lustre/portals/portals/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/portals/portals/Makefile.am b/lustre/portals/portals/Makefile.am
new file mode 100644 (file)
index 0000000..8c03749
--- /dev/null
@@ -0,0 +1,10 @@
+# Copyright (C) 2002  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include
+lib_LIBRARIES= libportals.a
+libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c
diff --git a/lustre/portals/portals/Makefile.mk b/lustre/portals/portals/Makefile.mk
new file mode 100644 (file)
index 0000000..5627ef7
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Kernelenv
+
+obj-y += portals.o
+portals-objs    := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o lib-move.o lib-msg.o lib-ni.o lib-not-impl.o lib-pid.o api-eq.o api-errno.o api-init.o api-md.o api-me.o api-ni.o api-wrap.o
diff --git a/lustre/portals/portals/api-eq.c b/lustre/portals/portals/api-eq.c
new file mode 100644 (file)
index 0000000..e066619
--- /dev/null
@@ -0,0 +1,158 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-eq.c
+ * User-level event queue management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_eq_init(void)
+{
+        /* Nothing to do anymore... */
+        return PTL_OK;
+}
+
+void ptl_eq_fini(void)
+{
+        /* Nothing to do anymore... */
+}
+
+int ptl_eq_ni_init(nal_t * nal)
+{
+        /* Nothing to do anymore... */
+        return PTL_OK;
+}
+
+void ptl_eq_ni_fini(nal_t * nal)
+{
+        /* Nothing to do anymore... */
+}
+
+int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev)
+{
+        ptl_eq_t *eq;
+        int rc, new_index;
+        unsigned long flags;
+        ptl_event_t *new_event;
+        nal_t *nal;
+        ENTRY;
+
+        if (!ptl_init)
+                RETURN(PTL_NOINIT);
+
+        nal = ptl_hndl2nal(&eventq);
+        if (!nal)
+                RETURN(PTL_INV_EQ);
+
+        eq = ptl_handle2usereq(&eventq);
+        nal->lock(nal, &flags);
+
+        /* size must be a power of 2 to handle a wrapped sequence # */
+        LASSERT (eq->size != 0 &&
+                 eq->size == LOWEST_BIT_SET (eq->size));
+
+        new_index = eq->sequence & (eq->size - 1);
+        new_event = &eq->base[new_index];
+        CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n",
+               new_event, eq->sequence, eq->size);
+        if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) {
+                nal->unlock(nal, &flags);
+                RETURN(PTL_EQ_EMPTY);
+        }
+
+        *ev = *new_event;
+
+        /* Set the unlinked_me interface number if there is one to pass
+         * back, since the NAL hasn't a clue what it is and therefore can't
+         * set it. */
+        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE))
+                ev->unlinked_me.nal_idx = eventq.nal_idx;
+        
+        /* ensure event is delivered correctly despite possible 
+           races with lib_finalize */
+        if (eq->sequence != new_event->sequence) {
+                CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n",
+                       eq->sequence, new_event->sequence);
+                rc = PTL_EQ_DROPPED;
+        } else {
+                rc = PTL_OK;
+        }
+
+        eq->sequence = new_event->sequence + 1;
+        nal->unlock(nal, &flags);
+        RETURN(rc);
+}
+
+
+int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
+{
+        int rc;
+        
+        /* PtlEQGet does the handle checking */
+        while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) {
+                nal_t *nal = ptl_hndl2nal(&eventq_in);
+                
+                if (nal->yield)
+                        nal->yield(nal);
+        }
+
+        return rc;
+}
+
+#ifndef __KERNEL__
+static jmp_buf eq_jumpbuf;
+
+static void eq_timeout(int signal)
+{
+        longjmp(eq_jumpbuf, -1);
+}
+
+int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out,
+                      int timeout)
+{
+        static void (*prev) (int);
+        static int left_over;
+        time_t time_at_start;
+        int rc;
+
+        if (setjmp(eq_jumpbuf)) {
+                signal(SIGALRM, prev);
+                alarm(left_over - timeout);
+                return PTL_EQ_EMPTY;
+        }
+
+        left_over = alarm(timeout);
+        prev = signal(SIGALRM, eq_timeout);
+        time_at_start = time(NULL);
+        if (left_over < timeout)
+                alarm(left_over);
+
+        rc = PtlEQWait(eventq_in, event_out);
+
+        signal(SIGALRM, prev);
+        alarm(left_over);       /* Should compute how long we waited */
+
+        return rc;
+}
+
+#endif
+
diff --git a/lustre/portals/portals/api-errno.c b/lustre/portals/portals/api-errno.c
new file mode 100644 (file)
index 0000000..026c93b
--- /dev/null
@@ -0,0 +1,55 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-errno.c
+ * Instantiate the string table of errors
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ */
+
+/* If you change these, you must update the number table in portals/errno.h */
+const char *ptl_err_str[] = {
+        "PTL_OK",
+        "PTL_SEGV",
+
+        "PTL_NOSPACE",
+        "PTL_INUSE",
+        "PTL_VAL_FAILED",
+
+        "PTL_NAL_FAILED",
+        "PTL_NOINIT",
+        "PTL_INIT_DUP",
+        "PTL_INIT_INV",
+        "PTL_AC_INV_INDEX",
+
+        "PTL_INV_ASIZE",
+        "PTL_INV_HANDLE",
+        "PTL_INV_MD",
+        "PTL_INV_ME",
+        "PTL_INV_NI",
+/* If you change these, you must update the number table in portals/errno.h */
+        "PTL_ILL_MD",
+        "PTL_INV_PROC",
+        "PTL_INV_PSIZE",
+        "PTL_INV_PTINDEX",
+        "PTL_INV_REG",
+
+        "PTL_INV_SR_INDX",
+        "PTL_ML_TOOLONG",
+        "PTL_ADDR_UNKNOWN",
+        "PTL_INV_EQ",
+        "PTL_EQ_DROPPED",
+
+        "PTL_EQ_EMPTY",
+        "PTL_NOUPDATE",
+        "PTL_FAIL",
+        "PTL_NOT_IMPLEMENTED",
+        "PTL_NO_ACK",
+
+        "PTL_IOV_TOO_MANY",
+        "PTL_IOV_TOO_SMALL",
+
+        "PTL_EQ_INUSE",
+        "PTL_MD_INUSE"
+};
+/* If you change these, you must update the number table in portals/errno.h */
diff --git a/lustre/portals/portals/api-init.c b/lustre/portals/portals/api-init.c
new file mode 100644 (file)
index 0000000..e59c922
--- /dev/null
@@ -0,0 +1,71 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-init.c
+ * Initialization and global data for the p30 user side library
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_init;
+unsigned int portal_subsystem_debug = 0xfff7e3ff;
+unsigned int portal_debug = ~0;
+unsigned int portal_printk;
+unsigned int portal_stack;
+
+#ifdef __KERNEL__
+atomic_t portal_kmemory = ATOMIC_INIT(0);
+#endif
+
+int __p30_initialized;
+int __p30_myr_initialized;
+int __p30_ip_initialized;
+ptl_handle_ni_t __myr_ni_handle;
+ptl_handle_ni_t __ip_ni_handle;
+
+int __p30_myr_timeout = 10;
+int __p30_ip_timeout;
+
+int PtlInit(void)
+{
+
+        if (ptl_init)
+                return PTL_OK;
+
+        ptl_ni_init();
+        ptl_me_init();
+        ptl_eq_init();
+        ptl_init = 1;
+        __p30_initialized = 1;
+
+        return PTL_OK;
+}
+
+
+void PtlFini(void)
+{
+
+        /* Reverse order of initialization */
+        ptl_eq_fini();
+        ptl_me_fini();
+        ptl_ni_fini();
+        ptl_init = 0;
+}
diff --git a/lustre/portals/portals/api-me.c b/lustre/portals/portals/api-me.c
new file mode 100644 (file)
index 0000000..e724e58
--- /dev/null
@@ -0,0 +1,42 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-me.c
+ * Match Entry local operations.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+int ptl_me_init(void)
+{
+        return PTL_OK;
+}
+void ptl_me_fini(void)
+{                                /* Nothing to do */
+}
+int ptl_me_ni_init(nal_t * nal)
+{
+        return PTL_OK;
+}
+
+void ptl_me_ni_fini(nal_t * nal)
+{                                /* Nothing to do... */
+}
diff --git a/lustre/portals/portals/api-ni.c b/lustre/portals/portals/api-ni.c
new file mode 100644 (file)
index 0000000..b2e069e
--- /dev/null
@@ -0,0 +1,197 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-ni.c
+ * Network Interface code
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <portals/api-support.h>
+
+/* Put some magic in the NI handle so uninitialised/zeroed handles are easy
+ * to spot */
+#define NI_HANDLE_MAGIC  0xebc0de00
+#define NI_HANDLE_MASK   0x000000ff
+#define MAX_NIS          8         
+static nal_t *ptl_interfaces[MAX_NIS];
+int ptl_num_interfaces = 0;
+
+nal_t *ptl_hndl2nal(ptl_handle_any_t *handle)
+{
+        unsigned int idx = handle->nal_idx;
+
+        /* XXX we really rely on the caller NOT racing with interface
+         * setup/teardown.  That ensures her NI handle can't get
+         * invalidated out from under her (or worse, swapped for a
+         * completely different interface!) */
+
+        if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0)
+                return NULL;
+
+        idx &= NI_HANDLE_MASK;
+        if (idx < MAX_NIS)
+                return ptl_interfaces[idx];
+
+        return NULL;
+}
+
+int ptl_ni_init(void)
+{
+        int i;
+
+        LASSERT (MAX_NIS <= (NI_HANDLE_MASK + 1));
+        
+        for (i = 0; i < MAX_NIS; i++)
+                ptl_interfaces[i] = NULL;
+
+        return PTL_OK;
+}
+
+void ptl_ni_fini(void)
+{
+        int i;
+
+        for (i = 0; i < MAX_NIS; i++) {
+                nal_t *nal = ptl_interfaces[i];
+                if (!nal)
+                        continue;
+
+                if (nal->shutdown)
+                        nal->shutdown(nal, i);
+        }
+}
+
+#ifdef __KERNEL__
+DECLARE_MUTEX(ptl_ni_init_mutex);
+
+static void ptl_ni_init_mutex_enter (void) 
+{
+        down (&ptl_ni_init_mutex);
+}
+
+static void ptl_ni_init_mutex_exit (void)
+{
+        up (&ptl_ni_init_mutex);
+}
+
+#else
+static void ptl_ni_init_mutex_enter (void)
+{
+}
+
+static void ptl_ni_init_mutex_exit (void) 
+{
+}
+
+#endif
+
+int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size,
+              ptl_ac_index_t acl_size, ptl_pid_t requested_pid,
+              ptl_handle_ni_t * handle)
+{
+        nal_t *nal;
+        int i;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+
+        ptl_ni_init_mutex_enter ();
+
+        nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid);
+
+        if (!nal) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_NAL_FAILED;
+        }
+
+        for (i = 0; i < ptl_num_interfaces; i++) {
+                if (ptl_interfaces[i] == nal) {
+                        nal->refct++;
+                        handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i;
+                        fprintf(stderr, "Returning existing NAL (%d)\n", i);
+                        ptl_ni_init_mutex_exit ();
+                        return PTL_OK;
+                }
+        }
+        nal->refct = 1;
+
+        if (ptl_num_interfaces >= MAX_NIS) {
+                if (nal->shutdown)
+                        nal->shutdown (nal, ptl_num_interfaces);
+                ptl_ni_init_mutex_exit ();
+                return PTL_NOSPACE;
+        }
+
+        handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | ptl_num_interfaces;
+        ptl_interfaces[ptl_num_interfaces++] = nal;
+
+        ptl_eq_ni_init(nal);
+        ptl_me_ni_init(nal);
+
+        ptl_ni_init_mutex_exit ();
+        return PTL_OK;
+}
+
+
+int PtlNIFini(ptl_handle_ni_t ni)
+{
+        nal_t *nal;
+        int idx;
+        int rc;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+
+        ptl_ni_init_mutex_enter ();
+
+        nal = ptl_hndl2nal (&ni);
+        if (nal == NULL) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_INV_HANDLE;
+        }
+
+        idx = ni.nal_idx & NI_HANDLE_MASK;
+
+        nal->refct--;
+        if (nal->refct > 0) {
+                ptl_ni_init_mutex_exit ();
+                return PTL_OK;
+        }
+
+        ptl_me_ni_fini(nal);
+        ptl_eq_ni_fini(nal);
+
+        rc = PTL_OK;
+        if (nal->shutdown)
+                rc = nal->shutdown(nal, idx);
+
+        ptl_interfaces[idx] = NULL;
+        ptl_num_interfaces--;
+
+        ptl_ni_init_mutex_exit ();
+        return rc;
+}
+
+int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out)
+{
+        *ni_out = handle_in;
+
+        return PTL_OK;
+}
diff --git a/lustre/portals/portals/api-wrap.c b/lustre/portals/portals/api-wrap.c
new file mode 100644 (file)
index 0000000..e54707f
--- /dev/null
@@ -0,0 +1,599 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * api/api-wrap.c
+ * User-level wrappers that dispatch across the protection boundaries
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/api-support.h>
+
+static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf,
+                      int argsize, void *retbuf, int retsize)
+{
+        nal_t *nal;
+
+        if (!ptl_init) {
+                fprintf(stderr, "PtlGetId: Not initialized\n");
+                return PTL_NOINIT;
+        }
+
+        nal = ptl_hndl2nal(&any_h);
+        if (!nal)
+                return PTL_INV_HANDLE;
+
+        nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize);
+
+        return PTL_OK;
+}
+
+int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id)
+{
+        PtlGetId_in args;
+        PtlGetId_out ret;
+        int rc;
+
+        args.handle_in = ni_handle;
+
+        rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return rc;
+        
+        if (id)
+                *id = ret.id_out;
+
+        return ret.rc;
+}
+
+int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) 
+{
+        PtlFailNid_in  args;
+        PtlFailNid_out ret;
+        int            rc;
+        
+        args.interface = interface;
+        args.nid       = nid;
+        args.threshold = threshold;
+        
+        rc = do_forward (interface, PTL_FAILNID, 
+                         &args, sizeof(args), &ret, sizeof (ret));
+
+        return ((rc != PTL_OK) ? rc : ret.rc);
+}
+
+int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
+                ptl_sr_value_t * status_out)
+{
+        PtlNIStatus_in args;
+        PtlNIStatus_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.register_in = register_in;
+
+        rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (status_out)
+                *status_out = ret.status_out;
+
+        return ret.rc;
+}
+
+int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
+              unsigned long *distance_out)
+{
+        PtlNIDist_in args;
+        PtlNIDist_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.process_in = process_in;
+
+        rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (distance_out)
+                *distance_out = ret.distance_out;
+
+        return ret.rc;
+}
+
+
+
+unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in)
+{
+        PtlNIDebug_in args;
+        PtlNIDebug_out ret;
+        int rc;
+
+        args.mask_in = mask_in;
+
+        rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        return ret.rc;
+}
+
+int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
+                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
+                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
+                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out)
+{
+        PtlMEAttach_in args;
+        PtlMEAttach_out ret;
+        int rc;
+
+        args.interface_in = interface_in;
+        args.index_in = index_in;
+        args.match_id_in = match_id_in;
+        args.match_bits_in = match_bits_in;
+        args.ignore_bits_in = ignore_bits_in;
+        args.unlink_in = unlink_in;
+        args.position_in = pos_in;
+
+        rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = interface_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+
+        return ret.rc;
+}
+
+int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
+                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
+                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
+                ptl_handle_me_t * handle_out)
+{
+        PtlMEInsert_in args;
+        PtlMEInsert_out ret;
+        int rc;
+
+        args.current_in = current_in;
+        args.match_id_in = match_id_in;
+        args.match_bits_in = match_bits_in;
+        args.ignore_bits_in = ignore_bits_in;
+        args.unlink_in = unlink_in;
+        args.position_in = position_in;
+
+        rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = current_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+int PtlMEUnlink(ptl_handle_me_t current_in)
+{
+        PtlMEUnlink_in args;
+        PtlMEUnlink_out ret;
+        int rc;
+
+        args.current_in = current_in;
+        args.unlink_in = PTL_RETAIN;
+
+        rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        return ret.rc;
+}
+
+int PtlTblDump(ptl_handle_ni_t ni, int index_in)
+{
+        PtlTblDump_in args;
+        PtlTblDump_out ret;
+        int rc;
+
+        args.index_in = index_in;
+
+        rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        return ret.rc;
+}
+
+int PtlMEDump(ptl_handle_me_t current_in)
+{
+        PtlMEDump_in args;
+        PtlMEDump_out ret;
+        int rc;
+
+        args.current_in = current_in;
+
+        rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        return ret.rc;
+}
+
+static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in)
+{
+        nal_t *nal;
+        int rc;
+        int i;
+
+        if (!ptl_init) {
+                fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n");
+                return PTL_NOINIT;
+        }
+
+        nal = ptl_hndl2nal(&current_in);
+        if (!nal)
+                return PTL_INV_HANDLE;
+
+        if (nal->validate != NULL)                /* nal->validate not a NOOP */
+        {
+                if ((md_in.options & PTL_MD_IOV) == 0)        /* contiguous */
+                {
+                        rc = nal->validate (nal, md_in.start, md_in.length);
+                        if (rc)
+                                return (PTL_SEGV);
+                }
+                else
+                {
+                        struct iovec *iov = (struct iovec *)md_in.start;
+
+                        for (i = 0; i < md_in.niov; i++, iov++)
+                        {
+                                rc = nal->validate (nal, iov->iov_base, iov->iov_len);
+                                if (rc)
+                                        return (PTL_SEGV);
+                        }
+                }
+        }
+
+        return 0;
+}
+
+static ptl_handle_eq_t md2eq (ptl_md_t *md)
+{
+        if (PtlHandleEqual (md->eventq, PTL_EQ_NONE))
+                return (PTL_EQ_NONE);
+        
+        return (ptl_handle2usereq (&md->eventq)->cb_eq_handle);
+}
+
+
+int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in,
+                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out)
+{
+        PtlMDAttach_in args;
+        PtlMDAttach_out ret;
+        int rc;
+
+        rc = validate_md(me_in, md_in);
+        if (rc == PTL_OK) {
+                args.eq_in = md2eq(&md_in);
+                args.me_in = me_in;
+                args.md_in = md_in;
+                args.unlink_in = unlink_in;
+                
+                rc = do_forward(me_in, PTL_MDATTACH, 
+                                &args, sizeof(args), &ret, sizeof(ret));
+        }
+
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = me_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+
+
+int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
+                       ptl_handle_md_t * handle_out)
+{
+        PtlMDBind_in args;
+        PtlMDBind_out ret;
+        int rc;
+
+        rc = validate_md(ni_in, md_in);
+        if (rc != PTL_OK)
+                return rc;
+
+        args.eq_in = md2eq(&md_in);
+        args.ni_in = ni_in;
+        args.md_in = md_in;
+
+        rc = do_forward(ni_in, PTL_MDBIND, 
+                        &args, sizeof(args), &ret, sizeof(ret));
+
+        if (rc != PTL_OK)
+                return rc;
+
+        if (handle_out) {
+                handle_out->nal_idx = ni_in.nal_idx;
+                handle_out->cookie = ret.handle_out.cookie;
+        }
+        return ret.rc;
+}
+
+int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout,
+                ptl_md_t *new_inout, ptl_handle_eq_t testq_in)
+{
+        PtlMDUpdate_internal_in args;
+        PtlMDUpdate_internal_out ret;
+        int rc;
+
+        args.md_in = md_in;
+
+        if (old_inout) {
+                args.old_inout = *old_inout;
+                args.old_inout_valid = 1;
+        } else
+                args.old_inout_valid = 0;
+
+        if (new_inout) {
+                rc = validate_md (md_in, *new_inout);
+                if (rc != PTL_OK)
+                        return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+                args.new_inout = *new_inout;
+                args.new_inout_valid = 1;
+        } else
+                args.new_inout_valid = 0;
+
+        if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) {
+                args.testq_in = PTL_EQ_NONE;
+                args.sequence_in = -1;
+        } else {
+                ptl_eq_t *eq = ptl_handle2usereq (&testq_in);
+                
+                args.testq_in = eq->cb_eq_handle;
+                args.sequence_in = eq->sequence;
+        }
+
+        rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+
+        if (old_inout)
+                *old_inout = ret.old_inout;
+
+        return ret.rc;
+}
+
+int PtlMDUnlink(ptl_handle_md_t md_in)
+{
+        PtlMDUnlink_in args;
+        PtlMDUnlink_out ret;
+        int rc;
+
+        args.md_in = md_in;
+        rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc;
+
+        return ret.rc;
+}
+
+int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count,
+               int (*callback) (ptl_event_t * event),
+               ptl_handle_eq_t * handle_out)
+{
+        ptl_eq_t *eq = NULL;
+        ptl_event_t *ev = NULL;
+        PtlEQAlloc_in args;
+        PtlEQAlloc_out ret;
+        int rc, i;
+        nal_t *nal;
+
+        if (!ptl_init)
+                return PTL_NOINIT;
+        
+        nal = ptl_hndl2nal (&interface);
+        if (nal == NULL)
+                return PTL_INV_HANDLE;
+
+        if (count != LOWEST_BIT_SET(count)) {   /* not a power of 2 already */
+                do {                    /* knock off all but the top bit... */
+                        count &= ~LOWEST_BIT_SET (count);
+                } while (count != LOWEST_BIT_SET(count));
+
+                count <<= 1;                             /* ...and round up */
+        }
+
+        if (count == 0)        /* catch bad parameter / overflow on roundup */
+                return (PTL_VAL_FAILED);
+
+        PORTAL_ALLOC(ev, count * sizeof(ptl_event_t));
+        if (!ev)
+                return PTL_NOSPACE;
+
+        for (i = 0; i < count; i++)
+                ev[i].sequence = 0;
+
+        if (nal->validate != NULL) {
+                rc = nal->validate(nal, ev, count * sizeof(ptl_event_t));
+                if (rc != PTL_OK)
+                        goto fail;
+        }
+
+        args.ni_in = interface;
+        args.count_in = count;
+        args.base_in = ev;
+        args.len_in = count * sizeof(*ev);
+        args.callback_in = callback;
+
+        rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret,
+                        sizeof(ret));
+        if (rc != PTL_OK)
+                goto fail;
+        if (ret.rc)
+                GOTO(fail, rc = ret.rc);
+
+        PORTAL_ALLOC(eq, sizeof(*eq));
+        if (!eq) {
+                rc = PTL_NOSPACE;
+                goto fail;
+        }
+
+        eq->sequence = 1;
+        eq->size = count;
+        eq->base = ev;
+
+        /* EQ handles are a little wierd.  PtlEQGet() just looks at the
+         * queued events in shared memory.  It doesn't want to do_forward()
+         * at all, so the cookie in the EQ handle we pass out of here is
+         * simply a pointer to the event queue we just set up.  We stash
+         * the handle returned by do_forward(), so we can pass it back via
+         * do_forward() when we need to. */
+
+        eq->cb_eq_handle.nal_idx = interface.nal_idx;
+        eq->cb_eq_handle.cookie = ret.handle_out.cookie;
+
+        handle_out->nal_idx = interface.nal_idx;
+        handle_out->cookie = (__u64)((unsigned long)eq);
+        return PTL_OK;
+
+fail:
+        PORTAL_FREE(ev, count * sizeof(ptl_event_t));
+        return rc;
+}
+
+int PtlEQFree(ptl_handle_eq_t eventq)
+{
+        PtlEQFree_in args;
+        PtlEQFree_out ret;
+        ptl_eq_t *eq;
+        int rc;
+
+        eq = ptl_handle2usereq (&eventq);
+        args.eventq_in = eq->cb_eq_handle;
+
+        rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args,
+                        sizeof(args), &ret, sizeof(ret));
+
+        /* XXX we're betting rc == PTL_OK here */
+        PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t));
+        PORTAL_FREE(eq, sizeof(*eq));
+
+        return rc;
+}
+
+int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
+               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in)
+{
+        PtlACEntry_in args;
+        PtlACEntry_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.ni_in = ni_in;
+        args.index_in = index_in;
+        args.match_id_in = match_id_in;
+        args.portal_in = portal_in;
+
+        rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret,
+                        sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
+
+int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
+           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
+           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
+           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in)
+{
+        PtlPut_in args;
+        PtlPut_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.md_in = md_in;
+        args.ack_req_in = ack_req_in;
+        args.target_in = target_in;
+        args.portal_in = portal_in;
+        args.cookie_in = cookie_in;
+        args.match_bits_in = match_bits_in;
+        args.offset_in = offset_in;
+        args.hdr_data_in = hdr_data_in;
+
+        rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
+
+int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
+           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
+           ptl_match_bits_t match_bits_in, ptl_size_t offset_in)
+{
+        PtlGet_in args;
+        PtlGet_out ret;
+        int rc;
+
+        /*
+         * Copy arguments into the argument block to
+         * hand to the forwarding object
+         */
+        args.md_in = md_in;
+        args.target_in = target_in;
+        args.portal_in = portal_in;
+        args.cookie_in = cookie_in;
+        args.match_bits_in = match_bits_in;
+        args.offset_in = offset_in;
+
+        rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret));
+
+        return (rc != PTL_OK) ? rc : ret.rc;
+}
diff --git a/lustre/portals/portals/lib-dispatch.c b/lustre/portals/portals/lib-dispatch.c
new file mode 100644 (file)
index 0000000..13036c7
--- /dev/null
@@ -0,0 +1,80 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-dispatch.c
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/lib-dispatch.h>
+
+typedef struct {
+        int (*fun) (nal_cb_t * nal, void *private, void *in, void *out);
+        char *name;
+} dispatch_table_t;
+
+static dispatch_table_t dispatch_table[] = {
+        [PTL_GETID] {do_PtlGetId, "PtlGetId"},
+        [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"},
+        [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"},
+        [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"},
+        [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"},
+        [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"},
+        [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"},
+        [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"},
+        [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"},
+        [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"},
+        [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"},
+        [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"},
+        [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"},
+        [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"},
+        [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"},
+        [PTL_PUT] {do_PtlPut, "PtlPut"},
+        [PTL_GET] {do_PtlGet, "PtlGet"},
+        [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"},
+        /*    */ {0, ""}
+};
+
+/*
+ * This really should be elsewhere, but lib-p30/dispatch.c is
+ * an automatically generated file.
+ */
+void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block,
+                  void *ret_block)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (index < 0 || index > LIB_MAX_DISPATCH ||
+            !dispatch_table[index].fun) {
+                CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index);
+                return;
+        }
+
+        CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid,
+               dispatch_table[index].name, index);
+
+        dispatch_table[index].fun(nal, private, arg_block, ret_block);
+}
+
+char *dispatch_name(int index)
+{
+        return dispatch_table[index].name;
+}
diff --git a/lustre/portals/portals/lib-eq.c b/lustre/portals/portals/lib-eq.c
new file mode 100644 (file)
index 0000000..ce343c1
--- /dev/null
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-eq.c
+ * Library level Event queue management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args,
+                           void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_size_t count_in
+         *      void                    * base_in
+         *
+         * Outgoing:
+         *      ptl_handle_eq_t         * handle_out
+         */
+
+        PtlEQAlloc_in *args = v_args;
+        PtlEQAlloc_out *ret = v_ret;
+
+        lib_eq_t *eq;
+        unsigned long flags;
+
+        /* api should have rounded up */
+        if (args->count_in != LOWEST_BIT_SET (args->count_in))
+                return ret->rc = PTL_VAL_FAILED;
+
+        eq = lib_eq_alloc (nal);
+        if (eq == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        if (nal->cb_map != NULL) {
+                struct iovec iov = {
+                        .iov_base = args->base_in,
+                        .iov_len = args->count_in * sizeof (ptl_event_t) };
+
+                ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey);
+                if (ret->rc != PTL_OK) {
+                        lib_eq_free (nal, eq);
+                        
+                        state_unlock (nal, &flags);
+                        return (ret->rc);
+                }
+        }
+
+        eq->sequence = 1;
+        eq->base = args->base_in;
+        eq->size = args->count_in;
+        eq->eq_refcount = 0;
+        eq->event_callback = args->callback_in;
+
+        lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ);
+        list_add (&eq->eq_list, &nal->ni.ni_active_eqs);
+
+        state_unlock(nal, &flags);
+
+        ptl_eq2handle(&ret->handle_out, eq);
+        return (ret->rc = PTL_OK);
+}
+
+int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args,
+                          void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_eq_t eventq_in
+         *
+         * Outgoing:
+         */
+
+        PtlEQFree_in *args = v_args;
+        PtlEQFree_out *ret = v_ret;
+        lib_eq_t *eq;
+        long flags;
+
+        state_lock (nal, &flags);
+
+        eq = ptl_handle2eq(&args->eventq_in, nal);
+        if (eq == NULL) {
+                ret->rc = PTL_INV_EQ;
+        } else if (eq->eq_refcount != 0) {
+                ret->rc = PTL_EQ_INUSE;
+        } else {
+                if (nal->cb_unmap != NULL) {
+                        struct iovec iov = {
+                                .iov_base = eq->base,
+                                .iov_len = eq->size * sizeof (ptl_event_t) };
+                        
+                        nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey);
+                }
+
+                lib_invalidate_handle (nal, &eq->eq_lh);
+                list_del (&eq->eq_list);
+                lib_eq_free (nal, eq);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock (nal, &flags);
+
+        return (ret->rc);
+}
diff --git a/lustre/portals/portals/lib-init.c b/lustre/portals/portals/lib-init.c
new file mode 100644 (file)
index 0000000..99c4d32
--- /dev/null
@@ -0,0 +1,474 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-init.c
+ * Start up the internal library and clear all structures
+ * Called by the NAL when it initializes.  Safe to call multiple times.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+
+#ifdef __KERNEL__
+# include <linux/string.h>      /* for memset() */
+# include <linux/kp30.h>
+# ifdef KERNEL_ADDR_CACHE
+#  include <compute/OS/addrCache/cache.h>
+# endif
+#else
+# include <string.h>
+# include <sys/time.h>
+#endif
+
+#ifdef PTL_USE_SLAB_CACHE
+static int ptl_slab_users;
+
+kmem_cache_t *ptl_md_slab;
+kmem_cache_t *ptl_msg_slab;
+kmem_cache_t *ptl_me_slab;
+kmem_cache_t *ptl_eq_slab;
+
+atomic_t md_in_use_count;
+atomic_t msg_in_use_count;
+atomic_t me_in_use_count;
+atomic_t eq_in_use_count;
+
+/* NB zeroing in ctor and on freeing ensures items that
+ * kmem_cache_validate() OK, but haven't been initialised
+ * as an MD/ME/EQ can't have valid handles
+ */
+static void
+ptl_md_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_md_t));
+}
+
+static void
+ptl_me_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_me_t));
+}
+
+static void
+ptl_eq_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags)
+{
+        memset (obj, 0, sizeof (lib_eq_t));
+}
+
+int
+kportal_descriptor_setup (nal_cb_t *nal)
+{
+        /* NB on failure caller must still call kportal_descriptor_cleanup */
+        /*               ******                                            */
+
+        /* We'll have 1 set of slabs for ALL the nals :) */
+
+        if (ptl_slab_users++)
+                return 0;
+
+        ptl_md_slab = kmem_cache_create("portals_MD",
+                                        sizeof(lib_md_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_md_slab_ctor, NULL);
+        if (!ptl_md_slab) {
+                CERROR("couldn't allocate ptl_md_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        /* NB no ctor for msgs; they don't need handle verification */
+        ptl_msg_slab = kmem_cache_create("portals_MSG",
+                                         sizeof(lib_msg_t), 0,
+                                         SLAB_HWCACHE_ALIGN,
+                                         NULL, NULL);
+        if (!ptl_msg_slab) {
+                CERROR("couldn't allocate ptl_msg_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        ptl_me_slab = kmem_cache_create("portals_ME",
+                                        sizeof(lib_me_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_me_slab_ctor, NULL);
+        if (!ptl_me_slab) {
+                CERROR("couldn't allocate ptl_me_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        ptl_eq_slab = kmem_cache_create("portals_EQ",
+                                        sizeof(lib_eq_t), 0,
+                                        SLAB_HWCACHE_ALIGN,
+                                        ptl_eq_slab_ctor, NULL);
+        if (!ptl_eq_slab) {
+                CERROR("couldn't allocate ptl_eq_t slab");
+                RETURN (PTL_NOSPACE);
+        }
+
+        RETURN(PTL_OK);
+}
+
+void
+kportal_descriptor_cleanup (nal_cb_t *nal)
+{
+        if (--ptl_slab_users != 0)
+                return;
+
+        LASSERT (atomic_read (&md_in_use_count) == 0);
+        LASSERT (atomic_read (&me_in_use_count) == 0);
+        LASSERT (atomic_read (&eq_in_use_count) == 0);
+        LASSERT (atomic_read (&msg_in_use_count) == 0);
+
+        if (ptl_md_slab != NULL)
+                kmem_cache_destroy(ptl_md_slab);
+        if (ptl_msg_slab != NULL)
+                kmem_cache_destroy(ptl_msg_slab);
+        if (ptl_me_slab != NULL)
+                kmem_cache_destroy(ptl_me_slab);
+        if (ptl_eq_slab != NULL)
+                kmem_cache_destroy(ptl_eq_slab);
+}
+#else
+
+int
+lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size)
+{
+        char *space;
+
+        LASSERT (n > 0);
+
+        size += offsetof (lib_freeobj_t, fo_contents);
+
+        space = nal->cb_malloc (nal, n * size);
+        if (space == NULL)
+                return (PTL_NOSPACE);
+
+        INIT_LIST_HEAD (&fl->fl_list);
+        fl->fl_objs = space;
+        fl->fl_nobjs = n;
+        fl->fl_objsize = size;
+
+        do
+        {
+                memset (space, 0, size);
+                list_add ((struct list_head *)space, &fl->fl_list);
+                space += size;
+        } while (--n != 0);
+
+        return (PTL_OK);
+}
+
+void
+lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl)
+{
+        struct list_head *el;
+        int               count;
+
+        if (fl->fl_nobjs == 0)
+                return;
+
+        count = 0;
+        for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+                count++;
+
+        LASSERT (count == fl->fl_nobjs);
+
+        nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+        memset (fl, 0, sizeof (fl));
+}
+
+int
+kportal_descriptor_setup (nal_cb_t *nal)
+{
+        /* NB on failure caller must still call kportal_descriptor_cleanup */
+        /*               ******                                            */
+        int rc;
+
+        memset (&nal->ni.ni_free_mes,  0, sizeof (nal->ni.ni_free_mes));
+        memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs));
+        memset (&nal->ni.ni_free_mds,  0, sizeof (nal->ni.ni_free_mds));
+        memset (&nal->ni.ni_free_eqs,  0, sizeof (nal->ni.ni_free_eqs));
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_mes,
+                                MAX_MES, sizeof (lib_me_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs,
+                                MAX_MSGS, sizeof (lib_msg_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_mds,
+                                MAX_MDS, sizeof (lib_md_t));
+        if (rc != PTL_OK)
+                return (rc);
+
+        rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs,
+                                MAX_EQS, sizeof (lib_eq_t));
+        return (rc);
+}
+
+void
+kportal_descriptor_cleanup (nal_cb_t *nal)
+{
+        lib_freelist_fini (nal, &nal->ni.ni_free_mes);
+        lib_freelist_fini (nal, &nal->ni.ni_free_msgs);
+        lib_freelist_fini (nal, &nal->ni.ni_free_mds);
+        lib_freelist_fini (nal, &nal->ni.ni_free_eqs);
+}
+
+#endif
+
+__u64
+lib_create_interface_cookie (nal_cb_t *nal)
+{
+        /* NB the interface cookie in wire handles guards against delayed
+         * replies and ACKs appearing valid in a new instance of the same
+         * interface.  Initialisation time, even if it's only implemented
+         * to millisecond resolution is probably easily good enough. */
+        struct timeval tv;
+        __u64          cookie;
+#ifndef __KERNEL__
+        int            rc = gettimeofday (&tv, NULL);
+        LASSERT (rc == 0);
+#else
+       do_gettimeofday(&tv);
+#endif
+        cookie = tv.tv_sec;
+        cookie *= 1000000;
+        cookie += tv.tv_usec;
+        return (cookie);
+}
+
+int
+lib_setup_handle_hash (nal_cb_t *nal) 
+{
+        lib_ni_t *ni = &nal->ni;
+        int       i;
+        
+        /* Arbitrary choice of hash table size */
+#ifdef __KERNEL__
+        ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head);
+#else
+        ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4;
+#endif
+        ni->ni_lh_hash_table = 
+                (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size
+                                                    * sizeof (struct list_head));
+        if (ni->ni_lh_hash_table == NULL)
+                return (PTL_NOSPACE);
+        
+        for (i = 0; i < ni->ni_lh_hash_size; i++)
+                INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]);
+
+        ni->ni_next_object_cookie = PTL_COOKIE_TYPES;
+        
+        return (PTL_OK);
+}
+
+void
+lib_cleanup_handle_hash (nal_cb_t *nal)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (ni->ni_lh_hash_table == NULL)
+                return;
+        
+        nal->cb_free (nal, ni->ni_lh_hash_table,
+                      ni->ni_lh_hash_size * sizeof (struct list_head));
+}
+
+lib_handle_t *
+lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) 
+{
+        /* ALWAYS called with statelock held */
+        lib_ni_t            *ni = &nal->ni;
+        struct list_head    *list;
+        struct list_head    *el;
+        unsigned int         hash;
+
+        if ((cookie & (PTL_COOKIE_TYPES - 1)) != type)
+                return (NULL);
+        
+        hash = ((unsigned int)cookie) % ni->ni_lh_hash_size;
+        list = &ni->ni_lh_hash_table[hash];
+        
+        list_for_each (el, list) {
+                lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain);
+                
+                if (lh->lh_cookie == cookie)
+                        return (lh);
+        }
+        
+        return (NULL);
+}
+
+void
+lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) 
+{
+        /* ALWAYS called with statelock held */
+        lib_ni_t       *ni = &nal->ni;
+        unsigned int    hash;
+
+        LASSERT (type >= 0 && type < PTL_COOKIE_TYPES);
+        lh->lh_cookie = ni->ni_next_object_cookie | type;
+        ni->ni_next_object_cookie += PTL_COOKIE_TYPES;
+        
+        hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size;
+        list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]);
+}
+
+void
+lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh)
+{
+        list_del (&lh->lh_hash_chain);
+}
+
+int
+lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize,
+         ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size)
+{
+        int       rc = PTL_OK;
+        lib_ni_t *ni = &nal->ni;
+        int i;
+        ENTRY;
+
+        /* NB serialised in PtlNIInit() */
+
+        if (ni->refcnt != 0) {                       /* already initialised */
+                ni->refcnt++;
+                goto out;
+        }
+
+        lib_assert_wire_constants ();
+        
+        /*
+         * Allocate the portal table for this interface
+         * and all per-interface objects.
+         */
+        memset(&ni->counters, 0, sizeof(lib_counters_t));
+
+        rc = kportal_descriptor_setup (nal);
+        if (rc != PTL_OK)
+                goto out;
+
+        INIT_LIST_HEAD (&ni->ni_active_msgs);
+        INIT_LIST_HEAD (&ni->ni_active_mds);
+        INIT_LIST_HEAD (&ni->ni_active_eqs);
+
+        INIT_LIST_HEAD (&ni->ni_test_peers);
+
+        ni->ni_interface_cookie = lib_create_interface_cookie (nal);
+        ni->ni_next_object_cookie = 0;
+        rc = lib_setup_handle_hash (nal);
+        if (rc != PTL_OK)
+                goto out;
+        
+        ni->nid = nid;
+        ni->pid = pid;
+
+        ni->num_nodes = gsize;
+        ni->tbl.size = ptl_size;
+
+        ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size);
+        if (ni->tbl.tbl == NULL) {
+                rc = PTL_NOSPACE;
+                goto out;
+        }
+
+        for (i = 0; i < ptl_size; i++)
+                INIT_LIST_HEAD(&(ni->tbl.tbl[i]));
+
+        ni->debug = PTL_DEBUG_NONE;
+        ni->up = 1;
+        ni->refcnt++;
+
+ out:
+        if (rc != PTL_OK) {
+                lib_cleanup_handle_hash (nal);
+                kportal_descriptor_cleanup (nal);
+        }
+
+        RETURN (rc);
+}
+
+int
+lib_fini(nal_cb_t * nal)
+{
+        lib_ni_t *ni = &nal->ni;
+        int       idx;
+
+        ni->refcnt--;
+
+        if (ni->refcnt != 0)
+                goto out;
+
+        /* NB no stat_lock() since this is the last reference.  The NAL
+         * should have shut down already, so it should be safe to unlink
+         * and free all descriptors, even those that appear committed to a
+         * network op (eg MD with non-zero pending count)
+         */
+
+        for (idx = 0; idx < ni->tbl.size; idx++)
+                while (!list_empty (&ni->tbl.tbl[idx])) {
+                        lib_me_t *me = list_entry (ni->tbl.tbl[idx].next,
+                                                   lib_me_t, me_list);
+
+                        CERROR ("Active me %p on exit\n", me);
+                        list_del (&me->me_list);
+                        lib_me_free (nal, me);
+                }
+
+        while (!list_empty (&ni->ni_active_mds)) {
+                lib_md_t *md = list_entry (ni->ni_active_mds.next,
+                                           lib_md_t, md_list);
+
+                CERROR ("Active md %p on exit\n", md);
+                list_del (&md->md_list);
+                lib_md_free (nal, md);
+        }
+
+        while (!list_empty (&ni->ni_active_eqs)) {
+                lib_eq_t *eq = list_entry (ni->ni_active_eqs.next,
+                                           lib_eq_t, eq_list);
+
+                CERROR ("Active eq %p on exit\n", eq);
+                list_del (&eq->eq_list);
+                lib_eq_free (nal, eq);
+        }
+
+        while (!list_empty (&ni->ni_active_msgs)) {
+                lib_msg_t *msg = list_entry (ni->ni_active_msgs.next,
+                                             lib_msg_t, msg_list);
+
+                CERROR ("Active msg %p on exit\n", msg);
+                list_del (&msg->msg_list);
+                lib_msg_free (nal, msg);
+        }
+
+        nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size);
+        ni->up = 0;
+
+        lib_cleanup_handle_hash (nal);
+        kportal_descriptor_cleanup (nal);
+
+ out:
+        return (PTL_OK);
+}
diff --git a/lustre/portals/portals/lib-md.c b/lustre/portals/portals/lib-md.c
new file mode 100644 (file)
index 0000000..a79e2be
--- /dev/null
@@ -0,0 +1,412 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-md.c
+ * Memory Descriptor management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+/*
+ * must be called with state lock held
+ */
+void lib_md_unlink(nal_cb_t * nal, lib_md_t * md)
+{
+        lib_me_t *me = md->me;
+
+        if (md->pending != 0) {
+                CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
+                md->md_flags |= PTL_MD_FLAG_UNLINK;
+                return;
+        }
+
+        CDEBUG(D_NET, "Unlinking md %p\n", md);
+
+        if ((md->options & PTL_MD_KIOV) != 0) {
+                if (nal->cb_unmap_pages != NULL)
+                        nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, 
+                                             &md->md_addrkey);
+        } else if (nal->cb_unmap != NULL)
+                nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, 
+                               &md->md_addrkey);
+
+        if (me) {
+                me->md = NULL;
+                if (me->unlink == PTL_UNLINK)
+                        lib_me_unlink(nal, me);
+        }
+
+        if (md->eq != NULL)
+        {
+                md->eq->eq_refcount--;
+                LASSERT (md->eq->eq_refcount >= 0);
+        }
+
+        lib_invalidate_handle (nal, &md->md_lh);
+        list_del (&md->md_list);
+        lib_md_free(nal, md);
+}
+
+/* must be called with state lock held */
+static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private,
+                        ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink)
+{
+        const int     max_size_opts = PTL_MD_AUTO_UNLINK |
+                                      PTL_MD_MAX_SIZE;
+        lib_eq_t     *eq = NULL;
+        int           rc;
+        int           i;
+
+        /* NB we are passes an allocated, but uninitialised/active md.
+         * if we return success, caller may lib_md_unlink() it.
+         * otherwise caller may only lib_md_free() it.
+         */
+
+        if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) {
+                eq = ptl_handle2eq(eqh, nal);
+                if (eq == NULL)
+                        return PTL_INV_EQ;
+        }
+
+        if ((md->options & PTL_MD_IOV) != 0 &&  /* discontiguous MD */
+            md->niov > PTL_MD_MAX_IOV)          /* too many fragments */
+                return PTL_IOV_TOO_MANY;
+
+        if ((md->options & max_size_opts) != 0 && /* max size used */
+            (md->max_size < 0 || md->max_size > md->length)) // illegal max_size
+                return PTL_INV_MD;
+
+        new->me = NULL;
+        new->start = md->start;
+        new->length = md->length;
+        new->offset = 0;
+        new->max_size = md->max_size;
+        new->unlink = unlink;
+        new->options = md->options;
+        new->user_ptr = md->user_ptr;
+        new->eq = eq;
+        new->threshold = md->threshold;
+        new->pending = 0;
+        new->md_flags = 0;
+
+        if ((md->options & PTL_MD_IOV) != 0) {
+                int total_length = 0;
+
+                if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */
+                        return PTL_INV_MD; 
+
+                new->md_niov = md->niov;
+                
+                if (nal->cb_read (nal, private, new->md_iov.iov, md->start,
+                                  md->niov * sizeof (new->md_iov.iov[0])))
+                        return PTL_SEGV;
+
+                for (i = 0; i < new->md_niov; i++) {
+                        /* We take the base address on trust */
+                        if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */
+                                return PTL_VAL_FAILED;
+
+                        total_length += new->md_iov.iov[i].iov_len;
+                }
+
+                if (md->length > total_length)
+                        return PTL_IOV_TOO_SMALL;
+                
+                if (nal->cb_map != NULL) {
+                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
+                                          &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+        } else if ((md->options & PTL_MD_KIOV) != 0) {
+#ifndef __KERNEL__
+                return PTL_INV_MD;
+#else
+                int total_length = 0;
+                
+                /* Trap attempt to use paged I/O if unsupported early. */
+                if (nal->cb_send_pages == NULL ||
+                    nal->cb_recv_pages == NULL)
+                        return PTL_INV_MD;
+
+                new->md_niov = md->niov;
+
+                if (nal->cb_read (nal, private, new->md_iov.kiov, md->start,
+                                  md->niov * sizeof (new->md_iov.kiov[0])))
+                        return PTL_SEGV;
+                
+                for (i = 0; i < new->md_niov; i++) {
+                        /* We take the page pointer on trust */
+                        if (new->md_iov.kiov[i].kiov_offset + 
+                            new->md_iov.kiov[i].kiov_len > PAGE_SIZE )
+                                return PTL_VAL_FAILED; /* invalid length */
+
+                        total_length += new->md_iov.kiov[i].kiov_len;
+                }
+
+                if (md->length > total_length)
+                        return PTL_IOV_TOO_SMALL;
+
+                if (nal->cb_map_pages != NULL) {
+                        rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, 
+                                                &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+#endif
+        } else {   /* contiguous */
+                new->md_niov = 1;
+                new->md_iov.iov[0].iov_base = md->start;
+                new->md_iov.iov[0].iov_len = md->length;
+
+                if (nal->cb_map != NULL) {
+                        rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, 
+                                          &new->md_addrkey);
+                        if (rc != PTL_OK)
+                                return (rc);
+                }
+        } 
+
+        if (eq != NULL)
+                eq->eq_refcount++;
+
+        /* It's good; let handle2md succeed and add to active mds */
+        lib_initialise_handle (nal, &new->md_lh, PTL_COOKIE_TYPE_MD);
+        list_add (&new->md_list, &nal->ni.ni_active_mds);
+
+        return PTL_OK;
+}
+
+/* must be called with state lock held */
+void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new)
+{
+        /* NB this doesn't copy out all the iov entries so when a
+         * discontiguous MD is copied out, the target gets to know the
+         * original iov pointer (in start) and the number of entries it had
+         * and that's all.
+         */
+        new->start = md->start;
+        new->length = md->length;
+        new->threshold = md->threshold;
+        new->max_size = md->max_size;
+        new->options = md->options;
+        new->user_ptr = md->user_ptr;
+        ptl_eq2handle(&new->eventq, md->eq);
+        new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov;
+}
+
+int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_me_t current_in
+         *      ptl_md_t md_in
+         *      ptl_unlink_t unlink_in
+         *
+         * Outgoing:
+         *      ptl_handle_md_t         * handle_out
+         */
+
+        PtlMDAttach_in *args = v_args;
+        PtlMDAttach_out *ret = v_ret;
+        lib_me_t *me;
+        lib_md_t *md;
+        unsigned long flags;
+
+        md = lib_md_alloc (nal);
+        if (md == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->me_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else if (me->md != NULL) {
+                ret->rc = PTL_INUSE;
+        } else {
+                ret->rc = lib_md_build(nal, md, private, &args->md_in,
+                                       &args->eq_in, args->unlink_in);
+
+                if (ret->rc == PTL_OK) {
+                        me->md = md;
+                        md->me = me;
+
+                        ptl_md2handle(&ret->handle_out, md);
+
+                        state_unlock (nal, &flags);
+                        return (PTL_OK);
+                }
+        }
+
+        lib_md_free (nal, md);
+
+        state_unlock (nal, &flags);
+        return (ret->rc);
+}
+
+int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t ni_in
+         *      ptl_md_t md_in
+         *
+         * Outgoing:
+         *      ptl_handle_md_t         * handle_out
+         */
+
+        PtlMDBind_in *args = v_args;
+        PtlMDBind_out *ret = v_ret;
+        lib_md_t *md;
+        unsigned long flags;
+
+        md = lib_md_alloc (nal);
+        if (md == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        ret->rc = lib_md_build(nal, md, private,
+                               &args->md_in, &args->eq_in, PTL_UNLINK);
+
+        if (ret->rc == PTL_OK) {
+                ptl_md2handle(&ret->handle_out, md);
+
+                state_unlock(nal, &flags);
+                return (PTL_OK);
+        }
+
+        lib_md_free (nal, md);
+
+        state_unlock(nal, &flags);
+        return (ret->rc);
+}
+
+int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMDUnlink_in *args = v_args;
+        PtlMDUnlink_out *ret = v_ret;
+
+        lib_md_t *md;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL) {
+                ret->rc = PTL_INV_MD;
+        } else if (md->pending != 0) {           /* being filled/spilled */
+                ret->rc = PTL_MD_INUSE;
+        } else {
+                /* Callers attempting to unlink a busy MD which will get
+                 * unlinked once the net op completes should see INUSE,
+                 * before completion and INV_MD thereafter.  LASSERT we've
+                 * got that right... */
+                LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0);
+
+                lib_md_deconstruct(nal, md, &ret->status_out);
+                lib_md_unlink(nal, md);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return (ret->rc);
+}
+
+int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args,
+                            void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_md_t                * old_inout
+         *      ptl_md_t                * new_inout
+         *      ptl_handle_eq_t testq_in
+         *      ptl_seq_t               sequence_in
+         *
+         * Outgoing:
+         *      ptl_md_t                * old_inout
+         *      ptl_md_t                * new_inout
+         */
+        PtlMDUpdate_internal_in *args = v_args;
+        PtlMDUpdate_internal_out *ret = v_ret;
+        lib_md_t *md;
+        lib_eq_t *test_eq = NULL;
+        ptl_md_t *new = &args->new_inout;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL) {
+                 ret->rc = PTL_INV_MD;
+                 goto out;
+        }
+
+        if (args->old_inout_valid)
+                lib_md_deconstruct(nal, md, &ret->old_inout);
+
+        if (!args->new_inout_valid) {
+                ret->rc = PTL_OK;
+                goto out;
+        }
+
+        if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) {
+                test_eq = ptl_handle2eq(&args->testq_in, nal);
+                if (test_eq == NULL) {
+                        ret->rc = PTL_INV_EQ;
+                        goto out;
+                }
+        }
+
+        if (md->pending != 0) {
+                        ret->rc = PTL_NOUPDATE;
+                        goto out;
+        }
+
+        if (test_eq == NULL ||
+            test_eq->sequence == args->sequence_in) {
+                lib_me_t *me = md->me;
+
+#warning this does not track eq refcounts properly
+
+                ret->rc = lib_md_build(nal, md, private,
+                                       new, &new->eventq, md->unlink);
+
+                md->me = me;
+        } else {
+                ret->rc = PTL_NOUPDATE;
+        }
+
+ out:
+        state_unlock(nal, &flags);
+        return (ret->rc);
+}
diff --git a/lustre/portals/portals/lib-me.c b/lustre/portals/portals/lib-me.c
new file mode 100644 (file)
index 0000000..bd1af5b
--- /dev/null
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-me.c
+ * Match Entry management routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+static void lib_me_dump(nal_cb_t * nal, lib_me_t * me);
+
+int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEAttach_in *args = v_args;
+        PtlMEAttach_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+        lib_ptl_t *tbl = &ni->tbl;
+        unsigned long flags;
+        lib_me_t *me;
+
+        if (args->index_in < 0 || args->index_in >= tbl->size)
+                return ret->rc = PTL_INV_PTINDEX;
+
+        /* Should check for valid matchid, but not yet */
+        if (0)
+                return ret->rc = PTL_INV_PROC;
+
+        me = lib_me_alloc (nal);
+        if (me == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        state_lock(nal, &flags);
+
+        me->match_id = args->match_id_in;
+        me->match_bits = args->match_bits_in;
+        me->ignore_bits = args->ignore_bits_in;
+        me->unlink = args->unlink_in;
+        me->md = NULL;
+
+        lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME);
+
+        if (args->position_in == PTL_INS_AFTER)
+                list_add_tail(&me->me_list, &(tbl->tbl[args->index_in]));
+        else
+                list_add(&me->me_list, &(tbl->tbl[args->index_in]));
+
+        ptl_me2handle(&ret->handle_out, me);
+
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEInsert_in *args = v_args;
+        PtlMEInsert_out *ret = v_ret;
+        unsigned long flags;
+        lib_me_t *me;
+        lib_me_t *new;
+
+        new = lib_me_alloc (nal);
+        if (new == NULL)
+                return (ret->rc = PTL_NOSPACE);
+
+        /* Should check for valid matchid, but not yet */
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                lib_me_free (nal, new);
+
+                state_unlock (nal, &flags);
+                return (ret->rc = PTL_INV_ME);
+        }
+
+        new->match_id = args->match_id_in;
+        new->match_bits = args->match_bits_in;
+        new->ignore_bits = args->ignore_bits_in;
+        new->unlink = args->unlink_in;
+        new->md = NULL;
+
+        lib_initialise_handle (nal, &new->me_lh, PTL_COOKIE_TYPE_ME);
+
+        if (args->position_in == PTL_INS_AFTER)
+                list_add_tail(&new->me_list, &me->me_list);
+        else
+                list_add(&new->me_list, &me->me_list);
+
+        ptl_me2handle(&ret->handle_out, new);
+
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEUnlink_in *args = v_args;
+        PtlMEUnlink_out *ret = v_ret;
+        unsigned long flags;
+        lib_me_t *me;
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else {
+                lib_me_unlink(nal, me);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return (ret->rc);
+}
+
+/* call with state_lock please */
+void lib_me_unlink(nal_cb_t *nal, lib_me_t *me)
+{
+        lib_ni_t *ni = &nal->ni;
+
+        if (ni->debug & PTL_DEBUG_UNLINK) {
+                ptl_handle_any_t handle;
+                ptl_me2handle(&handle, me);
+        }
+
+        list_del (&me->me_list);
+
+        if (me->md) {
+                me->md->me = NULL;
+                lib_md_unlink(nal, me->md);
+        }
+
+        lib_invalidate_handle (nal, &me->me_lh);
+        lib_me_free(nal, me);
+}
+
+int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlTblDump_in *args = v_args;
+        PtlTblDump_out *ret = v_ret;
+        lib_ptl_t *tbl = &nal->ni.tbl;
+        ptl_handle_any_t handle;
+        struct list_head *tmp;
+        unsigned long flags;
+
+        if (args->index_in < 0 || args->index_in >= tbl->size)
+                return ret->rc = PTL_INV_PTINDEX;
+
+        nal->cb_printf(nal, "Portal table index %d\n", args->index_in);
+
+        state_lock(nal, &flags);
+        list_for_each(tmp, &(tbl->tbl[args->index_in])) {
+                lib_me_t *me = list_entry(tmp, lib_me_t, me_list);
+                ptl_me2handle(&handle, me);
+                lib_me_dump(nal, me);
+        }
+        state_unlock(nal, &flags);
+
+        return ret->rc = PTL_OK;
+}
+
+int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlMEDump_in *args = v_args;
+        PtlMEDump_out *ret = v_ret;
+        lib_me_t *me;
+        unsigned long flags;
+
+        state_lock(nal, &flags);
+
+        me = ptl_handle2me(&args->current_in, nal);
+        if (me == NULL) {
+                ret->rc = PTL_INV_ME;
+        } else {
+                lib_me_dump(nal, me);
+                ret->rc = PTL_OK;
+        }
+
+        state_unlock(nal, &flags);
+
+        return ret->rc;
+}
+
+static void lib_me_dump(nal_cb_t * nal, lib_me_t * me)
+{
+        nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, 
+                       me->me_lh.lh_cookie);
+
+        nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n",
+                       me->match_bits, me->ignore_bits);
+
+        nal->cb_printf(nal, "\tMD\t= %p\n", me->md);
+        nal->cb_printf(nal, "\tprev\t= %p\n",
+                       list_entry(me->me_list.prev, lib_me_t, me_list));
+        nal->cb_printf(nal, "\tnext\t= %p\n",
+                       list_entry(me->me_list.next, lib_me_t, me_list));
+}
diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c
new file mode 100644 (file)
index 0000000..fde4f16
--- /dev/null
@@ -0,0 +1,1379 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-move.c
+ * Data movement routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+/*
+ * Right now it does not check access control lists.
+ *
+ * We only support one MD per ME, which is how the Portals 3.1 spec is written.
+ * All previous complication is removed.
+ */
+
+static lib_me_t *
+lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid,
+            ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset,
+            ptl_match_bits_t match_bits, ptl_size_t *mlength_out,
+            ptl_size_t *offset_out, int *unlink_out)
+{
+        lib_ni_t         *ni = &nal->ni;
+        struct list_head *match_list = &ni->tbl.tbl[index];
+        struct list_head *tmp;
+        lib_me_t         *me;
+        lib_md_t         *md;
+        ptl_size_t        mlength;
+        ptl_size_t        offset;
+
+        ENTRY;
+
+        CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d "
+                "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits);
+
+        if (index < 0 || index >= ni->tbl.size) {
+                CERROR("Invalid portal %d not in [0-%d]\n",
+                       index, ni->tbl.size);
+                goto failed;
+        }
+
+        list_for_each (tmp, match_list) {
+                me = list_entry(tmp, lib_me_t, me_list);
+                md = me->md;
+
+                 /* ME attached but MD not attached yet */
+                if (md == NULL)
+                        continue;
+
+                LASSERT (me == md->me);
+
+                /* MD deactivated */
+                if (md->threshold == 0)
+                        continue;
+
+                /* mismatched MD op */
+                if ((md->options & op_mask) == 0)
+                        continue;
+
+                /* mismatched ME nid/pid? */
+                if (me->match_id.nid != PTL_NID_ANY &&
+                    me->match_id.nid != src_nid)
+                        continue;
+
+                if (me->match_id.pid != PTL_PID_ANY &&
+                    me->match_id.pid != src_pid)
+                        continue;
+
+                /* mismatched ME matchbits? */
+                if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0)
+                        continue;
+
+                /* Hurrah! This _is_ a match; check it out... */
+
+                if ((md->options & PTL_MD_MANAGE_REMOTE) == 0)
+                        offset = md->offset;
+                else
+                        offset = roffset;
+
+                mlength = md->length - offset;
+                if ((md->options & PTL_MD_MAX_SIZE) != 0 &&
+                    mlength > md->max_size)
+                        mlength = md->max_size;
+
+                if (rlength <= mlength) {        /* fits in allowed space */
+                        mlength = rlength;
+                } else if ((md->options & PTL_MD_TRUNCATE) == 0) {
+                        /* this packet _really_ is too big */
+                        CERROR("Matching packet %d too big: %d left, "
+                               "%d allowed\n", rlength, md->length - offset,
+                               mlength);
+                        goto failed;
+                }
+
+                md->offset = offset + mlength;
+
+                *offset_out = offset;
+                *mlength_out = mlength;
+                *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 &&
+                               md->offset >= (md->length - md->max_size));
+                RETURN (me);
+        }
+
+ failed:
+        CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64
+                " offset %d length %d: no match\n",
+                ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT",
+                src_nid, src_pid, index, match_bits, roffset, rlength);
+        RETURN(NULL);
+}
+
+int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret)
+{
+        PtlFailNid_in     *args = v_args;
+        PtlFailNid_out    *ret  = v_ret;
+        lib_test_peer_t   *tp;
+        unsigned long      flags;
+        struct list_head  *el;
+        struct list_head  *next;
+        struct list_head   cull;
+        
+        if (args->threshold != 0) {
+                /* Adding a new entry */
+                tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp));
+                if (tp == NULL)
+                        return (ret->rc = PTL_FAIL);
+                
+                tp->tp_nid = args->nid;
+                tp->tp_threshold = args->threshold;
+                
+                state_lock (nal, &flags);
+                list_add (&tp->tp_list, &nal->ni.ni_test_peers);
+                state_unlock (nal, &flags);
+                return (ret->rc = PTL_OK);
+        }
+        
+        /* removing entries */
+        INIT_LIST_HEAD (&cull);
+        
+        state_lock (nal, &flags);
+
+        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+                tp = list_entry (el, lib_test_peer_t, tp_list);
+                
+                if (tp->tp_threshold == 0 ||    /* needs culling anyway */
+                    args->nid == PTL_NID_ANY || /* removing all entries */
+                    tp->tp_nid == args->nid)    /* matched this one */
+                {
+                        list_del (&tp->tp_list);
+                        list_add (&tp->tp_list, &cull);
+                }
+        }
+        
+        state_unlock (nal, &flags);
+                
+        while (!list_empty (&cull)) {
+                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+
+                list_del (&tp->tp_list);
+                nal->cb_free (nal, tp, sizeof (*tp));
+        }
+        return (ret->rc = PTL_OK);
+}
+
+static int
+fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) 
+{
+        lib_test_peer_t  *tp;
+        struct list_head *el;
+        struct list_head *next;
+        unsigned long     flags;
+        struct list_head  cull;
+        int               fail = 0;
+
+        INIT_LIST_HEAD (&cull);
+        
+        state_lock (nal, &flags);
+
+        list_for_each_safe (el, next, &nal->ni.ni_test_peers) {
+                tp = list_entry (el, lib_test_peer_t, tp_list);
+
+                if (tp->tp_threshold == 0) {
+                        /* zombie entry */
+                        if (outgoing) {
+                                /* only cull zombies on outgoing tests,
+                                 * since we may be at interrupt priority on
+                                 * incoming messages. */
+                                list_del (&tp->tp_list);
+                                list_add (&tp->tp_list, &cull);
+                        }
+                        continue;
+                }
+                        
+                if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */
+                    nid == tp->tp_nid) {        /* fail this peer */
+                        fail = 1;
+                        
+                        if (tp->tp_threshold != PTL_MD_THRESH_INF) {
+                                tp->tp_threshold--;
+                                if (outgoing &&
+                                    tp->tp_threshold == 0) {
+                                        /* see above */
+                                        list_del (&tp->tp_list);
+                                        list_add (&tp->tp_list, &cull);
+                                }
+                        }
+                        break;
+                }
+        }
+        
+        state_unlock (nal, &flags);
+
+        while (!list_empty (&cull)) {
+                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+                list_del (&tp->tp_list);
+                
+                nal->cb_free (nal, tp, sizeof (*tp));
+        }
+
+        return (fail);
+}
+
+ptl_size_t
+lib_iov_nob (int niov, struct iovec *iov)
+{
+        ptl_size_t nob = 0;
+        
+        while (niov-- > 0)
+                nob += (iov++)->iov_len;
+        
+        return (nob);
+}
+
+void
+lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len)
+{
+        ptl_size_t nob;
+
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len, len);
+                memcpy (dest, iov->iov_base, nob);
+
+                len -= nob;
+                dest += nob;
+                niov--;
+                iov++;
+        }
+}
+
+void
+lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len)
+{
+        ptl_size_t nob;
+
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (iov->iov_len, len);
+                memcpy (iov->iov_base, src, nob);
+                
+                len -= nob;
+                src += nob;
+                niov--;
+                iov++;
+        }
+}
+
+static int
+lib_extract_iov (struct iovec *dst, lib_md_t *md,
+                 ptl_size_t offset, ptl_size_t len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        int             src_niov = md->md_niov;  
+        struct iovec   *src = md->md_iov.iov;
+        ptl_size_t      frag_len;
+        int             dst_niov;
+
+        LASSERT (len >= 0);
+        LASSERT (offset >= 0);
+        LASSERT (offset + len <= md->length);
+        
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->iov_len) {      /* skip initial frags */
+                offset -= src->iov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        dst_niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                
+                frag_len = src->iov_len - offset;
+                dst->iov_base = ((char *)src->iov_base) + offset;
+
+                if (len <= frag_len) {
+                        dst->iov_len = len;
+                        return (dst_niov);
+                }
+                
+                dst->iov_len = frag_len;
+
+                len -= frag_len;
+                dst++;
+                src++;
+                dst_niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+
+#ifndef __KERNEL__
+ptl_size_t
+lib_kiov_nob (int niov, ptl_kiov_t *kiov) 
+{
+        LASSERT (0);
+        return (0);
+}
+
+void
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+void
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+static int
+lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+                  ptl_size_t offset, ptl_size_t len)
+{
+        LASSERT (0);
+}
+
+#else
+
+ptl_size_t
+lib_kiov_nob (int niov, ptl_kiov_t *kiov) 
+{
+        ptl_size_t  nob = 0;
+
+        while (niov-- > 0)
+                nob += (kiov++)->kiov_len;
+
+        return (nob);
+}
+
+void
+lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len)
+{
+        ptl_size_t  nob;
+        char       *addr;
+        
+        LASSERT (!in_interrupt ());
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (kiov->kiov_len, len);
+                
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                memcpy (dest, addr, nob);
+                kunmap (kiov->kiov_page);
+                
+                len -= nob;
+                dest += nob;
+                niov--;
+                kiov++;
+        }
+}
+
+void
+lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len)
+{
+        ptl_size_t  nob;
+        char       *addr;
+
+        LASSERT (!in_interrupt ());
+        while (len > 0)
+        {
+                LASSERT (niov > 0);
+                nob = MIN (kiov->kiov_len, len);
+                
+                addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
+                memcpy (addr, src, nob);
+                kunmap (kiov->kiov_page);
+                
+                len -= nob;
+                src += nob;
+                niov--;
+                kiov++;
+        }
+}
+
+static int
+lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md,
+                  ptl_size_t offset, ptl_size_t len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        int             src_niov = md->md_niov;  
+        ptl_kiov_t     *src = md->md_iov.kiov;
+        ptl_size_t      frag_len;
+        int             dst_niov;
+
+        LASSERT (len >= 0);
+        LASSERT (offset >= 0);
+        LASSERT (offset + len <= md->length);
+        
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->kiov_len) {      /* skip initial frags */
+                offset -= src->kiov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        dst_niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (dst_niov <= PTL_MD_MAX_IOV);
+                
+                frag_len = src->kiov_len - offset;
+                dst->kiov_page = src->kiov_page;
+                dst->kiov_offset = src->kiov_offset + offset;
+
+                if (len <= frag_len) {
+                        dst->kiov_len = len;
+                        LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+                        return (dst_niov);
+                }
+
+                dst->kiov_len = frag_len;
+                LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+
+                len -= frag_len;
+                dst++;
+                src++;
+                dst_niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+#endif
+
+void
+lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
+          ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
+{
+        int   niov;
+
+        if (mlen == 0)
+                nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen);
+        else if ((md->options & PTL_MD_KIOV) == 0) {
+                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen);
+                nal->cb_recv (nal, private, msg,
+                              niov, msg->msg_iov.iov, mlen, rlen);
+        } else {
+                niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen);
+                nal->cb_recv_pages (nal, private, msg, 
+                                    niov, msg->msg_iov.kiov, mlen, rlen);
+        }
+}
+
+int
+lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg,
+          ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+          lib_md_t *md, ptl_size_t offset, ptl_size_t len) 
+{
+        int   niov;
+
+        if (len == 0)
+                return (nal->cb_send (nal, private, msg, 
+                                      hdr, type, nid, pid,
+                                      0, NULL, 0));
+        
+        if ((md->options & PTL_MD_KIOV) == 0) {
+                niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len);
+                return (nal->cb_send (nal, private, msg, 
+                                      hdr, type, nid, pid,
+                                      niov, msg->msg_iov.iov, len));
+        }
+
+        niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len);
+        return (nal->cb_send_pages (nal, private, msg, 
+                                    hdr, type, nid, pid,
+                                    niov, msg->msg_iov.kiov, len));
+}
+
+static lib_msg_t *
+get_new_msg (nal_cb_t *nal, lib_md_t *md)
+{
+        /* ALWAYS called holding the state_lock */
+        lib_counters_t *counters = &nal->ni.counters;
+        lib_msg_t      *msg      = lib_msg_alloc (nal);
+
+        if (msg == NULL)
+                return (NULL);
+
+        memset (msg, 0, sizeof (*msg));
+
+        msg->send_ack = 0;
+
+        msg->md = md;
+        msg->ev.arrival_time = get_cycles();
+        md->pending++;
+        if (md->threshold != PTL_MD_THRESH_INF) {
+                LASSERT (md->threshold > 0);
+                md->threshold--;
+        }
+
+        counters->msgs_alloc++;
+        if (counters->msgs_alloc > counters->msgs_max)
+                counters->msgs_max = counters->msgs_alloc;
+
+        list_add (&msg->msg_list, &nal->ni.ni_active_msgs);
+
+        return (msg);
+}
+
+
+/*
+ * Incoming messages have a ptl_msg_t object associated with them
+ * by the library.  This object encapsulates the state of the
+ * message and allows the NAL to do non-blocking receives or sends
+ * of long messages.
+ *
+ */
+static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        ptl_size_t       mlength = 0;
+        ptl_size_t       offset = 0;
+        int              unlink = 0;
+        lib_me_t        *me;
+        lib_md_t        *md;
+        lib_msg_t       *msg;
+        unsigned long    flags;
+
+        /* Convert put fields to host byte order */
+        hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits);
+        hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index);
+        hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset);
+
+        state_lock(nal, &flags);
+
+        me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT,
+                         hdr->src_nid, hdr->src_pid,
+                         PTL_HDR_LENGTH (hdr), hdr->msg.put.offset,
+                         hdr->msg.put.match_bits,
+                         &mlength, &offset, &unlink);
+        if (me == NULL)
+                goto drop;
+
+        md = me->md;
+        CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d "
+               "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index,
+               hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), 
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+            !(md->options & PTL_MD_ACK_DISABLE)) {
+                msg->send_ack = 1;
+                msg->ack_wmd = hdr->msg.put.ack_wmd;
+                msg->nid = hdr->src_nid;
+                msg->pid = hdr->src_pid;
+                msg->ev.match_bits = hdr->msg.put.match_bits;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_PUT;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.portal = hdr->msg.put.ptl_index;
+                msg->ev.match_bits = hdr->msg.put.match_bits;
+                msg->ev.rlength = PTL_HDR_LENGTH(hdr);
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+                msg->ev.hdr_data = hdr->msg.put.hdr_data;
+
+                /* NB if this match has exhausted the MD, we can't be sure
+                 * that this event will the the last one associated with
+                 * this MD in the event queue (another message already
+                 * matching this ME/MD could end up being last).  So we
+                 * remember the ME handle anyway and check again when we're
+                 * allocating our slot in the event queue.
+                 */
+                ptl_me2handle (&msg->ev.unlinked_me, me);
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        ni->counters.recv_length += mlength;
+
+        /* only unlink after MD's pending count has been bumped
+         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
+        if (unlink) {
+                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+                lib_me_unlink (nal, me);
+        }
+
+        state_unlock(nal, &flags);
+
+        lib_recv (nal, private, msg, md, offset, mlength, PTL_HDR_LENGTH (hdr));
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        ptl_size_t       mlength = 0;
+        ptl_size_t       offset = 0;
+        int              unlink = 0;
+        lib_me_t        *me;
+        lib_md_t        *md;
+        lib_msg_t       *msg;
+        ptl_hdr_t        reply;
+        unsigned long    flags;
+        int              rc;
+
+        /* Convert get fields to host byte order */
+        hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits);
+        hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index);
+        hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length);
+        hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset);
+
+        /* compatibility check until field is deleted */
+        if (hdr->msg.get.return_offset != 0)
+                CERROR("Unexpected non-zero get.return_offset %x from "
+                       LPU64"\n", hdr->msg.get.return_offset, hdr->src_nid);
+
+        state_lock(nal, &flags);
+
+        me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET,
+                         hdr->src_nid, hdr->src_pid,
+                         hdr->msg.get.sink_length, hdr->msg.get.src_offset,
+                         hdr->msg.get.match_bits,
+                         &mlength, &offset, &unlink);
+        if (me == NULL)
+                goto drop;
+
+        md = me->md;
+        CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d "
+               "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index,
+               hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), 
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_GET;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.portal = hdr->msg.get.ptl_index;
+                msg->ev.match_bits = hdr->msg.get.match_bits;
+                msg->ev.rlength = PTL_HDR_LENGTH(hdr);
+                msg->ev.mlength = mlength;
+                msg->ev.offset = offset;
+                msg->ev.hdr_data = 0;
+
+                /* NB if this match has exhausted the MD, we can't be sure
+                 * that this event will the the last one associated with
+                 * this MD in the event queue (another message already
+                 * matching this ME/MD could end up being last).  So we
+                 * remember the ME handle anyway and check again when we're
+                 * allocating our slot in the event queue.
+                 */
+                ptl_me2handle (&msg->ev.unlinked_me, me);
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.send_count++;
+        ni->counters.send_length += mlength;
+
+        /* only unlink after MD's refcount has been bumped
+         * in get_new_msg() otherwise lib_me_unlink() will nuke it */
+        if (unlink) {
+                md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED;
+                lib_me_unlink (nal, me);
+        }
+
+        state_unlock(nal, &flags);
+
+        memset (&reply, 0, sizeof (reply));
+        reply.type     = HTON__u32 (PTL_MSG_REPLY);
+        reply.dest_nid = HTON__u64 (hdr->src_nid);
+        reply.src_nid  = HTON__u64 (ni->nid);
+        reply.dest_pid = HTON__u32 (hdr->src_pid);
+        reply.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&reply) = HTON__u32 (mlength);
+
+        reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd;
+
+        rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, 
+                       hdr->src_nid, hdr->src_pid, md, offset, mlength);
+        if (rc != 0) {
+                CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n",
+                       ni->nid, hdr->src_nid);
+                state_lock (nal, &flags);
+                goto drop;
+        }
+
+        /* Complete the incoming message */
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return (rc);
+ drop:
+        ni->counters.drop_count++;
+        ni->counters.drop_length += hdr->msg.get.sink_length;
+        state_unlock(nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t        *ni = &nal->ni;
+        lib_md_t        *md;
+        int              rlength;
+        int              length;
+        lib_msg_t       *msg;
+        unsigned long    flags;
+
+        /* compatibility check until field is deleted */
+        if (hdr->msg.reply.dst_offset != 0)
+                CERROR("Unexpected non-zero reply.dst_offset %x from "LPU64"\n",
+                       hdr->msg.reply.dst_offset, hdr->src_nid);
+
+        state_lock(nal, &flags);
+
+        /* NB handles only looked up by creator (no flips) */
+        md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal);
+        if (md == NULL || md->threshold == 0) {
+                CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n",
+                        ni->nid, hdr->src_nid,
+                        md == NULL ? "invalid" : "inactive",
+                        hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                        hdr->msg.reply.dst_wmd.wh_object_cookie);
+                goto drop;
+        }
+
+        LASSERT (md->offset == 0);
+
+        length = rlength = PTL_HDR_LENGTH(hdr);
+
+        if (length > md->length) {
+                if ((md->options & PTL_MD_TRUNCATE) == 0) {
+                        CERROR (LPU64": Dropping REPLY from "LPU64
+                                " length %d for MD "LPX64" would overflow (%d)\n",
+                                ni->nid, hdr->src_nid, length,
+                                hdr->msg.reply.dst_wmd.wh_object_cookie,
+                                md->length);
+                        goto drop;
+                }
+                length = md->length;
+        }
+
+        CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n",
+               hdr->src_nid, length, rlength, 
+               hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping REPLY from "LPU64": can't "
+                       "allocate msg\n", ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_REPLY;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.rlength = rlength;
+                msg->ev.mlength = length;
+                msg->ev.offset = 0;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        ni->counters.recv_length += length;
+
+        state_unlock(nal, &flags);
+
+        lib_recv (nal, private, msg, md, 0, length, rlength);
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        lib_ni_t *ni = &nal->ni;
+        lib_md_t *md;
+        lib_msg_t *msg = NULL;
+        unsigned long flags;
+
+        /* Convert ack fields to host byte order */
+        hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits);
+        hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength);
+
+        state_lock(nal, &flags);
+
+        /* NB handles only looked up by creator (no flips) */
+        md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal);
+        if (md == NULL || md->threshold == 0) {
+                CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD "
+                       LPX64"."LPX64"\n", ni->nid, hdr->src_nid, 
+                       (md == NULL) ? "invalid" : "inactive",
+                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                       hdr->msg.ack.dst_wmd.wh_object_cookie);
+                goto drop;
+        }
+
+        CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
+               ni->nid, hdr->src_nid, 
+               hdr->msg.ack.dst_wmd.wh_object_cookie);
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n",
+                       ni->nid, hdr->src_nid);
+                goto drop;
+        }
+
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_ACK;
+                msg->ev.initiator.nid = hdr->src_nid;
+                msg->ev.initiator.pid = hdr->src_pid;
+                msg->ev.mlength = hdr->msg.ack.mlength;
+                msg->ev.match_bits = hdr->msg.ack.match_bits;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        ni->counters.recv_count++;
+        state_unlock(nal, &flags);
+        lib_recv (nal, private, msg, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return 0;
+
+ drop:
+        nal->ni.counters.drop_count++;
+        state_unlock (nal, &flags);
+        lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+        return -1;
+}
+
+static char *
+hdr_type_string (ptl_hdr_t *hdr)
+{
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return ("ACK");
+        case PTL_MSG_PUT:
+                return ("PUT");
+        case PTL_MSG_GET:
+                return ("GET");
+        case PTL_MSG_REPLY:
+                return ("REPLY");
+        case PTL_MSG_HELLO:
+                return ("HELLO");
+        default:
+                return ("<UNKNOWN>");
+        }
+}
+
+void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr)
+{
+        char *type_str = hdr_type_string (hdr);
+
+        nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str);
+        nal->cb_printf(nal, "    From nid/pid %Lu/%Lu", hdr->src_nid,
+                       hdr->src_pid);
+        nal->cb_printf(nal, "    To nid/pid %Lu/%Lu\n", hdr->dest_nid,
+                       hdr->dest_pid);
+
+        switch (hdr->type) {
+        default:
+                break;
+
+        case PTL_MSG_PUT:
+                nal->cb_printf(nal,
+                               "    Ptl index %d, ack md "LPX64"."LPX64", "
+                               "match bits "LPX64"\n",
+                               hdr->msg.put.ptl_index,
+                               hdr->msg.put.ack_wmd.wh_interface_cookie,
+                               hdr->msg.put.ack_wmd.wh_object_cookie,
+                               hdr->msg.put.match_bits);
+                nal->cb_printf(nal,
+                               "    Length %d, offset %d, hdr data "LPX64"\n",
+                               PTL_HDR_LENGTH(hdr), hdr->msg.put.offset,
+                               hdr->msg.put.hdr_data);
+                break;
+
+        case PTL_MSG_GET:
+                nal->cb_printf(nal,
+                               "    Ptl index %d, return md "LPX64"."LPX64", "
+                               "match bits "LPX64"\n", hdr->msg.get.ptl_index,
+                               hdr->msg.get.return_wmd.wh_interface_cookie,
+                               hdr->msg.get.return_wmd.wh_object_cookie,
+                               hdr->msg.get.match_bits);
+                nal->cb_printf(nal,
+                               "    Length %d, src offset %d\n",
+                               hdr->msg.get.sink_length,
+                               hdr->msg.get.src_offset);
+                break;
+
+        case PTL_MSG_ACK:
+                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
+                               "manipulated length %d\n",
+                               hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                               hdr->msg.ack.dst_wmd.wh_object_cookie,
+                               hdr->msg.ack.mlength);
+                break;
+
+        case PTL_MSG_REPLY:
+                nal->cb_printf(nal, "    dst md "LPX64"."LPX64", "
+                               "length %d\n",
+                               hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                               hdr->msg.reply.dst_wmd.wh_object_cookie,
+                               PTL_HDR_LENGTH(hdr));
+        }
+
+}                               /* end of print_hdr() */
+
+
+int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private)
+{
+        unsigned long  flags;
+
+        /* NB static check; optimizer will elide this if it's right */
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.put.length));
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.get.length));
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) ==
+                 offsetof (ptl_hdr_t, msg.reply.length));
+
+        /* convert common fields to host byte order */
+        hdr->dest_nid = NTOH__u64 (hdr->dest_nid);
+        hdr->src_nid = NTOH__u64 (hdr->src_nid);
+        hdr->dest_pid = NTOH__u32 (hdr->dest_pid);
+        hdr->src_pid = NTOH__u32 (hdr->src_pid);
+        hdr->type = NTOH__u32 (hdr->type);
+        PTL_HDR_LENGTH(hdr) = NTOH__u32 (PTL_HDR_LENGTH(hdr));
+#if 0
+        nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n",
+                       nal->ni.nid, nal, hdr, hdr->type);
+        print_hdr(nal, hdr);
+#endif
+        if (hdr->type == PTL_MSG_HELLO) {
+                /* dest_nid is really ptl_magicversion_t */
+                ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid;
+
+                CERROR (LPU64": Dropping unexpected HELLO message: "
+                        "magic %d, version %d.%d from "LPD64"\n",
+                        nal->ni.nid, mv->magic, 
+                        mv->version_major, mv->version_minor,
+                        hdr->src_nid);
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+        
+        if (hdr->dest_nid != nal->ni.nid) {
+                CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64
+                       " (not me)\n", nal->ni.nid, hdr_type_string (hdr),
+                       hdr->src_nid, hdr->dest_nid);
+
+                state_lock (nal, &flags);
+                nal->ni.counters.drop_count++;
+                nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr);
+                state_unlock (nal, &flags);
+
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, hdr->src_nid, 0))      /* shall we now? */
+        {
+                CERROR(LPU64": Dropping incoming %s from "LPU64
+                       ": simulated failure\n",
+                       nal->ni.nid, hdr_type_string (hdr), 
+                       hdr->src_nid);
+                return (-1);
+        }
+        
+        switch (hdr->type) {
+        case PTL_MSG_ACK:
+                return (parse_ack(nal, hdr, private));
+        case PTL_MSG_PUT:
+                return (parse_put(nal, hdr, private));
+                break;
+        case PTL_MSG_GET:
+                return (parse_get(nal, hdr, private));
+                break;
+        case PTL_MSG_REPLY:
+                return (parse_reply(nal, hdr, private));
+                break;
+        default:
+                CERROR(LPU64": Dropping <unknown> message from "LPU64
+                       ": Bad type=0x%x\n",  nal->ni.nid, hdr->src_nid,
+                       hdr->type);
+
+                lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
+                return (-1);
+        }
+}
+
+
+int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_ack_req_t ack_req_in
+         *      ptl_process_id_t target_in
+         *      ptl_pt_index_t portal_in
+         *      ptl_ac_index_t cookie_in
+         *      ptl_match_bits_t match_bits_in
+         *      ptl_size_t offset_in
+         *
+         * Outgoing:
+         */
+
+        PtlPut_in *args = v_args;
+        PtlPut_out *ret = v_ret;
+        ptl_hdr_t hdr;
+
+        lib_ni_t *ni = &nal->ni;
+        lib_md_t *md;
+        lib_msg_t *msg = NULL;
+        ptl_process_id_t *id = &args->target_in;
+        unsigned long flags;
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        {
+                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
+                       nal->ni.nid, id->nid);
+                return (ret->rc = PTL_INV_PROC);
+        }
+        
+        ret->rc = PTL_OK;
+        state_lock(nal, &flags);
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL || !md->threshold) {
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_INV_MD;
+        }
+
+        CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid,
+               (unsigned long)id->pid);
+
+        memset (&hdr, 0, sizeof (hdr));
+        hdr.type     = HTON__u32 (PTL_MSG_PUT);
+        hdr.dest_nid = HTON__u64 (id->nid);
+        hdr.src_nid  = HTON__u64 (ni->nid);
+        hdr.dest_pid = HTON__u32 (id->pid);
+        hdr.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&hdr) = HTON__u32 (md->length);
+
+        /* NB handles only looked up by creator (no flips) */
+        if (args->ack_req_in == PTL_ACK_REQ) {
+                hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie;
+                hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+        } else {
+                hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE;
+        }
+
+        hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in);
+        hdr.msg.put.ptl_index = HTON__u32 (args->portal_in);
+        hdr.msg.put.offset = HTON__u32 (args->offset_in);
+        hdr.msg.put.hdr_data = args->hdr_data_in;
+
+        ni->counters.send_count++;
+        ni->counters.send_length += md->length;
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR("BAD: could not allocate msg!\n");
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_NOSPACE;
+        }
+
+        /*
+         * If this memory descriptor has an event queue associated with
+         * it we need to allocate a message state object and record the
+         * information about this operation that will be recorded into
+         * event queue once the message has been completed.
+         *
+         * NB. We're now committed to the GET, since we just marked the MD
+         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
+         * PtlMDUnlink()) expect a completion event to tell them when the
+         * MD becomes idle. 
+         */
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_SENT;
+                msg->ev.initiator.nid = ni->nid;
+                msg->ev.initiator.pid = ni->pid;
+                msg->ev.portal = args->portal_in;
+                msg->ev.match_bits = args->match_bits_in;
+                msg->ev.rlength = md->length;
+                msg->ev.mlength = md->length;
+                msg->ev.offset = args->offset_in;
+                msg->ev.hdr_data = args->hdr_data_in;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        state_unlock(nal, &flags);
+        
+        lib_send (nal, private, msg, &hdr, PTL_MSG_PUT,
+                  id->nid, id->pid, md, 0, md->length);
+
+        return ret->rc = PTL_OK;
+}
+
+
+int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_md_t md_in
+         *      ptl_process_id_t target_in
+         *      ptl_pt_index_t portal_in
+         *      ptl_ac_index_t cookie_in
+         *      ptl_match_bits_t match_bits_in
+         *      ptl_size_t offset_in
+         *
+         * Outgoing:
+         */
+
+        PtlGet_in *args = v_args;
+        PtlGet_out *ret = v_ret;
+        ptl_hdr_t hdr;
+        lib_msg_t *msg = NULL;
+        lib_ni_t *ni = &nal->ni;
+        ptl_process_id_t *id = &args->target_in;
+        lib_md_t *md;
+        unsigned long flags;
+
+        if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */
+            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        {
+                CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n",
+                       nal->ni.nid, id->nid);
+                return (ret->rc = PTL_INV_PROC);
+        }
+        
+        state_lock(nal, &flags);
+        md = ptl_handle2md(&args->md_in, nal);
+        if (md == NULL || !md->threshold) {
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_INV_MD;
+        }
+
+        LASSERT (md->offset == 0);
+
+        CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
+               (unsigned long)id->pid);
+
+        memset (&hdr, 0, sizeof (hdr));
+        hdr.type     = HTON__u32 (PTL_MSG_GET);
+        hdr.dest_nid = HTON__u64 (id->nid);
+        hdr.src_nid  = HTON__u64 (ni->nid);
+        hdr.dest_pid = HTON__u32 (id->pid);
+        hdr.src_pid  = HTON__u32 (ni->pid);
+        PTL_HDR_LENGTH(&hdr) = 0;
+
+        /* NB handles only looked up by creator (no flips) */
+        hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie;
+        hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+
+        hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in);
+        hdr.msg.get.ptl_index = HTON__u32 (args->portal_in);
+        hdr.msg.get.src_offset = HTON__u32 (args->offset_in);
+        hdr.msg.get.sink_length = HTON__u32 (md->length);
+
+        ni->counters.send_count++;
+
+        msg = get_new_msg (nal, md);
+        if (msg == NULL) {
+                CERROR("do_PtlGet: BAD - could not allocate cookie!\n");
+                state_unlock(nal, &flags);
+                return ret->rc = PTL_NOSPACE;
+        }
+
+        /*
+         * If this memory descriptor has an event queue associated with
+         * it we must allocate a message state object that will record
+         * the information to be filled in once the message has been
+         * completed.  More information is in the do_PtlPut() comments.
+         *
+         * NB. We're now committed to the GET, since we just marked the MD
+         * busy.  Callers who observe this (by getting PTL_MD_INUSE from
+         * PtlMDUnlink()) expect a completion event to tell them when the
+         * MD becomes idle. 
+         */
+        if (md->eq) {
+                msg->ev.type = PTL_EVENT_SENT;
+                msg->ev.initiator.nid = ni->nid;
+                msg->ev.initiator.pid = ni->pid;
+                msg->ev.portal = args->portal_in;
+                msg->ev.match_bits = args->match_bits_in;
+                msg->ev.rlength = md->length;
+                msg->ev.mlength = md->length;
+                msg->ev.offset = args->offset_in;
+                msg->ev.hdr_data = 0;
+
+                lib_md_deconstruct(nal, md, &msg->ev.mem_desc);
+        }
+
+        state_unlock(nal, &flags);
+
+        lib_send (nal, private, msg, &hdr, PTL_MSG_GET,
+                  id->nid, id->pid, NULL, 0, 0);
+
+        return ret->rc = PTL_OK;
+}
+
+void lib_assert_wire_constants (void)
+{
+        /* Wire protocol assertions generated by 'wirecheck' */
+
+        /* Constants... */
+        LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded);
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        LASSERT (PORTALS_PROTO_VERSION_MINOR == 1);
+        LASSERT (PTL_MSG_ACK == 0);
+        LASSERT (PTL_MSG_PUT == 1);
+        LASSERT (PTL_MSG_GET == 2);
+        LASSERT (PTL_MSG_REPLY == 3);
+        LASSERT (PTL_MSG_HELLO == 4);
+
+        /* Checks for struct ptl_handle_wire_t */
+        LASSERT (sizeof (ptl_handle_wire_t) == 16);
+        LASSERT (offsetof (ptl_handle_wire_t, wh_interface_cookie) == 0);
+        LASSERT (sizeof (((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8);
+        LASSERT (offsetof (ptl_handle_wire_t, wh_object_cookie) == 8);
+        LASSERT (sizeof (((ptl_handle_wire_t *)0)->wh_object_cookie) == 8);
+
+        /* Checks for struct ptl_magicversion_t */
+        LASSERT (sizeof (ptl_magicversion_t) == 8);
+        LASSERT (offsetof (ptl_magicversion_t, magic) == 0);
+        LASSERT (sizeof (((ptl_magicversion_t *)0)->magic) == 4);
+        LASSERT (offsetof (ptl_magicversion_t, version_major) == 4);
+        LASSERT (sizeof (((ptl_magicversion_t *)0)->version_major) == 2);
+        LASSERT (offsetof (ptl_magicversion_t, version_minor) == 6);
+        LASSERT (sizeof (((ptl_magicversion_t *)0)->version_minor) == 2);
+
+        /* Checks for struct ptl_hdr_t */
+        LASSERT (sizeof (ptl_hdr_t) == 72);
+        LASSERT (offsetof (ptl_hdr_t, dest_nid) == 0);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->dest_nid) == 8);
+        LASSERT (offsetof (ptl_hdr_t, src_nid) == 8);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->src_nid) == 8);
+        LASSERT (offsetof (ptl_hdr_t, dest_pid) == 16);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->dest_pid) == 4);
+        LASSERT (offsetof (ptl_hdr_t, src_pid) == 20);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->src_pid) == 4);
+        LASSERT (offsetof (ptl_hdr_t, type) == 24);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->type) == 4);
+
+        /* Ack */
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.mlength) == 28);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.mlength) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.dst_wmd) == 32);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.match_bits) == 48);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.match_bits) == 8);
+        LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == 56);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.length) == 4);
+
+        /* Put */
+        LASSERT (offsetof (ptl_hdr_t, msg.put.ptl_index) == 28);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.ptl_index) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.ack_wmd) == 32);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.match_bits) == 48);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.match_bits) == 8);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.length) == 56);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.length) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.offset) == 60);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.offset) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.put.hdr_data) == 64);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.hdr_data) == 8);
+
+        /* Get */
+        LASSERT (offsetof (ptl_hdr_t, msg.get.ptl_index) == 28);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.ptl_index) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.return_wmd) == 32);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.return_wmd) == 16);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.match_bits) == 48);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.match_bits) == 8);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.length) == 56);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.length) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.src_offset) == 60);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.src_offset) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.return_offset) == 64);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.return_offset) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.get.sink_length) == 68);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.sink_length) == 4);
+
+        /* Reply */
+        LASSERT (offsetof (ptl_hdr_t, msg.reply.dst_wmd) == 32);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+        LASSERT (offsetof (ptl_hdr_t, msg.reply.dst_offset) == 48);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.dst_offset) == 4);
+        LASSERT (offsetof (ptl_hdr_t, msg.reply.length) == 56);
+        LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.length) == 4);
+}
diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c
new file mode 100644 (file)
index 0000000..f10892c
--- /dev/null
@@ -0,0 +1,163 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-msg.c
+ * Message decoding, parsing and finalizing routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __KERNEL__
+# include <stdio.h>
+#else
+# define DEBUG_SUBSYSTEM S_PORTALS
+# include <linux/kp30.h>
+#endif
+
+#include <portals/lib-p30.h>
+
+int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg)
+{
+        lib_md_t     *md;
+        lib_eq_t     *eq;
+        int           rc;
+        unsigned long flags;
+
+        /* ni went down while processing this message */
+        if (nal->ni.up == 0) {
+                return -1;
+        }
+
+        if (msg == NULL)
+                return 0;
+
+        rc = 0;
+        if (msg->send_ack) {
+                ptl_hdr_t ack;
+
+                LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd));
+
+                memset (&ack, 0, sizeof (ack));
+                ack.type     = HTON__u32 (PTL_MSG_ACK);
+                ack.dest_nid = HTON__u64 (msg->nid);
+                ack.src_nid  = HTON__u64 (nal->ni.nid);
+                ack.dest_pid = HTON__u32 (msg->pid);
+                ack.src_pid  = HTON__u32 (nal->ni.pid);
+                PTL_HDR_LENGTH(&ack) = 0;
+
+                ack.msg.ack.dst_wmd = msg->ack_wmd;
+                ack.msg.ack.match_bits = msg->ev.match_bits;
+                ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength);
+
+                rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
+                               msg->nid, msg->pid, NULL, 0, 0);
+        }
+
+        md = msg->md;
+        LASSERT (md->pending > 0);  /* I've not dropped my ref yet */
+        eq = md->eq;
+
+        state_lock(nal, &flags);
+
+        if (eq != NULL) {
+                ptl_event_t  *ev = &msg->ev;
+                ptl_event_t  *eq_slot;
+
+                /* I have to hold the lock while I bump the sequence number
+                 * and copy the event into the queue.  If not, and I was
+                 * interrupted after bumping the sequence number, other
+                 * events could fill the queue, including the slot I just
+                 * allocated to this event.  On resuming, I would overwrite
+                 * a more 'recent' event with old event state, and
+                 * processes taking events off the queue would not detect
+                 * overflow correctly.
+                 */
+
+                ev->sequence = eq->sequence++;/* Allocate the next queue slot */
+
+                /* size must be a power of 2 to handle a wrapped sequence # */
+                LASSERT (eq->size != 0 &&
+                         eq->size == LOWEST_BIT_SET (eq->size));
+                eq_slot = eq->base + (ev->sequence & (eq->size - 1));
+
+                /* Invalidate unlinked_me unless this is the last
+                 * event for an auto-unlinked MD.  Note that if md was
+                 * auto-unlinked, md->pending can only decrease
+                 */
+                if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */
+                    md->pending != 1)                       /* not last ref */
+                        ev->unlinked_me = PTL_HANDLE_NONE;
+
+                /* Copy the event into the allocated slot, ensuring all the
+                 * rest of the event's contents have been copied _before_
+                 * the sequence number gets updated.  A processes 'getting'
+                 * an event waits on the next queue slot's sequence to be
+                 * 'new'.  When it is, _all_ other event fields had better
+                 * be consistent.  I assert 'sequence' is the last member,
+                 * so I only need a 2 stage copy.
+                 */
+                LASSERT(sizeof (ptl_event_t) ==
+                        offsetof(ptl_event_t, sequence) + sizeof(ev->sequence));
+
+                rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev,
+                                    offsetof (ptl_event_t, sequence));
+                LASSERT (rc == 0);
+
+#ifdef __KERNEL__
+                barrier();
+#endif
+                /* Updating the sequence number is what makes the event 'new' */
+
+                /* cb_write is not necessarily atomic, so this could
+                   cause a race with PtlEQGet */
+                rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence,
+                                   (void *)&ev->sequence,sizeof (ev->sequence));
+                LASSERT (rc == 0);
+
+#ifdef __KERNEL__
+                barrier();
+#endif
+
+                /* I must also ensure that (a) callbacks are made in the
+                 * same order as the events land in the queue, and (b) the
+                 * callback occurs before the event can be removed from the
+                 * queue, so I can't drop the lock during the callback. */
+                if (nal->cb_callback != NULL)
+                        nal->cb_callback(nal, private, eq, ev);
+                else  if (eq->event_callback != NULL)
+                        (void)((eq->event_callback) (ev));
+        }
+
+        LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 ||
+                 (md->md_flags & PTL_MD_FLAG_UNLINK) != 0);
+
+        md->pending--;
+        if (md->pending == 0 && /* no more outstanding operations on this md */
+            (md->threshold == 0 ||              /* done its business */
+             (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */
+                lib_md_unlink(nal, md);
+
+        list_del (&msg->msg_list);
+        nal->ni.counters.msgs_alloc--;
+        lib_msg_free(nal, msg);
+
+        state_unlock(nal, &flags);
+
+        return rc;
+}
diff --git a/lustre/portals/portals/lib-ni.c b/lustre/portals/portals/lib-ni.c
new file mode 100644 (file)
index 0000000..aa30329
--- /dev/null
@@ -0,0 +1,128 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-ni.c
+ * Network status registers and distance functions.
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *  Copyright (c) 2001-2002 Sandia National Laboratories
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PORTALS
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+#define MAX_DIST 18446744073709551615UL
+
+int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        PtlNIDebug_in *args = v_args;
+        PtlNIDebug_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+
+        ret->rc = ni->debug;
+        ni->debug = args->mask_in;
+
+        return 0;
+}
+
+int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t interface_in
+         *      ptl_sr_index_t register_in
+         *
+         * Outgoing:
+         *      ptl_sr_value_t          * status_out
+         */
+
+        PtlNIStatus_in *args = v_args;
+        PtlNIStatus_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+        lib_counters_t *count = &ni->counters;
+
+        if (!args)
+                return ret->rc = PTL_SEGV;
+
+        ret->rc = PTL_OK;
+        ret->status_out = 0;
+
+        /*
+         * I hate this sort of code....  Hash tables, offset lists?
+         * Treat the counters as an array of ints?
+         */
+        if (args->register_in == PTL_SR_DROP_COUNT)
+                ret->status_out = count->drop_count;
+
+        else if (args->register_in == PTL_SR_DROP_LENGTH)
+                ret->status_out = count->drop_length;
+
+        else if (args->register_in == PTL_SR_RECV_COUNT)
+                ret->status_out = count->recv_count;
+
+        else if (args->register_in == PTL_SR_RECV_LENGTH)
+                ret->status_out = count->recv_length;
+
+        else if (args->register_in == PTL_SR_SEND_COUNT)
+                ret->status_out = count->send_count;
+
+        else if (args->register_in == PTL_SR_SEND_LENGTH)
+                ret->status_out = count->send_length;
+
+        else if (args->register_in == PTL_SR_MSGS_MAX)
+                ret->status_out = count->msgs_max;
+        else
+                ret->rc = PTL_INV_SR_INDX;
+
+        return ret->rc;
+}
+
+
+int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t interface_in
+         *      ptl_process_id_t process_in
+
+         *
+         * Outgoing:
+         *      unsigned long   * distance_out
+
+         */
+
+        PtlNIDist_in *args = v_args;
+        PtlNIDist_out *ret = v_ret;
+
+        unsigned long dist;
+        ptl_process_id_t id_in = args->process_in;
+        ptl_nid_t nid;
+        int rc;
+
+        nid = id_in.nid;
+
+        if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) {
+                ret->distance_out = (unsigned long) MAX_DIST;
+                return PTL_INV_PROC;
+        }
+
+        ret->distance_out = dist;
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lustre/portals/portals/lib-pid.c b/lustre/portals/portals/lib-pid.c
new file mode 100644 (file)
index 0000000..12eebb5
--- /dev/null
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-pid.c
+ *
+ * Process identification routines
+ * Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ * Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ *
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This should be removed.  The NAL should have the PID information */
+#define DEBUG_SUBSYSTEM S_PORTALS
+
+#if defined (__KERNEL__)
+#       include <linux/kernel.h>
+extern int getpid(void);
+#else
+#       include <stdio.h>
+#       include <unistd.h>
+#endif
+#include <portals/lib-p30.h>
+#include <portals/arg-blocks.h>
+
+int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret)
+{
+        /*
+         * Incoming:
+         *      ptl_handle_ni_t handle_in
+         *
+         * Outgoing:
+         *      ptl_process_id_t        * id_out
+         *      ptl_id_t                * gsize_out
+         */
+
+        PtlGetId_out *ret = v_ret;
+        lib_ni_t *ni = &nal->ni;
+
+        ret->id_out.nid = ni->nid;
+        ret->id_out.pid = ni->pid;
+
+        return ret->rc = PTL_OK;
+}
diff --git a/lustre/portals/router/.cvsignore b/lustre/portals/router/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/portals/router/Makefile.am b/lustre/portals/router/Makefile.am
new file mode 100644 (file)
index 0000000..1c8087b
--- /dev/null
@@ -0,0 +1,16 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Rules.linux
+
+MODULE = kptlrouter
+modulenet_DATA = kptlrouter.o
+EXTRA_PROGRAMS = kptlrouter
+
+
+#CFLAGS:= @KCFLAGS@ 
+#CPPFLAGS:=@KCPPFLAGS@
+DEFS =
+kptlrouter_SOURCES = router.c proc.c router.h
diff --git a/lustre/portals/router/Makefile.mk b/lustre/portals/router/Makefile.mk
new file mode 100644 (file)
index 0000000..64bd09b
--- /dev/null
@@ -0,0 +1,9 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Kernelenv
+
+obj-y += kptlrouter.o
+kptlrouter-objs    := router.o proc.o
diff --git a/lustre/portals/router/proc.c b/lustre/portals/router/proc.c
new file mode 100644 (file)
index 0000000..dd65b34
--- /dev/null
@@ -0,0 +1,78 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "router.h"
+
+#define KPR_PROC_ROUTER "sys/portals/router"
+
+int
+kpr_proc_read (char *page, char **start, off_t off, int count, int *eof, void *data)
+{
+       unsigned long long bytes = kpr_fwd_bytes;
+       unsigned long      packets = kpr_fwd_packets;
+       unsigned long      errors = kpr_fwd_errors;
+        unsigned int       qdepth = atomic_read (&kpr_queue_depth);
+       int                len;
+       
+       *eof = 1;
+       if (off != 0)
+               return (0);
+       
+       len = sprintf (page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth);
+       
+       *start = page;
+       return (len);
+}
+
+int
+kpr_proc_write (struct file *file, const char *ubuffer, unsigned long count, void *data)
+{
+       /* Ignore what we've been asked to write, and just zero the stats counters */
+       kpr_fwd_bytes = 0;
+       kpr_fwd_packets = 0;
+       kpr_fwd_errors = 0;
+
+       return (count);
+}
+
+void
+kpr_proc_init(void)
+{
+        struct proc_dir_entry *entry = create_proc_entry (KPR_PROC_ROUTER, S_IFREG | S_IRUGO | S_IWUSR, NULL);
+
+        if (entry == NULL) 
+       {
+                CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER);
+                return;
+        }
+
+        entry->data = NULL;
+        entry->read_proc = kpr_proc_read;
+       entry->write_proc = kpr_proc_write;
+}
+
+void 
+kpr_proc_fini(void)
+{
+        remove_proc_entry(KPR_PROC_ROUTER, 0);
+}
diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c
new file mode 100644 (file)
index 0000000..6074c3c
--- /dev/null
@@ -0,0 +1,449 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "router.h"
+
+struct list_head kpr_routes;
+struct list_head kpr_nals;
+
+unsigned long long kpr_fwd_bytes;
+unsigned long      kpr_fwd_packets;
+unsigned long      kpr_fwd_errors;
+atomic_t           kpr_queue_depth;
+
+/* Mostly the tables are read-only (thread and interrupt context)
+ *
+ * Once in a blue moon we register/deregister NALs and add/remove routing
+ * entries (thread context only)... */
+rwlock_t         kpr_rwlock;
+
+kpr_router_interface_t kpr_router_interface = {
+       kprri_register:         kpr_register_nal,
+       kprri_lookup:           kpr_lookup_target,
+       kprri_fwd_start:        kpr_forward_packet,
+       kprri_fwd_done:         kpr_complete_packet,
+       kprri_shutdown:         kpr_shutdown_nal,
+       kprri_deregister:       kpr_deregister_nal,
+};
+
+kpr_control_interface_t kpr_control_interface = {
+       kprci_add_route:        kpr_add_route,
+       kprci_del_route:        kpr_del_route,
+       kprci_get_route:        kpr_get_route,
+};
+
+int
+kpr_register_nal (kpr_nal_interface_t *nalif, void **argp)
+{
+       long               flags;
+       struct list_head  *e;
+       kpr_nal_entry_t   *ne;
+
+        CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid);
+
+       PORTAL_ALLOC (ne, sizeof (*ne));
+       if (ne == NULL)
+               return (-ENOMEM);
+
+       memset (ne, 0, sizeof (*ne));
+        memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif));
+
+       LASSERT (!in_interrupt());
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+       for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
+       {
+               kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+               if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid)
+               {
+                       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+                       CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid);
+
+                       PORTAL_FREE (ne, sizeof (*ne));
+                       return (-EEXIST);
+               }
+       }
+
+        list_add (&ne->kpne_list, &kpr_nals);
+
+       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+       *argp = ne;
+       PORTAL_MODULE_USE;
+        return (0);
+}
+
+void
+kpr_shutdown_nal (void *arg)
+{
+       long             flags;
+       kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
+
+        CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
+
+       LASSERT (!ne->kpne_shutdown);
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */
+       ne->kpne_shutdown = 1;
+       write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */
+
+       while (atomic_read (&ne->kpne_refcount) != 0)
+       {
+               CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n",
+                       ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
+
+               set_current_state (TASK_UNINTERRUPTIBLE);
+               schedule_timeout (HZ);
+       }
+}
+
+void
+kpr_deregister_nal (void *arg)
+{
+       long              flags;
+       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
+
+        CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
+
+       LASSERT (ne->kpne_shutdown);            /* caller must have issued shutdown already */
+       LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */
+       LASSERT (!in_interrupt());
+
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+       list_del (&ne->kpne_list);
+
+       write_unlock_irqrestore (&kpr_rwlock, flags);
+
+       PORTAL_FREE (ne, sizeof (*ne));
+        PORTAL_MODULE_UNUSE;
+}
+
+
+int
+kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp)
+{
+       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
+       struct list_head *e;
+       int               rc = -ENOENT;
+
+        CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid);
+
+       if (ne->kpne_shutdown)          /* caller is shutting down */
+               return (-ENOENT);
+
+       read_lock (&kpr_rwlock);
+
+       /* Search routes for one that has a gateway to target_nid on the callers network */
+
+       for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
+       {
+               kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+
+               if (re->kpre_lo_nid > target_nid ||
+                    re->kpre_hi_nid < target_nid)
+                       continue;
+
+               /* found table entry */
+
+               if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */
+                       rc = -EHOSTUNREACH;
+               else
+               {
+                       rc = 0;
+                       *gateway_nidp = re->kpre_gateway_nid;
+               }
+               break;
+       }
+
+       read_unlock (&kpr_rwlock);
+
+        CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
+                target_nid, ne->kpne_interface.kprni_nalid, rc,
+                (rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
+       return (rc);
+}
+
+void
+kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
+{
+       kpr_nal_entry_t  *src_ne = (kpr_nal_entry_t *)arg;
+       ptl_nid_t         target_nid = fwd->kprfd_target_nid;
+        int               nob = fwd->kprfd_nob;
+       struct list_head *e;
+
+        CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd,
+                target_nid, src_ne->kpne_interface.kprni_nalid);
+
+        LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */
+        LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov));
+        
+        atomic_inc (&kpr_queue_depth);
+       atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */
+
+        kpr_fwd_packets++;                   /* (loose) stats accounting */
+        kpr_fwd_bytes += nob;
+
+       if (src_ne->kpne_shutdown)           /* caller is shutting down */
+               goto out;
+
+       fwd->kprfd_router_arg = src_ne;      /* stash caller's nal entry */
+
+       read_lock (&kpr_rwlock);
+
+       /* Search routes for one that has a gateway to target_nid NOT on the caller's network */
+
+       for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
+       {
+               kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+
+               if (re->kpre_lo_nid > target_nid || /* no match */
+                    re->kpre_hi_nid < target_nid)
+                       continue;
+
+                CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd,
+                        target_nid, src_ne->kpne_interface.kprni_nalid,
+                        re->kpre_gateway_nid, re->kpre_gateway_nalid);
+
+               if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid)
+                       break;                  /* don't route to same NAL */
+
+               /* Search for gateway's NAL's entry */
+
+               for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
+               {
+                       kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+                       if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */
+                               continue;
+
+                       if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */
+                               break;
+
+                       fwd->kprfd_gateway_nid = re->kpre_gateway_nid;
+                       atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
+
+                       read_unlock (&kpr_rwlock);
+
+                        CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd,
+                                target_nid, src_ne->kpne_interface.kprni_nalid,
+                                fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
+
+                       dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
+                       return;
+               }
+               break;
+       }
+
+       read_unlock (&kpr_rwlock);
+ out:
+        kpr_fwd_errors++;
+
+        CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
+                target_nid, src_ne->kpne_interface.kprni_nalid);
+
+       /* Can't find anywhere to forward to */
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH);
+
+        atomic_dec (&kpr_queue_depth);
+       atomic_dec (&src_ne->kpne_refcount);
+}
+
+void
+kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error)
+{
+       kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
+       kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
+
+        CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
+                src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
+
+       atomic_dec (&dst_ne->kpne_refcount);    /* CAVEAT EMPTOR dst_ne can disappear now!!! */
+
+       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
+
+        CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd,
+                src_ne->kpne_interface.kprni_nalid, error);
+
+        atomic_dec (&kpr_queue_depth);
+       atomic_dec (&src_ne->kpne_refcount);    /* CAVEAT EMPTOR src_ne can disappear now!!! */
+}
+
+int
+kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
+               ptl_nid_t hi_nid)
+{
+       long               flags;
+       struct list_head  *e;
+       kpr_route_entry_t *re;
+
+        CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
+               gateway_nalid, gateway_nid, lo_nid, hi_nid);
+
+        LASSERT(lo_nid <= hi_nid);
+
+        PORTAL_ALLOC (re, sizeof (*re));
+        if (re == NULL)
+                return (-ENOMEM);
+
+        re->kpre_gateway_nalid = gateway_nalid;
+        re->kpre_gateway_nid = gateway_nid;
+        re->kpre_lo_nid = lo_nid;
+        re->kpre_hi_nid = hi_nid;
+
+        LASSERT(!in_interrupt());
+       write_lock_irqsave (&kpr_rwlock, flags);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t,
+                                                    kpre_list);
+
+                if (re->kpre_lo_nid > re2->kpre_hi_nid ||
+                    re->kpre_hi_nid < re2->kpre_lo_nid)
+                        continue;
+
+                CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]"
+                        "to ["LPX64" - "LPX64"]\n",
+                        re->kpre_lo_nid, re->kpre_hi_nid,
+                        re2->kpre_lo_nid, re2->kpre_hi_nid);
+
+                write_unlock_irqrestore (&kpr_rwlock, flags);
+
+                PORTAL_FREE (re, sizeof (*re));
+                return (-EINVAL);
+        }
+
+        list_add (&re->kpre_list, &kpr_routes);
+
+        write_unlock_irqrestore (&kpr_rwlock, flags);
+        return (0);
+}
+
+int
+kpr_del_route (ptl_nid_t nid)
+{
+       long               flags;
+       struct list_head  *e;
+
+        CDEBUG(D_OTHER, "Del route "LPX64"\n", nid);
+
+        LASSERT(!in_interrupt());
+       write_lock_irqsave(&kpr_rwlock, flags);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+                                                   kpre_list);
+
+                if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid)
+                        continue;
+
+                list_del (&re->kpre_list);
+                write_unlock_irqrestore(&kpr_rwlock, flags);
+
+                PORTAL_FREE(re, sizeof (*re));
+                return (0);
+        }
+
+        write_unlock_irqrestore(&kpr_rwlock, flags);
+        return (-ENOENT);
+}
+
+int
+kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
+              ptl_nid_t *lo_nid, ptl_nid_t *hi_nid)
+{
+       struct list_head  *e;
+
+       read_lock(&kpr_rwlock);
+
+        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
+                kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+                                                   kpre_list);
+
+                if (idx-- == 0) {
+                        *gateway_nalid = re->kpre_gateway_nalid;
+                        *gateway_nid = re->kpre_gateway_nid;
+                        *lo_nid = re->kpre_lo_nid;
+                        *hi_nid = re->kpre_hi_nid;
+
+                        read_unlock(&kpr_rwlock);
+                        return (0);
+                }
+        }
+
+        read_unlock (&kpr_rwlock);
+        return (-ENOENT);
+}
+
+static void __exit
+kpr_finalise (void)
+{
+        LASSERT (list_empty (&kpr_nals));
+
+        while (!list_empty (&kpr_routes)) {
+                kpr_route_entry_t *re = list_entry(kpr_routes.next,
+                                                   kpr_route_entry_t,
+                                                   kpre_list);
+
+                list_del(&re->kpre_list);
+                PORTAL_FREE(re, sizeof (*re));
+        }
+
+        kpr_proc_fini();
+
+        PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
+        PORTAL_SYMBOL_UNREGISTER(kpr_control_interface);
+
+        CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n",
+               atomic_read(&portal_kmemory));
+}
+
+static int __init
+kpr_initialise (void)
+{
+        CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n",
+               atomic_read(&portal_kmemory));
+
+       rwlock_init(&kpr_rwlock);
+       INIT_LIST_HEAD(&kpr_routes);
+       INIT_LIST_HEAD(&kpr_nals);
+
+        kpr_proc_init();
+
+        PORTAL_SYMBOL_REGISTER(kpr_router_interface);
+        PORTAL_SYMBOL_REGISTER(kpr_control_interface);
+        return (0);
+}
+
+MODULE_AUTHOR("Eric Barton");
+MODULE_DESCRIPTION("Kernel Portals Router v0.01");
+MODULE_LICENSE("GPL");
+
+module_init (kpr_initialise);
+module_exit (kpr_finalise);
+
+EXPORT_SYMBOL (kpr_control_interface);
+EXPORT_SYMBOL (kpr_router_interface);
diff --git a/lustre/portals/router/router.h b/lustre/portals/router/router.h
new file mode 100644 (file)
index 0000000..b8c3bec
--- /dev/null
@@ -0,0 +1,81 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef _KPTLROUTER_H
+#define _KPTLROUTER_H
+#define EXPORT_SYMTAB
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+
+#define DEBUG_SUBSYSTEM S_PTLROUTER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <portals/lib-p30.h>
+
+typedef struct
+{
+       struct list_head        kpne_list;
+       kpr_nal_interface_t     kpne_interface;
+       atomic_t                kpne_refcount;
+       int                     kpne_shutdown;
+} kpr_nal_entry_t;
+
+typedef struct
+{
+       struct list_head        kpre_list;
+       int                     kpre_gateway_nalid;
+       ptl_nid_t               kpre_gateway_nid;
+       ptl_nid_t               kpre_lo_nid;
+        ptl_nid_t               kpre_hi_nid;
+} kpr_route_entry_t;
+
+extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp);
+extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp);
+extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd);
+extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error);
+extern void kpr_shutdown_nal (void *arg);
+extern void kpr_deregister_nal (void *arg);
+
+extern void kpr_proc_init (void);
+extern void kpr_proc_fini (void);
+
+extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, 
+                          ptl_nid_t lo_nid, ptl_nid_t hi_nid);
+extern int kpr_del_route (ptl_nid_t nid);
+extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, 
+                          ptl_nid_t *lo_nid, ptl_nid_t *hi_nid);
+
+extern unsigned long long kpr_fwd_bytes;
+extern unsigned long      kpr_fwd_packets;
+extern unsigned long      kpr_fwd_errors;
+extern atomic_t           kpr_queue_depth;
+
+#endif /* _KPLROUTER_H */
diff --git a/lustre/portals/tests/.cvsignore b/lustre/portals/tests/.cvsignore
new file mode 100644 (file)
index 0000000..051d1bd
--- /dev/null
@@ -0,0 +1,3 @@
+Makefile
+Makefile.in
+.deps
diff --git a/lustre/portals/tests/Makefile.am b/lustre/portals/tests/Makefile.am
new file mode 100644 (file)
index 0000000..7b47ae0
--- /dev/null
@@ -0,0 +1,23 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+include ../Rules.linux
+
+LDFLAGS = -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r
+LINK = $(LD) $(LDFLAGS) -o $@
+DEFS =
+LIBS =
+MODULE = $(basename)
+EXTRA_DIST = startserver.sh startclient.sh stopserver.sh stopclient.sh
+
+noinst_PROGRAMS = pingsrv.o pingcli.o spingsrv.o spingcli.o 
+
+pingsrv_o_SOURCES = ping_srv.c ping.h
+
+pingcli_o_SOURCES = ping_cli.c ping.h
+
+spingsrv_o_SOURCES = sping_srv.c ping.h
+
+spingcli_o_SOURCES = sping_cli.c ping.h
diff --git a/lustre/portals/tests/ping.h b/lustre/portals/tests/ping.h
new file mode 100644 (file)
index 0000000..f07444b
--- /dev/null
@@ -0,0 +1,80 @@
+#ifndef _KPING_INCLUDED
+#define _KPING_INCLUDED
+
+#include <portals/p30.h>
+
+
+#define PTL_PING_IN_SIZE               256     // n packets per buffer
+#define PTL_PING_IN_BUFFERS            2       // n fallback buffers
+
+#define PTL_PING_CLIENT                        4
+#define PTL_PING_SERVER                        5
+
+#define PING_HEADER_MAGIC              0xDEADBEEF
+#define PING_BULK_MAGIC                        0xCAFEBABE
+
+#define PING_HEAD_BITS                 0x00000001
+#define PING_BULK_BITS                 0x00000002
+#define PING_IGNORE_BITS               0xFFFFFFFC
+
+#define PTL_PING_ACK                   0x01
+#define PTL_PING_VERBOSE               0x02
+#define PTL_PING_VERIFY                        0x04
+#define PTL_PING_PREALLOC              0x08
+
+
+#define NEXT_PRIMARY_BUFFER(index)             \
+       (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1))
+
+#define PDEBUG(str, err)                       \
+       CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err)
+
+
+/* Ping data to be passed via the ioctl to kernel space */
+
+#if __KERNEL__
+
+
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+struct pingsrv_data {
+        
+        ptl_handle_ni_t         ni;
+        ptl_handle_me_t         me;
+        ptl_handle_eq_t         eq;
+        void                   *in_buf;
+        ptl_process_id_t        my_id;
+        ptl_process_id_t        id_local;
+        ptl_md_t                mdin;
+        ptl_md_t                mdout;
+        ptl_handle_md_t         mdin_h;
+        ptl_handle_md_t         mdout_h;
+        ptl_event_t             evnt;
+        struct task_struct     *tsk;
+}; /* struct pingsrv_data */
+struct pingcli_data {
+        
+        struct portal_ioctl_data *args;
+        ptl_handle_me_t        me;
+        ptl_handle_eq_t                eq;
+        char                          *inbuf;    
+        char                   *outbuf;   
+        ptl_process_id_t       myid; 
+        ptl_process_id_t       id_local; 
+        ptl_process_id_t       id_remote;
+        ptl_md_t               md_in_head;
+        ptl_md_t               md_out_head;
+        ptl_handle_md_t        md_in_head_h;
+        ptl_handle_md_t        md_out_head_h;
+        ptl_event_t            ev;
+        struct task_struct     *tsk;
+}; /* struct pingcli_data */
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _KPING_INCLUDED */
diff --git a/lustre/portals/tests/ping_cli.c b/lustre/portals/tests/ping_cli.c
new file mode 100644 (file)
index 0000000..389ffbb
--- /dev/null
@@ -0,0 +1,300 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *         Kedar Sovani (kedar@calsoftinc.com)
+ *         Amey Inamdar (amey@calsoftinc.com)
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include "ping.h"
+/* int portal_debug = D_PING_CLI;  */
+
+
+#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
+
+#define MAX_TIME 100000
+
+/* This should be enclosed in a structure */
+
+static struct pingcli_data *client = NULL;
+
+static int count = 0;
+
+static void
+pingcli_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+                case 2:
+                        if ((rc = PtlMDUnlink (client->md_in_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (client->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        if ((rc = PtlMEUnlink (client->me)))
+                                PDEBUG ("PtlMEUnlink", rc);
+                case 3:
+                        kportal_put_ni (client->args->ioc_nal);
+
+                case 4:
+                        /* Free our buffers */
+
+                        if (client != NULL)
+                                PORTAL_FREE (client,
+                                                sizeof(struct pingcli_data));
+        }
+
+
+        CDEBUG (D_OTHER, "ping client released resources\n");
+} /* pingcli_shutdown() */
+
+static int pingcli_callback(ptl_event_t *ev)
+{
+        int i, magic;
+        i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned));
+        magic = *(int *)(ev->mem_desc.start + ev->offset);
+
+        if(magic != 0xcafebabe) {
+                printk ("Unexpected response \n");
+                return 1;
+        }
+
+        if((i == count) || !count)
+                wake_up_process (client->tsk);
+        else
+                printk ("Received response after timeout for %d\n",i);
+        return 1;
+}
+
+
+static struct pingcli_data *
+pingcli_start(struct portal_ioctl_data *args)
+{
+        ptl_handle_ni_t *nip;
+        unsigned ping_head_magic = PING_HEADER_MAGIC;
+        unsigned ping_bulk_magic = PING_BULK_MAGIC;
+        int rc;
+        struct timeval tv1, tv2;
+        client->tsk = current;
+        client->args = args;
+        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64",  \
+                        nal %d, size %u, count: %u, timeout: %u\n",
+                        args->ioc_nid, args->ioc_nal, args->ioc_size,
+                        args->ioc_count, args->ioc_timeout);
+
+
+        PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ;
+        if (client->outbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        PORTAL_ALLOC (client->inbuf,
+                        (args->ioc_size + STDSIZE) * args->ioc_count);
+        if (client->inbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        {
+                CERROR ("NAL %d not loaded\n", args->ioc_nal);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (*nip, &client->myid)))
+        {
+                CERROR ("PtlGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Setup the local match entries */
+        client->id_local.nid = PTL_NID_ANY;
+        client->id_local.pid = PTL_PID_ANY;
+
+        /* Setup the remote match entries */
+        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.pid = 0;
+
+        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, PTL_RETAIN,
+                   PTL_INS_AFTER, &client->me)))
+        {
+                CERROR ("PtlMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Allocate the event queue for this network interface */
+        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        {
+                CERROR ("PtlEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        count = args->ioc_count;
+
+        client->md_in_head.start     = client->inbuf;
+        client->md_in_head.length    = (args->ioc_size + STDSIZE)
+                                                * count;
+        client->md_in_head.threshold = PTL_MD_THRESH_INF;
+        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.user_ptr  = NULL;
+        client->md_in_head.eventq    = client->eq;
+        memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count);
+
+        /* Attach the incoming buffer */
+        if ((rc = PtlMDAttach (client->me, client->md_in_head,
+                              PTL_UNLINK, &client->md_in_head_h))) {
+                CERROR ("PtlMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+        /* Setup the outgoing ping header */
+        client->md_out_head.start     = client->outbuf;
+        client->md_out_head.length    = STDSIZE + args->ioc_size;
+        client->md_out_head.threshold = args->ioc_count;
+        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.user_ptr  = NULL;
+        client->md_out_head.eventq    = PTL_EQ_NONE;
+
+        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic));
+
+        count = 0;
+
+        /* Bind the outgoing ping header */
+        if ((rc=PtlMDBind (*nip, client->md_out_head,
+                                        &client->md_out_head_h))) {
+                CERROR ("PtlMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return NULL;
+        }
+        while ((args->ioc_count - count)) {
+                memcpy (client->outbuf + sizeof(unsigned),
+                       &(count), sizeof(unsigned));
+                 /* Put the ping packet */
+                do_gettimeofday (&tv1);
+
+                memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1,
+                       sizeof(struct timeval));
+
+                if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
+                          client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
+                         PDEBUG ("PtlPut (header)", rc);
+                         pingcli_shutdown (1);
+                         return NULL;
+                }
+                printk ("sent msg no %d", count);
+
+                set_current_state (TASK_INTERRUPTIBLE);
+                rc = schedule_timeout (20 * args->ioc_timeout);
+                if (rc == 0) {
+                        printk ("   ::  timeout .....\n");
+                } else {
+                        do_gettimeofday (&tv2);
+                        printk("   ::  Reply in %u usec\n",
+                                (unsigned)((tv2.tv_sec - tv1.tv_sec)
+                                 * 1000000 +  (tv2.tv_usec - tv1.tv_usec)));
+                }
+                count++;
+        }
+
+        if (client->outbuf != NULL)
+                PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size);
+
+        if (client->inbuf != NULL)
+                PORTAL_FREE (client->inbuf,
+                               (args->ioc_size + STDSIZE) * args->ioc_count);
+
+        pingcli_shutdown (2);
+
+        /* Success! */
+        return NULL;
+} /* pingcli_setup() */
+
+
+
+/* called by the portals_ioctl for ping requests */
+static int kping_client(struct portal_ioctl_data *args)
+{
+        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        if (client == NULL)
+        {
+                CERROR ("Unable to allocate client structure\n");
+                return (0);
+        }
+        memset (client, 0, sizeof(struct pingcli_data));
+        pingcli_start (args);
+
+        return 0;
+} /* kping_client() */
+
+
+static int __init pingcli_init(void)
+{
+        PORTAL_SYMBOL_REGISTER(kping_client);
+        return 0;
+} /* pingcli_init() */
+
+
+static void __exit pingcli_cleanup(void)
+{
+        PORTAL_SYMBOL_UNREGISTER (kping_client);
+} /* pingcli_cleanup() */
+
+
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingcli_init);
+module_exit(pingcli_cleanup);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+EXPORT_SYMBOL (kping_client);
+#endif
diff --git a/lustre/portals/tests/ping_srv.c b/lustre/portals/tests/ping_srv.c
new file mode 100644 (file)
index 0000000..1037d09
--- /dev/null
@@ -0,0 +1,308 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *        Amey Inamdar     <amey@calsoftinc.com>
+ *        Kedar Sovani     <kedar@calsoftinc.com>
+ *
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include "ping.h"
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+#include <linux/wait.h>
+#include <linux/smp_lock.h>
+
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+
+#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
+#define MAXSIZE (16*1024*1024)
+
+static unsigned ping_head_magic;
+static unsigned ping_bulk_magic;
+static int nal  = 0;                            // Your NAL,
+static unsigned long packets_valid = 0;         // Valid packets 
+static int running = 1;
+atomic_t pkt;
+       
+static struct pingsrv_data *server=NULL;             // Our ping server
+
+static void *pingsrv_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (server->mdin_h)))
+                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (server->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        /* Unlink the client portal from the ME list */
+                        if ((rc = PtlMEUnlink (server->me)))
+                                        PDEBUG ("PtlMEUnlink", rc);
+
+                case 3:
+                        kportal_put_ni (nal);
+
+                case 4:
+                        
+                case 5:
+                        if (server->in_buf != NULL)
+                                PORTAL_FREE (server->in_buf, MAXSIZE);
+                        
+                        if (server != NULL)
+                                PORTAL_FREE (server, 
+                                             sizeof (struct pingsrv_data));
+                        
+        }
+
+        CDEBUG (D_OTHER, "ping sever resources released\n");
+        return NULL;
+} /* pingsrv_shutdown() */
+
+
+int pingsrv_thread(void *arg)
+{
+        int rc;
+        unsigned long magic;
+        unsigned long ping_bulk_magic = 0xcafebabe;
+        
+        kportal_daemonize ("pingsrv");
+        server->tsk =  current;
+        
+        while (running) {
+                set_current_state (TASK_INTERRUPTIBLE);
+                if (atomic_read (&pkt) == 0) {
+                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        continue;
+                }
+               
+                magic =  *((int *)(server->evnt.mem_desc.start 
+                                        + server->evnt.offset));
+                
+                
+                if(magic != 0xdeadbeef) {
+                        printk("Unexpected Packet to the server\n");
+                        
+                } 
+                memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic));
+                                
+                server->mdout.length    = server->evnt.rlength;
+                server->mdout.start     = server->in_buf;
+                server->mdout.threshold = 1; 
+                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.user_ptr  = NULL;
+                server->mdout.eventq    = PTL_EQ_NONE;
+       
+                /* Bind the outgoing buffer */
+                if ((rc = PtlMDBind (server->ni, server->mdout, 
+                                                &server->mdout_h))) {
+                         PDEBUG ("PtlMDBind", rc);
+                         pingsrv_shutdown (1);
+                         return 1;
+               }
+         
+                
+                server->mdin.start     = server->in_buf;
+                server->mdin.length    = MAXSIZE;
+                server->mdin.threshold = 1; 
+                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.user_ptr  = NULL;
+                server->mdin.eventq    = server->eq;
+        
+                if ((rc = PtlMDAttach (server->me, server->mdin,
+                        PTL_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                        CDEBUG (D_OTHER, "ping server resources allocated\n");
+                }
+                
+                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
+                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
+                         PDEBUG ("PtlPut", rc);
+                
+                atomic_dec (&pkt);
+                
+        }
+        pingsrv_shutdown (1);
+        running = 1;
+        return 0;    
+}
+
+static int pingsrv_packet(ptl_event_t *ev)
+{
+        atomic_inc (&pkt);
+        wake_up_process (server->tsk);
+        return 1;
+} /* pingsrv_head() */
+
+static int pingsrv_callback(ptl_event_t *ev)
+{
+        
+        if (ev == NULL) {
+                CERROR ("null in callback, ev=%p\n", ev);
+                return 0;
+        }
+        server->evnt = *ev;
+        
+        printk ("received ping from nid "LPX64" "
+               "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
+               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+               *((int *)(ev->mem_desc.start + ev->offset)),
+               *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))),
+               *((int *)(ev->mem_desc.start + ev->offset + 2 * 
+                               sizeof(unsigned))));
+        
+        packets_valid++;
+
+        return pingsrv_packet(ev);
+        
+} /* pingsrv_callback() */
+
+
+static struct pingsrv_data *pingsrv_setup(void)
+{
+        ptl_handle_ni_t *nip;
+        int rc;
+
+       /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (nal)) == NULL) {
+                CDEBUG (D_OTHER, "NAL %d not loaded\n", nal);
+                return pingsrv_shutdown (4);
+        }
+
+        server->ni= *nip;
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (server->ni, &server->my_id))) {
+                PDEBUG ("PtlGetId", rc);
+                return pingsrv_shutdown (2);
+        }
+
+        server->id_local.nid = PTL_NID_ANY;
+        server->id_local.pid = PTL_PID_ANY;
+
+        /* Attach a match entries for header packets */
+        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+            server->id_local,0, ~0,
+            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
+                PDEBUG ("PtlMEAttach", rc);
+                return pingsrv_shutdown (2);
+        }
+
+
+        if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback,
+                                        &server->eq))) {
+                PDEBUG ("PtlEQAlloc (callback)", rc);
+                return pingsrv_shutdown (2);
+        }
+        
+        PORTAL_ALLOC (server->in_buf, MAXSIZE);
+        if(!server->in_buf){
+                CDEBUG (D_OTHER,"Allocation error\n");
+                return pingsrv_shutdown(2);
+        }
+        
+        /* Setup the incoming buffer */
+        server->mdin.start     = server->in_buf;
+        server->mdin.length    = MAXSIZE;
+        server->mdin.threshold = 1; 
+        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.user_ptr  = NULL;
+        server->mdin.eventq    = server->eq;
+        memset (server->in_buf, 0, STDSIZE);
+        
+        if ((rc = PtlMDAttach (server->me, server->mdin,
+                PTL_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("PtlMDAttach (bulk)", rc);
+                CDEBUG (D_OTHER, "ping server resources allocated\n");
+       }
+        /* Success! */
+        return server; 
+} /* pingsrv_setup() */
+
+static int pingsrv_start(void)
+{
+        /* Setup our server */
+        if (!pingsrv_setup()) {
+                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
+                return -ENOMEM;
+        }
+        kernel_thread (pingsrv_thread,NULL,0);
+        return 0;
+} /* pingsrv_start() */
+
+
+
+static int __init pingsrv_init(void)
+{
+        ping_head_magic = PING_HEADER_MAGIC;
+        ping_bulk_magic = PING_BULK_MAGIC;
+        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        return pingsrv_start ();
+} /* pingsrv_init() */
+
+
+static void __exit pingsrv_cleanup(void)
+{
+        remove_proc_entry ("net/pingsrv", NULL);
+        
+        running = 0;
+        wake_up_process (server->tsk);
+        while (running != 1) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (HZ);
+        }
+        
+} /* pingsrv_cleanup() */
+
+
+MODULE_PARM(nal, "i");
+MODULE_PARM_DESC(nal, "Use the specified NAL "
+                "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)");
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A kernel space ping server for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingsrv_init);
+module_exit(pingsrv_cleanup);
diff --git a/lustre/portals/tests/sping_cli.c b/lustre/portals/tests/sping_cli.c
new file mode 100644 (file)
index 0000000..4cef08b
--- /dev/null
@@ -0,0 +1,276 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *         Kedar Sovani (kedar@calsoftinc.com)
+ *         Amey Inamdar (amey@calsoftinc.com)
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* This is a striped down version of pinger. It follows a single
+ * request-response protocol. Doesn't do Bulk data pinging. Also doesn't
+ * send multiple packets in a single ioctl.
+ */
+
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include "ping.h"
+/* int portal_debug = D_PING_CLI;  */
+
+
+#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes
+                                                   assumed */
+
+/* This should be enclosed in a structure */
+
+static struct pingcli_data *client = NULL;
+
+static int count = 0;
+
+static void
+pingcli_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("PtlMDUnlink", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (client->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        if ((rc = PtlMEUnlink (client->me)))
+                                PDEBUG ("PtlMEUnlink", rc);
+                case 3:
+                        kportal_put_ni (client->args->ioc_nal);
+
+                case 4:
+                        /* Free our buffers */
+                        if (client->outbuf != NULL)
+                                PORTAL_FREE (client->outbuf, STDSIZE);
+
+                        if (client->inbuf != NULL)
+                                PORTAL_FREE (client->inbuf, STDSIZE);
+
+
+                        if (client != NULL)
+                                PORTAL_FREE (client,
+                                                sizeof(struct pingcli_data));
+        }
+
+
+        CDEBUG (D_OTHER, "ping client released resources\n");
+} /* pingcli_shutdown() */
+
+static int pingcli_callback(ptl_event_t *ev)
+{
+                wake_up_process (client->tsk);
+        return 1;
+}
+
+
+static struct pingcli_data *
+pingcli_start(struct portal_ioctl_data *args)
+{
+        const ptl_handle_ni_t *nip;
+        unsigned ping_head_magic = PING_HEADER_MAGIC;
+        int rc;
+
+        client->tsk = current;
+        client->args = args;
+
+        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64",  \
+                        nal %d, size %u, count: %u, timeout: %u\n",
+                        args->ioc_nid, args->ioc_nal, args->ioc_size,
+                        args->ioc_count, args->ioc_timeout);
+
+
+        PORTAL_ALLOC (client->outbuf, STDSIZE) ;
+        if (client->outbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        PORTAL_ALLOC (client->inbuf,  STDSIZE);
+
+        if (client->inbuf == NULL)
+        {
+                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (args->ioc_nal)) == NULL)
+        {
+                CERROR ("NAL %d not loaded.\n", args->ioc_nal);
+                pingcli_shutdown (4);
+                return (NULL);
+        }
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (*nip, &client->myid)))
+        {
+                CERROR ("PtlGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Setup the local match entries */
+        client->id_local.nid = PTL_NID_ANY;
+        client->id_local.pid = PTL_PID_ANY;
+
+        /* Setup the remote match entries */
+        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.pid = 0;
+
+        if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, PTL_RETAIN,
+                   PTL_INS_AFTER, &client->me)))
+        {
+                CERROR ("PtlMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+        /* Allocate the event queue for this network interface */
+        if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq)))
+        {
+                CERROR ("PtlEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return (NULL);
+        }
+
+
+        client->md_in_head.start     = client->inbuf;
+        client->md_in_head.length    = STDSIZE;
+        client->md_in_head.threshold = 1;
+        client->md_in_head.options   = PTL_MD_OP_PUT;
+        client->md_in_head.user_ptr  = NULL;
+        client->md_in_head.eventq    = client->eq;
+        memset (client->inbuf, 0, STDSIZE);
+
+        /* Attach the incoming buffer */
+        if ((rc = PtlMDAttach (client->me, client->md_in_head,
+                              PTL_UNLINK, &client->md_in_head_h))) {
+                CERROR ("PtlMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+
+        /* Setup the outgoing ping header */
+        client->md_out_head.start     = client->outbuf;
+        client->md_out_head.length    = STDSIZE;
+        client->md_out_head.threshold = 1;
+        client->md_out_head.options   = PTL_MD_OP_PUT;
+        client->md_out_head.user_ptr  = NULL;
+        client->md_out_head.eventq    = PTL_EQ_NONE;
+
+        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic));
+
+        /* Bind the outgoing ping header */
+        if ((rc=PtlMDBind (*nip, client->md_out_head,
+                                        &client->md_out_head_h))) {
+                CERROR ("PtlMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return (NULL);
+        }
+        /* Put the ping packet */
+        if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
+                         client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
+                PDEBUG ("PtlPut (header)", rc);
+                pingcli_shutdown (1);
+                return NULL;
+        }
+
+        count = 0;
+        set_current_state (TASK_INTERRUPTIBLE);
+        rc = schedule_timeout (20 * args->ioc_timeout);
+        if (rc == 0) {
+                printk (" Time out on the server\n");
+                pingcli_shutdown (2);
+                return NULL;
+        } else
+                printk("Received respose from the server \n");
+
+
+        pingcli_shutdown (2);
+
+        /* Success! */
+        return NULL;
+} /* pingcli_setup() */
+
+
+
+/* called by the portals_ioctl for ping requests */
+static int kping_client(struct portal_ioctl_data *args)
+{
+
+        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        memset (client, 0, sizeof(struct pingcli_data));
+        if (client == NULL)
+        {
+                CERROR ("Unable to allocate client structure\n");
+                return (0);
+        }
+        pingcli_start (args);
+
+        return 0;
+} /* kping_client() */
+
+
+static int __init pingcli_init(void)
+{
+        PORTAL_SYMBOL_REGISTER(kping_client);
+        return 0;
+} /* pingcli_init() */
+
+
+static void __exit pingcli_cleanup(void)
+{
+        PORTAL_SYMBOL_UNREGISTER (kping_client);
+} /* pingcli_cleanup() */
+
+
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingcli_init);
+module_exit(pingcli_cleanup);
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+EXPORT_SYMBOL (kping_client);
+#endif
diff --git a/lustre/portals/tests/sping_srv.c b/lustre/portals/tests/sping_srv.c
new file mode 100644 (file)
index 0000000..a18ea35
--- /dev/null
@@ -0,0 +1,295 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
+ * Author: Brian Behlendorf <behlendorf1@llnl.gov>
+ *        Amey Inamdar     <amey@calsoftinc.com>
+ *        Kedar Sovani     <kedar@calsoftinc.com>
+ *
+ *
+ * This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This is a striped down version of pinger. It follows a single
+ * request-response protocol. Doesn't do Bulk data pinging. Also doesn't 
+ * send multiple packets in a single ioctl.
+ */
+
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <linux/kp30.h>
+#include <portals/p30.h>
+#include "ping.h"
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/version.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#include <linux/workqueue.h>
+#else
+#include <linux/tqueue.h>
+#endif
+#include <linux/wait.h>
+#include <linux/smp_lock.h>
+
+#include <asm/unistd.h>
+#include <asm/semaphore.h>
+
+#define STDSIZE (sizeof(int) + sizeof(int) + 4)
+
+static int nal  = 0;                            // Your NAL,
+static unsigned long packets_valid = 0;         // Valid packets 
+static int running = 1;
+atomic_t pkt;
+       
+static struct pingsrv_data *server=NULL;             // Our ping server
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
+#endif
+
+static void *pingsrv_shutdown(int err)
+{
+        int rc;
+
+        /* Yes, we are intentionally allowing us to fall through each
+         * case in to the next.  This allows us to pass an error
+         * code to just clean up the right stuff.
+         */
+        switch (err) {
+                case 1:
+                        /* Unlink any memory descriptors we may have used */
+                        if ((rc = PtlMDUnlink (server->mdin_h)))
+                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                case 2:
+                        /* Free the event queue */
+                        if ((rc = PtlEQFree (server->eq)))
+                                PDEBUG ("PtlEQFree", rc);
+
+                        /* Unlink the client portal from the ME list */
+                        if ((rc = PtlMEUnlink (server->me)))
+                                        PDEBUG ("PtlMEUnlink", rc);
+
+                case 3:
+                        kportal_put_ni (nal);
+
+                case 4:
+                        
+                        if (server->in_buf != NULL)
+                                PORTAL_FREE (server->in_buf, STDSIZE);
+                        
+                        if (server != NULL)
+                                PORTAL_FREE (server, 
+                                             sizeof (struct pingsrv_data));
+                        
+        }
+
+        CDEBUG (D_OTHER, "ping sever resources released\n");
+        return NULL;
+} /* pingsrv_shutdown() */
+
+
+int pingsrv_thread(void *arg)
+{
+        int rc;
+        
+        kportal_daemonize ("pingsrv");
+        server->tsk = current;
+        
+        while (running) {
+                set_current_state (TASK_INTERRUPTIBLE);
+                if (atomic_read (&pkt) == 0) {
+                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        continue;
+                }
+                               
+                server->mdout.start     = server->in_buf;
+                server->mdout.length    = STDSIZE;
+                server->mdout.threshold = 1; 
+                server->mdout.options   = PTL_MD_OP_PUT;
+                server->mdout.user_ptr  = NULL;
+                server->mdout.eventq    = PTL_EQ_NONE;
+       
+                /* Bind the outgoing buffer */
+                if ((rc = PtlMDBind (server->ni, server->mdout, 
+                                                &server->mdout_h))) {
+                         PDEBUG ("PtlMDBind", rc);
+                         pingsrv_shutdown (1);
+                         return 1;
+               }
+         
+                
+                server->mdin.start     = server->in_buf;
+                server->mdin.length    = STDSIZE;
+                server->mdin.threshold = 1; 
+                server->mdin.options   = PTL_MD_OP_PUT;
+                server->mdin.user_ptr  = NULL;
+                server->mdin.eventq    = server->eq;
+        
+                if ((rc = PtlMDAttach (server->me, server->mdin,
+                        PTL_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                        CDEBUG (D_OTHER, "ping server resources allocated\n");
+                }
+                
+                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
+                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
+                         PDEBUG ("PtlPut", rc);
+                
+                atomic_dec (&pkt);
+                
+        }
+        pingsrv_shutdown (1);
+        running = 1;
+        return 0;    
+}
+
+static int pingsrv_packet(ptl_event_t *ev)
+{
+        atomic_inc (&pkt);
+        wake_up_process (server->tsk);
+        return 1;
+} /* pingsrv_head() */
+
+static int pingsrv_callback(ptl_event_t *ev)
+{
+        
+        if (ev == NULL) {
+                CERROR ("null in callback, ev=%p\n", ev);
+                return 0;
+        }
+        server->evnt = *ev;
+        
+        printk ("received ping from nid "LPX64" "
+               "(off=%u rlen=%u mlen=%u head=%x)\n",
+               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
+               *((int *)(ev->mem_desc.start + ev->offset)));
+        
+        packets_valid++;
+
+        return pingsrv_packet(ev);
+        
+} /* pingsrv_callback() */
+
+
+static struct pingsrv_data *pingsrv_setup(void)
+{
+        ptl_handle_ni_t *nip;
+        int rc;
+
+       /* Aquire and initialize the proper nal for portals. */
+        if ((nip = kportal_get_ni (nal)) == NULL) {
+                CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal);
+                return pingsrv_shutdown (4);
+        }
+
+        server->ni= *nip;
+
+        /* Based on the initialization aquire our unique portal ID. */
+        if ((rc = PtlGetId (server->ni, &server->my_id))) {
+                PDEBUG ("PtlGetId", rc);
+                return pingsrv_shutdown (2);
+        }
+
+        server->id_local.nid = PTL_NID_ANY;
+        server->id_local.pid = PTL_PID_ANY;
+
+        /* Attach a match entries for header packets */
+        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+            server->id_local,0, ~0,
+            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
+                PDEBUG ("PtlMEAttach", rc);
+                return pingsrv_shutdown (2);
+        }
+
+
+        if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback,
+                                        &server->eq))) {
+                PDEBUG ("PtlEQAlloc (callback)", rc);
+                return pingsrv_shutdown (2);
+        }
+        
+        PORTAL_ALLOC (server->in_buf, STDSIZE);
+        if(!server->in_buf){
+                CDEBUG (D_OTHER,"Allocation error\n");
+                return pingsrv_shutdown(2);
+        }
+        
+        /* Setup the incoming buffer */
+        server->mdin.start     = server->in_buf;
+        server->mdin.length    = STDSIZE;
+        server->mdin.threshold = 1; 
+        server->mdin.options   = PTL_MD_OP_PUT;
+        server->mdin.user_ptr  = NULL;
+        server->mdin.eventq    = server->eq;
+        memset (server->in_buf, 0, STDSIZE);
+        
+        if ((rc = PtlMDAttach (server->me, server->mdin,
+                PTL_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("PtlMDAttach (bulk)", rc);
+                CDEBUG (D_OTHER, "ping server resources allocated\n");
+       }
+        /* Success! */
+        return server; 
+} /* pingsrv_setup() */
+
+static int pingsrv_start(void)
+{
+        /* Setup our server */
+        if (!pingsrv_setup()) {
+                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
+                return -ENOMEM;
+        }
+        kernel_thread (pingsrv_thread,NULL,0);
+        return 0;
+} /* pingsrv_start() */
+
+
+
+static int __init pingsrv_init(void)
+{
+        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        return pingsrv_start ();
+} /* pingsrv_init() */
+
+
+static void __exit pingsrv_cleanup(void)
+{
+        remove_proc_entry ("net/pingsrv", NULL);
+        
+        running = 0;
+        wake_up_process (server->tsk);
+        while (running != 1) {
+                set_current_state (TASK_UNINTERRUPTIBLE);
+                schedule_timeout (HZ);
+        }
+        
+} /* pingsrv_cleanup() */
+
+
+MODULE_PARM(nal, "i");
+MODULE_PARM_DESC(nal, "Use the specified NAL "
+                "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)");
+MODULE_AUTHOR("Brian Behlendorf (LLNL)");
+MODULE_DESCRIPTION("A kernel space ping server for portals testing");
+MODULE_LICENSE("GPL");
+
+module_init(pingsrv_init);
+module_exit(pingsrv_cleanup);
diff --git a/lustre/portals/tests/startclient.sh b/lustre/portals/tests/startclient.sh
new file mode 100755 (executable)
index 0000000..c9b7c16
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-0}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=pingcli.o
+else
+       PING=spingcli.o
+fi
+
+case "$1" in
+       toe)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../toenal/ktoenal.o
+               /sbin/insmod ./$PING
+               echo ktoenal > /tmp/nal
+       ;;
+       
+       tcp)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../socknal/ksocknal.o
+               /sbin/insmod ./$PING 
+               echo ksocknal > /tmp/nal
+       ;;
+       
+       elan)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../qswnal/kqswnal.o
+               /sbin/insmod ./$PING
+               echo kqswnal > /tmp/nal
+       ;;
+       
+       *)
+               echo "Usage : ${0} < tcp | toe | elan >"
+               exit 1;
+esac
+exit 0;
diff --git a/lustre/portals/tests/startserver.sh b/lustre/portals/tests/startserver.sh
new file mode 100755 (executable)
index 0000000..942300e
--- /dev/null
@@ -0,0 +1,38 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-0}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=pingsrv.o
+else
+       PING=spingsrv.o
+fi
+
+case "$1" in
+       toe)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../toenal/ktoenal.o
+               /sbin/insmod ./$PING nal=4
+               echo ktoenal > /tmp/nal
+       ;;
+       
+       tcp)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../socknal/ksocknal.o
+               /sbin/insmod ./$PING nal=2
+               echo ksocknal > /tmp/nal
+       ;;
+       
+       elan)
+               /sbin/insmod  ../oslib/portals.o
+               /sbin/insmod ../qswnal/kqswnal.o
+               /sbin/insmod ./$PING nal=4
+               echo kqswnal > /tmp/nal
+       ;;
+       
+       *)
+               echo "Usage : ${0} < tcp | toe | elan >"
+               exit 1;
+esac
+../utils/acceptor 9999&
+exit 0;
diff --git a/lustre/portals/tests/stopclient.sh b/lustre/portals/tests/stopclient.sh
new file mode 100755 (executable)
index 0000000..f7e3aa1
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-1}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=spingcli
+else
+       PING=pingcli
+fi
+
+rmmod $PING
+NAL=`cat /tmp/nal`;
+rmmod $NAL
+rmmod portals
diff --git a/lustre/portals/tests/stopserver.sh b/lustre/portals/tests/stopserver.sh
new file mode 100644 (file)
index 0000000..3e81831
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+SIMPLE=${SIMPLE:-1}
+
+if [ $SIMPLE -eq 0 ]; then
+       PING=spingsrv
+else
+       PING=pingsrv
+fi
+
+rmmod $PING
+NAL=`cat /tmp/nal`;
+rmmod $NAL
+killall -9 acceptor
+rm -f /var/run/acceptor-9999.pid
+rmmod portals
diff --git a/lustre/portals/unals/.cvsignore b/lustre/portals/unals/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lustre/portals/unals/Makefile.am b/lustre/portals/unals/Makefile.am
new file mode 100644 (file)
index 0000000..dc427b0
--- /dev/null
@@ -0,0 +1,5 @@
+CPPFLAGS=
+INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir)
+lib_LIBRARIES = libtcpnal.a
+pkginclude_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
+libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
diff --git a/lustre/portals/unals/README b/lustre/portals/unals/README
new file mode 100644 (file)
index 0000000..6cb93d9
--- /dev/null
@@ -0,0 +1,53 @@
+This library implements two NAL interfaces, both running over IP.
+The first, tcpnal, creates TCP connections between participating
+processes in order to transport the portals requests. The second,
+ernal, provides a simple transport protocol which runs over
+UDP datagrams.
+
+The interface functions return both of these values in host order for
+convenience and readability. However this means that addresses
+exchanged in messages between hosts of different orderings will not
+function properly.
+
+Both NALs use the same support functions in order to schedule events
+and communicate with the generic portals implementation.
+
+            -------------------------
+            |         api           |
+            |_______________________|
+            |         lib           |
+            |_______________________|
+            | ernal  |   |tcpnal    |
+            |--------|   |----------|
+            | udpsock|   |connection|
+            |-----------------------|
+            |     timer/select      |
+            -------------------------
+
+
+  These NALs uses the framework from fdnal of a pipe between the api
+and library sides. This is wrapped up in the select on the library
+side, and blocks on the api side. Performance could be severely
+enhanced by collapsing this aritificial barrier, by using shared
+memory queues, or by wiring the api layer directly to the library.
+
+
+nid is defined as the low order 24-bits of the IP address of the
+physical node left shifted by 8 plus a virtual node number of 0
+through 255 (really only 239).  The virtual node number of a tcpnal
+application should be specified using the environment variable
+PTL_VIRTNODE.  pid is now a completely arbitrary number in the
+range of 0 to 255.  The IP interface used can be overridden by
+specifying the appropriate hostid by setting the PTL_HOSTID
+environment variable.  The value can be either dotted decimal
+(n.n.n.n) or hex starting with "0x".
+TCPNAL:
+  As the NAL needs to try to send to a particular nid/pid pair, it
+  will open up connections on demand. Because the port associated with
+  the connecting socket is different from the bound port, two
+  connections will normally be established between a pair of peers, with
+  data flowing from the anonymous connect (active) port to the advertised
+  or well-known bound (passive) port of each peer.
+
+  Should the connection fail to open, an error is reported to the
+  library component, which causes the api request to fail.
diff --git a/lustre/portals/unals/address.c b/lustre/portals/unals/address.c
new file mode 100644 (file)
index 0000000..b422c3f
--- /dev/null
@@ -0,0 +1,146 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* address.c:
+ * this file provides functions to aquire the IP address of the node
+ * and translate them into a NID/PID pair which supports a static
+ * mapping of virtual nodes into the port range of an IP socket.
+*/
+
+#include <stdlib.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <portals/p30.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+/* Function:  get_node_id
+ * Returns: a 32 bit id for this node, actually a big-endian IP address
+ *
+ * get_node_id() determines the host name and uses the resolver to
+ *  find out its ip address. This is fairly fragile and inflexible, but
+ *  explicitly asking about interfaces and their addresses is very
+ *  complicated and nonportable.
+ */
+static unsigned int get_node_id(void)
+{
+    char buffer[255];
+    unsigned int x;
+    struct hostent *he;
+    char * host_envp;
+
+    if (!(host_envp = getenv("PTL_HOSTID")))
+        {
+            gethostname(buffer,sizeof(buffer));
+            he=gethostbyname(buffer);
+            if (he)
+                    x=*(unsigned int *)he->h_addr_list[0];
+            else
+                    x = 0;
+            return(ntohl(x));
+        }
+    else 
+        {
+            if (host_envp[1] != 'x')
+                {
+                    int a, b, c, d;
+                    sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d);
+                    return ((a<<24) | (b<<16) | (c<<8) | d);
+                }
+            else
+                {
+                    long long hostid = strtoll(host_envp, 0, 0);
+                    return((unsigned int) hostid);
+                }
+        }
+}
+
+
+/* Function:  set_address
+ * Arugments: t: a procnal structure to populate with the request
+ *
+ * set_address performs the bit manipulations to set the nid, pid, and
+ *    iptop8 fields of the procnal structures.
+ *
+ * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY
+ */
+
+#ifdef DIRECT_IP_MODE
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int port;
+    if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
+    else port=pidrequest;
+    t->nal_cb->ni.nid=get_node_id();
+    t->nal_cb->ni.pid=port;
+}
+#else
+
+void set_address(bridge t,ptl_pid_t pidrequest)
+{
+    int virtnode, in_addr, port; 
+    ptl_pid_t pid;
+
+    /* get and remember my node id*/
+    if (!getenv("PTL_VIRTNODE"))
+        virtnode = 0;
+    else 
+        {
+            int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT 
+                                              >> PNAL_VNODE_SHIFT);
+            virtnode = atoi(getenv("PTL_VIRTNODE"));
+            if (virtnode > maxvnode)
+                {
+                    fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n",
+                            virtnode, maxvnode);
+                    return;
+                }
+        }
+    
+    in_addr = get_node_id();
+
+    t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
+    t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) 
+                            << PNAL_VNODE_SHIFT)
+        + virtnode;
+
+    pid=pidrequest;
+    /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
+#ifdef notyet
+    if (pid==(unsigned short)PTL_PID_ANY) port = 0;
+#endif
+    if (pid==(unsigned short)PTL_PID_ANY) 
+        {
+            fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n");
+            return;
+        }
+    else if (pid > PNAL_PID_MASK)
+        {
+            fprintf(stderr, "portal pid of %d is too large - max %d\n",
+                    pid, PNAL_PID_MASK);
+            return;
+        }
+    else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
+    t->nal_cb->ni.pid=pid;
+}
+#endif
diff --git a/lustre/portals/unals/bridge.h b/lustre/portals/unals/bridge.h
new file mode 100644 (file)
index 0000000..0b4940f
--- /dev/null
@@ -0,0 +1,29 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <portals/lib-p30.h>
+
+typedef struct bridge {
+    int alive;
+    nal_cb_t *nal_cb;
+    void *lower;
+    void *local;
+    void (*shutdown)(struct bridge *);
+    /* this doesn't really belong here */
+    unsigned char iptop8;
+} *bridge;
+
+
+nal_t *bridge_init(ptl_interface_t nal,
+                   ptl_pid_t pid_request,
+                   ptl_ni_limits_t *desired,
+                   ptl_ni_limits_t *actual,
+                   int *rc);
+
+typedef int (*nal_initialize)(bridge);
+extern nal_initialize nal_table[PTL_IFACE_MAX];
diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c
new file mode 100644 (file)
index 0000000..310e899
--- /dev/null
@@ -0,0 +1,294 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* connection.c:
+   This file provides a simple stateful connection manager which
+   builds tcp connections on demand and leaves them open for
+   future use. It also provides the machinery to allow peers
+   to connect to it
+*/
+
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <table.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <connection.h>
+#include <errno.h>
+
+
+/* global variable: acceptor port */
+unsigned short tcpnal_acceptor_port = 988;
+
+
+/* Function:  compare_connection
+ * Arguments: connection c:      a connection in the hash table
+ *            ptl_process_id_t:  an id to verify  agains
+ * Returns: 1 if the connection is the one requested, 0 otherwise
+ *
+ *    compare_connection() tests for collisions in the hash table
+ */
+static int compare_connection(void *arg1, void *arg2)
+{
+        connection c = arg1;
+        unsigned int * id = arg2;
+        return((c->ip==id[0]) && (c->port==id[1]));
+}
+
+
+/* Function:  connection_key
+ * Arguments: ptl_process_id_t id:  an id to hash
+ * Returns: a not-particularily-well-distributed hash
+ *          of the id
+ */
+static unsigned int connection_key(unsigned int *id)
+{
+    return(id[0]^id[1]);
+}
+
+
+/* Function:  remove_connection
+ * Arguments: c: the connection to remove
+ */
+void remove_connection(void *arg)
+{
+        connection c = arg;
+        unsigned int id[2];
+        
+        id[0]=c->ip;
+        id[1]=c->port;
+        hash_table_remove(c->m->connections,id);
+        close(c->fd);
+        free(c);
+}
+
+
+/* Function:  read_connection: 
+ * Arguments: c:    the connection to read from 
+ *            dest: the buffer to read into
+ *            len:  the number of bytes to read   
+ * Returns: success as 1, or failure as 0
+ *
+ *   read_connection() reads data from the connection, continuing
+ *   to read partial results until the request is satisfied or
+ *   it errors. TODO: this read should be covered by signal protection.
+ */
+int read_connection(connection c,
+                    unsigned char *dest,
+                    int len)
+{
+    int offset=0,rc;
+
+    if (len){
+        do {
+            if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){
+                if (errno==EINTR) {
+                    rc=0;
+                } else {
+                    remove_connection(c);
+                    return(0);
+                }
+            }
+            offset+=rc;
+        } while (offset<len);
+    }
+    return(1);
+}
+
+static int connection_input(void *d)
+{
+        connection c = d;
+        return((*c->m->handler)(c->m->handler_arg,c));
+}
+
+
+/* Function:  allocate_connection
+ * Arguments: t:    tcpnal the allocation is occuring in the context of
+ *            dest: portal endpoint address for this connection
+ *            fd:   open file descriptor for the socket
+ * Returns: an allocated connection structure
+ *
+ * just encompasses the action common to active and passive
+ *  connections of allocation and placement in the global table
+ */
+static connection allocate_connection(manager m,
+                               unsigned int ip,
+                               unsigned short port,
+                               int fd)
+{
+    connection c=malloc(sizeof(struct connection));
+    unsigned int id[2];
+    c->m=m;
+    c->fd=fd;
+    c->ip=ip;
+    c->port=port;
+    id[0]=ip;
+    id[1]=port;
+    register_io_handler(fd,READ_HANDLER,connection_input,c);
+    hash_table_insert(m->connections,c,id);
+    return(c);
+}
+
+
+/* Function:  new_connection
+ * Arguments: t: opaque argument holding the tcpname
+ * Returns: 1 in order to reregister for new connection requests
+ *
+ *  called when the bound service socket recieves
+ *     a new connection request, it always accepts and
+ *     installs a new connection
+ */
+static int new_connection(void *z)
+{
+    manager m=z;
+    struct sockaddr_in s;
+    int len=sizeof(struct sockaddr_in);
+    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
+    unsigned int nid=*((unsigned int *)&s.sin_addr);
+    /* cfs specific hack */
+    //unsigned short pid=s.sin_port;
+    allocate_connection(m,htonl(nid),0/*pid*/,fd);
+    return(1);
+}
+
+
+/* Function:  force_tcp_connection
+ * Arguments: t: tcpnal
+ *            dest: portals endpoint for the connection
+ * Returns: an allocated connection structure, either
+ *          a pre-existing one, or a new connection
+ */
+connection force_tcp_connection(manager m,
+                                unsigned int ip,
+                                unsigned short port)
+{
+    connection c;
+    struct sockaddr_in addr;
+    unsigned int id[2];
+
+    port = tcpnal_acceptor_port;
+
+    id[0]=ip;
+    id[1]=port;
+
+    if (!(c=hash_table_find(m->connections,id))){
+        int fd;
+
+        bzero((char *) &addr, sizeof(addr));
+        addr.sin_family      = AF_INET;
+        addr.sin_addr.s_addr = htonl(ip);
+        addr.sin_port        = htons(port);
+
+        if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 
+            perror("tcpnal socket failed");
+            exit(-1);
+        }
+        if (connect(fd,
+                    (struct sockaddr *)&addr,
+                    sizeof(struct sockaddr_in)))
+            {
+                perror("tcpnal connect");
+                return(0);
+            }
+        return(allocate_connection(m,ip,port,fd));
+    }
+    return(c);
+}
+
+
+/* Function:  bind_socket
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: 1 on success, or 0 on error
+ *
+ * bind_socket() attempts to allocate and bind a socket to the requested
+ *  port, or dynamically assign one from the kernel should the port be
+ *  zero. Sets the bound and bound_handler elements of m.
+ *
+ *  TODO: The port should be an explicitly sized type.
+ */
+static int bind_socket(manager m,unsigned short port)
+{
+    struct sockaddr_in addr;
+    int alen=sizeof(struct sockaddr_in);
+    
+    if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0)  
+        return(0);
+    
+    bzero((char *) &addr, sizeof(addr));
+    addr.sin_family      = AF_INET;
+    addr.sin_addr.s_addr = 0;
+    addr.sin_port        = port; 
+    
+    if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){
+        perror ("tcpnal bind"); 
+        return(0);
+    }
+    
+    getsockname(m->bound,(struct sockaddr *)&addr, &alen);
+
+    m->bound_handler=register_io_handler(m->bound,READ_HANDLER,
+                                         new_connection,m);
+    listen(m->bound,5); 
+    m->port=addr.sin_port;
+    return(1);
+}
+
+
+/* Function:  shutdown_connections
+ * Arguments: m: the manager structure
+ *
+ * close all connections and reclaim resources
+ */
+void shutdown_connections(manager m)
+{
+    close(m->bound);
+    remove_io_handler(m->bound_handler);
+    hash_destroy_table(m->connections,remove_connection);
+    free(m);
+}
+
+
+/* Function:  init_connections
+ * Arguments: t: the nal state for this interface
+ *            port: the port to attempt to bind to
+ * Returns: a newly allocated manager structure, or
+ *          zero if the fixed port could not be bound
+ */
+manager init_connections(unsigned short pid,
+                         int (*input)(void *, void *),
+                         void *a)
+{
+    manager m=(manager)malloc(sizeof(struct manager));
+    m->connections=hash_create_table(compare_connection,connection_key);
+    m->handler=input;
+    m->handler_arg=a;
+    if (bind_socket(m,pid)) return(m);
+    free(m);
+    return(0);
+}
diff --git a/lustre/portals/unals/connection.h b/lustre/portals/unals/connection.h
new file mode 100644 (file)
index 0000000..6f57287
--- /dev/null
@@ -0,0 +1,32 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#include <table.h>
+
+typedef struct manager {
+    table connections;
+    int bound;
+    io_handler bound_handler;
+    int (*handler)(void *, void *);
+    void *handler_arg;
+    unsigned short port;
+} *manager;
+
+
+typedef struct connection {
+    unsigned int ip;
+    unsigned short port;
+    int fd;
+    manager m;
+} *connection;
+
+connection force_tcp_connection(manager m, unsigned int ip, unsigned int short);
+manager init_connections(unsigned short, int (*f)(void *, void *), void *);
+void remove_connection(void *arg);
+void shutdown_connections(manager m);
+int read_connection(connection c, unsigned char *dest, int len);
diff --git a/lustre/portals/unals/debug.c b/lustre/portals/unals/debug.c
new file mode 100644 (file)
index 0000000..529bb2d
--- /dev/null
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sys/time.h>
+
+int smp_processor_id = 1;
+char debug_file_path[1024] = "/tmp/lustre-log";
+char debug_file_name[1024];
+FILE *debug_file_fd;
+
+int portals_do_debug_dumplog(void *arg)
+{
+        printf("Look in %s\n", debug_file_name);
+        return 0;
+}
+
+
+void portals_debug_print(void)
+{
+        return;
+}
+
+
+void portals_debug_dumplog(void)
+{
+        printf("Look in %s\n", debug_file_name);
+        return;
+}
+
+
+int portals_debug_init(unsigned long bufsize)
+{ 
+        debug_file_fd = stdout;
+        return 0;
+}
+
+int portals_debug_cleanup(void)
+{
+        return 0; //close(portals_debug_fd);
+}
+
+int portals_debug_clear_buffer(void)
+{
+        return 0;
+}
+
+int portals_debug_mark_buffer(char *text)
+{
+
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+
+        return 0;
+}
+
+int portals_debug_copy_to_user(char *buf, unsigned long len)
+{
+        return 0;
+}
+
+/* FIXME: I'm not very smart; someone smarter should make this better. */
+void
+portals_debug_msg (int subsys, int mask, char *file, char *fn, int line,
+                   const char *format, ...)
+{
+        va_list       ap;
+        unsigned long flags;
+        struct timeval tv;
+        int nob;
+
+
+        /* NB since we pass a non-zero sized buffer (at least) on the first
+         * print, we can be assured that by the end of all the snprinting,
+         * we _do_ have a terminated buffer, even if our message got truncated.
+         */
+
+        gettimeofday(&tv, NULL);
+
+        nob += fprintf(debug_file_fd,
+                              "%02x:%06x:%d:%lu.%06lu ",
+                              subsys >> 24, mask, smp_processor_id,
+                              tv.tv_sec, tv.tv_usec);
+
+        nob += fprintf(debug_file_fd,
+                            "(%s:%d:%s() %d+%ld): ",
+                            file, line, fn, 0,
+                            8192 - ((unsigned long)&flags & 8191UL));
+
+        va_start (ap, format);
+        nob += fprintf(debug_file_fd, format, ap);
+        va_end (ap);
+
+
+}
+
diff --git a/lustre/portals/unals/dispatch.h b/lustre/portals/unals/dispatch.h
new file mode 100644 (file)
index 0000000..34dd070
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* this file is only called dispatch.h to prevent it
+   from colliding with /usr/include/sys/select.h */
+
+typedef struct io_handler *io_handler;
+
+struct io_handler{
+  io_handler *last;
+  io_handler next;
+  int fd;
+  int type;
+  int (*function)(void *);
+  void *argument;
+  int disabled;
+};
+
+
+#define READ_HANDLER 1
+#define WRITE_HANDLER 2
+#define EXCEPTION_HANDLER 4
+#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER)
+
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg);
+
+void remove_io_handler (io_handler i);
+void init_unix_timer(void);
+void select_timer_block(when until);
+when now(void);
diff --git a/lustre/portals/unals/ipmap.h b/lustre/portals/unals/ipmap.h
new file mode 100644 (file)
index 0000000..85b1e18
--- /dev/null
@@ -0,0 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#define DIRECT_IP_MODE
+#ifdef DIRECT_IP_MODE
+#define PNAL_NID(in_addr, port) (in_addr)
+#define PNAL_PID(pid) (pid)
+#define PNAL_IP(in_addr, port) (in_addr)
+#define PNAL_PORT(nid, pid) (pid)
+#else
+
+#define PNAL_BASE_PORT 4096
+#define PNAL_HOSTID_SHIFT 24
+#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1)
+#define PNAL_VNODE_SHIFT 8
+#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1)
+#define PNAL_PID_SHIFT 8
+#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1)
+
+#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \
+                                    << PNAL_VNODE_SHIFT) \
+                                   | (((ntohs(port)-PNAL_BASE_PORT) >>\
+                                       PNAL_PID_SHIFT)))
+#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT)  & PNAL_PID_MASK)
+
+#define PNAL_IP(nid,t)  (htonl((((unsigned)(nid))\
+                                >> PNAL_VNODE_SHIFT)\
+                               | (t->iptop8 << PNAL_HOSTID_SHIFT)))
+#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \
+                                 << PNAL_VNODE_SHIFT) \
+                                | ((pid) & PNAL_PID_MASK)) \
+                               + PNAL_BASE_PORT))
+#endif
diff --git a/lustre/portals/unals/pqtimer.c b/lustre/portals/unals/pqtimer.c
new file mode 100644 (file)
index 0000000..fa2fb4f
--- /dev/null
@@ -0,0 +1,226 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* timer.c:
+ *   this file implements a simple priority-queue based timer system. when
+ * combined with a file which implements now() and block(), it can
+ * be used to provide course-grained time-based callbacks.
+ */
+
+#include <pqtimer.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+};
+
+typedef struct thunk *thunk;
+struct thunk {
+    void (*f)(void *);
+    void *a;
+    thunk next;
+};
+
+extern when now(void);
+
+static thunk thunks;
+static int internal;
+static void (*block_function)(when);
+static int number_of_timers;
+static int size_of_pqueue;
+static timer *timers;
+
+
+static void heal(int where)
+{
+    int left=(where<<1);
+    int right=(where<<1)+1;
+    int min=where;
+    timer temp;
+  
+    if (left <= number_of_timers)
+       if (timers[left]->w < timers[min]->w) min=left;
+    if (right <= number_of_timers)
+       if (timers[right]->w < timers[min]->w) min=right;
+    if (min != where){
+       temp=timers[where];
+       timers[where]=timers[min];
+       timers[min]=temp;
+       heal(min);
+    }
+}
+
+static void add_pqueue(int i)
+{
+    timer temp;
+    int parent=(i>>1);
+    if ((i>1) && (timers[i]->w< timers[parent]->w)){
+       temp=timers[i];
+       timers[i]=timers[parent];
+       timers[parent]=temp;
+       add_pqueue(parent);
+    }
+}
+
+static void add_timer(timer t)
+{
+    if (size_of_pqueue<(number_of_timers+2)){
+       int oldsize=size_of_pqueue;
+       timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10));
+       memcpy(new,timers,sizeof(timer)*oldsize);
+       timers=new;
+    }
+    timers[++number_of_timers]=t;
+    add_pqueue(number_of_timers);
+}
+
+/* Function: register_timer
+ * Arguments: interval: the time interval from the current time when
+ *                      the timer function should be called
+ *            function: the function to call when the time has expired
+ *            argument: the argument to call it with.
+ * Returns: a pointer to a timer structure
+ */
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument)
+{
+    timer t=(timer)malloc(sizeof(struct timer));
+
+    t->arg=argument;
+    t->function=function;
+    t->interval=interval;
+    t->disable=0;
+    t->w=now()+interval;
+    add_timer(t);
+    if (!internal && (number_of_timers==1))
+        block_function(t->w);
+    return(t);
+}
+
+/* Function: remove_timer
+ * Arguments: t: 
+ * Returns: nothing
+ *
+ * remove_timer removes a timer from the system, insuring
+ * that it will never be called. It does not actually
+ * free the timer due to reentrancy issues.
+ */
+
+void remove_timer(timer t)
+{
+    t->disable=1;
+}
+
+
+
+void timer_fire()
+{
+    timer current;
+
+    current=timers[1];
+    timers[1]=timers[number_of_timers--];
+    heal(1);
+    if (!current->disable) {
+        (*current->function)(current->arg);
+    }
+    free(current);
+}
+
+when next_timer(void)
+{
+    when here=now();
+
+    while (number_of_timers && (timers[1]->w <= here)) timer_fire();
+    if (number_of_timers) return(timers[1]->w);
+    return(0);
+}
+
+/* Function: timer_loop
+ * Arguments: none
+ * Returns: never
+ * 
+ * timer_loop() is the blocking dispatch function for the timer.
+ * Is calls the block() function registered with init_timer,
+ * and handles associated with timers that have been registered.
+ */
+void timer_loop()
+{
+    when here;
+
+    while (1){
+       thunk z;
+       here=now();
+
+       for (z=thunks;z;z=z->next) (*z->f)(z->a);
+
+       if (number_of_timers){
+           if (timers[1]->w > here){
+               (*block_function)(timers[1]->w);
+           } else {
+                timer_fire();
+           }
+       } else {
+           thunk z;
+           for (z=thunks;z;z=z->next) (*z->f)(z->a);
+           (*block_function)(0);
+       }
+    }
+}
+
+
+/* Function: register_thunk
+ * Arguments: f: the function to call
+ *            a: the single argument to call it with
+ *
+ * Thunk functions get called at irregular intervals, they
+ * should not assume when, or take a particularily long
+ * amount of time. Thunks are for background cleanup tasks.
+ */
+void register_thunk(void (*f)(void *),void *a)
+{
+    thunk t=(void *)malloc(sizeof(struct thunk));
+    t->f=f;
+    t->a=a;
+    t->next=thunks;
+    thunks=t;
+}
+
+/* Function: initialize_timer
+ * Arguments: block: the function to call to block for the specified interval 
+ *
+ * initialize_timer() must be called before any other timer function,
+ * including timer_loop.
+ */
+void initialize_timer(void (*block)(when))
+{
+    block_function=block;
+    number_of_timers=0;
+    size_of_pqueue=10;
+    timers=(timer *)malloc(sizeof(timer)*size_of_pqueue);
+    thunks=0;
+}
diff --git a/lustre/portals/unals/pqtimer.h b/lustre/portals/unals/pqtimer.h
new file mode 100644 (file)
index 0000000..11efb0e
--- /dev/null
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned long long when;
+when now(void);
+typedef struct timer *timer;
+timer register_timer(when interval,
+                    void (*function)(void *),
+                    void *argument);
+timer register_timer_wait(void);
+void remove_timer(timer);
+void timer_loop(void);
+void initialize_timer(void (*block)(when));
+void timer_fire(void);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c
new file mode 100644 (file)
index 0000000..6da3210
--- /dev/null
@@ -0,0 +1,283 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* api.c:
+ *  This file provides the 'api' side for the process-based nals.
+ *  it is responsible for creating the 'library' side thread,
+ *  and passing wrapped portals transactions to it.
+ *
+ *  Along with initialization, shutdown, and transport to the library
+ *  side, this file contains some stubs to satisfy the nal definition.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <errno.h>
+
+
+/* Function: forward
+ * Arguments: nal_t *nal: pointer to my top-side nal structure
+ *            id: the command to pass to the lower layer
+ *            args, args_len:pointer to and length of the request
+ *            ret, ret_len:  pointer to and size of the result
+ * Returns: a portals status code
+ *
+ * forwards a packaged api call from the 'api' side to the 'library'
+ *   side, and collects the result
+ */
+#define forward_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(PTL_SEGV);\
+       }
+static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len,
+                             void *ret, ptl_size_t ret_len)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int lib=p->to_lib[1];
+    int k;
+
+    forward_failure(write,lib, &id, sizeof(id));
+    forward_failure(write,lib,&args_len, sizeof(args_len));
+    forward_failure(write,lib,&ret_len, sizeof(ret_len));
+    forward_failure(write,lib,args, args_len);
+
+    do {
+        k=syscall(SYS_read, p->from_lib[0], ret, ret_len);
+    } while ((k!=ret_len) && (errno += EINTR));
+
+    if(k!=ret_len){
+        perror("nal: read return block");
+        return PTL_SEGV;
+    }
+    return (PTL_OK);
+}
+#undef forward_failure
+
+
+/* Function: shutdown
+ * Arguments: nal: a pointer to my top side nal structure
+ *            ni: my network interface index
+ *
+ * cleanup nal state, reclaim the lower side thread and
+ *   its state using PTL_FINI codepoint
+ */
+static int procbridge_shutdown(nal_t *n, int ni)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+    int code=PTL_FINI;
+
+    syscall(SYS_write, p->to_lib[1],&code,sizeof(code));
+    syscall(SYS_read, p->from_lib[0],&code,sizeof(code));
+
+    syscall(SYS_close, p->to_lib[0]);
+    syscall(SYS_close, p->to_lib[1]);
+    syscall(SYS_close, p->from_lib[0]);
+    syscall(SYS_close, p->from_lib[1]);
+
+    free(p);
+    return(0);
+}
+
+
+/* Function: validate
+ *    useless stub
+ */
+static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent)
+{
+    return(0);
+}
+
+
+/* Function: yield
+ * Arguments:  pid:
+ *
+ *  this function was originally intended to allow the
+ *   lower half thread to be scheduled to allow progress. we
+ *   overload it to explicitly block until signalled by the
+ *   lower half.
+ */
+static void procbridge_yield(nal_t *n)
+{
+    bridge b=(bridge)n->nal_data;
+    procbridge p=(procbridge)b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_wait(&p->cond,&p->mutex);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+static void procbridge_lock(nal_t * nal, unsigned long *flags){}
+static void procbridge_unlock(nal_t * nal, unsigned long *flags){}
+/* api_nal
+ *  the interface vector to allow the generic code to access
+ *  this nal. this is seperate from the library side nal_cb.
+ *  TODO: should be dyanmically allocated
+ */
+static nal_t api_nal = {
+    ni:       {0},
+    nal_data: NULL,
+    forward:  procbridge_forward,
+    shutdown: procbridge_shutdown,
+    validate: procbridge_validate,
+    yield:    procbridge_yield,
+    lock:     procbridge_lock,
+    unlock:   procbridge_unlock
+};
+
+/* Function: bridge_init
+ *
+ * Arguments:  pid: requested process id (port offset)
+ *                  PTL_ID_ANY not supported.
+ *             desired: limits passed from the application
+ *                      and effectively ignored
+ *             actual:  limits actually allocated and returned
+ *
+ * Returns: a pointer to my statically allocated top side NAL
+ *          structure
+ *
+ * initializes the tcp nal. we define unix_failure as an
+ * error wrapper to cut down clutter.
+ */
+#define unix_failure(operand,fd,buffer,length,text)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          perror(text);\
+          return(NULL);\
+       }
+#if 0
+static nal_t *bridge_init(ptl_interface_t nal,
+                          ptl_pid_t pid_request,
+                          ptl_ni_limits_t *desired,
+                          ptl_ni_limits_t *actual,
+                          int *rc)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (desired) limits = *desired;
+    unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t),
+                 "tcp_init: read");
+    unix_failure(read,p->from_lib[0], rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(*rc) return(NULL);
+
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#endif
+
+ptl_nid_t tcpnal_mynid;
+
+nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid)
+{
+    procbridge p;
+    bridge b;
+    static int initialized=0;
+    ptl_ni_limits_t limits = {-1,-1,-1,-1,-1};
+    int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
+
+    if(initialized) return (&api_nal);
+
+    init_unix_timer();
+
+    b=(bridge)malloc(sizeof(struct bridge));
+    p=(procbridge)malloc(sizeof(struct procbridge));
+    api_nal.nal_data=b;
+    b->local=p;
+
+    if(pipe(p->to_lib) || pipe(p->from_lib)) {
+        perror("nal_init: pipe");
+        return(NULL);
+    }
+
+    if (ptl_size)
+           limits.max_ptable_index = ptl_size;
+    if (acl_size)
+           limits.max_atable_index = acl_size;
+
+    unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t),
+                       "nal_init: write");
+    unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type),
+                       "nal_init: write");
+
+    if(pthread_create(&p->t, NULL, nal_thread, b)) {
+        perror("nal_init: pthread_create");
+        return(NULL);
+    }
+
+    unix_failure(read,p->from_lib[0], &rc, sizeof(rc),
+                 "nal_init: read");
+
+    if(rc) return(NULL);
+
+    b->nal_cb->ni.nid = tcpnal_mynid;
+    initialized = 1;
+    pthread_mutex_init(&p->mutex,0);
+    pthread_cond_init(&p->cond, 0);
+
+    return (&api_nal);
+}
+#undef unix_failure
diff --git a/lustre/portals/unals/procbridge.h b/lustre/portals/unals/procbridge.h
new file mode 100644 (file)
index 0000000..060ae7b
--- /dev/null
@@ -0,0 +1,40 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef _PROCBRIDGE_H_
+#define _PROCBRIDGE_H_
+
+#include <pthread.h>
+#include <bridge.h>
+#include <ipmap.h>
+
+
+typedef struct procbridge {
+    pthread_t t;
+    pthread_cond_t cond;
+    pthread_mutex_t mutex;
+    int to_lib[2];
+    int from_lib[2];
+} *procbridge;
+
+extern void *nal_thread(void *);
+
+
+#define PTL_INIT        (LIB_MAX_DISPATCH+1)
+#define PTL_FINI        (LIB_MAX_DISPATCH+2)
+
+#define MAX_ACLS        1
+#define MAX_PTLS        128
+
+extern void set_address(bridge t,ptl_pid_t pidrequest);
+extern nal_t *procbridge_interface(int num_interface,
+                            ptl_pt_index_t ptl_size,
+                            ptl_ac_index_t acl_size,
+                            ptl_pid_t requested_pid);
+
+#endif
diff --git a/lustre/portals/unals/proclib.c b/lustre/portals/unals/proclib.c
new file mode 100644 (file)
index 0000000..c3ee103
--- /dev/null
@@ -0,0 +1,270 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* lib.c:
+ *  This file provides the 'library' side for the process-based nals.
+ *  it is responsible for communication with the 'api' side and
+ *  providing service to the generic portals 'library'
+ *  implementation. 'library' might be better termed 'communication'
+ *  or 'kernel'.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <procbridge.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <errno.h>
+#include <timer.h>
+//#include <util/pqtimer.h>
+#include <dispatch.h>
+
+/* the following functions are stubs to satisfy the nal definition
+   without doing anything particularily useful*/
+
+static int nal_write(nal_cb_t *nal,
+                     void *private,
+                     user_ptr dst_addr,
+                     void *src_addr,
+                     ptl_size_t len)
+{
+    memcpy(dst_addr, src_addr, len);
+    return 0;
+}
+
+static int nal_read(nal_cb_t * nal,
+                    void *private,
+                   void *dst_addr,
+                   user_ptr src_addr,
+                   size_t len)
+{
+       memcpy(dst_addr, src_addr, len);
+       return 0;
+}
+
+static void *nal_malloc(nal_cb_t *nal,
+                        ptl_size_t len)
+{
+    void *buf =  malloc(len);
+    return buf;
+}
+
+static void nal_free(nal_cb_t *nal,
+                     void *buf,
+                     ptl_size_t len)
+{
+    free(buf);
+}
+
+static void nal_printf(nal_cb_t *nal,
+                       const char *fmt,
+                       ...)
+{
+    va_list        ap;
+
+    va_start(ap, fmt);
+    vprintf(fmt, ap);
+    va_end(ap);
+}
+
+
+static void nal_cli(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static void nal_sti(nal_cb_t *nal,
+                    unsigned long *flags)
+{
+}
+
+
+static int nal_dist(nal_cb_t *nal,
+                    ptl_nid_t nid,
+                    unsigned long *dist)
+{
+    return 0;
+}
+    
+
+
+/* Function:  data_from_api
+ * Arguments: t: the nal state for this interface
+ * Returns: whether to continue reading from the pipe
+ *
+ *   data_from_api() reads data from the api side in response
+ *   to a select.
+ *
+ *   We define data_failure() for syntactic convenience
+ *   of unix error reporting.
+ */
+
+#define data_failure(operand,fd,buffer,length)\
+       if(syscall(SYS_##operand,fd,buffer,length)!=length){\
+          lib_fini(b->nal_cb);\
+          return(0);\
+       }
+static int data_from_api(void *arg)
+{
+        bridge b = arg;
+    procbridge p=(procbridge)b->local;
+    /* where are these two sizes derived from ??*/
+    char arg_block[ 256 ];
+    char ret_block[ 128 ];
+    ptl_size_t arg_len,ret_len;
+    int fd=p->to_lib[0];
+    int index;
+
+    data_failure(read,fd, &index, sizeof(index));
+
+    if (index==PTL_FINI) {
+        lib_fini(b->nal_cb);
+        if (b->shutdown) (*b->shutdown)(b);
+        syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive));
+
+        /* a heavy-handed but convenient way of shutting down
+           the lower side thread */
+        pthread_exit(0);
+    }
+
+    data_failure(read,fd, &arg_len, sizeof(arg_len));
+    data_failure(read,fd, &ret_len, sizeof(ret_len));
+    data_failure(read,fd, arg_block, arg_len);
+
+    lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block);
+
+    data_failure(write,p->from_lib[1],ret_block, ret_len);
+    return(1);
+}
+#undef data_failure
+
+
+
+static void wakeup_topside(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+
+    pthread_mutex_lock(&p->mutex);
+    pthread_cond_broadcast(&p->cond);
+    pthread_mutex_unlock(&p->mutex);
+}
+
+
+/* Function:  nal_thread
+ * Arguments: z: an opaque reference to a nal control structure
+ *               allocated and partially populated by the api level code
+ * Returns: nothing, and only on error or explicit shutdown
+ *
+ *  This function is the entry point of the pthread initiated on 
+ *  the api side of the interface. This thread is used to handle
+ *  asynchronous delivery to the application.
+ * 
+ *  We define a limit macro to place a ceiling on limits
+ *   for syntactic convenience
+ */
+#define LIMIT(x,y,max)\
+     if ((unsigned int)x > max) y = max;
+
+extern int tcpnal_init(bridge);
+
+nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
+
+void *nal_thread(void *z)
+{
+    bridge b=z;
+    procbridge p=b->local;
+    int rc;
+    ptl_pid_t pid_request;
+    int nal_type;
+    ptl_ni_limits_t desired;
+    ptl_ni_limits_t actual;
+    
+    b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t));
+    b->nal_cb->nal_data=b;
+    b->nal_cb->cb_read=nal_read;
+    b->nal_cb->cb_write=nal_write;
+    b->nal_cb->cb_malloc=nal_malloc;
+    b->nal_cb->cb_free=nal_free;
+    b->nal_cb->cb_map=NULL;
+    b->nal_cb->cb_unmap=NULL;
+    b->nal_cb->cb_printf=nal_printf;
+    b->nal_cb->cb_cli=nal_cli;
+    b->nal_cb->cb_sti=nal_sti;
+    b->nal_cb->cb_dist=nal_dist;
+
+
+    register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b);
+
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t))))
+        perror("procbridge read from api");
+    if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type))))
+        perror("procbridge read from api");
+
+    actual = desired;
+    LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES);
+    LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS);
+    LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS);
+    LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS);
+    LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS);
+
+    set_address(b,pid_request);
+
+    if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
+    /* initialize the generic 'library' level code */
+
+    rc = lib_init(b->nal_cb, 
+                  b->nal_cb->ni.nid,
+                  b->nal_cb->ni.pid,
+                 10,
+                 actual.max_ptable_index,
+                 actual.max_atable_index);
+
+    /*
+     * Whatever the initialization returned is passed back to the
+     * user level code for further interpretation.  We just exit if
+     * it is non-zero since something went wrong.
+     */
+    /* this should perform error checking */
+#if 0
+    write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t));
+#endif
+    syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc));
+    
+    if(!rc) {
+        /* the thunk function is called each time the timer loop
+           performs an operation and returns to blocking mode. we
+           overload this function to inform the api side that
+           it may be interested in looking at the event queue */
+        register_thunk(wakeup_topside,b);
+        timer_loop();
+    }
+    return(0);
+}
+#undef LIMIT
+
diff --git a/lustre/portals/unals/select.c b/lustre/portals/unals/select.c
new file mode 100644 (file)
index 0000000..c4f84f4
--- /dev/null
@@ -0,0 +1,165 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* select.c:
+ *  Provides a general mechanism for registering and dispatching
+ *  io events through the select system call.
+ */
+
+#ifdef sun
+#include <sys/filio.h>
+#else
+#include <sys/ioctl.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+
+
+static struct timeval beginning_of_epoch;
+static io_handler io_handlers;
+
+/* Function: now
+ *
+ * Return: the current time in canonical units: a 64 bit number
+ *   where the most significant 32 bits contains the number
+ *   of seconds, and the least signficant a count of (1/(2^32))ths
+ *   of a second.
+ */
+when now()
+{
+    struct timeval result;
+  
+    gettimeofday(&result,0);
+    return((((unsigned long long)result.tv_sec)<<32)|
+           (((unsigned long long)result.tv_usec)<<32)/1000000);
+}
+
+
+/* Function: register_io_handler
+ * Arguments: fd: the file descriptor of interest
+ *            type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER
+ *            function: a function to call when io is available on fd
+ *            arg: an opaque correlator to return to the handler
+ * Returns: a pointer to the io_handler structure
+ */
+io_handler register_io_handler(int fd,
+                               int type,
+                               int (*function)(void *),
+                               void *arg)
+{
+    io_handler i=(io_handler)malloc(sizeof(struct io_handler));
+    if ((i->fd=fd)>=0){
+        i->type=type;
+        i->function=function;
+        i->argument=arg;
+        i->disabled=0;
+        i->last=&io_handlers;
+        if ((i->next=io_handlers)) i->next->last=&i->next;
+        io_handlers=i;
+    }
+    return(i);
+}
+
+/* Function: remove_io_handler
+ * Arguments: i: a pointer to the handler to stop servicing
+ *
+ * remove_io_handler() doesn't actually free the handler, due
+ * to reentrancy problems. it just marks the handler for 
+ * later cleanup by the blocking function.
+ */
+void remove_io_handler (io_handler i)
+{
+    i->disabled=1;
+}
+
+static void set_flag(io_handler n,fd_set *fds)
+{
+    if (n->type & READ_HANDLER) FD_SET(n->fd,fds);
+    if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1);
+    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2);
+}
+
+
+/* Function: select_timer_block
+ * Arguments: until: an absolute time when the select should return
+ * 
+ *   This function dispatches the various file descriptors' handler
+ *   functions, if the kernel indicates there is io available.
+ */
+void select_timer_block(when until)
+{
+    fd_set fds[3];
+    struct timeval timeout;
+    struct timeval *timeout_pointer;
+    int result;
+    io_handler j;
+    io_handler *k;
+
+    /* TODO: loop until the entire interval is expired*/
+    if (until){
+       when interval=until-now();
+        timeout.tv_sec=(interval>>32);
+        timeout.tv_usec=((interval<<32)/1000000)>>32;
+        timeout_pointer=&timeout;
+    } else timeout_pointer=0;
+
+    FD_ZERO(fds);
+    FD_ZERO(fds+1);
+    FD_ZERO(fds+2);
+    for (k=&io_handlers;*k;){
+        if ((*k)->disabled){
+            j=*k;
+            *k=(*k)->next;
+            free(j);
+        }
+        if (*k) {
+           set_flag(*k,fds);
+           k=&(*k)->next;
+       }
+    }
+    result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer);
+
+    if (result > 0)
+        for (j=io_handlers;j;j=j->next){
+            if (!(j->disabled) && 
+                ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) ||
+                 (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){
+                if (!(*j->function)(j->argument))
+                    j->disabled=1;
+            }
+        }
+}
+
+/* Function: init_unix_timer()
+ *   is called to initialize the library 
+ */
+void init_unix_timer()
+{
+    io_handlers=0;
+    gettimeofday(&beginning_of_epoch, 0);
+    initialize_timer(select_timer_block);
+}
diff --git a/lustre/portals/unals/table.c b/lustre/portals/unals/table.c
new file mode 100644 (file)
index 0000000..bef13c5
--- /dev/null
@@ -0,0 +1,264 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <table.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+/* table.c:
+ * a very simple hash table implementation with paramerterizable 
+ * comparison and key generation functions. it does resize
+ * in order to accomidate more entries, but never collapses 
+ * the table 
+ */
+
+static table_entry *table_lookup (table t,void *comparator,
+                                  unsigned int k,
+                                  int (*compare_function)(void *, void *),
+                                  int *success)
+{
+    unsigned int key=k%t->size;
+    table_entry *i;
+
+    for (i=&(t->entries[key]);*i;i=&((*i)->next)){
+        if (compare_function && ((*i)->key==k))
+            if ((*t->compare_function)((*i)->value,comparator)){
+                *success=1;
+                return(i);
+            }
+    }
+    *success=0;
+    return(&(t->entries[key]));
+}
+
+
+static void resize_table(table t, int size)
+{
+    int old_size=t->size;
+    table_entry *old_entries=t->entries;
+    int i; 
+    table_entry j,n;
+    table_entry *position;
+    int success;
+  
+    t->size=size;
+    t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size);
+    memset(t->entries,0,sizeof(table_entry)*t->size);
+
+    for (i=0;i<old_size;i++)
+        for (j=old_entries[i];j;j=n){
+            n=j->next;
+            position=table_lookup(t,0,j->key,0,&success);
+            j->next= *position;
+            *position=j;
+        }
+    free(old_entries);
+}
+
+
+/* Function: key_from_int
+ * Arguments: int i: value to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_int(int i)
+{
+    return(i);
+}
+
+
+/* Function: key_from_string
+ * Arguments: char *s: the null terminated string
+ *                     to compute the key of
+ * Returns: the key 
+ */
+unsigned int key_from_string(char *s)
+{
+    unsigned int result=0;
+    unsigned char *n;
+    int i;
+    if (!s) return(1);
+    for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i;
+    return(result);
+}
+
+
+/* Function: hash_create_table
+ * Arguments: compare_function: a function to compare
+ *                              a table instance with a correlator
+ *            key_function: a function to generate a 32 bit 
+ *                          hash key from a correlator
+ * Returns: a pointer to the new table
+ */
+table hash_create_table (int (*compare_function)(void *, void *),
+                    unsigned int (*key_function)(unsigned int *))
+{
+    table new=(table)malloc(sizeof(struct table));
+    memset(new, 0, sizeof(struct table));
+
+    new->compare_function=compare_function;
+    new->key_function=key_function;
+    new->number_of_entries=0;
+    new->size=4;
+    new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size);
+    memset(new->entries,0,sizeof(table_entry)*new->size);
+    return(new);
+}
+
+
+/* Function: hash_table_find
+ * Arguments: t: a table to look in
+ *            comparator: a value to access the table entry
+ * Returns: the element references to by comparator, or null
+ */
+void *hash_table_find (table t, void *comparator)
+{
+    int success;
+    table_entry* entry=table_lookup(t,comparator,
+                                    (*t->key_function)(comparator),
+                                    t->compare_function,
+                                    &success);
+    if (success)  return((*entry)->value);
+    return(0);
+}
+
+
+/* Function: hash_table_insert
+ * Arguments: t: a table to insert the object
+ *            value: the object to put in the table
+ *            comparator: the value by which the object 
+ *                        will be addressed
+ * Returns: nothing
+ */
+void hash_table_insert (table t, void *value, void *comparator)
+{
+    int success;
+    unsigned int k=(*t->key_function)(comparator);
+    table_entry *position=table_lookup(t,comparator,k,
+                                       t->compare_function,&success);
+    table_entry entry;
+
+    if (success) {
+        entry = *position;
+    } else {
+        entry = (table_entry)malloc(sizeof(struct table_entry));
+        memset(entry, 0, sizeof(struct table_entry));
+        entry->next= *position;
+        *position=entry;
+        t->number_of_entries++;
+    }
+    entry->value=value;
+    entry->key=k;
+    if (t->number_of_entries > t->size) resize_table(t,t->size*2);
+}
+
+/* Function: hash_table_remove
+ * Arguments: t: the table to remove the object from
+ *            comparator: the index value of the object to remove
+ * Returns: 
+ */
+void hash_table_remove (table t, void *comparator)
+{
+    int success;
+    table_entry temp;
+    table_entry *position=table_lookup(t,comparator,
+                                       (*t->key_function)(comparator),
+                                       t->compare_function,&success);
+    if(success) {
+        temp=*position;
+        *position=(*position)->next;
+        free(temp); /* the value? */
+        t->number_of_entries--;
+    }
+}
+
+/* Function: hash_iterate_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ */
+void hash_iterate_table_entries(table t,
+                           void (*handler)(void *,void *), 
+                           void *arg)
+{
+    int i;
+    table_entry *j,*next;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            (*handler)(arg,(*j)->value);
+        }
+}
+
+/* Function: hash_filter_table_entries
+ * Arguments: t: the table to iterate over
+ *            handler: a function to call with each element
+ *                     of the table, along with arg
+ *            arg: the opaque object to pass to handler
+ * Returns: nothing
+ * Notes: operations on the table inside handler are not safe
+ *
+ * filter_table_entires() calls the handler function for each
+ *   item in the table, passing it and arg. The handler function
+ *   returns 1 if it is to be retained in the table, and 0
+ *   if it is to be removed.
+ */
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg)
+{
+    int i;
+    table_entry *j,*next,v;
+  
+    for (i=0;i<t->size;i++)
+        for (j=t->entries+i;*j;j=next){
+            next=&((*j)->next);
+            if (!(*handler)(arg,(*j)->value)){
+                next=j;
+                v=*j;
+                *j=(*j)->next;
+                free(v);
+                t->number_of_entries--;
+            }
+        }
+}
+
+/* Function: destroy_table
+ * Arguments: t: the table to free
+ *            thunk: a function to call with each element,
+ *                   most likely free()
+ * Returns: nothing
+ */
+void hash_destroy_table(table t,void (*thunk)(void *))
+{
+    table_entry j,next;
+    int i;
+    for (i=0;i<t->size;i++)
+        for (j=t->entries[i];j;j=next){
+            next=j->next;
+            if (thunk) (*thunk)(j->value);
+            free(j);
+        }
+    free(t->entries);
+    free(t);
+}
diff --git a/lustre/portals/unals/table.h b/lustre/portals/unals/table.h
new file mode 100644 (file)
index 0000000..7fab586
--- /dev/null
@@ -0,0 +1,39 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+#ifndef E_TABLE
+#define E_TABLE
+
+typedef struct table_entry {
+  unsigned int key;
+  void *value;
+  struct table_entry *next;
+} *table_entry;
+
+
+typedef struct table {
+  unsigned int size;
+  int number_of_entries;
+  table_entry *entries;
+  int (*compare_function)(void *, void *);
+  unsigned int (*key_function)(unsigned int *);
+} *table;
+
+/* table.c */
+unsigned int key_from_int(int i);
+unsigned int key_from_string(char *s);
+table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *));
+void *hash_table_find(table t, void *comparator);
+void hash_table_insert(table t, void *value, void *comparator);
+void hash_table_remove(table t, void *comparator);
+void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg);
+void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg);
+void hash_destroy_table(table t, void (*thunk)(void *));
+
+#endif
diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c
new file mode 100644 (file)
index 0000000..534fc17
--- /dev/null
@@ -0,0 +1,198 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2.1 of the GNU Lesser General
+ *   Public License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public
+ *   License along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* tcpnal.c:
+   This file implements the TCP-based nal by providing glue
+   between the connection service and the generic NAL implementation */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pqtimer.h>
+#include <dispatch.h>
+#include <bridge.h>
+#include <ipmap.h>
+#include <connection.h>
+
+/* Function:  tcpnal_send
+ * Arguments: nal:     pointer to my nal control block
+ *            private: unused
+ *            cookie:  passed back to the portals library
+ *            hdr:     pointer to the portals header
+ *            nid:     destination node
+ *            pid:     destination process
+ *            data:    body of the message
+ *            len:     length of the body
+ * Returns: zero on success
+ *
+ * sends a packet to the peer, after insuring that a connection exists
+ */
+#warning FIXME: "param 'type' is newly added, make use of it!!"
+int tcpnal_send(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+               ptl_hdr_t *hdr,
+               int type,
+               ptl_nid_t nid,
+               ptl_pid_t pid,
+                unsigned int niov,
+                struct iovec *iov,
+               size_t len)
+{
+    connection c;
+    bridge b=(bridge)n->nal_data;
+    struct iovec tiov[2];
+    int count = 1;
+
+    if (!(c=force_tcp_connection((manager)b->lower,
+                                 PNAL_IP(nid,b),
+                                 PNAL_PORT(nid,pid)))) 
+        return(1);
+
+#if 0
+    /* TODO: these results should be checked. furthermore, provision
+       must be made for the SIGPIPE which is delivered when
+       writing on a tcp socket which has closed underneath
+       the application. there is a linux flag in the sendmsg
+       call which turns off the signally behaviour, but its
+       nonstandard */
+    syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t));
+    LASSERT (niov <= 1);
+    if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len);
+#else
+    LASSERT (niov <= 1);
+
+    tiov[0].iov_base = hdr;
+    tiov[0].iov_len = sizeof(ptl_hdr_t);
+
+    if (len) {
+            tiov[1].iov_base = iov[0].iov_base;
+            tiov[1].iov_len = len;
+            count++;
+    }
+
+    syscall(SYS_writev, c->fd, tiov, count);
+#endif
+    lib_finalize(n, private, cookie);
+        
+    return(0);
+}
+
+
+/* Function:  tcpnal_recv
+ * Arguments: nal_cb_t *nal:     pointer to my nal control block
+ *            void *private:     connection pointer passed through
+ *                               lib_parse()
+ *            lib_msg_t *cookie: passed back to portals library
+ *            user_ptr data:     pointer to the destination buffer
+ *            size_t mlen:       length of the body
+ *            size_t rlen:       length of data in the network
+ * Returns: zero on success
+ *
+ * blocking read of the requested data. must drain out the
+ * difference of mainpulated and requested lengths from the network
+ */
+int tcpnal_recv(nal_cb_t *n,
+               void *private,
+               lib_msg_t *cookie,
+                unsigned int niov,
+                struct iovec *iov,
+               ptl_size_t mlen,
+               ptl_size_t rlen)
+
+{
+    if (mlen) {
+        LASSERT (niov <= 1);
+        read_connection(private,iov[0].iov_base,mlen);
+        lib_finalize(n, private, cookie);
+    }
+
+    if (mlen!=rlen){
+        char *trash=malloc(rlen-mlen);
+        
+        /*TODO: check error status*/
+        read_connection(private,trash,rlen-mlen);
+        free(trash);
+    }
+
+    return(rlen);
+}
+
+
+/* Function:  from_connection: 
+ * Arguments: c: the connection to read from 
+ * Returns: whether or not to continue reading from this connection,
+ *          expressed as a 1 to continue, and a 0 to not
+ *
+ *  from_connection() is called from the select loop when i/o is 
+ *  available. It attempts to read the portals header and 
+ *  pass it to the generic library for processing.
+ */
+static int from_connection(void *a, void *d)
+{
+        connection c = d;
+        bridge b=a;
+        ptl_hdr_t hdr;
+
+        if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
+                lib_parse(b->nal_cb, &hdr, c);
+                return(1);
+        }
+        return(0);
+}
+
+
+static void tcpnal_shutdown(bridge b)
+{
+    shutdown_connections(b->lower);
+}
+
+/* Function:  PTL_IFACE_TCP
+ * Arguments: pid_request: desired port number to bind to
+ *            desired: passed NAL limits structure
+ *            actual: returned NAL limits structure
+ * Returns: a nal structure on success, or null on failure
+ */
+int tcpnal_init(bridge b)
+{
+    manager m;
+        
+    b->nal_cb->cb_send=tcpnal_send;
+    b->nal_cb->cb_recv=tcpnal_recv;
+    b->shutdown=tcpnal_shutdown;
+    
+    if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid,
+                                       b->nal_cb->ni.pid),
+                             from_connection,b))){
+        /* TODO: this needs to shut down the
+           newly created junk */
+        return(PTL_NAL_FAILED);
+    }
+    /* XXX cfs hack */
+    b->nal_cb->ni.pid=0;
+    b->lower=m;
+    return(PTL_OK);
+}
diff --git a/lustre/portals/unals/timer.h b/lustre/portals/unals/timer.h
new file mode 100644 (file)
index 0000000..aaf39d2
--- /dev/null
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *  Copyright (c) 2002 Eric Hoffman
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+/* TODO: make this an explicit type when they become available */
+typedef unsigned long long when;
+
+typedef struct timer {
+  void (*function)(void *);
+  void *arg;
+  when w;
+  int interval;
+  int disable;
+} *timer;
+
+timer register_timer(when, void (*f)(void *), void *a);
+void remove_timer(timer t);
+void timer_loop(void);
+void initialize_timer(void);
+void register_thunk(void (*f)(void *),void *a);
+
+
+#define HZ 0x100000000ull
+
+
diff --git a/lustre/portals/unals/utypes.h b/lustre/portals/unals/utypes.h
new file mode 100644 (file)
index 0000000..7eca959
--- /dev/null
@@ -0,0 +1,12 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002 Cray Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
+ */
+
+typedef unsigned short uint16;
+typedef unsigned long uint32;
+typedef unsigned long long uint64;
+typedef unsigned char uint8;
diff --git a/lustre/portals/utils/.cvsignore b/lustre/portals/utils/.cvsignore
new file mode 100644 (file)
index 0000000..148310a
--- /dev/null
@@ -0,0 +1,8 @@
+Makefile
+Makefile.in
+acceptor
+debugctl
+ptlctl
+.deps
+routerstat
+wirecheck
\ No newline at end of file
diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am
new file mode 100644 (file)
index 0000000..05af598
--- /dev/null
@@ -0,0 +1,27 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+
+COMPILE = gcc -Wall -g -I$(srcdir)/../include 
+LINK = gcc -o $@
+
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck
+lib_LIBRARIES = libptlctl.a
+
+acceptor_SOURCES = acceptor.c # -lefence
+
+wirecheck_SOURCES = wirecheck.c
+
+libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+
+ptlctl_SOURCES = ptlctl.c
+ptlctl_LDADD =  -L. -lptlctl -lncurses # -lefence
+ptlctl_DEPENDENCIES = libptlctl.a
+
+debugctl_SOURCES = debugctl.c
+debugctl_LDADD = -L. -lptlctl -lncurses # -lefence
+debugctl_DEPENDENCIES = libptlctl.a
+
+routerstat_SOURCES = routerstat.c
diff --git a/lustre/portals/utils/acceptor.c b/lustre/portals/utils/acceptor.c
new file mode 100644 (file)
index 0000000..c6590db
--- /dev/null
@@ -0,0 +1,466 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <asm/byteorder.h>
+#include <syslog.h>
+
+#include <errno.h>
+
+#include <portals/api-support.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+
+/* should get this from autoconf somehow */
+#ifndef PIDFILE_DIR
+#define PIDFILE_DIR "/var/run"
+#endif 
+
+#define PROGNAME "acceptor"
+
+void create_pidfile(char *name, int port)
+{
+        char pidfile[1024];
+        FILE *fp;
+
+        snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", 
+                 PIDFILE_DIR, name, port);
+        
+        if ((fp = fopen(pidfile, "w"))) {
+                fprintf(fp, "%d\n", getpid());
+                fclose(fp);
+        } else {
+                syslog(LOG_ERR, "%s: %s\n", pidfile, 
+                       strerror(errno));
+        }
+}
+
+int pidfile_exists(char *name, int port)
+{
+        char pidfile[1024];
+
+        snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", 
+                 PIDFILE_DIR, name, port);
+        
+        if (!access(pidfile, F_OK)) {
+                fprintf(stderr, "%s: exists, acceptor already running.\n", 
+                        pidfile);
+                return (1);
+        } 
+        return (0);
+}
+
+int
+parse_size (int *sizep, char *str)
+{
+        int             size;
+        char            mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod))
+        {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod)
+                {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+void
+show_connection (int fd, __u32 net_ip, ptl_nid_t nid)
+{
+        struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET);
+        __u32 host_ip = ntohl (net_ip);
+        int  rxmem = 0;
+        int  txmem = 0;
+        int  nonagle = 0;
+        int  len;
+        char host[1024];
+        
+        len = sizeof (txmem);
+        if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0)
+                perror ("Cannot get write buffer size");
+        
+        len = sizeof (rxmem);
+        if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0)
+                perror ("Cannot get read buffer size");
+        
+        len = sizeof (nonagle);
+        if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0)
+                perror ("Cannot get nagle");
+
+        if (h == NULL)
+                snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff,
+                                    (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff);
+        else
+                snprintf (host, sizeof(host), "%s", h->h_name);
+                
+        syslog (LOG_INFO, "Accepted host: %s NID: "LPX64" snd: %d rcv %d nagle: %s\n", 
+                 host, nid, txmem, rxmem, nonagle ? "disabled" : "enabled");
+}
+
+int
+sock_write (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = write (cfd, buffer, nob);
+
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+
+                if (rc == 0)
+                {
+                        fprintf (stderr, "Unexpected zero sock_write\n");
+                        abort();
+                }
+
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+sock_read (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = read (cfd, buffer, nob);
+                
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+                
+                if (rc == 0)                    /* EOF */
+                {
+                        errno = ECONNABORTED;
+                        return (-1);
+                }
+                
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid)
+{
+        int                      rc;
+        ptl_hdr_t                hdr;
+        ptl_magicversion_t      *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        memset (&hdr, 0, sizeof (hdr));
+        
+        hmv->magic          = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+
+        hdr.src_nid = __cpu_to_le64 (my_nid);
+        hdr.type = __cpu_to_le32 (PTL_MSG_HELLO);
+        
+        /* Assume there's sufficient socket buffering for a portals HELLO header */
+        rc = sock_write (cfd, &hdr, sizeof (hdr));
+        if (rc != 0) {
+                perror ("Can't send initial HELLO");
+                return (-1);
+        }
+
+        /* First few bytes down the wire are the portals protocol magic and
+         * version, no matter what protocol version we're running. */
+
+        rc = sock_read (cfd, hmv, sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read from peer");
+                return (-1);
+        }
+
+        if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) {
+                fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", 
+                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC);
+                return (-1);
+        }
+
+        if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR ||
+            __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) {
+                fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n",
+                         __cpu_to_le16 (hmv->version_major),
+                         __cpu_to_le16 (hmv->version_minor),
+                         PORTALS_PROTO_VERSION_MAJOR,
+                         PORTALS_PROTO_VERSION_MINOR);
+        }
+
+        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+         * so read the rest of it in now... */
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read rest of HELLO hdr");
+                return (-1);
+        }
+
+        /* ...and check we got what we expected */
+        if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO ||
+            __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) {
+                fprintf (stderr, "Expecting a HELLO hdr with 0 payload,"
+                         " but got type %d with %d payload\n",
+                         __cpu_to_le32 (hdr.type),
+                         __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)));
+                return (-1);
+        }
+        
+        *peer_nid = __le64_to_cpu (hdr.src_nid);
+        return (0);
+}
+
+void
+usage (char *myname)
+{
+        fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-N nal_id] port\n", myname);
+        exit (1);
+}
+
+int main(int argc, char **argv)
+{
+        int o, fd, rc, port, pfd;
+        struct sockaddr_in srvaddr;
+        int c;
+        int rxmem = 0;
+        int txmem = 0;
+        int noclose = 0;
+        int nonagle = 1;
+        int nal = SOCKNAL;
+        int xchg_nids = 0;
+        int bind_irq = 0;
+        
+        while ((c = getopt (argc, argv, "N:r:s:nlxi")) != -1)
+                switch (c)
+                {
+                case 'r':
+                        if (parse_size (&rxmem, optarg) != 0 || rxmem < 0)
+                                usage (argv[0]);
+                        break;
+                        
+                case 's':
+                        if (parse_size (&txmem, optarg) != 0 || txmem < 0)
+                                usage (argv[0]);
+                        break;
+
+                case 'n':
+                        nonagle = 0;
+                        break;
+
+                case 'l':
+                        noclose = 1;
+                        break;
+
+                case 'x':
+                        xchg_nids = 1;
+                        break;
+
+                case 'i':
+                        bind_irq = 1;
+                        break;
+                        
+                case 'N':
+                        if (parse_size(&nal, optarg) != 0 || 
+                            nal < 0 || nal > NAL_MAX_NR)
+                                usage(argv[0]);
+                        break;
+                        
+                default:
+                        usage (argv[0]);
+                        break;
+                }
+
+        if (optind >= argc)
+                usage (argv[0]);
+
+        port = atol(argv[optind++]);
+
+        if (pidfile_exists(PROGNAME, port))
+                exit(1);
+
+        memset(&srvaddr, 0, sizeof(srvaddr));
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons(port);
+        srvaddr.sin_addr.s_addr = INADDR_ANY;
+
+        fd = socket(PF_INET, SOCK_STREAM, 0);
+        if (fd < 0) {
+                perror("opening socket");
+                exit(1);
+        }
+
+        o = 1;
+        if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) {
+                perror("Cannot set REUSEADDR socket opt");
+                exit(1);
+        }
+
+        if (nonagle)
+        {
+                o = 1;
+                rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o));
+                if (rc != 0) 
+                { 
+                        perror ("Cannot disable nagle");
+                        exit (1);
+                }
+        }
+
+        if (txmem != 0)
+        {
+                rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem));
+                if (rc != 0)
+                {
+                        perror ("Cannot set write buffer size");
+                        exit (1);
+                }
+        }
+        
+        if (rxmem != 0)
+        {
+                rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem));
+                if (rc != 0)
+                {
+                        perror ("Cannot set read buffer size");
+                        exit (1);
+               }
+        }
+                
+        rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+        if ( rc == -1 ) {
+                perror("bind: ");
+                exit(1);
+        }
+
+        if (listen(fd, 127)) {
+                perror("listen: ");
+                exit(1);
+        }
+        fprintf(stderr, "listening on port %d\n", port);
+
+        pfd = open("/dev/portals", O_RDWR);
+        if ( pfd < 0 ) {
+                perror("opening portals device");
+                exit(1);
+        }
+
+        rc = daemon(1, noclose);
+        if (rc < 0) {
+                perror("daemon(): ");
+                exit(1);
+        }
+
+        openlog(PROGNAME, LOG_PID, LOG_DAEMON);
+        syslog(LOG_INFO, "started, listening on port %d\n", port);
+        create_pidfile(PROGNAME, port);
+
+        while (1) {
+                struct sockaddr_in clntaddr;
+                int len = sizeof(clntaddr);
+                int cfd;
+                struct portal_ioctl_data data;
+                ptl_nid_t peer_nid;
+                
+                cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
+                if ( cfd < 0 ) {
+                        perror("accept");
+                        exit(0);
+                        continue;
+                }
+
+                if (!xchg_nids)
+                        peer_nid = ntohl (clntaddr.sin_addr.s_addr); /* HOST byte order */
+                else
+                {
+                        PORTAL_IOC_INIT (data);
+                        data.ioc_nal = nal;
+                        rc = ioctl (pfd, IOC_PORTAL_GET_NID, &data);
+                        if (rc < 0)
+                        {
+                                perror ("Can't get my NID");
+                                close (cfd);
+                                continue;
+                        }
+                        
+                        rc = exchange_nids (cfd, data.ioc_nid, &peer_nid);
+                        if (rc != 0)
+                        {
+                                close (cfd);
+                                continue;
+                        }
+                }
+
+                show_connection (cfd, clntaddr.sin_addr.s_addr, peer_nid);
+                
+                PORTAL_IOC_INIT(data);
+                data.ioc_fd = cfd;
+                data.ioc_nal = nal;
+                data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD;
+                data.ioc_nid = peer_nid;
+                data.ioc_flags = bind_irq;
+                
+                if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) {
+                        perror("ioctl failed");
+
+                } else {
+                        printf("client registered\n");
+                }
+                rc = close(cfd);
+                if (rc)
+                        perror ("close failed");
+        }
+
+        closelog();
+        exit(0);
+
+}
diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c
new file mode 100644 (file)
index 0000000..9ab1c73
--- /dev/null
@@ -0,0 +1,618 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Some day I'll split all of this functionality into a cfs_debug module
+ * of its own.  That day is not today.
+ *
+ */
+
+#include <stdio.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <syscall.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#define BUG()                            /* workaround for module.h includes */
+#include <linux/version.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#include <linux/module.h>
+#endif
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include "parser.h"
+
+static char rawbuf[8192];
+static char *buf = rawbuf;
+static int max = 8192;
+//static int g_pfd = -1;
+static int subsystem_array[1 << 8];
+static int debug_mask = ~0;
+
+static const char *portal_debug_subsystems[] =
+        {"undefined", "mdc", "mds", "osc", "ost", "class", "obdfs", "llite",
+         "rpc", "ext2obd", "portals", "socknal", "qswnal", "pinger", "filter",
+         "obdtrace", "echo", "ldlm", "lov", "gmnal", "router", "ptldb", NULL};
+static const char *portal_debug_masks[] =
+        {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl",
+         "blocks", "net", "warning", "buffs", "other", "dentry", "portals",
+         "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", NULL};
+
+struct debug_daemon_cmd {
+        char *cmd;
+        unsigned int cmdv;
+};
+
+static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = {
+        {"start", DEBUG_DAEMON_START},
+        {"stop", DEBUG_DAEMON_STOP},
+        {"pause", DEBUG_DAEMON_PAUSE},
+        {"continue", DEBUG_DAEMON_CONTINUE},
+        {0, 0}
+};
+
+static int do_debug_mask(char *name, int enable)
+{
+        int found = 0, i;
+
+        for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
+                if (strcasecmp(name, portal_debug_subsystems[i]) == 0 ||
+                    strcasecmp(name, "all_subs") == 0) {
+                        printf("%s output from subsystem \"%s\"\n",
+                                enable ? "Enabling" : "Disabling",
+                                portal_debug_subsystems[i]);
+                        subsystem_array[i] = enable;
+                        found = 1;
+                }
+        }
+        for (i = 0; portal_debug_masks[i] != NULL; i++) {
+                if (strcasecmp(name, portal_debug_masks[i]) == 0 ||
+                    strcasecmp(name, "all_types") == 0) {
+                        printf("%s output of type \"%s\"\n",
+                                enable ? "Enabling" : "Disabling",
+                                portal_debug_masks[i]);
+                        if (enable)
+                                debug_mask |= (1 << i);
+                        else
+                                debug_mask &= ~(1 << i);
+                        found = 1;
+                }
+        }
+
+        return found;
+}
+
+int dbg_initialize(int argc, char **argv)
+{
+        memset(subsystem_array, 1, sizeof(subsystem_array));
+        return 0;
+}
+
+int jt_dbg_filter(int argc, char **argv)
+{
+        int   i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n",
+                        argv[0]);
+                return 0;
+        }
+
+        for (i = 1; i < argc; i++)
+                if (!do_debug_mask(argv[i], 0))
+                        fprintf(stderr, "Unknown subsystem or debug type: %s\n",
+                                argv[i]);
+        return 0;
+}
+
+int jt_dbg_show(int argc, char **argv)
+{
+        int    i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n",
+                        argv[0]);
+                return 0;
+        }
+
+        for (i = 1; i < argc; i++)
+                if (!do_debug_mask(argv[i], 1))
+                        fprintf(stderr, "Unknown subsystem or debug type: %s\n",
+                                argv[i]);
+
+        return 0;
+}
+
+static int applymask(char* procpath, int value)
+{
+        int rc;
+        char buf[64];
+        int len = snprintf(buf, 64, "%d", value);
+
+        int fd = open(procpath, O_WRONLY);
+        if (fd == -1) {
+                fprintf(stderr, "Unable to open %s: %s\n",
+                        procpath, strerror(errno));
+                return fd;
+        }
+        rc = write(fd, buf, len+1);
+        if (rc<0) {
+                fprintf(stderr, "Write to %s failed: %s\n",
+                        procpath, strerror(errno));
+                return rc;
+        }
+        close(fd);
+        return 0;
+}
+
+extern char *dump_filename;
+extern int dump(int dev_id, int opc, void *buf);
+
+static void applymask_all(unsigned int subs_mask, unsigned int debug_mask)
+{
+        if (!dump_filename) {
+                applymask("/proc/sys/portals/subsystem_debug", subs_mask);
+                applymask("/proc/sys/portals/debug", debug_mask);
+        } else {
+                struct portals_debug_ioctl_data data;
+
+                data.hdr.ioc_len = sizeof(data);
+                data.hdr.ioc_version = 0;
+                data.subs = subs_mask;
+                data.debug = debug_mask;
+
+                dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data);
+        }
+        printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n",
+               subs_mask, debug_mask);
+}
+
+int jt_dbg_list(int argc, char **argv)
+{
+        int i;
+
+        if (argc != 2) {
+                fprintf(stderr, "usage: %s <subs || types>\n", argv[0]);
+                return 0;
+        }
+
+        if (strcasecmp(argv[1], "subs") == 0) {
+                printf("Subsystems: all_subs");
+                for (i = 0; portal_debug_subsystems[i] != NULL; i++)
+                        printf(", %s", portal_debug_subsystems[i]);
+                printf("\n");
+        } else if (strcasecmp(argv[1], "types") == 0) {
+                printf("Types: all_types");
+                for (i = 0; portal_debug_masks[i] != NULL; i++)
+                        printf(", %s", portal_debug_masks[i]);
+                printf("\n");
+        }
+        else if (strcasecmp(argv[1], "applymasks") == 0) {
+                unsigned int subsystem_mask = 0;
+                for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
+                        if (subsystem_array[i]) subsystem_mask |= (1 << i);
+                }
+                applymask_all(subsystem_mask, debug_mask);
+        }
+        return 0;
+}
+
+/* if 'raw' is true, don't strip the debug information from the front of the
+ * lines */
+static void dump_buffer(FILE *fd, char *buf, int size, int raw)
+{
+        char *p, *z;
+        unsigned long subsystem, debug, dropped = 0, kept = 0;
+        int max_sub, max_type;
+
+        for (max_sub = 0; portal_debug_subsystems[max_sub] != NULL; max_sub++)
+                ;
+        for (max_type = 0; portal_debug_masks[max_type] != NULL; max_type++)
+                ;
+
+        while (size) {
+                p = memchr(buf, '\n', size);
+                if (!p)
+                        break;
+                subsystem = strtoul(buf, &z, 16);
+                debug = strtoul(z + 1, &z, 16);
+
+                z++;
+                /* for some reason %*s isn't working. */
+                *p = '\0';
+                if (subsystem < max_sub &&
+                    subsystem_array[subsystem] &&
+                    (!debug || (debug_mask & debug))) {
+                        if (raw)
+                                fprintf(fd, "%s\n", buf);
+                        else
+                                fprintf(fd, "%s\n", z);
+                        //printf("%s\n", buf);
+                        kept++;
+                } else {
+                        //fprintf(stderr, "dropping line (%lx:%lx): %s\n", subsystem, debug, buf);
+                        dropped++;
+                }
+                *p = '\n';
+                p++;
+                size -= (p - buf);
+                buf = p;
+        }
+
+        printf("Debug log: %lu lines, %lu kept, %lu dropped.\n",
+                dropped + kept, kept, dropped);
+}
+
+int jt_dbg_debug_kernel(int argc, char **argv)
+{
+        int rc, raw = 1;
+        FILE *fd = stdout;
+        const int databuf_size = (6 << 20);
+        struct portal_ioctl_data data, *newdata;
+        char *databuf = NULL;
+
+        if (argc > 3) {
+                fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]);
+                return 0;
+        }
+
+        if (argc > 1) {
+                fd = fopen(argv[1], "w");
+                if (fd == NULL) {
+                        fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
+                                strerror(errno));
+                        return -1;
+                }
+        }
+        if (argc > 2)
+                raw = atoi(argv[2]);
+
+        databuf = malloc(databuf_size);
+        if (!databuf) {
+                fprintf(stderr, "No memory for buffer.\n");
+                goto out;
+        }
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_plen1 = databuf_size;
+        data.ioc_pbuf1 = databuf;
+
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                goto out;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_GET_DEBUG failed: %s\n",
+                        strerror(errno));
+                goto out;
+        }
+
+        newdata = (struct portal_ioctl_data *)buf;
+        if (newdata->ioc_size > 0)
+                dump_buffer(fd, databuf, newdata->ioc_size, raw);
+
+ out:
+        if (databuf)
+                free(databuf);
+        if (fd != stdout)
+                fclose(fd);
+        return 0;
+}
+
+int jt_dbg_debug_daemon(int argc, char **argv)
+{
+        int i, rc;
+        unsigned int cmd = 0;
+        FILE *fd = stdout;
+        struct portal_ioctl_data data;
+
+        if (argc <= 1) {
+                fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|"
+                        "continue]\n", argv[0]);
+                return 0;
+        }
+        for (i = 0; portal_debug_daemon_cmd[i].cmd != NULL; i++) {
+                if (strcasecmp(argv[1], portal_debug_daemon_cmd[i].cmd) == 0) {
+                        cmd = portal_debug_daemon_cmd[i].cmdv;
+                        break;
+                }
+        }
+        if (portal_debug_daemon_cmd[i].cmd == NULL) {
+                fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|"
+                        "continue]\n", argv[0]);
+                return 0;
+        }
+        memset(&data, 0, sizeof(data));
+        if (cmd == DEBUG_DAEMON_START) {
+                if (argc < 3) {
+                        fprintf(stderr, "usage: %s [start file <#MB>|stop|"
+                                "pause|continue]\n", argv[0]);
+                        return 0;
+                }
+                if (access(argv[2], F_OK) != 0) {
+                        fd = fopen(argv[2], "w");
+                        if (fd != NULL) {
+                                fclose(fd);
+                                remove(argv[2]);
+                                goto ok;
+                        }
+                }
+                if (access(argv[2], W_OK) == 0)
+                        goto ok;
+                fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
+                        strerror(errno));
+                return -1;
+ok:
+                data.ioc_inllen1 = strlen(argv[2]) + 1;
+                data.ioc_inlbuf1 = argv[2];
+                data.ioc_misc = 0;
+                if (argc == 4) {
+                        unsigned long size;
+                        errno = 0;
+                        size = strtoul(argv[3], NULL, 0);
+                        if (errno) {
+                                fprintf(stderr, "file size(%s): error %s\n",
+                                        argv[3], strerror(errno));
+                                return -1;
+                        }
+                        data.ioc_misc = size;
+                }
+        }
+        data.ioc_count = cmd;
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_SET_DAEMON, buf);
+        if (rc < 0) {
+                fprintf(stderr, "IOC_PORTAL_SET_DEMON failed: %s\n",
+                                strerror(errno));
+                return rc;
+        }
+        return 0;
+}
+
+int jt_dbg_debug_file(int argc, char **argv)
+{
+        int rc, fd = -1, raw = 1;
+        FILE *output = stdout;
+        char *databuf = NULL;
+        struct stat statbuf;
+
+        if (argc > 4 || argc < 2) {
+                fprintf(stderr, "usage: %s <input> [output] [raw]\n", argv[0]);
+                return 0;
+        }
+
+        fd = open(argv[1], O_RDONLY);
+        if (fd < 0) {
+                fprintf(stderr, "fopen(%s) failed: %s\n", argv[1],
+                        strerror(errno));
+                return -1;
+        }
+#warning FIXME: cleanup fstat issue here
+#ifndef SYS_fstat64
+#define __SYS_fstat__ SYS_fstat
+#else
+#define __SYS_fstat__ SYS_fstat64
+#endif
+        rc = syscall(__SYS_fstat__, fd, &statbuf);
+        if (rc < 0) {
+                fprintf(stderr, "fstat failed: %s\n", strerror(errno));
+                goto out;
+        }
+
+        if (argc >= 3) {
+                output = fopen(argv[2], "w");
+                if (output == NULL) {
+                        fprintf(stderr, "fopen(%s) failed: %s\n", argv[2],
+                                strerror(errno));
+                        goto out;
+                }
+        }
+
+        if (argc == 4)
+                raw = atoi(argv[3]);
+
+        databuf = mmap(NULL, statbuf.st_size, PROT_READ | PROT_WRITE,
+                       MAP_PRIVATE, fd, 0);
+        if (databuf == NULL) {
+                fprintf(stderr, "mmap failed: %s\n", strerror(errno));
+                goto out;
+        }
+
+        dump_buffer(output, databuf, statbuf.st_size, raw);
+
+ out:
+        if (databuf)
+                munmap(databuf, statbuf.st_size);
+        if (output != stdout)
+                fclose(output);
+        if (fd > 0)
+                close(fd);
+        return 0;
+}
+
+int jt_dbg_clear_debug_buf(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+
+        memset(&data, 0, sizeof(data));
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+int jt_dbg_mark_debug_buf(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+        char *text;
+        time_t now = time(NULL);
+
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [marker text]\n", argv[0]);
+                return 0;
+        }
+
+        if (argc == 2) {
+                text = argv[1];
+        } else {
+                text = ctime(&now);
+                text[strlen(text) - 1] = '\0'; /* stupid \n */
+        }
+
+        memset(&data, 0, sizeof(data));
+        data.ioc_inllen1 = strlen(text) + 1;
+        data.ioc_inlbuf1 = text;
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+
+int jt_dbg_modules(int argc, char **argv)
+{
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        struct mod_paths {
+                char *name, *path;
+        } *mp, mod_paths[] = {
+                {"portals", "lustre/portals/libcfs"},
+                {"ksocknal", "lustre/portals/knals/socknal"},
+                {"obdclass", "lustre/obdclass"},
+                {"ptlrpc", "lustre/ptlrpc"},
+                {"obdext2", "lustre/obdext2"},
+                {"ost", "lustre/ost"},
+                {"osc", "lustre/osc"},
+                {"mds", "lustre/mds"},
+                {"mdc", "lustre/mdc"},
+                {"llite", "lustre/llite"},
+                {"obdecho", "lustre/obdecho"},
+                {"ldlm", "lustre/ldlm"},
+                {"obdfilter", "lustre/obdfilter"},
+                {"extN", "lustre/extN"},
+                {"lov", "lustre/lov"},
+                {"fsfilt_ext3", "lustre/obdclass"},
+                {"fsfilt_extN", "lustre/obdclass"},
+                {"mds_ext2", "lustre/mds"},
+                {"mds_ext3", "lustre/mds"},
+                {"mds_extN", "lustre/mds"},
+                {"ptlbd", "lustre/ptlbd"},
+                {NULL, NULL}
+        };
+        char *path = "..";
+        char *kernel = "linux";
+
+        if (argc >= 2)
+                path = argv[1];
+        if (argc == 3)
+                kernel = argv[2];
+        if (argc > 3) {
+                printf("%s [path] [kernel]\n", argv[0]);
+                return 0;
+        }
+
+        for (mp = mod_paths; mp->name != NULL; mp++) {
+                struct module_info info;
+                int rc;
+                size_t crap;
+                int query_module(const char *name, int which, void *buf,
+                                 size_t bufsize, size_t *ret);
+
+                rc = query_module(mp->name, QM_INFO, &info, sizeof(info),
+                                  &crap);
+                if (rc < 0) {
+                        if (errno != ENOENT)
+                                printf("query_module(%s) failed: %s\n",
+                                       mp->name, strerror(errno));
+                } else {
+                        printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path,
+                               mp->path, mp->name,
+                               info.addr + sizeof(struct module));
+                }
+        }
+
+        return 0;
+#else
+        printf("jt_dbg_module is not yet implemented for Linux 2.5\n");
+        return 0;
+#endif /* linux 2.5 */
+}
+
+int jt_dbg_panic(int argc, char **argv)
+{
+        int rc;
+        struct portal_ioctl_data data;
+
+        if (argc != 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+
+        memset(&data, 0, sizeof(data));
+        if (portal_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "portal_ioctl_pack failed.\n");
+                return -1;
+        }
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf);
+        if (rc) {
+                fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
diff --git a/lustre/portals/utils/debugctl.c b/lustre/portals/utils/debugctl.c
new file mode 100644 (file)
index 0000000..02cb9b4
--- /dev/null
@@ -0,0 +1,66 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Some day I'll split all of this functionality into a cfs_debug module
+ * of its own.  That day is not today.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include "parser.h"
+
+
+command_t list[] = {
+        {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"},
+        {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file [#MB]|stop|pause|continue], control debug daemon to dump debug buffer to a file"}, 
+        {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file <input> [output] [raw], read debug buffer from input and print it [to output]"},
+        {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"},
+        {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"},
+        {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"},
+        {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"},
+        {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"},
+        {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: <path>)"},
+        {"panic", jt_dbg_panic, 0, "cause the kernel to panic"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
+        {"help", Parser_help, 0, "help"},
+        {"exit", Parser_quit, 0, "quit"},
+        {"quit", Parser_quit, 0, "quit"},
+        { 0, 0, 0, NULL }
+};
+
+int main(int argc, char **argv)
+{
+        if (dbg_initialize(argc, argv) < 0)
+                exit(2);
+
+        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+
+        Parser_init("debugctl > ", list);
+        if (argc > 1)
+                return Parser_execarg(argc - 1, &argv[1], list);
+
+        Parser_commands();
+
+        unregister_ioc_dev(PORTALS_DEV_ID);
+        return 0;
+}
diff --git a/lustre/portals/utils/l_ioctl.c b/lustre/portals/utils/l_ioctl.c
new file mode 100644 (file)
index 0000000..722bb57
--- /dev/null
@@ -0,0 +1,281 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+
+struct ioc_dev {
+       const char * dev_name;
+       int dev_fd;
+};
+
+static struct ioc_dev ioc_dev_list[10];
+
+struct dump_hdr {
+       int magic;
+       int dev_id;
+       int opc;
+};
+
+char * dump_filename;
+
+static int
+open_ioc_dev(int dev_id) 
+{
+       const char * dev_name;
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return -EINVAL;
+
+       dev_name = ioc_dev_list[dev_id].dev_name;
+       if (dev_name == NULL) {
+                fprintf(stderr, "unknown device id: %d\n", dev_id);
+               return -EINVAL;
+       }
+
+       if (ioc_dev_list[dev_id].dev_fd < 0) {
+               int fd = open(dev_name, O_RDWR);
+               
+               if (fd < 0) {
+                       fprintf(stderr, "opening %s failed: %s\n"
+                               "hint: the kernel modules may not be loaded\n",
+                               dev_name, strerror(errno));
+                       return fd;
+               }
+               ioc_dev_list[dev_id].dev_fd = fd;
+       }
+
+       return ioc_dev_list[dev_id].dev_fd;
+}
+
+
+static int 
+do_ioctl(int dev_id, int opc, void *buf)
+{
+       int fd, rc;
+       
+       fd = open_ioc_dev(dev_id);
+       if (fd < 0) 
+               return fd;
+
+       rc = ioctl(fd, opc, buf);
+       return rc;
+       
+}
+
+static FILE *
+get_dump_file() 
+{
+       FILE *fp = NULL;
+       
+       if (!dump_filename) {
+               fprintf(stderr, "no dump filename\n");
+       } else 
+               fp = fopen(dump_filename, "a");
+       return fp;
+}
+
+/*
+ * The dump file should start with a description of which devices are
+ * used, but for now it will assumed whatever app reads the file will
+ * know what to do. */
+int 
+dump(int dev_id, int opc, void *buf)
+{
+       FILE *fp;
+       struct dump_hdr dump_hdr;
+       struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
+       int rc;
+       
+       printf("dumping opc %x to %s\n", opc, dump_filename);
+       
+
+       dump_hdr.magic = 0xdeadbeef;
+       dump_hdr.dev_id = dev_id;
+       dump_hdr.opc = opc;
+
+       fp = get_dump_file();
+       if (fp == NULL) {
+               fprintf(stderr, "%s: %s\n", dump_filename, 
+                       strerror(errno));
+               return -EINVAL;
+       }
+       
+       rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
+       if (rc == 1)
+               rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
+       fclose(fp);
+       if (rc != 1) {
+               fprintf(stderr, "%s: %s\n", dump_filename, 
+                       strerror(errno));
+               return -EINVAL;
+       }
+       
+       return 0;
+}
+
+/* register a device to send ioctls to.  */
+int 
+register_ioc_dev(int dev_id, const char * dev_name) 
+{
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return -EINVAL;
+
+       unregister_ioc_dev(dev_id);
+
+       ioc_dev_list[dev_id].dev_name = dev_name;
+       ioc_dev_list[dev_id].dev_fd = -1;
+
+       return dev_id;
+}
+
+void
+unregister_ioc_dev(int dev_id) 
+{
+
+       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
+               return;
+       if (ioc_dev_list[dev_id].dev_name != NULL &&
+           ioc_dev_list[dev_id].dev_fd >= 0) 
+               close(ioc_dev_list[dev_id].dev_fd);
+
+       ioc_dev_list[dev_id].dev_name = NULL;
+       ioc_dev_list[dev_id].dev_fd = -1;
+}
+
+/* If this file is set, then all ioctl buffers will be 
+   appended to the file. */
+int
+set_ioctl_dump(char * file)
+{
+       if (dump_filename)
+               free(dump_filename);
+       
+       dump_filename = strdup(file);
+       return 0;
+}
+
+int
+l_ioctl(int dev_id, int opc, void *buf)
+{
+       if (dump_filename) 
+               return dump(dev_id, opc, buf);
+       else 
+               return do_ioctl(dev_id, opc, buf);
+}
+
+/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer
+ * in the file.  For example:
+ *
+ * parse_dump("lctl.dump", l_ioctl);
+ *
+ * Note: if using l_ioctl, then you also need to register_ioc_dev() for 
+ * each device used in the dump.
+ */
+int 
+parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *))
+{
+       int fd, line =0;
+       struct stat st;
+       char *buf, *end;
+       
+       fd = syscall(SYS_open, dump_file, O_RDONLY);
+
+#warning FIXME: cleanup fstat issue here
+#ifndef SYS_fstat64
+#define __SYS_fstat__ SYS_fstat
+#else
+#define __SYS_fstat__ SYS_fstat64
+#endif
+       if (syscall(__SYS_fstat__, fd, &st)) { 
+               perror("stat fails");
+               exit(1);
+       }
+
+       if (st.st_size < 1) {
+               fprintf(stderr, "KML is empty\n");
+               exit(1);
+       }
+
+       buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+       end = buf + st.st_size;
+       close(fd);
+       while (buf < end) {
+               struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
+               struct portal_ioctl_hdr * data;
+               char tmp[8096];
+               int rc;
+               
+               line++;
+
+               data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+               if (buf + data->ioc_len > end ) {
+                       fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
+                               data->ioc_len, end);
+                       return -1;
+               }
+#if 0
+               printf ("dump_hdr: %lx data: %lx\n",
+                       (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf);
+               
+               printf("%d: opcode %x len: %d  ver: %x ", line, dump_hdr->opc,
+                      data->ioc_len, data->ioc_version);
+#endif
+
+               memcpy(tmp, data, data->ioc_len);
+
+               rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp);
+               if (rc) {
+                       printf("failed: %d\n", rc);
+                       exit(1);
+               }
+
+               buf += data->ioc_len + sizeof(*dump_hdr);
+       }
+       return 0;
+}
+
+int 
+jt_ioc_dump(int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+       printf("setting dumpfile to: %s\n", argv[1]);
+       
+       set_ioctl_dump(argv[1]);
+       return 0;
+}
diff --git a/lustre/portals/utils/parser.c b/lustre/portals/utils/parser.c
new file mode 100644 (file)
index 0000000..4d93645
--- /dev/null
@@ -0,0 +1,703 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <sys/param.h>
+#include <assert.h>
+
+#include <config.h>
+#ifdef HAVE_LIBREADLINE
+#define        READLINE_LIBRARY
+#include <readline/readline.h>
+#endif
+//extern char **completion_matches __P((char *, rl_compentry_func_t *));
+extern void using_history(void);
+extern void stifle_history(int);
+extern void add_history(char *);
+
+#include "parser.h"
+
+static command_t * top_level;      /* Top level of commands, initialized by
+                                    * InitParser                            */
+static char * parser_prompt = NULL;/* Parser prompt, set by InitParser      */
+static int done;                  /* Set to 1 if user types exit or quit   */
+
+
+/* static functions */
+static char *skipwhitespace(char *s);
+static char *skiptowhitespace(char *s);
+static command_t *find_cmd(char *name, command_t cmds[], char **next);
+static int process(char *s, char **next, command_t *lookup, command_t **result,
+                   char **prev);
+static void print_commands(char *str, command_t *table);
+
+static char * skipwhitespace(char * s)
+{
+    char * t;
+    int    len;
+
+    len = (int)strlen(s);
+    for (t = s; t <= s + len && isspace(*t); t++);
+    return(t);
+}
+
+
+static char * skiptowhitespace(char * s)
+{
+    char * t;
+
+    for (t = s; *t && !isspace(*t); t++);
+    return(t);
+}
+
+static int line2args(char *line, char **argv, int maxargs)
+{
+    char *arg;
+    int i = 0;
+
+    arg = strtok(line, " \t");
+    if ( arg ) {
+       argv[i] = arg;
+       i++;
+    } else
+       return 0;
+
+    while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) {
+       argv[i] = arg;
+       i++;
+    }
+    return i;
+}
+
+/* find a command -- return it if unique otherwise print alternatives */
+static command_t *Parser_findargcmd(char *name, command_t cmds[])
+{
+       command_t *cmd;
+
+       for (cmd = cmds; cmd->pc_name; cmd++) {
+               if (strcmp(name, cmd->pc_name) == 0)
+                       return cmd;
+       }
+       return NULL;
+}
+
+int Parser_execarg(int argc, char **argv, command_t cmds[])
+{
+       command_t *cmd;
+
+        cmd = Parser_findargcmd(argv[0], cmds);
+       if ( cmd ) {
+               return (cmd->pc_func)(argc, argv);
+       } else {
+               printf("Try interactive use without arguments or use one of:\n");
+               for (cmd = cmds; cmd->pc_name; cmd++)
+                       printf("\"%s\" ", cmd->pc_name);
+               printf("\nas argument.\n");
+       }
+       return -1;
+}
+
+/* returns the command_t * (NULL if not found) corresponding to a
+   _partial_ match with the first token in name.  It sets *next to
+   point to the following token. Does not modify *name. */
+static command_t * find_cmd(char * name, command_t cmds[], char ** next)
+{
+        int    i, len;
+    
+        if (!cmds || !name ) 
+                return NULL;
+    
+        /* This sets name to point to the first non-white space character,
+           and next to the first whitespace after name, len to the length: do
+           this with strtok*/
+        name = skipwhitespace(name);
+        *next = skiptowhitespace(name);
+        len = *next - name;
+        if (len == 0) 
+                return NULL;
+
+        for (i = 0; cmds[i].pc_name; i++) {
+                if (strncasecmp(name, cmds[i].pc_name, len) == 0) {
+                        *next = skipwhitespace(*next);
+                        return(&cmds[i]);
+                }
+        }
+        return NULL;
+}
+
+/* Recursively process a command line string s and find the command
+   corresponding to it. This can be ambiguous, full, incomplete,
+   non-existent. */
+static int process(char *s, char ** next, command_t *lookup,
+                  command_t **result, char **prev)
+{
+    *result = find_cmd(s, lookup, next);
+    *prev = s;
+
+        /* non existent */
+        if ( ! *result ) 
+                return CMD_NONE;
+
+        /* found entry: is it ambigous, i.e. not exact command name and
+           more than one command in the list matches.  Note that find_cmd
+           points to the first ambiguous entry */
+        if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) &&
+             find_cmd(s, (*result) + 1, next)) 
+                return CMD_AMBIG;
+
+        /* found a unique command: component or full? */
+        if ( (*result)->pc_func ) {
+                return CMD_COMPLETE;
+        } else {
+                if ( *next == '\0' ) {
+                        return CMD_INCOMPLETE;
+                } else {
+                        return process(*next, next, (*result)->pc_sub_cmd, result, prev);
+                }
+        }
+}
+
+#ifdef HAVE_LIBREADLINE
+static command_t * match_tbl;   /* Command completion against this table */
+static char * command_generator(const char * text, int state)
+{
+        static int index,
+                len;
+        char       *name;
+
+        /* Do we have a match table? */
+        if (!match_tbl)
+                return NULL;
+
+        /* If this is the first time called on this word, state is 0 */
+        if (!state) {
+                index = 0;
+                len = (int)strlen(text);
+        }
+
+        /* Return next name in the command list that paritally matches test */
+        while ( (name = (match_tbl + index)->pc_name) ) {
+                index++;
+
+                if (strncasecmp(name, text, len) == 0) {
+                        return(strdup(name));
+                }
+        }
+
+    /* No more matches */
+    return NULL;
+}
+
+/* probably called by readline */
+static char **command_completion(char * text, int start, int end)
+{
+    command_t  * table;
+    char       * pos;
+
+    match_tbl = top_level;
+    for (table = find_cmd(rl_line_buffer, match_tbl, &pos);
+        table;
+        table = find_cmd(pos, match_tbl, &pos)) {
+
+       if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd;
+    }
+
+    return(completion_matches(text, command_generator));
+}
+#endif
+
+/* take a string and execute the function or print help */
+int execute_line(char * line)
+{
+        command_t         *cmd, *ambig;
+        char *prev;
+        char *next, *tmp;
+        char *argv[MAXARGS];
+        int         i;
+        int rc = 0;
+
+        switch( process(line, &next, top_level, &cmd, &prev) ) {
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, cmd, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        cmd = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "No such command, type help\n");
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_COMPLETE:
+                i = line2args(line, argv, MAXARGS);
+                rc = (cmd->pc_func)(i, argv);
+
+                if (rc == CMD_HELP)
+                        fprintf(stderr, "%s\n", cmd->pc_help);
+
+                break;
+        }
+
+        return rc;
+}
+
+int
+noop_fn ()
+{
+        return (0);
+}
+
+/* just in case you're ever in an airplane and discover you 
+   forgot to install readline-dev. :) */
+int init_input() 
+{
+        int   interactive = isatty (fileno (stdin));
+
+#ifdef HAVE_LIBREADLINE
+        using_history();
+        stifle_history(HISTORY);
+
+        if (!interactive)
+        {
+                rl_prep_term_function = (rl_vintfunc_t *)noop_fn;
+                rl_deprep_term_function = (rl_voidfunc_t *)noop_fn;
+        }
+
+        rl_attempted_completion_function = (CPPFunction *)command_completion;
+        rl_completion_entry_function = (void *)command_generator;
+#endif 
+        return interactive;
+}
+
+#ifndef HAVE_LIBREADLINE
+#define add_history(s)
+char * readline(char * prompt) 
+{
+        char line[2048];
+        int n = 0;
+        if (prompt)
+                printf ("%s", prompt);
+        if (fgets(line, sizeof(line), stdin) == NULL)
+                return (NULL);
+        n = strlen(line);
+        if (n && line[n-1] == '\n')
+                line[n-1] = '\0';
+        return strdup(line);
+}
+#endif
+
+/* this is the command execution machine */
+int Parser_commands(void)
+{
+        char *line, *s;
+        int rc = 0;
+        int interactive;
+        
+        interactive = init_input();
+
+        while(!done) {
+                line = readline(interactive ? parser_prompt : NULL);
+
+                if (!line) break;
+
+                s = skipwhitespace(line);
+
+                if (*s) {
+                        add_history(s);
+                        rc = execute_line(s);
+                }
+                
+                free(line);
+        }
+        return rc;
+}
+
+
+/* sets the parser prompt */
+void Parser_init(char * prompt, command_t * cmds)
+{
+    done = 0;
+    top_level = cmds;
+    if (parser_prompt) free(parser_prompt);
+    parser_prompt = strdup(prompt);
+}
+
+/* frees the parser prompt */
+void Parser_exit(int argc, char *argv[])
+{
+    done = 1;
+    free(parser_prompt);
+    parser_prompt = NULL;
+}
+
+/* convert a string to an integer */
+int Parser_int(char *s, int *val)
+{
+    int ret;
+
+    if (*s != '0')
+       ret = sscanf(s, "%d", val);
+    else if (*(s+1) != 'x')
+       ret = sscanf(s, "%o", val);
+    else {
+       s++;
+       ret = sscanf(++s, "%x", val);
+    }
+
+    return(ret);
+}
+
+
+void Parser_qhelp(int argc, char *argv[]) {
+
+    printf("Available commands are:\n");
+
+    print_commands(NULL, top_level);
+    printf("For more help type: help command-name\n");
+}
+
+int Parser_help(int argc, char **argv) 
+{
+        char line[1024];
+        char *next, *prev, *tmp;
+        command_t *result, *ambig;
+        int i;
+
+        if ( argc == 1 ) {
+                Parser_qhelp(argc, argv);
+                return 0;
+        }
+
+        line[0]='\0';
+        for ( i = 1 ;  i < argc ; i++ ) {
+                strcat(line, argv[i]);
+        }
+
+        switch ( process(line, &next, top_level, &result, &prev) ) {
+        case CMD_COMPLETE:
+                fprintf(stderr, "%s: %s\n",line, result->pc_help);
+                break;
+        case CMD_NONE:
+                fprintf(stderr, "%s: Unknown command.\n", line);
+                break;
+        case CMD_INCOMPLETE:
+                fprintf(stderr,
+                        "'%s' incomplete command.  Use '%s x' where x is one of:\n",
+                        line, line);
+                fprintf(stderr, "\t");
+                for (i = 0; result->pc_sub_cmd[i].pc_name; i++) {
+                        fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name);
+                }
+                fprintf(stderr, "\n");
+                break;
+        case CMD_AMBIG:
+                fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line);
+                while( (ambig = find_cmd(prev, result, &tmp)) ) {
+                        fprintf(stderr, "%s ", ambig->pc_name);
+                        result = ambig + 1;
+                }
+                fprintf(stderr, "\n");
+                break;
+        }
+        return 0;
+}  
+
+
+void Parser_printhelp(char *cmd)
+{
+        char *argv[] = { "help", cmd }; 
+        Parser_help(2, argv);
+}
+
+/*************************************************************************
+ * COMMANDS                                                             *
+ *************************************************************************/
+
+
+static void print_commands(char * str, command_t * table) {
+    command_t * cmds;
+    char       buf[80];
+
+    for (cmds = table; cmds->pc_name; cmds++) {
+       if (cmds->pc_func) {
+           if (str) printf("\t%s %s\n", str, cmds->pc_name);
+           else printf("\t%s\n", cmds->pc_name);
+       }
+       if (cmds->pc_sub_cmd) {
+           if (str) {
+               sprintf(buf, "%s %s", str, cmds->pc_name);
+               print_commands(buf, cmds->pc_sub_cmd);
+           } else {
+               print_commands(cmds->pc_name, cmds->pc_sub_cmd);
+           }
+       }
+    }
+}
+
+char *Parser_getstr(const char *prompt, const char *deft, char *res,
+                   size_t len)
+{
+    char *line = NULL;
+    int size = strlen(prompt) + strlen(deft) + 8;
+    char *theprompt;
+    theprompt = malloc(size);
+    assert(theprompt);
+
+    sprintf(theprompt, "%s [%s]: ", prompt, deft);
+
+    line  = readline(theprompt);
+    free(theprompt);
+
+    if ( line == NULL || *line == '\0' ) {
+       strncpy(res, deft, len);
+    } else {
+       strncpy(res, line, len);
+    }
+
+    if ( line ) {
+       free(line);
+       return res;
+    } else {
+       return NULL;
+    }
+}
+
+/* get integer from prompt, loop forever to get it */
+int Parser_getint(const char *prompt, long min, long max, long deft, int base)
+{
+    int rc;
+    long result;
+    char *line;
+    int size = strlen(prompt) + 40;
+    char *theprompt = malloc(size);
+    assert(theprompt);
+    sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft);
+
+    fflush(stdout);
+
+    do {
+       line = NULL;
+       line = readline(theprompt);
+       if ( !line ) {
+           fprintf(stdout, "Please enter an integer.\n");
+           fflush(stdout);
+           continue;
+       }
+       if ( *line == '\0' ) {
+           free(line);
+           result =  deft;
+           break;
+       }
+       rc = Parser_arg2int(line, &result, base);
+       free(line);
+       if ( rc != 0 ) {
+           fprintf(stdout, "Invalid string.\n");
+           fflush(stdout);
+       } else if ( result > max || result < min ) {
+           fprintf(stdout, "Error: response must lie between %ld and %ld.\n",
+                   min, max);
+           fflush(stdout);
+       } else {
+           break;
+       }
+    } while ( 1 ) ;
+
+    if (theprompt)
+       free(theprompt);
+    return result;
+
+}
+
+/* get boolean (starting with YyNn; loop forever */
+int Parser_getbool(const char *prompt, int deft)
+{
+    int result = 0;
+    char *line;
+    int size = strlen(prompt) + 8;
+    char *theprompt = malloc(size);
+    assert(theprompt);
+
+    fflush(stdout);
+
+    if ( deft != 0 && deft != 1 ) {
+       fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n",
+               deft);
+       assert ( 0 );
+    }
+    sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y");
+
+    do {
+       line = NULL;
+       line = readline(theprompt);
+       if ( line == NULL ) {
+           result = deft;
+           break;
+       }
+       if ( *line == '\0' ) {
+           result = deft;
+           break;
+       }
+       if ( *line == 'y' || *line == 'Y' ) {
+           result = 1;
+           break;
+       }
+       if ( *line == 'n' || *line == 'N' ) {
+           result = 0;
+           break;
+       }
+       if ( line )
+           free(line);
+       fprintf(stdout, "Invalid string. Must start with yY or nN\n");
+       fflush(stdout);
+    } while ( 1 );
+
+    if ( line )
+       free(line);
+    if ( theprompt )
+       free(theprompt);
+    return result;
+}
+
+/* parse int out of a string or prompt for it */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                 int min, int max, int base)
+{
+    long result;
+    int rc;
+
+    rc = Parser_arg2int(inp, &result, base);
+
+    if ( rc == 0 ) {
+       return result;
+    } else {
+       return Parser_getint(prompt, deft, min, max, base);
+    }
+}
+
+/* parse int out of a string or prompt for it */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                   char *answer, int len)
+{
+    if ( inp == NULL || *inp == '\0' ) {
+       return Parser_getstr(prompt, deft, answer, len);
+    } else
+       return inp;
+}
+
+/* change a string into a number: return 0 on success. No invalid characters
+   allowed. The processing of base and validity follows strtol(3)*/
+int Parser_arg2int(const char *inp, long *result, int base)
+{
+    char *endptr;
+
+    if ( (base !=0) && (base < 2 || base > 36) )
+       return 1;
+
+    *result = strtol(inp, &endptr, base);
+
+        if ( *inp != '\0' && *endptr == '\0' )
+                return 0;
+        else 
+                return 1;
+}
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size (int *sizep, char *str) {
+        int size;
+        char mod[32];
+
+        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
+        default:
+                return (-1);
+
+        case 1:
+                *sizep = size;
+                return (0);
+
+        case 2:
+                switch (*mod) {
+                case 'g':
+                case 'G':
+                        *sizep = size << 30;
+                        return (0);
+
+                case 'm':
+                case 'M':
+                        *sizep = size << 20;
+                        return (0);
+
+                case 'k':
+                case 'K':
+                        *sizep = size << 10;
+                        return (0);
+
+                default:
+                        *sizep = size;
+                        return (0);
+                }
+        }
+}
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool (int *b, char *str) {
+        if (!strcasecmp (str, "no") ||
+            !strcasecmp (str, "n") ||
+            !strcasecmp (str, "off") ||
+            !strcasecmp (str, "disable"))
+        {
+                *b = 0;
+                return (0);
+        }
+        
+        if (!strcasecmp (str, "yes") ||
+            !strcasecmp (str, "y") ||
+            !strcasecmp (str, "on") ||
+            !strcasecmp (str, "enable"))
+        {
+                *b = 1;
+                return (0);
+        }
+        
+        return (-1);
+}
+
+int Parser_quit(int argc, char **argv)
+{
+        argc = argc;
+        argv = argv;
+        done = 1;
+        return 0;
+}
diff --git a/lustre/portals/utils/parser.h b/lustre/portals/utils/parser.h
new file mode 100644 (file)
index 0000000..dead9f5
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef _PARSER_H_
+#define _PARSER_H_
+
+#define HISTORY        100             /* Don't let history grow unbounded    */
+#define MAXARGS 100
+
+#define CMD_COMPLETE   0
+#define CMD_INCOMPLETE 1
+#define CMD_NONE       2
+#define CMD_AMBIG      3
+#define CMD_HELP       4
+
+typedef struct parser_cmd {
+       char    *pc_name;
+       int     (* pc_func)(int, char **);
+       struct parser_cmd * pc_sub_cmd;
+       char *pc_help;
+} command_t;
+
+typedef struct argcmd {
+       char    *ac_name;
+       int      (*ac_func)(int, char **);
+       char     *ac_help;
+} argcmd_t;
+
+typedef struct network {
+       char    *type;
+       char    *server;
+       int     port;
+} network_t;
+
+int  Parser_quit(int argc, char **argv);
+void Parser_init(char *, command_t *); /* Set prompt and load command list */
+int Parser_commands(void);                     /* Start the command parser */
+void Parser_qhelp(int, char **);       /* Quick help routine */
+int Parser_help(int, char **);         /* Detailed help routine */
+void Parser_printhelp(char *);         /* Detailed help routine */
+void Parser_exit(int, char **);                /* Shuts down command parser */
+int Parser_execarg(int argc, char **argv, command_t cmds[]);
+int execute_line(char * line);
+
+/* Converts a string to an integer */
+int Parser_int(char *, int *);
+
+/* Prompts for a string, with default values and a maximum length */
+char *Parser_getstr(const char *prompt, const char *deft, char *res, 
+                   size_t len);
+
+/* Prompts for an integer, with minimum, maximum and default values and base */
+int Parser_getint(const char *prompt, long min, long max, long deft,
+                 int base);
+
+/* Prompts for a yes/no, with default */
+int Parser_getbool(const char *prompt, int deft);
+
+/* Extracts an integer from a string, or prompts if it cannot get one */
+long Parser_intarg(const char *inp, const char *prompt, int deft,
+                  int min, int max, int base);
+
+/* Extracts a word from the input, or propmts if it cannot get one */
+char *Parser_strarg(char *inp, const char *prompt, const char *deft,
+                   char *answer, int len);
+
+/* Extracts an integer from a string  with a base */
+int Parser_arg2int(const char *inp, long *result, int base);
+
+/* Convert human readable size string to and int; "1k" -> 1000 */
+int Parser_size(int *sizep, char *str);
+
+/* Convert a string boolean to an int; "enable" -> 1 */
+int Parser_bool(int *b, char *str);
+
+#endif
diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c
new file mode 100644 (file)
index 0000000..90d66f5
--- /dev/null
@@ -0,0 +1,985 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+#include <asm/byteorder.h>
+
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+#include "parser.h"
+
+unsigned int portal_debug;
+unsigned int portal_printk;
+unsigned int portal_stack;
+
+
+static ptl_nid_t g_nid = 0;
+static unsigned int g_nal = 0;
+static unsigned short g_port = 0;
+
+static int g_socket_txmem = 0;
+static int g_socket_rxmem = 0;
+static int g_socket_nonagle = 1;
+
+typedef struct
+{
+        char *name;
+        int   num;
+} name2num_t;
+
+static name2num_t nalnames[] = {
+        {"tcp",                SOCKNAL},
+        {"toe",                TOENAL},
+        {"elan",       QSWNAL},
+        {"gm",         GMNAL},
+        {"scimac",      SCIMACNAL},
+        {NULL,         -1}
+};
+
+static name2num_t *
+name2num_lookup_name (name2num_t *table, char *str)
+{
+        while (table->name != NULL)
+                if (!strcmp (str, table->name))
+                        return (table);
+                else
+                        table++;
+        return (NULL);
+}
+
+static name2num_t *
+name2num_lookup_num (name2num_t *table, int num)
+{
+        while (table->name != NULL)
+                if (num == table->num)
+                        return (table);
+                else
+                        table++;
+        return (NULL);
+}
+
+int
+ptl_name2nal (char *str)
+{
+        name2num_t *e = name2num_lookup_name (nalnames, str);
+
+        return ((e == NULL) ? 0 : e->num);
+}
+
+static char *
+nal2name (int nal)
+{
+        name2num_t *e = name2num_lookup_num (nalnames, nal);
+
+        return ((e == NULL) ? "???" : e->name);
+}
+
+int
+ptl_parse_nid (ptl_nid_t *nidp, char *str)
+{
+        struct hostent *he;
+        int             a;
+        int             b;
+        int             c;
+        int             d;
+        
+        if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 &&
+            (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+            (c & ~0xff) == 0 && (d & ~0xff) == 0)
+        {
+                __u32 addr = (a<<24)|(b<<16)|(c<<8)|d;
+
+                *nidp = (ptl_nid_t)addr;
+                return (0);
+        }
+        
+        if ((('a' <= str[0] && str[0] <= 'z') ||
+             ('A' <= str[0] && str[0] <= 'Z')) &&
+             (he = gethostbyname (str)) != NULL)
+        {
+                __u32 addr = *(__u32 *)he->h_addr;
+
+                *nidp = (ptl_nid_t)ntohl(addr);  /* HOST byte order */
+                return (0);
+        }
+
+        if (sscanf (str, "%i", &a) == 1)
+        {
+                *nidp = (ptl_nid_t)a;
+                return (0);
+        }
+
+        if (sscanf (str, "%x", &a) == 1)
+        {
+                *nidp = (ptl_nid_t) a;
+                return (0);
+        }
+
+        return (-1);
+}
+
+char *
+ptl_nid2str (char *buffer, ptl_nid_t nid)
+{
+        __u32           addr = htonl((__u32)nid); /* back to NETWORK byte order */
+        struct hostent *he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET);
+
+        if (he != NULL)
+                strcpy (buffer, he->h_name);
+        else
+                sprintf (buffer, "0x"LPX64, nid);
+        
+        return (buffer);
+}
+
+int
+sock_write (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = write (cfd, buffer, nob);
+
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+
+                if (rc == 0)
+                {
+                        fprintf (stderr, "Unexpected zero sock_write\n");
+                        abort();
+                }
+
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int
+sock_read (int cfd, void *buffer, int nob)
+{
+        while (nob > 0)
+        {
+                int rc = read (cfd, buffer, nob);
+                
+                if (rc < 0)
+                {
+                        if (errno == EINTR)
+                                continue;
+                        
+                        return (rc);
+                }
+                
+                if (rc == 0)                    /* EOF */
+                {
+                        errno = ECONNABORTED;
+                        return (-1);
+                }
+                
+                nob -= rc;
+                buffer = (char *)buffer + nob;
+        }
+        
+        return (0);
+}
+
+int ptl_initialize(int argc, char **argv) 
+{
+        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+        return 0;
+}
+
+
+int jt_ptl_network(int argc, char **argv)
+{
+        int  nal;
+        
+        if (argc != 2 ||
+            (nal = ptl_name2nal (argv[1])) == 0)
+        {
+                name2num_t *entry;
+                
+                fprintf(stderr, "usage: %s \n", argv[0]);
+                for (entry = nalnames; entry->name != NULL; entry++)
+                        fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name);
+                fprintf(stderr, ">\n");
+        }
+        else
+                g_nal = nal;
+
+        return (0);
+}
+
+int
+exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid)
+{
+        int                      rc;
+        ptl_hdr_t                hdr;
+        ptl_magicversion_t      *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+
+        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+
+        memset (&hdr, 0, sizeof (hdr));
+        
+        hmv->magic          = __cpu_to_le32 (PORTALS_PROTO_MAGIC);
+        hmv->version_major  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
+        hmv->version_minor  = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+
+        hdr.src_nid = __cpu_to_le64 (my_nid);
+        hdr.type = __cpu_to_le32 (PTL_MSG_HELLO);
+        
+        /* Assume there's sufficient socket buffering for a portals HELLO header */
+        rc = sock_write (cfd, &hdr, sizeof (hdr));
+        if (rc != 0) {
+                perror ("Can't send initial HELLO");
+                return (-1);
+        }
+
+        /* First few bytes down the wire are the portals protocol magic and
+         * version, no matter what protocol version we're running. */
+
+        rc = sock_read (cfd, hmv, sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read from peer");
+                return (-1);
+        }
+
+        if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) {
+                fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", 
+                         __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC);
+                return (-1);
+        }
+
+        if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR ||
+            __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) {
+                fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n",
+                         __cpu_to_le16 (hmv->version_major),
+                         __cpu_to_le16 (hmv->version_minor),
+                         PORTALS_PROTO_VERSION_MAJOR,
+                         PORTALS_PROTO_VERSION_MINOR);
+        }
+
+        /* version 0 sends magic/version as the dest_nid of a 'hello' header,
+         * so read the rest of it in now... */
+        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0);
+        rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        if (rc != 0) {
+                perror ("Can't read rest of HELLO hdr");
+                return (-1);
+        }
+
+        /* ...and check we got what we expected */
+        if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO ||
+            __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) {
+                fprintf (stderr, "Expecting a HELLO hdr with 0 payload,"
+                         " but got type %d with %d payload\n",
+                         __cpu_to_le32 (hdr.type),
+                         __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)));
+                return (-1);
+        }
+        
+        *peer_nid = __le64_to_cpu (hdr.src_nid);
+        return (0);
+}
+
+int jt_ptl_connect(int argc, char **argv)
+{
+        if (argc < 2) {
+        usage:
+                fprintf(stderr, "usage: %s <hostname port [xi]> or <elan ID>\n",
+                        argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                ptl_nid_t peer_nid;
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                struct sockaddr_in srvaddr;
+                char *flag;
+                int fd, rc;
+                int nonagle = 0;
+                int rxmem = 0;
+                int txmem = 0;
+                int bind_irq = 0;
+                int xchange_nids = 0;
+                int o;
+                int olen;
+                
+                if (argc < 3) {
+                        goto usage;
+                }
+
+                he = gethostbyname(argv[1]);
+                if (!he) {
+                        fprintf(stderr, "gethostbyname error: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                g_port = atol(argv[2]);
+
+                if (argc > 3)
+                        for (flag = argv[3]; *flag != 0; flag++)
+                                switch (*flag)
+                                {
+                                case 'i':
+                                        bind_irq = 1;
+                                        break;
+                                        
+                                case 'x':
+                                        xchange_nids = 1;
+                                        break;
+
+                                default:
+                                        fprintf (stderr, "unrecognised flag '%c'\n",
+                                                 *flag);
+                                        return (-1);
+                                }
+                
+                memset(&srvaddr, 0, sizeof(srvaddr));
+                srvaddr.sin_family = AF_INET;
+                srvaddr.sin_port = htons(g_port);
+                srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr;
+        
+                fd = socket(PF_INET, SOCK_STREAM, 0);
+                if ( fd < 0 ) {
+                        fprintf(stderr, "socket() failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                if (g_socket_nonagle)
+                {
+                        o = 1;
+                        if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                if (g_socket_rxmem != 0)
+                {
+                        o = g_socket_rxmem;
+                        if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                if (g_socket_txmem != 0)
+                {
+                        o = g_socket_txmem;
+                        if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0)
+                        { 
+                                fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno));
+                                return (-1);
+                        }
+                }
+
+                rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
+                if ( rc == -1 ) { 
+                        fprintf(stderr, "connect() failed: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+
+                olen = sizeof (txmem);
+                if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0)
+                        fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno));
+                olen = sizeof (rxmem);
+                if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0)
+                        fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno));
+                olen = sizeof (nonagle);
+                if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0)
+                        fprintf (stderr, "Can't get nagle: %s\n", strerror (errno));
+
+                if (xchange_nids) {
+                        
+                        PORTAL_IOC_INIT (data);
+                        data.ioc_nal = g_nal;
+                        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
+                        if (rc != 0)
+                        {
+                                fprintf (stderr, "failed to get my nid: %s\n",
+                                         strerror (errno));
+                                close (fd);
+                                return (-1);
+                        }
+                        
+                        rc = exchange_nids (fd, data.ioc_nid, &peer_nid);
+                        if (rc != 0)
+                        {
+                                close (fd);
+                                return (-1);
+                        }
+                }
+                else
+                        peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */
+
+                printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1],
+                       peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled");
+
+                PORTAL_IOC_INIT(data);
+                data.ioc_fd = fd;
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD;
+                data.ioc_nid = peer_nid;
+                data.ioc_flags = bind_irq;
+                
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to register fd with portals: "
+                                "%s\n", strerror(errno));
+                        close (fd);
+                        return -1;
+                }
+
+                g_nid = peer_nid;
+                printf("Connection to "LPX64" registered with socknal\n", g_nid);
+
+                rc = close(fd);
+                if (rc) {
+                        fprintf(stderr, "close failed: %d\n", rc);
+                }
+        } else if (g_nal == QSWNAL) {
+                g_nid = atoi(argv[1]);
+        } else if (g_nal == GMNAL) {
+                g_nid = atoi(argv[1]);
+        } else if (g_nal == SCIMACNAL) {
+                unsigned int    tmpnid;
+                if(sscanf(argv[1], "%x", &tmpnid) == 1) {
+                        g_nid=tmpnid;
+                }
+                else {
+                        fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]);
+                }
+
+
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+        }
+
+        return 0;
+}
+
+int jt_ptl_disconnect(int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                int rc;
+
+                PORTAL_IOC_INIT(data);
+                if (argc == 2) {
+                        he = gethostbyname(argv[1]);
+                        if (!he) {
+                                fprintf(stderr, "gethostbyname error: %s\n",
+                                        strerror(errno));
+                                return -1;
+                        }
+                        
+                        data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */
+
+                } else {
+                        printf("Disconnecting ALL connections.\n");
+                        /* leave ioc_nid zeroed == disconnect all */
+                }
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION;
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to remove connection: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+        } else if (g_nal == QSWNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "elan.\n");
+        } else if (g_nal == GMNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "GM.\n");
+        } else if (g_nal == SCIMACNAL) {
+                printf("'disconnect' doesn't make any sense for "
+                        "SCI.\n");
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        return 0;
+}
+
+int jt_ptl_push_connection (int argc, char **argv)
+{
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
+                return 0;
+        }
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+        if (g_nal == SOCKNAL || g_nal == TOENAL) {
+                struct hostent *he;
+                struct portal_ioctl_data data;
+                int rc;
+
+                PORTAL_IOC_INIT(data);
+                if (argc == 2) {
+                        he = gethostbyname(argv[1]);
+                        if (!he) {
+                                fprintf(stderr, "gethostbyname error: %s\n",
+                                        strerror(errno));
+                                return -1;
+                        }
+                        
+                        data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */
+
+                } else {
+                        printf("Pushing ALL connections.\n");
+                        /* leave ioc_nid zeroed == disconnect all */
+                }
+                data.ioc_nal = g_nal;
+                data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION;
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+                if (rc) {
+                        fprintf(stderr, "failed to push connection: %s\n",
+                                strerror(errno));
+                        return -1;
+                }
+        } else if (g_nal == QSWNAL) {
+                printf("'push' doesn't make any sense for elan.\n");
+        } else if (g_nal == GMNAL) {
+                printf("'push' doesn't make any sense for GM.\n");
+        } else if (g_nal == SCIMACNAL) {
+                printf("'push' doesn't make any sense for SCI.\n");
+        } else {
+                fprintf(stderr, "This should never happen.  Also it is very "
+                        "bad.\n");
+                return -1;
+        }
+
+        return 0;
+}
+
+int jt_ptl_ping(int argc, char **argv)
+{
+        int       rc;
+        ptl_nid_t nid;
+        long      count   = 1;
+        long      size    = 4;
+        long      timeout = 1;
+        struct portal_ioctl_data data;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]);
+                return 0;
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
+                return (-1);
+        }
+        
+        if (argc > 2)
+        {
+                count = atol(argv[2]);
+
+                if (count < 0 || count > 20000) 
+                {
+                        fprintf(stderr, "are you insane?  %ld is a crazy count.\n", count);
+                        return -1;
+                }
+        }
+        
+        if (argc > 3)
+                size= atol(argv[3]);
+
+        if (argc > 4)
+                timeout = atol (argv[4]);
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_count   = count;
+        data.ioc_size    = size;
+        data.ioc_nid     = nid;
+        data.ioc_nal     = g_nal;
+        data.ioc_timeout = timeout;
+        
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data);
+        if (rc) {
+                fprintf(stderr, "failed to start pinger: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        return 0;
+}
+
+int jt_ptl_shownid(int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        int                      rc;
+        
+        if (argc > 1) {
+                fprintf(stderr, "usage: %s\n", argv[0]);
+                return 0;
+        }
+        
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command first\n");
+                return -1;
+        }
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_nal = g_nal;
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
+        if (rc < 0)
+                fprintf(stderr, "getting my NID failed: %s\n",
+                        strerror (errno));
+        else
+                printf(LPX64"\n", data.ioc_nid);
+        return 0;
+}
+
+int jt_ptl_mynid(int argc, char **argv)
+{
+        int rc;
+        char hostname[1024];
+        char *nidstr;
+        struct portal_ioctl_data data;
+        ptl_nid_t mynid;
+        
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [NID]\n", argv[0]);
+                fprintf(stderr, "NID defaults to the primary IP address of the machine.\n");
+                return 0;
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return -1;
+        }
+
+        if (argc >= 2)
+                nidstr = argv[1];
+        else if (gethostname(hostname, sizeof(hostname)) != 0) {
+                fprintf(stderr, "gethostname failed: %s\n",
+                        strerror(errno));
+                return -1;
+        }
+        else
+                nidstr = hostname;
+
+        rc = ptl_parse_nid (&mynid, nidstr);
+        if (rc != 0) {
+                fprintf (stderr, "Can't convert '%s' into a NID\n", nidstr);
+                return -1;
+        }
+        
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = mynid;
+        data.ioc_nal = g_nal;
+        data.ioc_nal_cmd = NAL_CMD_REGISTER_MYNID;
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
+        if (rc < 0)
+                fprintf(stderr, "setting my NID failed: %s\n",
+                       strerror(errno));
+        else
+                printf("registered my nid "LPX64" (%s)\n", mynid, hostname);
+        return 0;
+}
+
+int
+jt_ptl_fail_nid (int argc, char **argv)
+{
+        int                      rc;
+        ptl_nid_t                nid;
+        unsigned int             threshold;
+        struct portal_ioctl_data data;
+
+        if (argc < 2 || argc > 3)
+        {
+                fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]);
+                return (0);
+        }
+        
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return (-1);
+        }
+
+        if (!strcmp (argv[1], "_all_"))
+                nid = PTL_NID_ANY;
+        else if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        if (argc < 3)
+                threshold = PTL_MD_THRESH_INF;
+        else if (sscanf (argv[2], "%i", &threshold) != 1) {
+                fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]);
+                return (-1);
+        }
+        
+        PORTAL_IOC_INIT (data);
+        data.ioc_nal = g_nal;
+        data.ioc_nid = nid;
+        data.ioc_count = threshold;
+        
+        rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data);
+        if (rc < 0)
+                fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n",
+                         strerror (errno));
+        else
+                printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]);
+        
+        return (0);
+}
+
+int
+jt_ptl_rxmem (int argc, char **argv)
+{
+        int   size;
+        
+        if (argc > 1)
+        {
+                if (Parser_size (&size, argv[1]) != 0 || size < 0)
+                {
+                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
+                        return (0);
+                }
+
+                g_socket_rxmem = size;
+        }
+        printf ("Socket rmem = %d\n", g_socket_rxmem);        
+        return (0);
+}
+
+int
+jt_ptl_txmem (int argc, char **argv)
+{
+        int   size;
+        
+        if (argc > 1)
+        {
+                if (Parser_size (&size, argv[1]) != 0 || size < 0)
+                {
+                        fprintf (stderr, "Can't parse size %s\n", argv[1]);
+                        return (0);
+                }
+                g_socket_txmem = size;
+        }
+        printf ("Socket txmem = %d\n", g_socket_txmem);
+        return (0);
+}
+
+int
+jt_ptl_nagle (int argc, char **argv)
+{
+        int enable;
+
+        if (argc > 1)
+        {
+                if (Parser_bool (&enable, argv[1]) != 0)
+                {
+                        fprintf (stderr, "Can't parse boolean %s\n", argv[1]);
+                        return (0);
+                }
+                g_socket_nonagle = !enable;
+        }
+        printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled");
+        return (0);
+}
+
+int
+jt_ptl_add_route (int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        ptl_nid_t                nid1;
+        ptl_nid_t                nid2;
+        ptl_nid_t                gateway_nid;
+        int                      rc;
+        
+        if (argc < 3)
+        {
+                fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]);
+                return (0);
+        }
+
+        if (g_nal == 0) {
+                fprintf(stderr, "Error: you must run the 'network' command "
+                        "first.\n");
+                return (-1);
+        }
+
+        if (ptl_parse_nid (&gateway_nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        if (ptl_parse_nid (&nid1, argv[2]) != 0)
+        {
+                fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]);
+                return (-1);
+        }
+
+        if (argc < 4)
+                nid2 = nid1;
+        else if (ptl_parse_nid (&nid2, argv[3]) != 0)
+        {
+                fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]);
+                return (-1);
+        }
+
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = gateway_nid;
+        data.ioc_nal = g_nal;
+        data.ioc_nid2 = MIN (nid1, nid2);
+        data.ioc_nid3 = MAX (nid1, nid2);
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_ADD_ROUTE, &data);
+        if (rc != 0) 
+        {
+                fprintf (stderr, "IOC_PORTAL_ADD_ROUTE failed: %s\n", strerror (errno));
+                return (-1);
+        }
+        
+        return (0);
+}
+
+int
+jt_ptl_del_route (int argc, char **argv)
+{
+        struct portal_ioctl_data data;
+        ptl_nid_t                nid;
+        int                      rc;
+        
+        if (argc < 2)
+        {
+                fprintf (stderr, "usage: %s targetNID\n", argv[0]);
+                return (0);
+        }
+
+        if (ptl_parse_nid (&nid, argv[1]) != 0)
+        {
+                fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
+                return (-1);
+        }
+
+        PORTAL_IOC_INIT(data);
+        data.ioc_nid = nid;
+
+        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data);
+        if (rc != 0) 
+        {
+                fprintf (stderr, "IOC_PORTAL_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno));
+                return (-1);
+        }
+        
+        return (0);
+}
+
+int
+jt_ptl_print_routes (int argc, char **argv)
+{
+        char                      buffer[3][128];
+        struct portal_ioctl_data  data;
+        int                       rc;
+        int                       index;
+        int                      gateway_nal;
+        ptl_nid_t                gateway_nid;
+        ptl_nid_t                nid1;
+        ptl_nid_t                nid2;
+        
+        
+        for (index = 0;;index++)
+        {
+                PORTAL_IOC_INIT(data);
+                data.ioc_count = index;
+                
+                rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_ROUTE, &data);
+                if (rc != 0)
+                        break;
+
+                gateway_nal = data.ioc_nal;
+                gateway_nid = data.ioc_nid;
+                nid1 = data.ioc_nid2;
+                nid2 = data.ioc_nid3;
+                
+                printf ("%8s %18s : %s - %s\n", 
+                        nal2name (gateway_nal), 
+                        ptl_nid2str (buffer[0], gateway_nid),
+                        ptl_nid2str (buffer[1], nid1),
+                        ptl_nid2str (buffer[2], nid2));
+        }
+        return (0);
+}
+
diff --git a/lustre/portals/utils/ptlctl.c b/lustre/portals/utils/ptlctl.c
new file mode 100644 (file)
index 0000000..8c56d93
--- /dev/null
@@ -0,0 +1,65 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <portals/api-support.h>
+#include <portals/ptlctl.h>
+
+#include "parser.h"
+
+
+command_t list[] = {
+        {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"},
+        {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: <hostname port> | <id> for tcp/elan respectively)"},
+        {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"},
+        {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"},
+        {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"},
+        {"shownid", jt_ptl_shownid, 0, "print the local NID"},
+        {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"},
+        {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
+        {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"},
+        {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"},
+        {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"},
+        {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"},
+        {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"},
+        {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
+        {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"},
+        {"help", Parser_help, 0, "help"},
+        {"exit", Parser_quit, 0, "quit"},
+        {"quit", Parser_quit, 0, "quit"},
+        { 0, 0, 0, NULL }
+};
+
+int main(int argc, char **argv)
+{
+        if (ptl_initialize(argc, argv) < 0)
+                exit(1);
+
+        Parser_init("ptlctl > ", list);
+        if (argc > 1)
+                return Parser_execarg(argc - 1, &argv[1], list);
+
+        Parser_commands();
+
+        return 0;
+}
diff --git a/lustre/portals/utils/routerstat.c b/lustre/portals/utils/routerstat.c
new file mode 100644 (file)
index 0000000..37da12c
--- /dev/null
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+double
+timenow ()
+{
+   struct timeval tv;
+   
+   gettimeofday (&tv, NULL);
+   return (tv.tv_sec + tv.tv_usec / 1000000.0);
+}
+
+void
+do_stat (int fd)
+{
+   static char  buffer[1024];
+   static double last = 0.0;
+   double now;
+   double t;
+   long long bytes;
+   long      packets;
+   long      errors;
+   long      depth;
+   int    n;
+   
+   lseek (fd, 0, SEEK_SET);
+   now = timenow();
+   n = read (fd, buffer, sizeof (buffer));
+   if (n < 0)
+   {
+      fprintf (stderr, "Can't read statfile\n");
+      exit (1);
+   }    
+   buffer[n] = 0;
+   
+   n = sscanf (buffer, "%Ld %ld %ld %ld", &bytes, &packets, &errors, &depth);
+   
+   if (n < 3)
+   {
+      fprintf (stderr, "Can't parse statfile\n");
+      exit (1);
+   }
+   
+   if (last == 0.0)
+      printf ("%Ld bytes, %ld packets (sz %Ld) %ld errors", 
+             bytes, packets, (long long)((packets == 0) ? 0LL : bytes/packets), errors);
+   else
+   {
+      t = now - last;
+
+      printf ("%9Ld (%7.2fMb/s), %7ld packets (sz %5Ld, %5ld/s) %ld errors (%ld/s)", 
+             bytes, ((double)bytes)/((1<<20) * t),
+             packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t),
+             errors, (long)(errors/t));
+   }
+
+   if (n == 4)
+      printf (" (%ld)\n", depth);
+   else
+      printf ("\n");
+
+   fflush (stdout);
+   
+   lseek (fd, 0, SEEK_SET);
+   write (fd, "\n", 1);
+   last = timenow();
+}
+
+int main (int argc, char **argv)
+{
+   int  interval = 0;
+   int  fd;
+   
+   if (argc > 1)
+      interval = atoi (argv[1]);
+
+   fd = open ("/proc/sys/portals/router", O_RDWR);
+   if (fd < 0)
+   {
+      fprintf (stderr, "Can't open stat: %s\n", strerror (errno));
+      return (1);
+   }
+   
+   do_stat (fd);
+   if (interval == 0)
+      return (0);
+   
+   for (;;)
+   {
+      sleep (interval);
+      do_stat (fd);
+   }
+}
diff --git a/lustre/portals/utils/wirecheck.c b/lustre/portals/utils/wirecheck.c
new file mode 100644 (file)
index 0000000..6a4377b
--- /dev/null
@@ -0,0 +1,141 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <portals/api-support.h>
+#include <portals/list.h>
+#include <portals/lib-types.h>
+
+#define BLANK_LINE()                           \
+do {                                           \
+       printf ("\n");                          \
+} while (0)
+
+#define COMMENT(c)                             \
+do {                                           \
+       printf ("        /* "c" */\n");         \
+} while (0)
+
+#define STRINGIFY(a) #a
+
+#define CHECK_DEFINE(a)                                                \
+do {                                                           \
+       printf ("        LASSERT ("#a" == "STRINGIFY(a)");\n"); \
+} while (0)
+
+#define CHECK_VALUE(a)                                 \
+do {                                                   \
+       printf ("        LASSERT ("#a" == %d);\n", a);  \
+} while (0)
+
+#define CHECK_MEMBER_OFFSET(s,m)               \
+do {                                           \
+       CHECK_VALUE(offsetof(s, m));            \
+} while (0)
+
+#define CHECK_MEMBER_SIZEOF(s,m)               \
+do {                                           \
+       CHECK_VALUE((int)sizeof(((s *)0)->m));  \
+} while (0)
+
+#define CHECK_MEMBER(s,m)                      \
+do {                                           \
+       CHECK_MEMBER_OFFSET(s, m);              \
+       CHECK_MEMBER_SIZEOF(s, m);              \
+} while (0)
+
+#define CHECK_STRUCT(s)                         \
+do {                                            \
+        BLANK_LINE ();                          \
+        COMMENT ("Checks for struct "#s);       \
+       CHECK_VALUE((int)sizeof(s));            \
+} while (0)
+
+void
+check_ptl_handle_wire (void)
+{
+       CHECK_STRUCT (ptl_handle_wire_t);
+       CHECK_MEMBER (ptl_handle_wire_t, wh_interface_cookie);
+       CHECK_MEMBER (ptl_handle_wire_t, wh_object_cookie);
+}
+
+void
+check_ptl_magicversion (void)
+{
+       CHECK_STRUCT (ptl_magicversion_t);
+       CHECK_MEMBER (ptl_magicversion_t, magic);
+       CHECK_MEMBER (ptl_magicversion_t, version_major);
+       CHECK_MEMBER (ptl_magicversion_t, version_minor);
+}
+
+void
+check_ptl_hdr (void)
+{
+       CHECK_STRUCT (ptl_hdr_t);
+       CHECK_MEMBER (ptl_hdr_t, dest_nid);
+       CHECK_MEMBER (ptl_hdr_t, src_nid);
+       CHECK_MEMBER (ptl_hdr_t, dest_pid);
+       CHECK_MEMBER (ptl_hdr_t, src_pid);
+       CHECK_MEMBER (ptl_hdr_t, type);
+
+        BLANK_LINE ();
+        COMMENT ("Ack");
+        CHECK_MEMBER (ptl_hdr_t, msg.ack.mlength);
+        CHECK_MEMBER (ptl_hdr_t, msg.ack.dst_wmd);
+        CHECK_MEMBER (ptl_hdr_t, msg.ack.match_bits);
+        CHECK_MEMBER (ptl_hdr_t, msg.ack.length);
+
+        BLANK_LINE ();
+        COMMENT ("Put");
+       CHECK_MEMBER (ptl_hdr_t, msg.put.ptl_index);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.ack_wmd);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.match_bits);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.length);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.offset);
+       CHECK_MEMBER (ptl_hdr_t, msg.put.hdr_data);
+
+        BLANK_LINE ();
+        COMMENT ("Get");
+       CHECK_MEMBER (ptl_hdr_t, msg.get.ptl_index);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.return_wmd);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.match_bits);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.length);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.src_offset);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.return_offset);
+       CHECK_MEMBER (ptl_hdr_t, msg.get.sink_length);
+
+        BLANK_LINE ();
+        COMMENT ("Reply");
+       CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_wmd);
+       CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_offset);
+       CHECK_MEMBER (ptl_hdr_t, msg.reply.length);
+}
+
+int
+main (int argc, char **argv)
+{
+       printf ("void lib_assert_wire_constants (void)\n"
+               "{\n");
+
+       COMMENT ("Wire protocol assertions generated by 'wirecheck'");
+       BLANK_LINE ();
+       
+       COMMENT ("Constants...");
+       CHECK_DEFINE (PORTALS_PROTO_MAGIC);
+       CHECK_DEFINE (PORTALS_PROTO_VERSION_MAJOR);
+       CHECK_DEFINE (PORTALS_PROTO_VERSION_MINOR);
+
+       CHECK_VALUE (PTL_MSG_ACK);
+       CHECK_VALUE (PTL_MSG_PUT);
+       CHECK_VALUE (PTL_MSG_GET);
+       CHECK_VALUE (PTL_MSG_REPLY);
+       CHECK_VALUE (PTL_MSG_HELLO);
+
+       check_ptl_handle_wire ();
+       check_ptl_magicversion ();
+       check_ptl_hdr ();
+       
+       printf ("}\n\n");
+       
+       return (0);
+}
index 28ca368..a367903 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/major.h>
 #include <linux/smp.h>
+#include <linux/hdreg.h>
 
 #define DEBUG_SUBSYSTEM S_PTLBD
 
@@ -95,20 +96,26 @@ static int ptlbd_open(struct inode *inode, struct file  *file)
         struct ptlbd_obd *ptlbd = ptlbd_get_inode(inode);
         ENTRY;
 
+
         if ( IS_ERR(ptlbd) )
                 RETURN(PTR_ERR(ptlbd));
-        if ( ptlbd->bd_import.imp_connection == NULL )
-                RETURN(-ENODEV);
+
+        if (! ptlbd->bd_import->imp_remote_handle.cookie)
+               if (ptlbd_do_connect(ptlbd))
+                       RETURN(-ENOTCONN);
 
         ptlbd->refcount++;
         RETURN(0);
 }
 
+
 static int ptlbd_ioctl(struct inode *inode, struct file *file,
                 unsigned int cmd, unsigned long arg)
 {
         struct ptlbd_obd *ptlbd;
         int ret;
+        __u16   major, minor, dev;
+        struct hd_geometry geo;
 
         if ( ! capable(CAP_SYS_ADMIN) )
                 RETURN(-EPERM);
@@ -117,11 +124,50 @@ static int ptlbd_ioctl(struct inode *inode, struct file *file,
         if ( IS_ERR(ptlbd) )
                 RETURN( PTR_ERR(ptlbd) );
 
+        major = MAJOR(inode->i_rdev);
+        minor = MINOR(inode->i_rdev);
+        dev = inode->i_rdev;
+
         switch(cmd) {
+                case HDIO_GETGEO:
+                        geo.heads = 64;
+                        geo.sectors = 32;
+                        geo.start = 4;
+                        geo.cylinders = blk_size[major][minor]/
+                                        (geo.heads * geo.sectors);
+                        if (copy_to_user((void *) arg, &geo, sizeof(geo)))
+                                ret = -EFAULT;
+                        else  
+                                ret = 0;
+                        break;
+
+                case BLKSECTGET:
+                        ret = copy_to_user((void *) arg, 
+                                & max_sectors[major][minor], sizeof(arg));
+                        break;
+
                 case BLKFLSBUF:
-                        ret = blk_ioctl(inode->i_rdev, cmd, arg);
+                        ret = blk_ioctl(dev, cmd, arg);
+                        ptlbd_send_flush_req(ptlbd, PTLBD_FLUSH);
                         break;
+
+                case BLKGETSIZE:
+                case BLKGETSIZE64:
+                case BLKROSET:
+                case BLKROGET:
+                case BLKRASET:
+                case BLKRAGET:
+                case BLKSSZGET:
+                case BLKELVGET:
+                case BLKELVSET:
                 default:
+                        ret = blk_ioctl(dev, cmd, arg);
+                        break;
+
+                case BLKSECTSET:       /* don't allow setting of max_sectors */
+
+                case BLKRRPART:        /* not a partitionable device */
+                case BLKPG:            /* "" */
                         ret = -EINVAL;
                         break;
         }
@@ -137,7 +183,9 @@ static int ptlbd_release(struct inode *inode, struct file *file)
         if ( IS_ERR(ptlbd) ) 
                 RETURN( PTR_ERR(ptlbd) );
 
-        ptlbd->refcount--;
+        if (--ptlbd->refcount == 0)
+                ptlbd_do_disconnect(ptlbd);
+
         RETURN(0);
 }
 
@@ -174,6 +222,7 @@ static void ptlbd_request(request_queue_t *q)
         struct ptlbd_obd *ptlbd;
         struct request *req;
         ptlbd_cmd_t cmd;
+        int     errors = 0;
         ENTRY;
 
         while ( !QUEUE_EMPTY ) {
@@ -190,19 +239,18 @@ static void ptlbd_request(request_queue_t *q)
 
                 spin_unlock_irq(&io_request_lock);
 
-                /* XXX dunno if we're supposed to get this or not.. */
-                /* __make_request() changes READA to READ - Kris */
-                LASSERT(req->cmd != READA);
-
                 if ( req->cmd == READ )
                         cmd = PTLBD_READ;
                 else 
                         cmd = PTLBD_WRITE;
 
-                ptlbd_send_req(ptlbd, cmd, req);
+                errors = ptlbd_send_rw_req(ptlbd, cmd, req->bh);
 
                 spin_lock_irq(&io_request_lock);
 
+                if (errors)
+                        req->errors += errors;
+
                 ptlbd_end_request_havelock(req);
         }
 }
@@ -228,7 +276,6 @@ int ptlbd_blk_init(void)
         blksize_size[PTLBD_MAJOR] = ptlbd_size_size;
         hardsect_size[PTLBD_MAJOR] = ptlbd_hardsect_size;
         max_sectors[PTLBD_MAJOR] = ptlbd_max_sectors;
-        //RHism blkdev_varyio[PTLBD_MAJOR] = ptlbd_dev_varyio;
 
         blk_init_queue(BLK_DEFAULT_QUEUE(PTLBD_MAJOR), ptlbd_request);
         blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0);
@@ -238,9 +285,7 @@ int ptlbd_blk_init(void)
                 /* avoid integer overflow */
                 ptlbd_size[i] = (16*1024*((1024*1024) >> BLOCK_SIZE_BITS));
                 ptlbd_hardsect_size[i] = 4096;
-                ptlbd_max_sectors[i] = 2;
-                //RHism ptlbd_dev_varyio[i] = 0;
-                /* XXX register_disk? */
+                ptlbd_max_sectors[i] = PTL_MD_MAX_IOV * (4096/512);
         }
 
         return 0;
index 8d957db..f36a3c7 100644 (file)
 #include <linux/lprocfs_status.h>
 #include <linux/obd_ptlbd.h>
 
-static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf)
+static int ptlbd_cl_setup(struct obd_device *obd, obd_count len, void *buf)
 {
-        struct ptlbd_obd *ptlbd = &obddev->u.ptlbd;
-        struct obd_import *imp = &ptlbd->bd_import;
+        struct ptlbd_obd *ptlbd = &obd->u.ptlbd;
+        struct obd_import *imp;
         struct obd_ioctl_data* data = buf;
-        struct obd_uuid server_uuid;
         ENTRY;
 
-        if ( ptlbd->bd_import.imp_connection != NULL )
+        if (ptlbd->bd_import != NULL)
                 RETURN(-EALREADY);
 
         if (data->ioc_inllen1 < 1) {
@@ -53,82 +52,144 @@ static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf)
                 RETURN(-EINVAL);
         }
 
-        obd_str2uuid(&server_uuid, data->ioc_inlbuf1);
-
-        imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid);
-        if (!imp->imp_connection)
-                RETURN(-ENOENT);
+        obd_str2uuid(&ptlbd->bd_server_uuid, data->ioc_inlbuf1);
 
-        INIT_LIST_HEAD(&imp->imp_replay_list);
-        INIT_LIST_HEAD(&imp->imp_sending_list);
-        INIT_LIST_HEAD(&imp->imp_delayed_list);
-        spin_lock_init(&imp->imp_lock);
         /*
          * from client_obd_connect.. *shrug*
          */
-        INIT_LIST_HEAD(&imp->imp_chain);
-        imp->imp_max_transno = 0;
-        imp->imp_peer_committed_transno = 0;
+        imp = ptlbd->bd_import = class_new_import();
+        imp->imp_connection = ptlrpc_uuid_to_connection(&ptlbd->bd_server_uuid);
+        if (!imp->imp_connection) {
+                class_destroy_import(imp);
+                class_import_put(imp);
+                RETURN(-ENOENT);
+        }
         imp->imp_level = LUSTRE_CONN_FULL;
 
         ptlrpc_init_client(PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL, 
                         "ptlbd", &ptlbd->bd_client);
         imp->imp_client = &ptlbd->bd_client;
-        imp->imp_obd = obddev;
-
+        imp->imp_obd = obd;
+        memcpy(imp->imp_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1);
         ptlbd_blk_register(ptlbd);
 
         RETURN(0);
 }
 
-static int ptlbd_cl_cleanup(struct obd_device *obddev)
+static int ptlbd_cl_cleanup(struct obd_device *obd, int force, int failover)
 {
-        struct ptlbd_obd *ptlbd = &obddev->u.ptlbd;
+        struct ptlbd_obd *ptlbd = &obd->u.ptlbd;
+        struct obd_import *imp;
         ENTRY;
 
-        if (!ptlbd)
+        if ((!ptlbd) || (!(imp = ptlbd->bd_import)))
                 RETURN(-ENOENT);
 
-        if (!ptlbd->bd_import.imp_connection)
+        if (!imp->imp_connection)
                 RETURN(-ENOENT);
 
-        ptlrpc_cleanup_client(&ptlbd->bd_import);
-        ptlrpc_put_connection(ptlbd->bd_import.imp_connection);
+        ptlrpc_cleanup_client(imp);
+        ptlrpc_put_connection(imp->imp_connection);
+
+        class_destroy_import(imp);
+        class_import_put(imp);
 
         RETURN(0);
 }
 
-#if 0
-static int ptlbd_cl_connect(struct lustre_handle *conn, struct obd_device *obd,
-                        struct obd_uuid cluuid, struct recovd_obd *recovd,
-                        ptlrpc_recovery_cb_t recover)
+
+/* modelled after ptlrpc_import_connect() */
+int ptlbd_cl_connect(struct lustre_handle *conn,
+                      struct obd_device *obd, 
+                      struct obd_uuid *target_uuid)
 {
         struct ptlbd_obd *ptlbd = &obd->u.ptlbd;
-        struct obd_import *imp = &ptlbd->bd_import;
-        int rc;
+        struct obd_import *imp = ptlbd->bd_import;
+        struct obd_export *exp;
+        struct ptlrpc_request *request;
+        int     rc, size[] = {sizeof(imp->imp_target_uuid),
+                              sizeof(obd->obd_uuid),
+                              sizeof(*conn)};
+        char *tmp[] = {imp->imp_target_uuid.uuid, 
+                       obd->obd_uuid.uuid,
+                       (char*)conn};
         ENTRY;
 
-        rc = class_connect(conn, obd, cluuid);
+        if (!conn || !obd || !target_uuid)
+                RETURN(-EINVAL);
+
+        rc = class_connect(conn, obd, target_uuid);
         if (rc)
                 RETURN(rc);
 
-        INIT_LIST_HEAD(&imp->imp_chain);
-        imp->imp_max_transno = 0;
-        imp->imp_peer_committed_transno = 0;
+        request = ptlrpc_prep_req(imp, PTLBD_CONNECT, 3, size, tmp);
+        if (!request)
+                GOTO(out_disco, rc = -ENOMEM);
+        request->rq_level = LUSTRE_CONN_NEW;
+        request->rq_replen = lustre_msg_size(0, NULL);
+
+        imp->imp_dlm_handle = *conn;
+
+        imp->imp_level = LUSTRE_CONN_CON;
+        rc = ptlrpc_queue_wait(request);
+        if (rc)
+                GOTO(out_req, rc);
+
+        exp = class_conn2export(conn);
+        exp->exp_connection = ptlrpc_connection_addref(request->rq_connection);
+        class_export_put(exp);
+
         imp->imp_level = LUSTRE_CONN_FULL;
+        imp->imp_remote_handle = request->rq_repmsg->handle;
+        
+out_req:
+        ptlrpc_req_finished(request);
+out_disco:
+        if (rc)
+                class_disconnect(conn, 0);
+        RETURN(rc);
+}
 
-        RETURN(0);
+
+/* modelled after ptlrpc_import_disconnect() */
+int ptlbd_cl_disconnect(struct lustre_handle *conn, int failover)
+{
+        struct obd_device *obd = class_conn2obd(conn);
+        struct ptlbd_obd *ptlbd = &obd->u.ptlbd;
+        struct obd_import *imp = ptlbd->bd_import;
+        struct ptlrpc_request *request;
+        int     rc, err;
+        ENTRY;
+
+        if (!obd)
+                RETURN(-EINVAL);
+
+        request = ptlrpc_prep_req(imp, PTLBD_DISCONNECT, 0, NULL, NULL);
+        if (!request)
+                GOTO(out_req, rc = -ENOMEM);
+
+        request->rq_replen = lustre_msg_size(0, NULL);
+        request->rq_level = LUSTRE_CONN_RECOVD;
+
+        rc = ptlrpc_queue_wait(request);
+
+out_req:
+        if (request)
+                ptlrpc_req_finished(request);
+        err = class_disconnect(conn, 0);
+        memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
+        if (!rc && err)
+                rc = err;
+        RETURN(rc);
 }
-#endif
+
 
 static struct obd_ops ptlbd_cl_obd_ops = {
         o_owner:        THIS_MODULE,
         o_setup:        ptlbd_cl_setup,
         o_cleanup:      ptlbd_cl_cleanup,
-#if 0
         o_connect:      ptlbd_cl_connect,
-        o_disconnect:   class_disconnect
-#endif
+        o_disconnect:   ptlbd_cl_disconnect,
 };
 
 int ptlbd_cl_init(void)
@@ -144,3 +205,28 @@ void ptlbd_cl_exit(void)
 {
         class_unregister_type(OBD_PTLBD_CL_DEVICENAME);
 }
+
+
+
+int ptlbd_do_connect(struct ptlbd_obd *ptlbd)
+{
+        int     rc;
+        struct obd_device       *obd = ptlbd->bd_import->imp_obd;
+        ENTRY;
+
+        memset(&ptlbd->bd_connect_handle, 0, sizeof(ptlbd->bd_connect_handle));
+        rc = obd_connect(&ptlbd->bd_connect_handle, obd, 
+                         &ptlbd->bd_server_uuid);
+        RETURN(rc);
+}
+
+
+int ptlbd_do_disconnect(struct ptlbd_obd *ptlbd)
+{
+        int     rc;
+        ENTRY;
+
+        rc = obd_disconnect(&ptlbd->bd_connect_handle, 0);
+        RETURN(rc);
+}
+
index d3e5083..9829900 100644 (file)
 #include <linux/lprocfs_status.h>
 #include <linux/obd_ptlbd.h>
 
-#define RSP_OK       0
-#define RSP_NOTOK   -1
-#define RQ_OK        0
-
-int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, 
-                struct request *blkreq)
+int ptlbd_send_rw_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, 
+                   struct buffer_head *first_bh)
 {
-        struct buffer_head *first_bh = blkreq->bh;
-        struct obd_import *imp = &ptlbd->bd_import;
+        struct obd_import *imp = ptlbd->bd_import;
         struct ptlbd_op *op;
         struct ptlbd_niob *niob, *niobs;
         struct ptlbd_rsp *rsp;
@@ -49,12 +44,11 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd,
         struct buffer_head *bh;
         unsigned int page_count;
         int rc, rep_size, size[2];
-        __u32 xid;
         ENTRY;
 
         LASSERT(cmd == PTLBD_READ || cmd == PTLBD_WRITE);
 
-        for ( page_count = 0, bh = first_bh ; bh ; bh = bh->b_next )
+        for ( page_count = 0, bh = first_bh ; bh ; bh = bh->b_reqnext )
                 page_count++;
 
         size[0] = sizeof(struct ptlbd_op);
@@ -62,10 +56,10 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd,
 
         req = ptlrpc_prep_req(imp, cmd, 2, size, NULL);
         if (!req)
-                RETURN(-ENOMEM);
+                RETURN(rc = 1);                  /* need to return error cnt */
 
-        op = lustre_msg_buf(req->rq_reqmsg, 0);
-        niobs = lustre_msg_buf(req->rq_reqmsg, 1);
+        op = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*op));
+        niobs = lustre_msg_buf(req->rq_reqmsg, 1, size[1]);
 
         /* XXX pack */
         op->op_cmd = cmd;
@@ -74,38 +68,26 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd,
         op->op__padding = 0;
         op->op_block_cnt = page_count;
 
-        desc = ptlrpc_prep_bulk(imp->imp_connection);
+        if (cmd == PTLBD_READ) 
+                desc = ptlrpc_prep_bulk_imp (req, BULK_PUT_SINK, PTLBD_BULK_PORTAL);
+        else
+                desc = ptlrpc_prep_bulk_imp (req, BULK_GET_SOURCE, PTLBD_BULK_PORTAL);
         if ( desc == NULL )
-                GOTO(out_req, rc = -ENOMEM);
-        desc->bd_portal = PTLBD_BULK_PORTAL;
-        desc->bd_ptl_ev_hdlr = NULL;
-
-        xid = ptlrpc_next_xid();
-
-        for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) {
-                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
-                if (bulk == NULL)
-                        GOTO(out_req, rc = -ENOMEM);
+                GOTO(out, rc = 1);              /* need to return error cnt */
+        /* NB req now owns desc, and frees it when she frees herself */
+        
+        for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_reqnext, niob++ ) {
+                rc = ptlrpc_prep_bulk_page(desc, bh->b_page,
+                                           bh_offset (bh) & (PAGE_SIZE - 1),
+                                           bh->b_size);
+                if (rc != 0)
+                        GOTO(out, rc = 1);      /* need to return error cnt */
 
-                niob->n_xid = xid;
                 niob->n_block_nr = bh->b_blocknr;
                 niob->n_offset = bh_offset(bh);
                 niob->n_length = bh->b_size;
-
-                bulk->bp_xid = xid;
-                bulk->bp_buf = bh->b_data;
-                bulk->bp_page = bh->b_page;
-                bulk->bp_buflen = bh->b_size;
         }
 
-        if ( cmd == PTLBD_READ )
-                rc = ptlrpc_register_bulk_put(desc);
-        else
-                rc = ptlrpc_register_bulk_get(desc);
-
-        if (rc)
-                GOTO(out_desc, rc);
-
         rep_size = sizeof(struct ptlbd_rsp);
         req->rq_replen = lustre_msg_size(1, &rep_size);
 
@@ -113,38 +95,77 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd,
         req->rq_level = imp->imp_level;
         rc = ptlrpc_queue_wait(req);
 
-        if ( rc != 0 ) {
-                blkreq->errors++;
-                GOTO(out_desc, rc);
+        if ( rc != 0 )
+                GOTO(out, rc = 1);              /* need to return error count */
+
+        rsp = lustre_swab_repbuf(req, 0, sizeof (*rsp),
+                                 lustre_swab_ptlbd_rsp);
+        if (rsp == NULL) {
+                CERROR ("can't unpack response\n");
+                GOTO (out, rc = 1);             /* need to return error count */
         }
-        rsp = lustre_msg_buf(req->rq_repmsg, 0);
-        if (rsp->r_status != RSP_OK) {
-                blkreq->errors += rsp->r_error_cnt;
+        else if (rsp->r_status != 0) {
+                rc = rsp->r_error_cnt;
         }
 
-out_desc:
-        ptlrpc_bulk_decref(desc);
-out_req:
+out:
         ptlrpc_req_finished(req);
         RETURN(rc);
 }
 
-static int ptlbd_bulk_timeout(void *data)
+
+int ptlbd_send_flush_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd)
 {
-/*        struct ptlrpc_bulk_desc *desc = data;*/
+        struct obd_import *imp = ptlbd->bd_import;
+        struct ptlbd_op *op;
+        struct ptlbd_rsp *rsp;
+        struct ptlrpc_request *req;
+        int rc, rep_size, size[1];
         ENTRY;
 
-        CERROR("ugh, timed out\n");
+        LASSERT(cmd == PTLBD_FLUSH);
+
+        size[0] = sizeof(struct ptlbd_op);
+
+        req = ptlrpc_prep_req(imp, cmd, 1, size, NULL);
+        if (!req)
+                RETURN(-ENOMEM); 
+
+        op = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*op));
+
+        /* XXX pack */
+        op->op_cmd = cmd;
+        op->op_lun = 0;
+        op->op_niob_cnt = 0;
+        op->op__padding = 0;
+        op->op_block_cnt = 0;
+
+        rep_size = sizeof(struct ptlbd_rsp);
+        req->rq_replen = lustre_msg_size(1, &rep_size);
+
+        /* XXX find out how we're really supposed to manage levels */
+        req->rq_level = imp->imp_level;
 
-        RETURN(1);
+        rc = ptlrpc_queue_wait(req);
+        if ( rc != 0 )
+                GOTO(out_req, rc = 1);
+        rsp = lustre_swab_repbuf(req, 0, sizeof (*rsp),
+                                 lustre_swab_ptlbd_rsp);
+        if (rsp->r_status != 0)
+                rc = rsp->r_status;
+
+out_req:
+        ptlrpc_req_finished(req);
+        RETURN(rc);
 }
 
+
 int ptlbd_do_filp(struct file *filp, int op, struct ptlbd_niob *niobs, 
                 int page_count, struct list_head *page_list)
 {
         mm_segment_t old_fs;
         struct list_head *pos;
-        int status = RSP_OK;
+        int status = 0;
         ENTRY;
 
         old_fs = get_fs();
@@ -155,118 +176,210 @@ int ptlbd_do_filp(struct file *filp, int op, struct ptlbd_niob *niobs,
                 struct page *page = list_entry(pos, struct page, list);
                 loff_t offset = (niobs->n_block_nr << PAGE_SHIFT) + 
                         niobs->n_offset;
-                if ( op == PTLBD_READ ) {
-                        if ((ret = filp->f_op->read(filp, page_address(page), 
-                             niobs->n_length, &offset)) != niobs->n_length)
-                                status = ret;
-                                goto out;             
-                } else {
-                        if ((ret = filp->f_op->write(filp, page_address(page), 
-                             niobs->n_length, &offset)) != niobs->n_length)
-                                status = ret;
-                                goto out;             
-                }               
-
+                if ( op == PTLBD_READ )
+                        ret = filp->f_op->read(filp, page_address(page), 
+                             niobs->n_length, &offset);
+                else 
+                        ret = filp->f_op->write(filp, page_address(page), 
+                             niobs->n_length, &offset);
+                if (ret != niobs->n_length) {
+                        status = ret;
+                        break;
+                }
                 niobs++;
         }
-out:
         set_fs(old_fs);
         RETURN(status);
 }
 
-int ptlbd_parse_req(struct ptlrpc_request *req)
+
+int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index, 
+                     struct ptlrpc_request *req, int swab)
 {
-        struct ptlbd_op *op;
         struct ptlbd_niob *niob, *niobs;
         struct ptlbd_rsp *rsp;
-        struct ptlrpc_bulk_desc *desc;
+        struct ptlrpc_bulk_desc *desc = NULL;
         struct file *filp = req->rq_obd->u.ptlbd.filp;
         struct l_wait_info lwi;
-        int size[1], wait_flag, i, page_count, rc, error_cnt = 0, 
-            status = RSP_OK;
+        int size[1], i, page_count, rc = 0, error_cnt = 0;
         struct list_head *pos, *n;
+        struct page *page;
         LIST_HEAD(tmp_pages);
         ENTRY;
 
-        rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
+        niobs = lustre_swab_reqbuf (req, 1, sizeof (*niobs),
+                                    lustre_swab_ptlbd_niob);
+        if (niobs == NULL)
+                GOTO (out, rc = -EFAULT);
+
+        size[0] = sizeof(struct ptlbd_rsp);
+        rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg);
         if ( rc )
-                RETURN(rc);
+                GOTO(out, rc);
 
-        op = lustre_msg_buf(req->rq_reqmsg, 0);
-        LASSERT(op->op_cmd == PTLBD_READ || op->op_cmd == PTLBD_WRITE);
+        rsp = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rsp));
+        if ( rsp == NULL )
+                GOTO (out, rc = -EFAULT);
 
-        niobs = lustre_msg_buf(req->rq_reqmsg, 1);
         page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob);
-
-        desc = ptlrpc_prep_bulk(req->rq_connection);
-        if (desc == NULL)
-                GOTO(out, rc = -ENOMEM);
-        desc->bd_ptl_ev_hdlr = NULL;
+        if (swab) {                             /* swab remaining niobs */
+                for (i = 1; i < page_count; i++)
+                        lustre_swab_ptlbd_niob(&niobs[i]);
+        }
+        if (req->rq_export == NULL) {
+                error_cnt++;
+                GOTO(out_reply, rc = -EFAULT);
+        }
+        
+        if (cmd == PTLBD_READ)
+                desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, PTLBD_BULK_PORTAL);
+        else
+                desc = ptlrpc_prep_bulk_exp (req, BULK_GET_SINK, PTLBD_BULK_PORTAL);
+        if (desc == NULL) {
+                error_cnt++;
+                GOTO(out_reply, rc = -ENOMEM);
+        }
         desc->bd_portal = PTLBD_BULK_PORTAL;
+        LASSERT (page_count > 0);
 
         for ( i = 0, niob = niobs ; i < page_count; niob++, i++) {
-                struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
-                if (bulk == NULL)
-                        GOTO(out_bulk, rc = -ENOMEM);
-
-                bulk->bp_page = alloc_page(GFP_KERNEL);
-                if (bulk->bp_page == NULL)
-                        GOTO(out_bulk, rc = -ENOMEM);
-                list_add(&bulk->bp_page->list, &tmp_pages);
-
-                bulk->bp_xid = niob->n_xid;
-                bulk->bp_buf = page_address(bulk->bp_page);
-                bulk->bp_buflen = niob->n_length;
+                page = alloc_page(GFP_KERNEL);
+                if (page == NULL) {
+                        error_cnt++;
+                        GOTO(out_reply, rc = -ENOMEM);
+                }
+                list_add_tail(&page->list, &tmp_pages);
+
+                rc = ptlrpc_prep_bulk_page(desc, page,
+                                           niob->n_offset & (PAGE_SIZE - 1),
+                                           niob->n_length);
+                if (rc != 0) {
+                        error_cnt++;
+                        GOTO(out_reply, rc);
+                }
         }
 
-        if ( op->op_cmd == PTLBD_READ ) {
-                if ((status = ptlbd_do_filp(filp, PTLBD_READ, niobs, 
-                                          page_count, &tmp_pages)) < 0) {
+        if ( cmd == PTLBD_READ ) {
+                if ((rc = ptlbd_do_filp(filp, PTLBD_READ, niobs, 
+                                        page_count, &tmp_pages)) < 0) {
                         error_cnt++;
+                        GOTO(out_reply, rc);
                 }
                 rc = ptlrpc_bulk_put(desc);
-                wait_flag = PTL_BULK_FL_SENT;
         } else {
                 rc = ptlrpc_bulk_get(desc);
-                wait_flag = PTL_BULK_FL_RCVD;
         }
 
-        if ( rc )
-                GOTO(out_bulk, rc);
-
-        /* this synchronization probably isn't good enough */
-        lwi = LWI_TIMEOUT(obd_timeout * HZ, ptlbd_bulk_timeout, desc);
-        rc = l_wait_event(desc->bd_waitq, desc->bd_flags & wait_flag, &lwi);
-
-        size[0] = sizeof(struct ptlbd_rsp);
-        rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg);
-        if ( rc )
-                GOTO(out, rc);
+        if ( rc ) {
+                error_cnt++;
+                GOTO(out_reply, rc);
+        }
 
-        rsp = lustre_msg_buf(req->rq_repmsg, 0);
-        if ( rsp == NULL )
-                GOTO(out, rc = -EINVAL);
+        lwi = LWI_TIMEOUT(obd_timeout * HZ, NULL, desc);
+        rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi);
+        if (rc != 0) {
+                LASSERT(rc == -ETIMEDOUT);
+                ptlrpc_abort_bulk(desc);
+                error_cnt++;
+                GOTO(out_reply, rc);
+        }
         
-        if ( op->op_cmd == PTLBD_WRITE ) {
-                if ((status = ptlbd_do_filp(filp, PTLBD_WRITE, niobs, 
+        if ( cmd == PTLBD_WRITE ) {
+                if ((rc = ptlbd_do_filp(filp, PTLBD_WRITE, niobs, 
                                            page_count, &tmp_pages)) < 0) {
                         error_cnt++;
                 }
         }
 
+out_reply:
         rsp->r_error_cnt = error_cnt;
-        rsp->r_status = status;                         /* I/O status */
-        req->rq_status = RQ_OK ; /* XXX */              /* ptlbd req status */
+        rsp->r_status = rc;  
+        req->rq_status = rc; 
 
-        ptlrpc_reply(req->rq_svc, req);
+        ptlrpc_reply(req);
 
-out_bulk:
         list_for_each_safe(pos, n, &tmp_pages) {
                 struct page *page = list_entry(pos, struct page, list);
                 list_del(&page->list);
                 __free_page(page);
         }
-        ptlrpc_bulk_decref(desc);
+        if (desc)
+                ptlrpc_free_bulk(desc);
 out:
         RETURN(rc);
 }
+
+
+int ptlbd_srv_flush_req(ptlbd_cmd_t cmd, __u16 index, 
+                        struct ptlrpc_request *req)
+{
+        struct ptlbd_rsp *rsp;
+        struct file *filp = req->rq_obd->u.ptlbd.filp;
+        int size[1], rc, status;
+        ENTRY;
+
+        size[0] = sizeof(struct ptlbd_rsp);
+        rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg);
+        if ( rc )
+                RETURN(rc);
+
+        rsp = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rsp));
+        if ( rsp == NULL )
+                RETURN(-EINVAL);
+
+        if (! (filp) && (filp->f_op) && (filp->f_op->fsync) &&
+              (filp->f_dentry))
+                GOTO(out_reply, status = -EINVAL);
+
+        status = filp->f_op->fsync(filp, filp->f_dentry, 1);
+
+out_reply:
+        rsp->r_error_cnt = 0;
+        rsp->r_status = status;
+        req->rq_status = 0;
+
+        ptlrpc_reply(req);
+        RETURN(0);
+}
+
+
+int ptlbd_handle(struct ptlrpc_request *req)
+{
+        struct ptlbd_op *op;
+        int swab;
+        int rc;
+        ENTRY;
+
+        swab = lustre_msg_swabbed (req->rq_reqmsg);
+
+        if (req->rq_reqmsg->opc == PTLBD_CONNECT) {
+                rc = target_handle_connect(req, ptlbd_handle);
+                target_send_reply(req, rc, OBD_FAIL_PTLRPC);
+                RETURN(0);
+        }
+        if (req->rq_reqmsg->opc == PTLBD_DISCONNECT) {
+                rc = target_handle_disconnect(req);
+                target_send_reply(req, rc, OBD_FAIL_PTLRPC);
+                RETURN(0);
+        }
+        op = lustre_swab_reqbuf (req, 0, sizeof (*op),
+                                 lustre_swab_ptlbd_op);
+        if (op == NULL)
+                RETURN(-EFAULT);
+
+        switch (op->op_cmd) {
+                case PTLBD_READ:
+                case PTLBD_WRITE:
+                        rc = ptlbd_srv_rw_req(op->op_cmd, op->op_lun, req, 
+                                              swab);
+                        break;
+
+                case PTLBD_FLUSH:
+                        rc = ptlbd_srv_flush_req(op->op_cmd, op->op_lun, req);
+                        break;
+                default:
+                        rc = -EINVAL;
+        }
+
+        RETURN(rc);
+}
index e4a7046..34ec737 100644 (file)
@@ -52,7 +52,7 @@ static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf)
                 ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE,
                                 PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL,
                                 PTLBD_REPLY_PORTAL,
-                                ptlbd_parse_req, "ptlbd_sv");
+                                ptlbd_handle, "ptlbd_sv", obddev);
 
         if (ptlbd->ptlbd_service == NULL) 
                 GOTO(out_filp, rc = -ENOMEM);
@@ -74,7 +74,7 @@ out_filp:
         RETURN(rc);
 }
 
-static int ptlbd_sv_cleanup(struct obd_device *obddev)
+static int ptlbd_sv_cleanup(struct obd_device *obddev, int force, int failover)
 {
         struct ptlbd_obd *ptlbd = &obddev->u.ptlbd;
         ENTRY;
@@ -94,6 +94,8 @@ static struct obd_ops ptlbd_sv_obd_ops = {
         o_owner:        THIS_MODULE,
         o_setup:        ptlbd_sv_setup,
         o_cleanup:      ptlbd_sv_cleanup,
+        o_connect:      class_connect,
+        o_disconnect:   class_disconnect,
 };
 
 int ptlbd_sv_init(void)
index 446f110..eb44329 100644 (file)
@@ -7,13 +7,16 @@ DEFS=
 
 if LIBLUSTRE
 lib_LIBRARIES = libptlrpc.a
-libptlrpc_a_SOURCES = client.c niobuf.c pack_generic.c recovd.c recover.c connection.c rpc.c events.c  # lproc_ptlrpc.c service.c
+libptlrpc_a_SOURCES = client.c niobuf.c pack_generic.c recover.c connection.c \
+ptlrpc_module.c events.c ptlrpc_lib.c
 else
 MODULE = ptlrpc
 modulefs_DATA = ptlrpc.o
 EXTRA_PROGRAMS = ptlrpc
 
-ptlrpc_SOURCES = recovd.c recover.c connection.c rpc.c events.c service.c client.c niobuf.c pack_generic.c lproc_ptlrpc.c
+ptlrpc_SOURCES = recover.c connection.c ptlrpc_module.c events.c service.c \
+client.c niobuf.c pack_generic.c lproc_ptlrpc.c pinger.c ptlrpc_lib.c \
+ptlrpc_internal.h
 endif
 
 include $(top_srcdir)/Rules
index 998c462..94a068d 100644 (file)
@@ -33,6 +33,8 @@
 #include <linux/lustre_ha.h>
 #include <linux/lustre_import.h>
 
+#include "ptlrpc_internal.h"
+
 void ptlrpc_init_client(int req_portal, int rep_portal, char *name,
                         struct ptlrpc_client *cl)
 {
@@ -70,7 +72,8 @@ struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid)
         return c;
 }
 
-void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,struct obd_uuid *uuid)
+void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,
+                                 struct obd_uuid *uuid)
 {
         struct ptlrpc_peer peer;
         int err;
@@ -85,69 +88,123 @@ void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,struct obd_uuid
         return;
 }
 
-struct ptlrpc_bulk_desc *ptlrpc_prep_bulk(struct ptlrpc_connection *conn)
+static inline struct ptlrpc_bulk_desc *new_bulk(void)
 {
         struct ptlrpc_bulk_desc *desc;
 
         OBD_ALLOC(desc, sizeof(*desc));
-        if (desc != NULL) {
-                desc->bd_connection = ptlrpc_connection_addref(conn);
-                atomic_set(&desc->bd_refcount, 1);
-                init_waitqueue_head(&desc->bd_waitq);
-                INIT_LIST_HEAD(&desc->bd_page_list);
-                INIT_LIST_HEAD(&desc->bd_set_chain);
-                ptl_set_inv_handle(&desc->bd_md_h);
-                ptl_set_inv_handle(&desc->bd_me_h);
-        }
+        if (!desc)
+                return NULL;
+
+        spin_lock_init (&desc->bd_lock);
+        init_waitqueue_head(&desc->bd_waitq);
+        INIT_LIST_HEAD(&desc->bd_page_list);
+        desc->bd_md_h = PTL_HANDLE_NONE;
+        desc->bd_me_h = PTL_HANDLE_NONE;
 
         return desc;
 }
 
-int ptlrpc_bulk_error(struct ptlrpc_bulk_desc *desc)
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req,
+                                               int type, int portal)
 {
-        int rc = 0;
-        if (desc->bd_flags & PTL_RPC_FL_TIMEOUT) {
-                rc = (desc->bd_flags & PTL_RPC_FL_INTR ? -ERESTARTSYS :
-                      -ETIMEDOUT);
-        }
-        return rc;
+        struct obd_import       *imp = req->rq_import;
+        unsigned long            flags;
+        struct ptlrpc_bulk_desc *desc;
+
+        LASSERT (type == BULK_PUT_SINK || type == BULK_GET_SOURCE);
+        
+        desc = new_bulk();
+        if (desc == NULL)
+                RETURN(NULL);
+        
+        /* Is this sampled at the right place?  Do we want to get the import
+         * generation just before we send?  Should it match the generation of
+         * the request? */
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        desc->bd_import_generation = imp->imp_generation;
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+        desc->bd_import = class_import_get(imp);
+        desc->bd_req = req;
+        desc->bd_type = type;
+        desc->bd_portal = portal;
+
+        /* This makes req own desc, and free it when she frees herself */
+        req->rq_bulk = desc;
+
+        return desc;
 }
 
-struct ptlrpc_bulk_page *ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc)
+struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req,
+                                               int type, int portal)
+{
+        struct obd_export       *exp = req->rq_export;
+        struct ptlrpc_bulk_desc *desc;
+
+        LASSERT (type == BULK_PUT_SOURCE || type == BULK_GET_SINK);
+        
+        desc = new_bulk();
+        if (desc == NULL)
+                RETURN(NULL);
+
+        desc->bd_export = class_export_get(exp);
+        desc->bd_req = req;
+        desc->bd_type = type;
+        desc->bd_portal = portal;
+
+        /* NB we don't assign rq_bulk here; server-side requests are
+         * re-used, and the handler frees the bulk desc explicitly. */
+
+        return desc;
+}
+
+int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
+                          struct page *page, int pageoffset, int len)
 {
         struct ptlrpc_bulk_page *bulk;
 
         OBD_ALLOC(bulk, sizeof(*bulk));
-        if (bulk != NULL) {
-                bulk->bp_desc = desc;
-                list_add_tail(&bulk->bp_link, &desc->bd_page_list);
-                desc->bd_page_count++;
-        }
-        return bulk;
+        if (bulk == NULL)
+                return (-ENOMEM);
+
+        LASSERT (page != NULL);
+        LASSERT (pageoffset >= 0);
+        LASSERT (len > 0);
+        LASSERT (pageoffset + len <= PAGE_SIZE);
+
+        bulk->bp_page = page;
+        bulk->bp_pageoffset = pageoffset;
+        bulk->bp_buflen = len;
+
+        bulk->bp_desc = desc;
+        list_add_tail(&bulk->bp_link, &desc->bd_page_list);
+        desc->bd_page_count++;
+        return 0;
 }
 
 void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 {
         struct list_head *tmp, *next;
         ENTRY;
-        if (desc == NULL) {
-                EXIT;
-                return;
-        }
-
-        LASSERT(list_empty(&desc->bd_set_chain));
-
-        if (atomic_read(&desc->bd_refcount) != 0)
-                CERROR("freeing desc %p with refcount %d!\n", desc,
-                       atomic_read(&desc->bd_refcount));
 
+        LASSERT (desc != NULL);
+        LASSERT (desc->bd_page_count != 0x5a5a5a5a); /* not freed already */
+        LASSERT (!desc->bd_network_rw);         /* network hands off or */
+        
         list_for_each_safe(tmp, next, &desc->bd_page_list) {
                 struct ptlrpc_bulk_page *bulk;
                 bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
                 ptlrpc_free_bulk_page(bulk);
         }
 
-        ptlrpc_put_connection(desc->bd_connection);
+        LASSERT (desc->bd_page_count == 0);
+        LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL));
+
+        if (desc->bd_export)
+                class_export_put(desc->bd_export);
+        else
+                class_import_put(desc->bd_import);
 
         OBD_FREE(desc, sizeof(*desc));
         EXIT;
@@ -155,168 +212,666 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 
 void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *bulk)
 {
-        ENTRY;
-        if (bulk == NULL) {
-                EXIT;
-                return;
-        }
-
+        LASSERT (bulk != NULL);
+        
         list_del(&bulk->bp_link);
         bulk->bp_desc->bd_page_count--;
         OBD_FREE(bulk, sizeof(*bulk));
-        EXIT;
 }
 
-static int ll_sync_brw_timeout(void *data)
+struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
+                                       int count, int *lengths, char **bufs)
 {
-        struct obd_brw_set *set = data;
-        struct list_head *tmp;
-        int failed = 0;
+        struct ptlrpc_request *request;
+        int rc;
         ENTRY;
 
-        LASSERT(set);
+        LASSERT((unsigned long)imp > 0x1000);
 
-        set->brw_flags |= PTL_RPC_FL_TIMEOUT;
+        OBD_ALLOC(request, sizeof(*request));
+        if (!request) {
+                CERROR("request allocation out of memory\n");
+                RETURN(NULL);
+        }
 
-        list_for_each(tmp, &set->brw_desc_head) {
-                struct ptlrpc_bulk_desc *desc =
-                        list_entry(tmp, struct ptlrpc_bulk_desc, bd_set_chain);
+        rc = lustre_pack_msg(count, lengths, bufs,
+                             &request->rq_reqlen, &request->rq_reqmsg);
+        if (rc) {
+                CERROR("cannot pack request %d\n", rc);
+                OBD_FREE(request, sizeof(*request));
+                RETURN(NULL);
+        }
 
-                /* Skip descriptors that were completed successfully. */
-                if (desc->bd_flags & (PTL_BULK_FL_RCVD | PTL_BULK_FL_SENT))
-                        continue;
+        request->rq_timeout = obd_timeout;
+        request->rq_level = LUSTRE_CONN_FULL;
+        request->rq_type = PTL_RPC_MSG_REQUEST;
+        request->rq_import = class_import_get(imp);
+        request->rq_phase = RQ_PHASE_NEW;
+        
+        /* XXX FIXME bug 249 */
+        request->rq_request_portal = imp->imp_client->cli_request_portal;
+        request->rq_reply_portal = imp->imp_client->cli_reply_portal;
 
-                LASSERT(desc->bd_connection);
-
-                /* If PtlMDUnlink succeeds, then bulk I/O on the MD hasn't
-                 * even started yet.  XXX where do we kunmup the thing?
-                 *
-                 * If it fail with PTL_MD_BUSY, then the network is still
-                 * reading/writing the buffers and we must wait for it to
-                 * complete (which it will within finite time, most
-                 * probably with failure; we really need portals error
-                 * events to detect that).
-                 *
-                 * Otherwise (PTL_INV_MD) it completed after the bd_flags
-                 * test above!
-                 */
-                if (PtlMDUnlink(desc->bd_md_h) != PTL_OK) {
-                        CERROR("Near-miss on OST %s -- need to adjust "
-                               "obd_timeout?\n",
-                               desc->bd_connection->c_remote_uuid.uuid);
-                        continue;
-                }
+        request->rq_connection = ptlrpc_connection_addref(imp->imp_connection);
 
-                CERROR("IO of %d pages to/from %s:%d (conn %p) timed out\n",
-                       desc->bd_page_count,
-                       desc->bd_connection->c_remote_uuid.uuid,
-                       desc->bd_portal, desc->bd_connection);
+        spin_lock_init (&request->rq_lock);
+        INIT_LIST_HEAD(&request->rq_list);
+        init_waitqueue_head(&request->rq_wait_for_rep);
+        request->rq_xid = ptlrpc_next_xid();
+        atomic_set(&request->rq_refcount, 1);
 
-                /* This one will "never" arrive, don't wait for it. */
-                if (atomic_dec_and_test(&set->brw_refcount))
-                        wake_up(&set->brw_waitq);
+        request->rq_reqmsg->opc = opcode;
+        request->rq_reqmsg->flags = 0;
 
-                if (class_signal_connection_failure)
-                        class_signal_connection_failure(desc->bd_connection);
-                else
-                        failed = 1;
+        RETURN(request);
+}
+
+struct ptlrpc_request_set *ptlrpc_prep_set(void)
+{
+        struct ptlrpc_request_set *set;
+
+        OBD_ALLOC(set, sizeof *set);
+        if (!set)
+                RETURN(NULL);
+        INIT_LIST_HEAD(&set->set_requests);
+        init_waitqueue_head(&set->set_waitq);
+        set->set_remaining = 0;
+
+        RETURN(set);
+}
+
+/* Finish with this set; opposite of prep_set. */
+void ptlrpc_set_destroy(struct ptlrpc_request_set *set)
+{
+        struct list_head *tmp;
+        struct list_head *next;
+        int               expected_phase;
+        int               n = 0;
+        ENTRY;
+
+        /* Requests on the set should either all be completed, or all be new */
+        expected_phase = (set->set_remaining == 0) ? 
+                         RQ_PHASE_COMPLETE : RQ_PHASE_NEW;
+        list_for_each (tmp, &set->set_requests) {
+                struct ptlrpc_request *req =
+                        list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+                LASSERT (req->rq_phase == expected_phase);
+                n++;
+        }
+        
+        LASSERT (set->set_remaining == 0 || set->set_remaining == n);
+        
+        list_for_each_safe(tmp, next, &set->set_requests) {
+                struct ptlrpc_request *req =
+                        list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+                list_del_init(&req->rq_set_chain);
+
+                LASSERT (req->rq_phase == expected_phase);
+
+                if (req->rq_phase == RQ_PHASE_NEW) {
+                        
+                        if (req->rq_interpret_reply != NULL) {
+                                int (*interpreter)(struct ptlrpc_request *, void *, int) =
+                                        req->rq_interpret_reply;
+                                
+                                /* higher level (i.e. LOV) failed; 
+                                 * let the sub reqs clean up */
+                                req->rq_status = -EBADR;
+                                interpreter(req, &req->rq_async_args, req->rq_status);
+                        }
+                        set->set_remaining--;
+                }
+
+                req->rq_set = NULL;
+                ptlrpc_req_finished (req);
         }
 
-        /* 0 = We go back to sleep, until we're resumed or interrupted */
-        /* 1 = We can't be recovered, just abort the syscall with -ETIMEDOUT */
-        RETURN(failed);
+        LASSERT(set->set_remaining == 0);
+
+        OBD_FREE(set, sizeof(*set));
+        EXIT;
 }
 
-static int ll_sync_brw_intr(void *data)
+void ptlrpc_set_add_req(struct ptlrpc_request_set *set,
+                        struct ptlrpc_request *req)
 {
-        struct obd_brw_set *set = data;
+        /* The set takes over the caller's request reference */
+        list_add_tail(&req->rq_set_chain, &set->set_requests);
+        req->rq_set = set;
+        set->set_remaining++;
+}
 
+static int ptlrpc_check_reply(struct ptlrpc_request *req)
+{
+        unsigned long flags;
+        int rc = 0;
         ENTRY;
-        set->brw_flags |= PTL_RPC_FL_INTR;
-        RETURN(1); /* ignored, as of this writing */
+
+        /* serialise with network callback */
+        spin_lock_irqsave (&req->rq_lock, flags);
+
+        if (req->rq_replied) {
+                DEBUG_REQ(D_NET, req, "REPLIED:");
+                GOTO(out, rc = 1);
+        }
+
+        if (req->rq_err) {
+                DEBUG_REQ(D_ERROR, req, "ABORTED:");
+                GOTO(out, rc = 1);
+        }
+
+        if (req->rq_resend) {
+                DEBUG_REQ(D_ERROR, req, "RESEND:");
+                GOTO(out, rc = 1);
+        }
+
+        if (req->rq_restart) {
+                DEBUG_REQ(D_ERROR, req, "RESTART:");
+                GOTO(out, rc = 1);
+        }
+        EXIT;
+ out:
+        spin_unlock_irqrestore (&req->rq_lock, flags);
+        DEBUG_REQ(D_NET, req, "rc = %d for", rc);
+        return rc;
 }
 
-int ll_brw_sync_wait(struct obd_brw_set *set, int phase)
+static int ptlrpc_check_status(struct ptlrpc_request *req)
 {
-        struct l_wait_info lwi;
-        struct list_head *tmp, *next;
-        int rc = 0;
+        int err;
+        ENTRY;
+
+        err = req->rq_repmsg->status;
+        if (req->rq_repmsg->type == PTL_RPC_MSG_ERR) {
+                DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)", err);
+                if (err >= 0)
+                        CERROR("Error Reply has >= zero status\n");
+                RETURN(err < 0 ? err : -EINVAL);
+        }
+
+        if (err < 0) {
+                DEBUG_REQ(D_INFO, req, "status is %d", err);
+        } else if (err > 0) {
+                /* XXX: translate this error from net to host */
+                DEBUG_REQ(D_INFO, req, "status is %d", err);
+        }
+
+        RETURN(err);
+}
+
+#warning this needs to change after robert fixes eviction handling
+static int 
+after_reply(struct ptlrpc_request *req, int *restartp)
+{
+        unsigned long flags;
+        struct obd_import *imp = req->rq_import;
+        int rc;
         ENTRY;
 
-        obd_brw_set_addref(set);
-        switch(phase) {
-        case CB_PHASE_START:
-                lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, ll_sync_brw_timeout,
-                                       ll_sync_brw_intr, set);
-                rc = l_wait_event(set->brw_waitq,
-                                  atomic_read(&set->brw_desc_count) == 0, &lwi);
-
-                list_for_each_safe(tmp, next, &set->brw_desc_head) {
-                        struct ptlrpc_bulk_desc *desc =
-                                list_entry(tmp, struct ptlrpc_bulk_desc,
-                                           bd_set_chain);
-                        list_del_init(&desc->bd_set_chain);
-                        ptlrpc_bulk_decref(desc);
+        LASSERT (!req->rq_receiving_reply);
+        LASSERT (req->rq_replied);
+
+        if (restartp != NULL)
+                *restartp = 0;
+        
+        /* NB Until this point, the whole of the incoming message,
+         * including buflens, status etc is in the sender's byte order. */
+
+#if SWAB_PARANOIA
+        /* Clear reply swab mask; this is a new reply in sender's byte order */
+        req->rq_rep_swab_mask = 0;
+#endif
+        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
+        if (rc) {
+                CERROR("unpack_rep failed: %d\n", rc);
+                RETURN (-EPROTO);
+        }
+
+        if (req->rq_repmsg->type != PTL_RPC_MSG_REPLY &&
+            req->rq_repmsg->type != PTL_RPC_MSG_ERR) {
+                CERROR("invalid packet type received (type=%u)\n",
+                       req->rq_repmsg->type);
+                RETURN (-EPROTO);
+        }
+
+        /* Store transno in reqmsg for replay. */
+        req->rq_reqmsg->transno = req->rq_transno = req->rq_repmsg->transno;
+
+        rc = ptlrpc_check_status(req);
+
+        /* Either we've been evicted, or the server has failed for
+         * some reason. Try to reconnect, and if that fails, punt to
+         * upcall */
+        if (rc == -ENOTCONN) {
+                if (req->rq_level < LUSTRE_CONN_FULL || req->rq_no_recov ||
+                    imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) {
+                        RETURN(-ENOTCONN);
                 }
-                break;
-        case CB_PHASE_FINISH:
-                if (atomic_dec_and_test(&set->brw_desc_count))
-                        wake_up(&set->brw_waitq);
-                break;
-        default:
+
+                rc = ptlrpc_request_handle_eviction(req);
+                if (rc)
+                        CERROR("can't reconnect to %s@%s: %d\n", 
+                               imp->imp_target_uuid.uuid,
+                               imp->imp_connection->c_remote_uuid.uuid, rc);
+                else
+                        ptlrpc_wake_delayed(imp);
+
+                if (req->rq_err)
+                        RETURN(-EIO);
+
+                if (req->rq_resend) {
+                        if (restartp == NULL)
+                                LBUG(); /* async resend not supported yet */
+                        spin_lock_irqsave (&req->rq_lock, flags);
+                        req->rq_resend = 0;
+                        spin_unlock_irqrestore (&req->rq_lock, flags);
+                        *restartp = 1;
+                        lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+                        DEBUG_REQ(D_HA, req, "resending: ");
+                        RETURN (0);
+                }
+
+                CERROR("request should be err or resend: %p\n", req);
                 LBUG();
         }
-        obd_brw_set_decref(set);
 
+        if (req->rq_import->imp_replayable) {
+                spin_lock_irqsave(&imp->imp_lock, flags);
+                if ((req->rq_replay || req->rq_transno != 0) && rc >= 0)
+                        ptlrpc_retain_replayable_request(req, imp);
+
+                if (req->rq_transno > imp->imp_max_transno)
+                        imp->imp_max_transno = req->rq_transno;
+
+                /* Replay-enabled imports return commit-status information. */
+                if (req->rq_repmsg->last_committed) {
+                        if (req->rq_repmsg->last_committed < 
+                            imp->imp_peer_committed_transno) {
+                                CERROR("%s went back in time (transno "LPD64
+                                       " was committed, server claims "LPD64
+                                       ")! is shared storage not coherent?\n",
+                                       imp->imp_target_uuid.uuid,
+                                       imp->imp_peer_committed_transno,
+                                       req->rq_repmsg->last_committed);
+                        }
+                        imp->imp_peer_committed_transno =
+                                req->rq_repmsg->last_committed;
+                }
+                ptlrpc_free_committed(imp);
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+        }
+        
         RETURN(rc);
 }
 
-struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
-                                       int count, int *lengths, char **bufs)
+static int check_set(struct ptlrpc_request_set *set)
 {
-        struct ptlrpc_connection *conn;
-        struct ptlrpc_request *request;
-        int rc;
+        unsigned long flags;
+        struct list_head *tmp;
+        ENTRY;
+
+        if (set->set_remaining == 0)
+                RETURN(1);
+
+        list_for_each(tmp, &set->set_requests) {
+                struct ptlrpc_request *req =
+                        list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+                struct obd_import *imp = req->rq_import;
+                int rc = 0;
+
+                LASSERT (req->rq_phase == RQ_PHASE_RPC ||
+                         req->rq_phase == RQ_PHASE_BULK ||
+                         req->rq_phase == RQ_PHASE_COMPLETE);
+
+                if (req->rq_phase == RQ_PHASE_COMPLETE)
+                        continue;
+
+                if (req->rq_err) {
+                        ptlrpc_unregister_reply(req);
+                        if (req->rq_status == 0)
+                                req->rq_status = -EIO;
+                        req->rq_phase = RQ_PHASE_INTERPRET;
+                
+                        spin_lock_irqsave(&imp->imp_lock, flags);
+                        list_del_init(&req->rq_list);
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+                        GOTO (interpret, req->rq_status);
+                } 
+                
+                if (req->rq_intr) {
+                        /* NB could be on delayed list */
+                        ptlrpc_unregister_reply(req);
+                        req->rq_status = -EINTR;
+                        req->rq_phase = RQ_PHASE_INTERPRET;
+                
+                        spin_lock_irqsave(&imp->imp_lock, flags);
+                        list_del_init(&req->rq_list);
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+                        GOTO (interpret, req->rq_status);
+                }
+                
+                if (req->rq_phase == RQ_PHASE_RPC) {
+                        int do_restart = 0;
+                        if (req->rq_waiting || req->rq_resend) {
+                                spin_lock_irqsave(&imp->imp_lock, flags);
+                                
+                                if (req->rq_level > imp->imp_level) {
+                                        spin_unlock_irqrestore(&imp->imp_lock,
+                                                               flags);
+                                        continue;
+                                }
+                                
+                                list_del(&req->rq_list);
+                                list_add_tail(&req->rq_list,
+                                              &imp->imp_sending_list);
+                                spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+                                req->rq_waiting = 0;
+                                if (req->rq_resend) {
+                                        lustre_msg_add_flags(req->rq_reqmsg,
+                                                             MSG_RESENT);
+                                        spin_lock_irqsave(&req->rq_lock, flags);
+                                        req->rq_resend = 0;
+                                        spin_unlock_irqrestore(&req->rq_lock,
+                                                               flags);
+                                        ptlrpc_unregister_reply(req);
+                                        if (req->rq_bulk) 
+                                                ptlrpc_unregister_bulk(req);
+                               }
+                                
+                                rc = ptl_send_rpc(req);
+                                if (rc) {
+                                        req->rq_status = rc;
+                                        req->rq_phase = RQ_PHASE_INTERPRET;
+                                        GOTO (interpret, req->rq_status);
+                                }
+                                
+                        }
+                
+                        /* Ensure the network callback returned */
+                        spin_lock_irqsave (&req->rq_lock, flags);
+                        if (!req->rq_replied) {
+                                spin_unlock_irqrestore (&req->rq_lock, flags);
+                                continue;
+                        }
+                        spin_unlock_irqrestore (&req->rq_lock, flags);
+                
+                        spin_lock_irqsave(&imp->imp_lock, flags);
+                        list_del_init(&req->rq_list);
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+                        req->rq_status = after_reply(req, &do_restart);
+                        if (do_restart) {
+                                req->rq_resend = 1; /* ugh */
+                                continue;
+                        }
+                        
+                        if (req->rq_bulk == NULL) {
+                                req->rq_phase = RQ_PHASE_INTERPRET;
+                                GOTO (interpret, req->rq_status);
+                        }
+
+                        req->rq_phase = RQ_PHASE_BULK;
+                }
+
+                LASSERT (req->rq_phase == RQ_PHASE_BULK);
+                if (!ptlrpc_bulk_complete (req->rq_bulk))
+                        continue;
+                
+                req->rq_phase = RQ_PHASE_INTERPRET;
+                
+        interpret:
+                LASSERT (req->rq_phase == RQ_PHASE_INTERPRET);
+                LASSERT (!req->rq_receiving_reply);
+
+                if (req->rq_bulk != NULL)
+                        ptlrpc_unregister_bulk (req);
+                
+                if (req->rq_interpret_reply != NULL) {
+                        int (*interpreter)(struct ptlrpc_request *, void *, int) =
+                                req->rq_interpret_reply;
+                        req->rq_status = interpreter(req, &req->rq_async_args, 
+                                                     req->rq_status);
+                }
+
+                CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:"
+                       "opc %s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+                       imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status,
+                       req->rq_xid,
+                       imp->imp_connection->c_peer.peer_ni->pni_name,
+                       imp->imp_connection->c_peer.peer_nid,
+                       req->rq_reqmsg->opc);
+
+                req->rq_phase = RQ_PHASE_COMPLETE;
+                set->set_remaining--;
+        }
+
+        RETURN (set->set_remaining == 0);
+}
+
+static int expire_one_request(struct ptlrpc_request *req)
+{
+        unsigned long      flags;
+        struct obd_import *imp = req->rq_import;
+        ENTRY;
+
+        DEBUG_REQ(D_ERROR, req, "timeout");
+
+        spin_lock_irqsave (&req->rq_lock, flags);
+        req->rq_timedout = 1;
+        spin_unlock_irqrestore (&req->rq_lock, flags);
+
+        ptlrpc_unregister_reply (req);
+
+        if (imp == NULL) {
+                DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?");
+                RETURN(1);
+        }
+
+        /* The DLM server doesn't want recovery run on its imports. */
+        if (imp->imp_dlm_fake)
+                RETURN(1);
+
+        /* If this request is for recovery or other primordial tasks,
+         * don't go back to sleep, and don't start recovery again.. */
+        if (req->rq_level < LUSTRE_CONN_FULL || req->rq_no_recov ||
+            imp->imp_obd->obd_no_recov)
+                RETURN(1);
+
+        ptlrpc_fail_import(imp, req->rq_import_generation);
+
+        RETURN(0);
+}
+
+static int expired_set(void *data)
+{
+        struct ptlrpc_request_set *set = data;
+        struct list_head          *tmp;
+        time_t                     now = LTIME_S (CURRENT_TIME);
         ENTRY;
 
-        LASSERT((unsigned long)imp > 0x1000);
-        conn = imp->imp_connection;
+        LASSERT (set != NULL);
+        CERROR("EXPIRED SET %p\n", set);
+
+        /* A timeout expired; see which reqs it applies to... */
+        list_for_each (tmp, &set->set_requests) {
+                struct ptlrpc_request *req =
+                        list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+                /* request in-flight? */
+                if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting) ||
+                      (req->rq_phase == RQ_PHASE_BULK)))
+                        continue;
+                
+                if (req->rq_timedout ||           /* already dealt with */
+                    req->rq_sent + req->rq_timeout > now) /* not expired */
+                        continue;
+
+                /* deal with this guy */
+                expire_one_request (req);
+        }
+
+        /* When waiting for a whole set, we always to break out of the
+         * sleep so we can recalculate the timeout, or enable interrupts
+         * iff everyone's timed out.
+         */
+        RETURN(1);
+}
+
+static void interrupted_set(void *data)
+{
+        struct ptlrpc_request_set *set = data;
+        struct list_head *tmp;
+        unsigned long flags;
+
+        LASSERT (set != NULL);
+        CERROR("INTERRUPTED SET %p\n", set);
+
+        list_for_each(tmp, &set->set_requests) {
+                struct ptlrpc_request *req =
+                        list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+                if (req->rq_phase != RQ_PHASE_RPC)
+                        continue;
+                
+                spin_lock_irqsave (&req->rq_lock, flags);
+                req->rq_intr = 1;
+                spin_unlock_irqrestore (&req->rq_lock, flags);
+        }
+}
+
+int ptlrpc_set_wait(struct ptlrpc_request_set *set)
+{
+        struct list_head      *tmp;
+        struct obd_import     *imp;
+        struct ptlrpc_request *req;
+        struct l_wait_info     lwi;
+        unsigned long          flags;
+        int                    rc;
+        time_t                 now;
+        time_t                 deadline;
+        int                    timeout;
+        ENTRY;
+
+        list_for_each(tmp, &set->set_requests) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+                LASSERT (req->rq_level == LUSTRE_CONN_FULL);
+                LASSERT (req->rq_phase == RQ_PHASE_NEW);
+                req->rq_phase = RQ_PHASE_RPC;
+                
+                imp = req->rq_import;
+                spin_lock_irqsave(&imp->imp_lock, flags);
+
+                if (imp->imp_invalid) {
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+                        req->rq_status = -EIO;
+                        req->rq_phase = RQ_PHASE_INTERPRET;
+                        continue;
+                }
 
-        OBD_ALLOC(request, sizeof(*request));
-        if (!request) {
-                CERROR("request allocation out of memory\n");
-                RETURN(NULL);
-        }
+                if (req->rq_level > imp->imp_level) {
+                        if (req->rq_no_recov || imp->imp_obd->obd_no_recov ||
+                            imp->imp_dlm_fake) {
+                                spin_unlock_irqrestore(&imp->imp_lock, flags);
+                                req->rq_status = -EWOULDBLOCK;
+                                req->rq_phase = RQ_PHASE_INTERPRET;
+                                continue;
+                        }
+
+                        spin_lock (&req->rq_lock);
+                        req->rq_waiting = 1;
+                        spin_unlock (&req->rq_lock);
+                        LASSERT (list_empty (&req->rq_list));
+                        // list_del(&req->rq_list);
+                        list_add_tail(&req->rq_list, &imp->imp_delayed_list);
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+                        continue;
+                }
 
-        rc = lustre_pack_msg(count, lengths, bufs,
-                             &request->rq_reqlen, &request->rq_reqmsg);
-        if (rc) {
-                CERROR("cannot pack request %d\n", rc);
-                OBD_FREE(request, sizeof(*request));
-                RETURN(NULL);
-        }
+                /* XXX this is the same as ptlrpc_queue_wait */
+                LASSERT(list_empty(&req->rq_list));
+                list_add_tail(&req->rq_list, &imp->imp_sending_list);
+                req->rq_import_generation = imp->imp_generation;
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
 
-        request->rq_timeout = obd_timeout;
-        request->rq_level = LUSTRE_CONN_FULL;
-        request->rq_type = PTL_RPC_MSG_REQUEST;
-        request->rq_import = imp;
+                CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:ni:nid:opc"
+                       " %s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+                       imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status,
+                       req->rq_xid,
+                       imp->imp_connection->c_peer.peer_ni->pni_name,
+                       imp->imp_connection->c_peer.peer_nid,
+                       req->rq_reqmsg->opc);
 
-        /* XXX FIXME bug 625069, now 249 */
-        request->rq_request_portal = imp->imp_client->cli_request_portal;
-        request->rq_reply_portal = imp->imp_client->cli_reply_portal;
+                rc = ptl_send_rpc(req);
+                if (rc) {
+                        req->rq_status = rc;
+                        req->rq_phase = RQ_PHASE_INTERPRET;
+                }
+        }
 
-        request->rq_connection = ptlrpc_connection_addref(conn);
+        do {
+                now = LTIME_S (CURRENT_TIME);
+                timeout = 0;
+                list_for_each (tmp, &set->set_requests) {
+                        req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
 
-        INIT_LIST_HEAD(&request->rq_list);
-        atomic_set(&request->rq_refcount, 1);
+                        /* request in-flight? */
+                        if (!((req->rq_phase == RQ_PHASE_RPC &&
+                               !req->rq_waiting) ||
+                              (req->rq_phase == RQ_PHASE_BULK)))
+                                continue;
 
-        request->rq_reqmsg->magic = PTLRPC_MSG_MAGIC;
-        request->rq_reqmsg->version = PTLRPC_MSG_VERSION;
-        request->rq_reqmsg->opc = HTON__u32(opcode);
-        request->rq_reqmsg->flags = 0;
+                        if (req->rq_timedout)   /* already timed out */
+                                continue;
+                        
+                        deadline = req->rq_sent + req->rq_timeout;
+                        if (deadline <= now)    /* actually expired already */
+                                timeout = 1;    /* ASAP */
+                        else if (timeout == 0 || timeout > deadline - now)
+                                timeout = deadline - now;
+                }
 
-        ptlrpc_hdl2req(request, &imp->imp_handle);
-        RETURN(request);
+                /* wait until all complete, interrupted, or an in-flight
+                 * req times out */
+                CDEBUG(D_HA, "set %p going to sleep for %d seconds\n",
+                       set, timeout);
+                lwi = LWI_TIMEOUT_INTR(timeout * HZ, 
+                                       expired_set, interrupted_set, set);
+                rc = l_wait_event(set->set_waitq, check_set(set), &lwi);
+                
+                LASSERT (rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
+
+                /* -EINTR => all requests have been flagged rq_intr so next
+                 * check completes.
+                 * -ETIMEOUTD => someone timed out.  When all reqs have
+                 * timed out, signals are enabled allowing completion with
+                 * EINTR.
+                 * I don't really care if we go once more round the loop in
+                 * the error cases -eeb. */
+        } while (rc != 0);
+
+        LASSERT (set->set_remaining == 0);
+
+        rc = 0;
+        list_for_each(tmp, &set->set_requests) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+                LASSERT (req->rq_phase == RQ_PHASE_COMPLETE);
+                if (req->rq_status != 0)
+                        rc = req->rq_status;
+        }
+        
+        if (set->set_interpret != NULL) {
+                int (*interpreter)(struct ptlrpc_request_set *set, void *, int) =
+                        set->set_interpret;
+                rc = interpreter (set, &set->set_args, rc);
+        }
+        
+        RETURN(rc);
 }
 
 static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
@@ -327,9 +882,11 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
                 return;
         }
 
+        LASSERT (!request->rq_receiving_reply);
+        
         /* We must take it off the imp_replay_list first.  Otherwise, we'll set
          * request->rq_reqmsg to NULL while osc_close is dereferencing it. */
-        if (request->rq_import) {
+        if (request->rq_import != NULL) {
                 unsigned long flags = 0;
                 if (!locked)
                         spin_lock_irqsave(&request->rq_import->imp_lock, flags);
@@ -340,23 +897,29 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked)
         }
 
         if (atomic_read(&request->rq_refcount) != 0) {
-                CERROR("freeing request %p (%d->%s:%d) with refcount %d\n",
-                       request, request->rq_reqmsg->opc,
-                       request->rq_connection->c_remote_uuid.uuid,
-                       request->rq_import->imp_client->cli_request_portal,
-                       atomic_read (&request->rq_refcount));
+                DEBUG_REQ(D_ERROR, request,
+                          "freeing request with nonzero refcount");
                 LBUG();
         }
 
         if (request->rq_repmsg != NULL) {
                 OBD_FREE(request->rq_repmsg, request->rq_replen);
                 request->rq_repmsg = NULL;
-                request->rq_reply_md.start = NULL;
         }
         if (request->rq_reqmsg != NULL) {
                 OBD_FREE(request->rq_reqmsg, request->rq_reqlen);
                 request->rq_reqmsg = NULL;
         }
+        if (request->rq_export != NULL) {
+                class_export_put(request->rq_export);
+                request->rq_export = NULL;
+        }
+        if (request->rq_import != NULL) {
+                class_import_put(request->rq_import);
+                request->rq_import = NULL;
+        }
+        if (request->rq_bulk != NULL)
+                ptlrpc_free_bulk(request->rq_bulk);
 
         ptlrpc_put_connection(request->rq_connection);
         OBD_FREE(request, sizeof(*request));
@@ -396,81 +959,81 @@ void ptlrpc_req_finished(struct ptlrpc_request *request)
         __ptlrpc_req_finished(request, 0);
 }
 
-static int ptlrpc_check_reply(struct ptlrpc_request *req)
+static void ptlrpc_cleanup_request_buf(struct ptlrpc_request *request)
 {
-        int rc = 0;
-
-        ENTRY;
-        if (req->rq_repmsg != NULL) {
-                req->rq_transno = NTOH__u64(req->rq_repmsg->transno);
-                /* Store transno in reqmsg for replay. */
-                req->rq_reqmsg->transno = req->rq_repmsg->transno;
-                req->rq_flags |= PTL_RPC_FL_REPLIED;
-                GOTO(out, rc = 1);
-        }
-
-        if (req->rq_flags & PTL_RPC_FL_RESEND) {
-                DEBUG_REQ(D_ERROR, req, "RESEND:");
-                GOTO(out, rc = 1);
-        }
-
-        if (req->rq_flags & PTL_RPC_FL_ERR) {
-                ENTRY;
-                DEBUG_REQ(D_ERROR, req, "ABORTED:");
-                GOTO(out, rc = 1);
-        }
-
-        if (req->rq_flags & PTL_RPC_FL_RESTART) {
-                DEBUG_REQ(D_ERROR, req, "RESTART:");
-                GOTO(out, rc = 1);
-        }
-        EXIT;
- out:
-        DEBUG_REQ(D_NET, req, "rc = %d for", rc);
-        return rc;
+        OBD_FREE(request->rq_reqmsg, request->rq_reqlen);
+        request->rq_reqmsg = NULL;
+        request->rq_reqlen = 0;
 }
 
-static int ptlrpc_check_status(struct ptlrpc_request *req)
+/* Disengage the client's reply buffer from the network
+ * NB does _NOT_ unregister any client-side bulk. 
+ * IDEMPOTENT, but _not_ safe against concurrent callers.
+ * The request owner (i.e. the thread doing the I/O) must call...
+ */
+void ptlrpc_unregister_reply (struct ptlrpc_request *request)
 {
-        int err;
+        unsigned long flags;
+        int           rc;
         ENTRY;
 
-        err = req->rq_repmsg->status;
-        if (req->rq_repmsg->type == NTOH__u32(PTL_RPC_MSG_ERR)) {
-                DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)", err);
-                RETURN(err ? err : -EINVAL);
-        }
+        LASSERT (!in_interrupt ());             /* might sleep */
 
-        if (err < 0) {
-                DEBUG_REQ(D_INFO, req, "status is %d", err);
-        } else if (err > 0) {
-                /* XXX: translate this error from net to host */
-                DEBUG_REQ(D_INFO, req, "status is %d", err);
+        spin_lock_irqsave (&request->rq_lock, flags);
+        if (!request->rq_receiving_reply) {     /* not waiting for a reply */
+                spin_unlock_irqrestore (&request->rq_lock, flags);
+                EXIT;
+                /* NB reply buffer not freed here */
+                return;
         }
 
-        RETURN(err);
-}
-
-static void ptlrpc_cleanup_request_buf(struct ptlrpc_request *request)
-{
-        OBD_FREE(request->rq_reqmsg, request->rq_reqlen);
-        request->rq_reqmsg = NULL;
-        request->rq_reqlen = 0;
-}
-
-/* Abort this request and cleanup any resources associated with it. */
-int ptlrpc_abort(struct ptlrpc_request *request)
-{
-        /* First remove the ME for the reply; in theory, this means
-         * that we can tear down the buffer safely. */
-        if (PtlMEUnlink(request->rq_reply_me_h) != PTL_OK)
-                RETURN(0);
-        OBD_FREE(request->rq_reply_md.start, request->rq_replen);
+        LASSERT (!request->rq_replied);         /* callback hasn't completed */
+        spin_unlock_irqrestore (&request->rq_lock, flags);
+        
+        rc = PtlMDUnlink (request->rq_reply_md_h);
+        switch (rc) {
+        default:
+                LBUG ();
+
+        case PTL_OK:                            /* unlinked before completion */
+                LASSERT (request->rq_receiving_reply);
+                LASSERT (!request->rq_replied);
+                spin_lock_irqsave (&request->rq_lock, flags);
+                request->rq_receiving_reply = 0;
+                spin_unlock_irqrestore (&request->rq_lock, flags);
+                OBD_FREE(request->rq_repmsg, request->rq_replen);
+                request->rq_repmsg = NULL;
+                EXIT;
+                return;
+                
+        case PTL_MD_INUSE:                      /* callback in progress */
+                for (;;) {
+                        /* Network access will complete in finite time but
+                         * the timeout lets us CERROR for visibility */
+                        struct l_wait_info lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL);
+                        
+                        rc = l_wait_event (request->rq_wait_for_rep,
+                                           request->rq_replied, &lwi);
+                        LASSERT (rc == 0 || rc == -ETIMEDOUT);
+                        if (rc == 0) {
+                                spin_lock_irqsave (&request->rq_lock, flags);
+                                /* Ensure the callback has completed scheduling me 
+                                 * and taken its hands off the request */
+                                spin_unlock_irqrestore (&request->rq_lock, flags);
+                                break;
+                        }
+                        
+                        CERROR ("Unexpectedly long timeout: req %p\n", request);
+                }
+                /* fall through */
 
-        memset(&request->rq_reply_me_h, 0, sizeof(request->rq_reply_me_h));
-        request->rq_reply_md.start = NULL;
-        request->rq_repmsg = NULL;
-        return 0;
+        case PTL_INV_MD:                        /* callback completed */
+                LASSERT (!request->rq_receiving_reply);
+                LASSERT (request->rq_replied);
+                EXIT;
+                return;
+        }
+        /* Not Reached */
 }
 
 /* caller must hold imp->imp_lock */
@@ -478,6 +1041,7 @@ void ptlrpc_free_committed(struct obd_import *imp)
 {
         struct list_head *tmp, *saved;
         struct ptlrpc_request *req;
+        struct ptlrpc_request *last_req = NULL; /* temporary fire escape */
         ENTRY;
 
         LASSERT(imp != NULL);
@@ -492,7 +1056,11 @@ void ptlrpc_free_committed(struct obd_import *imp)
         list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 
-                if (req->rq_flags & PTL_RPC_FL_REPLAY) {
+                /* XXX ok to remove when 1357 resolved - rread 05/29/03  */
+                LASSERT (req != last_req);
+                last_req = req;
+
+                if (req->rq_replay) {
                         DEBUG_REQ(D_HA, req, "keeping (FL_REPLAY)");
                         continue;
                 }
@@ -515,104 +1083,67 @@ void ptlrpc_free_committed(struct obd_import *imp)
 
 void ptlrpc_cleanup_client(struct obd_import *imp)
 {
-        struct list_head *tmp, *saved;
-        struct ptlrpc_request *req;
-        struct ptlrpc_connection *conn = imp->imp_connection;
-        unsigned long flags;
         ENTRY;
-
-        LASSERT(conn);
-
-        spin_lock_irqsave(&imp->imp_lock, flags);
-        list_for_each_safe(tmp, saved, &imp->imp_replay_list) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
-
-                /* XXX we should make sure that nobody's sleeping on these! */
-                DEBUG_REQ(D_HA, req, "cleaning up from sending list");
-                list_del_init(&req->rq_list);
-                req->rq_import = NULL;
-                __ptlrpc_req_finished(req, 0);
-        }
-        spin_unlock_irqrestore(&imp->imp_lock, flags);
-
         EXIT;
         return;
 }
 
-void ptlrpc_continue_req(struct ptlrpc_request *req)
-{
-        DEBUG_REQ(D_HA, req, "continuing delayed request");
-        req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
-        req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
-        wake_up(&req->rq_wait_for_rep);
-}
-
 void ptlrpc_resend_req(struct ptlrpc_request *req)
 {
+        unsigned long flags;
+        
         DEBUG_REQ(D_HA, req, "resending");
-        req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
-        req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
+        req->rq_reqmsg->handle.cookie = 0;
+        ptlrpc_put_connection(req->rq_connection);
+        req->rq_connection =
+                ptlrpc_connection_addref(req->rq_import->imp_connection);
         req->rq_status = -EAGAIN;
-        req->rq_level = LUSTRE_CONN_RECOVD;
-        req->rq_flags |= PTL_RPC_FL_RESEND;
-        req->rq_flags &= ~PTL_RPC_FL_TIMEOUT;
-        wake_up(&req->rq_wait_for_rep);
+
+        spin_lock_irqsave (&req->rq_lock, flags);
+        req->rq_resend = 1;
+        req->rq_timedout = 0;
+        if (req->rq_set != NULL)
+                wake_up (&req->rq_set->set_waitq);
+        else
+                wake_up(&req->rq_wait_for_rep);
+        spin_unlock_irqrestore (&req->rq_lock, flags);
 }
 
+/* XXX: this function and rq_status are currently unused */
 void ptlrpc_restart_req(struct ptlrpc_request *req)
 {
+        unsigned long flags;
+
         DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request");
         req->rq_status = -ERESTARTSYS;
-        req->rq_flags |= PTL_RPC_FL_RESTART;
-        req->rq_flags &= ~PTL_RPC_FL_TIMEOUT;
-        wake_up(&req->rq_wait_for_rep);
+
+        spin_lock_irqsave (&req->rq_lock, flags);
+        req->rq_restart = 1;
+        req->rq_timedout = 0;
+        if (req->rq_set != NULL)
+                wake_up (&req->rq_set->set_waitq);
+        else
+                wake_up(&req->rq_wait_for_rep);
+        spin_unlock_irqrestore (&req->rq_lock, flags);
 }
 
 static int expired_request(void *data)
 {
         struct ptlrpc_request *req = data;
-
         ENTRY;
-        if (!req) {
-                CERROR("NULL req!");
-                LBUG();
-                RETURN(0);
-        }
-
-        DEBUG_REQ(D_ERROR, req, "timeout");
-        ptlrpc_abort(req);
-        req->rq_flags |= PTL_RPC_FL_TIMEOUT;
-
-        if (!req->rq_import) {
-                DEBUG_REQ(D_HA, req, "NULL import; already cleaned up?");
-                RETURN(1);
-        }
-
-        if (!req->rq_import->imp_connection) {
-                DEBUG_REQ(D_ERROR, req, "NULL connection");
-                LBUG();
-                RETURN(0);
-        }
-
-        if (!req->rq_import->imp_connection->c_recovd_data.rd_recovd)
-                RETURN(1);
-
-        recovd_conn_fail(req->rq_import->imp_connection);
 
-        /* If this request is for recovery or other primordial tasks,
-         * don't go back to sleep.
-         */
-        if (req->rq_level < LUSTRE_CONN_FULL)
-                RETURN(1);
-        RETURN(0);
+        RETURN(expire_one_request(req));
 }
 
-static int interrupted_request(void *data)
+static void interrupted_request(void *data)
 {
+        unsigned long flags;
+        
         struct ptlrpc_request *req = data;
-        ENTRY;
-        req->rq_flags |= PTL_RPC_FL_INTR;
-        RETURN(1); /* ignored, as of this writing */
+        DEBUG_REQ(D_HA, req, "request interrupted");
+        spin_lock_irqsave (&req->rq_lock, flags);
+        req->rq_intr = 1;
+        spin_unlock_irqrestore (&req->rq_lock, flags);
 }
 
 struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req)
@@ -631,7 +1162,7 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
         LASSERT(spin_is_locked(&imp->imp_lock));
 #endif
 
-        LASSERT(imp->imp_flags & IMP_REPLAYABLE);
+        LASSERT(imp->imp_replayable);
         /* Balanced in ptlrpc_free_committed, usually. */
         ptlrpc_request_addref(req);
         list_for_each_prev(tmp, &imp->imp_replay_list) {
@@ -642,6 +1173,7 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
                  * open a file, or for closes retained if to match creating
                  * opens, so use req->rq_xid as a secondary key.
                  * (See bugs 684, 685, and 428.)
+                 * XXX no longer needed, but all opens need transnos!
                  */
                 if (iter->rq_transno > req->rq_transno)
                         continue;
@@ -662,196 +1194,228 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
 int ptlrpc_queue_wait(struct ptlrpc_request *req)
 {
         int rc = 0;
+        int brc;
         struct l_wait_info lwi;
         struct obd_import *imp = req->rq_import;
+        struct obd_device *obd = imp->imp_obd;
         struct ptlrpc_connection *conn = imp->imp_connection;
         unsigned int flags;
+        int do_restart = 0;
+        int timeout = 0;
         ENTRY;
 
-        init_waitqueue_head(&req->rq_wait_for_rep);
-
-        req->rq_xid = HTON__u32(ptlrpc_next_xid());
-
+        LASSERT (req->rq_set == NULL);
+        LASSERT (!req->rq_receiving_reply);
+        
         /* for distributed debugging */
-        req->rq_reqmsg->status = HTON__u32(current->pid);
-        CDEBUG(D_RPCTRACE, "Sending RPC pid:xid:nid:opc %d:"LPU64":%s:"LPX64
-               ":%d\n", NTOH__u32(req->rq_reqmsg->status), req->rq_xid,
+        req->rq_reqmsg->status = current->pid;
+        LASSERT(imp->imp_obd != NULL);
+        CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:ni:nid:opc "
+               "%s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+               imp->imp_obd->obd_uuid.uuid,
+               req->rq_reqmsg->status, req->rq_xid,
                conn->c_peer.peer_ni->pni_name, conn->c_peer.peer_nid,
-               NTOH__u32(req->rq_reqmsg->opc));
-
-        spin_lock_irqsave(&imp->imp_lock, flags);
+               req->rq_reqmsg->opc);
 
+        /* Mark phase here for a little debug help */
+        req->rq_phase = RQ_PHASE_RPC;
+        
+restart:
         /*
          * If the import has been invalidated (such as by an OST failure), the
-         * request must fail with -EIO.
+         * request must fail with -EIO.  Recovery requests are allowed to go
+         * through, though, so that they have a chance to revalidate the
+         * import.
          */
-        if (req->rq_import->imp_flags & IMP_INVALID) {
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        if (req->rq_import->imp_invalid && req->rq_level == LUSTRE_CONN_FULL) {
                 DEBUG_REQ(D_ERROR, req, "IMP_INVALID:");
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
-                RETURN(-EIO);
+                GOTO (out, rc = -EIO);
         }
 
         if (req->rq_level > imp->imp_level) {
                 list_del(&req->rq_list);
+                if (req->rq_no_recov || obd->obd_no_recov ||
+                    imp->imp_dlm_fake) {
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+                        GOTO (out, rc = -EWOULDBLOCK);
+                }
+
                 list_add_tail(&req->rq_list, &imp->imp_delayed_list);
                 spin_unlock_irqrestore(&imp->imp_lock, flags);
 
-                DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%d < %d)",
+                DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%d > %d)",
                           current->comm, req->rq_level, imp->imp_level);
                 lwi = LWI_INTR(NULL, NULL);
                 rc = l_wait_event(req->rq_wait_for_rep,
-                                  (req->rq_level <= imp->imp_level) ||
-                                  (req->rq_flags & PTL_RPC_FL_ERR), &lwi);
-
-                if (req->rq_flags & PTL_RPC_FL_ERR)
-                        rc = -EIO;
-
-                if (!req->rq_import)
-                        RETURN(rc);
+                                  (req->rq_level <= imp->imp_level ||
+                                   req->rq_err),
+                                  &lwi);
+                DEBUG_REQ(D_HA, req, "\"%s\" awake: (%d > %d)",
+                          current->comm, req->rq_level, imp->imp_level);
 
                 spin_lock_irqsave(&imp->imp_lock, flags);
                 list_del_init(&req->rq_list);
 
+                if (req->rq_err)
+                        rc = -EIO;
+
                 if (rc) {
                         spin_unlock_irqrestore(&imp->imp_lock, flags);
-                        RETURN(rc);
+                        GOTO (out, rc);
                 }
-
+                
                 CERROR("process %d resumed\n", current->pid);
         }
- resend:
 
+        /* XXX this is the same as ptlrpc_set_wait */
         LASSERT(list_empty(&req->rq_list));
         list_add_tail(&req->rq_list, &imp->imp_sending_list);
+        req->rq_import_generation = imp->imp_generation;
         spin_unlock_irqrestore(&imp->imp_lock, flags);
+
         rc = ptl_send_rpc(req);
         if (rc) {
-                CDEBUG(D_HA, "error %d, opcode %d, need recovery\n", rc,
-                       req->rq_reqmsg->opc);
-                /* sleep for a jiffy, then trigger recovery */
-                lwi = LWI_TIMEOUT_INTR(1, expired_request,
-                                       interrupted_request, req);
+                /* The DLM's fake imports want to avoid all forms of
+                 * recovery. */
+                if (imp->imp_dlm_fake) {
+                        spin_lock_irqsave(&imp->imp_lock, flags);
+                        list_del_init(&req->rq_list);
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+                        GOTO(out, rc);
+                }
+
+                DEBUG_REQ(D_ERROR, req, "send failed (%d); recovering", rc);
+                
+                ptlrpc_fail_import(imp, req->rq_import_generation);
+
+                /* If we've been told to not wait, we're done. */
+                if (req->rq_level < LUSTRE_CONN_FULL || req->rq_no_recov ||
+                    obd->obd_no_recov) {
+                        spin_lock_irqsave(&imp->imp_lock, flags);
+                        list_del_init(&req->rq_list);
+                        spin_unlock_irqrestore(&imp->imp_lock, flags);
+                        GOTO(out, rc);
+                }
+
+                /* If we errored, allow the user to interrupt immediately */
+                timeout = 1;
         } else {
+                timeout = req->rq_timeout * HZ;
                 DEBUG_REQ(D_NET, req, "-- sleeping");
-                lwi = LWI_TIMEOUT_INTR(req->rq_timeout * HZ, expired_request,
-                                       interrupted_request, req);
         }
 #ifdef __KERNEL__
+        lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request,
+                               req);
         l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi);
-#else 
-        { 
+#else
+        {
                 extern int reply_in_callback(ptl_event_t *ev);
                 ptl_event_t reply_ev;
-                PtlEQWait(req->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h, &reply_ev);
-                reply_in_callback(&reply_ev); 
+                PtlEQWait(req->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h,
+                          &reply_ev);
+                reply_in_callback(&reply_ev);
+
+                LASSERT (reply_ev.mem_desc.user_ptr == (void *)req);
+                // ptlrpc_check_reply(req);
+                // not required now it only tests
         }
-#endif 
+#endif
 
         DEBUG_REQ(D_NET, req, "-- done sleeping");
 
+        CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:opc "
+               "%s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm,
+               imp->imp_obd->obd_uuid.uuid,
+               req->rq_reqmsg->status, req->rq_xid,
+               conn->c_peer.peer_ni->pni_name, conn->c_peer.peer_nid,
+               req->rq_reqmsg->opc);
+
         spin_lock_irqsave(&imp->imp_lock, flags);
         list_del_init(&req->rq_list);
         spin_unlock_irqrestore(&imp->imp_lock, flags);
 
-        if (req->rq_flags & PTL_RPC_FL_ERR) {
-                ptlrpc_abort(req);
+        /* If the reply was received normally, this just grabs the spinlock
+         * (ensuring the reply callback has returned), sees that
+         * req->rq_receiving_reply is clear and returns. */
+        ptlrpc_unregister_reply (req);
+        
+        if (req->rq_err)
                 GOTO(out, rc = -EIO);
-        }
 
-        /* Don't resend if we were interrupted. */
-        if ((req->rq_flags & (PTL_RPC_FL_RESEND | PTL_RPC_FL_INTR)) ==
-            PTL_RPC_FL_RESEND) {
-                if (req->rq_flags & PTL_RPC_FL_NO_RESEND) {
-                        ptlrpc_abort(req); /* clean up reply buffers */
-                        req->rq_flags &= ~PTL_RPC_FL_NO_RESEND;
+        /* Resend if we need to, unless we were interrupted. */
+        if (req->rq_resend && !req->rq_intr) {
+                /* ...unless we were specifically told otherwise. */
+                if (req->rq_no_resend) {
+                        spin_lock_irqsave (&req->rq_lock, flags);
+                        req->rq_no_resend = 0;
+                        spin_unlock_irqrestore (&req->rq_lock, flags);
                         GOTO(out, rc = -ETIMEDOUT);
                 }
-                req->rq_flags &= ~PTL_RPC_FL_RESEND;
+                spin_lock_irqsave (&req->rq_lock, flags);
+                req->rq_resend = 0;
+                spin_unlock_irqrestore (&req->rq_lock, flags);
                 lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT);
+
+                if (req->rq_bulk != NULL)
+                        ptlrpc_unregister_bulk (req);
+        
                 DEBUG_REQ(D_HA, req, "resending: ");
-                spin_lock_irqsave(&imp->imp_lock, flags);
-                goto resend;
+                goto restart;
         }
 
-        if (req->rq_flags & PTL_RPC_FL_INTR) {
-                if (!(req->rq_flags & PTL_RPC_FL_TIMEOUT))
-                        LBUG(); /* should only be interrupted if we timed out */
-                /* Clean up the dangling reply buffers */
-                ptlrpc_abort(req);
+        if (req->rq_intr) {
+                /* Should only be interrupted if we timed out. */
+                if (!req->rq_timedout)
+                        DEBUG_REQ(D_ERROR, req,
+                                  "rq_intr set but rq_timedout not");
                 GOTO(out, rc = -EINTR);
         }
 
-        if (req->rq_flags & PTL_RPC_FL_TIMEOUT)
+        if (req->rq_timedout) {                 /* non-recoverable timeout */
                 GOTO(out, rc = -ETIMEDOUT);
-
-        if (!(req->rq_flags & PTL_RPC_FL_REPLIED))
-                GOTO(out, rc = req->rq_status);
-
-        rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
-        if (rc) {
-                CERROR("unpack_rep failed: %d\n", rc);
-                GOTO(out, rc);
         }
-#if 0
-        /* FIXME: Enable when BlueArc makes new release */
-        if (req->rq_repmsg->type != PTL_RPC_MSG_REPLY &&
-            req->rq_repmsg->type != PTL_RPC_MSG_ERR) {
-                CERROR("invalid packet type received (type=%u)\n",
-                       req->rq_repmsg->type);
+        
+        if (!req->rq_replied) {
+                /* How can this be? -eeb */
+                DEBUG_REQ(D_ERROR, req, "!rq_replied: ");
                 LBUG();
-                GOTO(out, rc = -EINVAL);
+                GOTO(out, rc = req->rq_status);
         }
-#endif
-        DEBUG_REQ(D_NET, req, "status %d", req->rq_repmsg->status);
 
-        /* We're a rejected connection, need to invalidate and rebuild. */
-        if (req->rq_repmsg->status == -ENOTCONN) {
-                spin_lock_irqsave(&imp->imp_lock, flags);
-                /* If someone else is reconnecting us (CONN_RECOVD) or has
-                 * already completed it (handle mismatch), then we just need
-                 * to get out.
-                 */
-                if (imp->imp_level == LUSTRE_CONN_RECOVD ||
-                    imp->imp_handle.addr != req->rq_reqmsg->addr ||
-                    imp->imp_handle.cookie != req->rq_reqmsg->cookie) {
-                        spin_unlock_irqrestore(&imp->imp_lock, flags);
-                        GOTO(out, rc = -EIO);
-                }
-                imp->imp_level = LUSTRE_CONN_RECOVD;
-                spin_unlock_irqrestore(&imp->imp_lock, flags);
-                if (imp->imp_recover != NULL) {
-                        rc = imp->imp_recover(imp, PTLRPC_RECOVD_PHASE_NOTCONN);
-                        if (rc)
-                                LBUG();
-                }
-                GOTO(out, rc = -EIO);
+        rc = after_reply (req, &do_restart);
+        /* NB may return +ve success rc */
+        if (do_restart) {
+                if (req->rq_bulk != NULL)
+                        ptlrpc_unregister_bulk (req);
+                DEBUG_REQ(D_HA, req, "resending: ");
+                goto restart;
         }
 
-        rc = ptlrpc_check_status(req);
-
-        if (req->rq_import->imp_flags & IMP_REPLAYABLE) {
-                spin_lock_irqsave(&imp->imp_lock, flags);
-                if ((req->rq_flags & PTL_RPC_FL_REPLAY || req->rq_transno != 0)
-                    && rc >= 0) {
-                        ptlrpc_retain_replayable_request(req, imp);
-                }
-
-                if (req->rq_transno > imp->imp_max_transno) {
-                        imp->imp_max_transno = req->rq_transno;
+ out:
+        if (req->rq_bulk != NULL) {
+                if (rc >= 0) {                  /* success so far */
+                        lwi = LWI_TIMEOUT (timeout, NULL, NULL);
+                        brc = l_wait_event (req->rq_wait_for_rep, 
+                                            ptlrpc_bulk_complete (req->rq_bulk), &lwi);
+                        if (brc != 0) {
+                                LASSERT (brc == -ETIMEDOUT);
+                                CERROR ("Timed out waiting for bulk\n");
+                                rc = brc;
+                        }
                 }
-
-                /* Replay-enabled imports return commit-status information. */
-                if (req->rq_repmsg->last_committed) {
-                        imp->imp_peer_committed_transno =
-                                req->rq_repmsg->last_committed;
+                if (rc < 0) {
+                        /* MDS blocks for put ACKs before replying */
+                        /* OSC sets rq_no_resend for the time being */
+                        LASSERT (req->rq_no_resend);
+                        ptlrpc_unregister_bulk (req);
                 }
-                ptlrpc_free_committed(imp);
-                spin_unlock_irqrestore(&imp->imp_lock, flags);
         }
-
-        EXIT;
- out:
-        return rc;
+        
+        LASSERT (!req->rq_receiving_reply);
+        req->rq_phase = RQ_PHASE_INTERPRET;
+        RETURN (rc);
 }
 
 int ptlrpc_replay_req(struct ptlrpc_request *req)
@@ -861,15 +1425,22 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         struct l_wait_info lwi;
         ENTRY;
 
-        init_waitqueue_head(&req->rq_wait_for_rep);
-        DEBUG_REQ(D_NET, req, "");
+        /* I don't touch rq_phase here, so the debug log can show what
+         * state it was left in */
+        
+        /* Not handling automatic bulk replay yet (or ever?) */
+        LASSERT (req->rq_bulk == NULL);
+        
+        DEBUG_REQ(D_NET, req, "about to replay");
 
-        req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
-        req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
+        /* Update request's state, since we might have a new connection. */
+        ptlrpc_put_connection(req->rq_connection);
+        req->rq_connection =
+                ptlrpc_connection_addref(req->rq_import->imp_connection);
 
         /* temporarily set request to RECOVD level (reset at out:) */
         old_level = req->rq_level;
-        if (req->rq_flags & PTL_RPC_FL_REPLIED)
+        if (req->rq_replied)
                 old_status = req->rq_repmsg->status;
         req->rq_level = LUSTRE_CONN_RECOVD;
         rc = ptl_send_rpc(req);
@@ -887,18 +1458,40 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
 
         // up(&cli->cli_rpc_sem);
 
-        if (!(req->rq_flags & PTL_RPC_FL_REPLIED)) {
+        /* If the reply was received normally, this just grabs the spinlock
+         * (ensuring the reply callback has returned), sees that
+         * req->rq_receiving_reply is clear and returns. */
+        ptlrpc_unregister_reply (req);
+
+        if (!req->rq_replied) {
                 CERROR("Unknown reason for wakeup\n");
                 /* XXX Phil - I end up here when I kill obdctl */
-                ptlrpc_abort(req);
+                /* ...that's because signals aren't all masked in
+                 * l_wait_event() -eeb */
                 GOTO(out, rc = -EINTR);
         }
 
+#if SWAB_PARANOIA
+        /* Clear reply swab mask; this is a new reply in sender's byte order */
+        req->rq_rep_swab_mask = 0;
+#endif
         rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen);
         if (rc) {
                 CERROR("unpack_rep failed: %d\n", rc);
-                GOTO(out, rc);
+                GOTO(out, rc = -EPROTO);
         }
+#if 0
+        /* FIXME: Enable when BlueArc makes new release */
+        if (req->rq_repmsg->type != PTL_RPC_MSG_REPLY &&
+            req->rq_repmsg->type != PTL_RPC_MSG_ERR) {
+                CERROR("invalid packet type received (type=%u)\n",
+                       req->rq_repmsg->type);
+                GOTO(out, rc = -EPROTO);
+        }
+#endif
+
+        /* The transno had better not change over replay. */
+        LASSERT(req->rq_reqmsg->transno == req->rq_repmsg->transno);
 
         CDEBUG(D_NET, "got rep "LPD64"\n", req->rq_xid);
 
@@ -906,8 +1499,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         if (req->rq_replay_cb)
                 req->rq_replay_cb(req);
 
-        if ((req->rq_flags & PTL_RPC_FL_REPLIED) &&
-            req->rq_repmsg->status != old_status) {
+        if (req->rq_replied && req->rq_repmsg->status != old_status) {
                 DEBUG_REQ(D_HA, req, "status %d, old was %d",
                           req->rq_repmsg->status, old_status);
         }
@@ -917,32 +1509,42 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         RETURN(rc);
 }
 
-/* XXX looks a lot like super.c:invalidate_request_list, don't it? */
-void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import)
+void ptlrpc_abort_inflight(struct obd_import *imp)
 {
         unsigned long flags;
         struct list_head *tmp, *n;
         ENTRY;
 
         /* Make sure that no new requests get processed for this import.
-         * ptlrpc_queue_wait must (and does) hold imp_lock while testing this
-         * flag and then putting requests on sending_list or delayed_list.
+         * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing
+         * this flag and then putting requests on sending_list or delayed_list.
+         */
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        if (!imp->imp_replayable)
+                /* on b_devel, I moved this line to
+                   ptlrpc_set_import_active because I thought it made
+                   more sense there and possibly not all callers of
+                   this function expect this. I'll leave it here until
+                   I can figure out if it's correct or not. - rread 5/12/03  */
+                imp->imp_invalid = 1;
+
+        /* XXX locking?  Maybe we should remove each request with the list
+         * locked?  Also, how do we know if the requests on the list are
+         * being freed at this time?
          */
-        if ((imp->imp_flags & IMP_REPLAYABLE) == 0) {
-                spin_lock_irqsave(&imp->imp_lock, flags);
-                imp->imp_flags |= IMP_INVALID;
-                spin_unlock_irqrestore(&imp->imp_lock, flags);
-        }
-
         list_for_each_safe(tmp, n, &imp->imp_sending_list) {
                 struct ptlrpc_request *req =
                         list_entry(tmp, struct ptlrpc_request, rq_list);
 
                 DEBUG_REQ(D_HA, req, "inflight");
-                req->rq_flags |= PTL_RPC_FL_ERR;
-                if (dying_import)
-                        req->rq_import = NULL;
-                wake_up(&req->rq_wait_for_rep);
+
+                spin_lock (&req->rq_lock);
+                req->rq_err = 1;
+                if (req->rq_set != NULL)
+                        wake_up(&req->rq_set->set_waitq);
+                else
+                        wake_up(&req->rq_wait_for_rep);
+                spin_unlock (&req->rq_lock);
         }
 
         list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
@@ -950,10 +1552,36 @@ void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import)
                         list_entry(tmp, struct ptlrpc_request, rq_list);
 
                 DEBUG_REQ(D_HA, req, "aborting waiting req");
-                req->rq_flags |= PTL_RPC_FL_ERR;
-                if (dying_import)
-                        req->rq_import = NULL;
-                wake_up(&req->rq_wait_for_rep);
+
+                spin_lock (&req->rq_lock);
+                req->rq_err = 1;
+                if (req->rq_set != NULL)
+                        wake_up(&req->rq_set->set_waitq);
+                else
+                        wake_up(&req->rq_wait_for_rep);
+                spin_unlock (&req->rq_lock);
         }
+
+        /* Last chance to free reqs left on the replay list, but we
+         * will still leak reqs that haven't comitted.  */
+        if (imp->imp_replayable)
+                ptlrpc_free_committed(imp);
+
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
         EXIT;
 }
+
+static __u64 ptlrpc_last_xid = 0;
+static spinlock_t ptlrpc_last_xid_lock = SPIN_LOCK_UNLOCKED;
+
+__u64 ptlrpc_next_xid(void)
+{
+        __u64 tmp;
+        spin_lock(&ptlrpc_last_xid_lock);
+        tmp = ++ptlrpc_last_xid;
+        spin_unlock(&ptlrpc_last_xid_lock);
+        return tmp;
+}
+
+
index 8f2cc2d..6b7690b 100644 (file)
 #include <liblustre.h>
 #endif
 
+#include "ptlrpc_internal.h"
+
 static spinlock_t conn_lock;
 static struct list_head conn_list;
 static struct list_head conn_unused_list;
 
-/* If UUID is NULL, c->c_remote_uuid must be all zeroes
- * If UUID is non-NULL, c->c_remote_uuid must match. */
-static int match_connection_uuid(struct ptlrpc_connection *c,
-                                 struct obd_uuid *uuid)
+void ptlrpc_dump_connections(void)
 {
-        struct obd_uuid zero_uuid;
-        memset(&zero_uuid, 0, sizeof(zero_uuid));
-
-        if (uuid)
-                return memcmp(c->c_remote_uuid.uuid, uuid->uuid,
-                              sizeof(uuid->uuid));
+        struct list_head *tmp;
+        struct ptlrpc_connection *c;
+        ENTRY;
 
-        return memcmp(c->c_remote_uuid.uuid, &zero_uuid, sizeof(zero_uuid));
+        list_for_each(tmp, &conn_list) {
+                c = list_entry(tmp, struct ptlrpc_connection, c_link);
+                CERROR("Connection %p/%s has refcount %d (nid="LPX64" on %s)\n",
+                       c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount),
+                       c->c_peer.peer_nid, c->c_peer.peer_ni->pni_name);
+        }
+        EXIT;
 }
 
 struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
@@ -55,15 +57,22 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
         struct ptlrpc_connection *c;
         ENTRY;
 
+
         CDEBUG(D_INFO, "peer is "LPX64" on %s\n",
                peer->peer_nid, peer->peer_ni->pni_name);
 
         spin_lock(&conn_lock);
+        if (list_empty(&conn_list)) {
+                if (!ptlrpc_get_ldlm_hooks()) {
+                        spin_unlock(&conn_lock);
+                        RETURN(NULL);
+                }
+        }
+
         list_for_each(tmp, &conn_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
                 if (peer->peer_nid == c->c_peer.peer_nid &&
-                    peer->peer_ni == c->c_peer.peer_ni &&
-                    !match_connection_uuid(c, uuid)) {
+                    peer->peer_ni == c->c_peer.peer_ni) {
                         ptlrpc_connection_addref(c);
                         GOTO(out, c);
                 }
@@ -72,8 +81,7 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
         list_for_each_safe(tmp, pos, &conn_unused_list) {
                 c = list_entry(tmp, struct ptlrpc_connection, c_link);
                 if (peer->peer_nid == c->c_peer.peer_nid &&
-                    peer->peer_ni == c->c_peer.peer_ni &&
-                    !match_connection_uuid(c, uuid)) {
+                    peer->peer_ni == c->c_peer.peer_ni) {
                         ptlrpc_connection_addref(c);
                         list_del(&c->c_link);
                         list_add(&c->c_link, &conn_list);
@@ -91,13 +99,8 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer,
         c->c_epoch = 1;
         c->c_bootcount = 0;
         c->c_flags = 0;
-        if (uuid->uuid)
+        if (uuid && uuid->uuid)                         /* XXX ???? */
                 obd_str2uuid(&c->c_remote_uuid, uuid->uuid);
-        INIT_LIST_HEAD(&c->c_imports);
-        INIT_LIST_HEAD(&c->c_exports);
-        INIT_LIST_HEAD(&c->c_sb_chain);
-        INIT_LIST_HEAD(&c->c_recovd_data.rd_managed_chain);
-        INIT_LIST_HEAD(&c->c_delayed_head);
         atomic_set(&c->c_refcount, 0);
         memcpy(&c->c_peer, peer, sizeof(c->c_peer));
         spin_lock_init(&c->c_lock);
@@ -123,14 +126,16 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c)
         }
 
         CDEBUG (D_INFO, "connection=%p refcount %d to "LPX64" on %s\n",
-                c, atomic_read(&c->c_refcount), c->c_peer.peer_nid,
+                c, atomic_read(&c->c_refcount) - 1, c->c_peer.peer_nid,
                 c->c_peer.peer_ni->pni_name);
 
         if (atomic_dec_and_test(&c->c_refcount)) {
-                recovd_conn_unmanage(c);
                 spin_lock(&conn_lock);
                 list_del(&c->c_link);
                 list_add(&c->c_link, &conn_unused_list);
+                if (list_empty(&conn_list)) {
+                        ptlrpc_put_ldlm_hooks();
+                }
                 spin_unlock(&conn_lock);
                 rc = 1;
         }
index 4a6eb67..167898a 100644 (file)
@@ -42,7 +42,7 @@ static int request_out_callback(ptl_event_t *ev)
         ENTRY;
 
         /* requests always contiguous */
-        LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0);
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
 
         if (ev->type != PTL_EVENT_SENT) {
                 // XXX make sure we understand all events, including ACK's
@@ -50,33 +50,34 @@ static int request_out_callback(ptl_event_t *ev)
                 LBUG();
         }
 
-        /* this balances the atomic_inc in ptl_send_rpc */
+        /* this balances the atomic_inc in ptl_send_rpc() */
         ptlrpc_req_finished(req);
         RETURN(1);
 }
 
-
 /*
  *  Free the packet when it has gone out
  */
 static int reply_out_callback(ptl_event_t *ev)
 {
+        struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+        unsigned long          flags;
         ENTRY;
 
         /* replies always contiguous */
-        LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0);
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
 
         if (ev->type == PTL_EVENT_SENT) {
+                /* NB don't even know if this is the current reply! In fact
+                 * we can't touch any state in the request, since the
+                 * service handler zeros it on each incoming request. */
                 OBD_FREE(ev->mem_desc.start, ev->mem_desc.length);
         } else if (ev->type == PTL_EVENT_ACK) {
-                struct ptlrpc_request *req = ev->mem_desc.user_ptr;
-                if (req->rq_flags & PTL_RPC_FL_WANT_ACK) {
-                        req->rq_flags &= ~PTL_RPC_FL_WANT_ACK;
-                        wake_up(&req->rq_wait_for_rep);
-                } else {
-                        DEBUG_REQ(D_ERROR, req,
-                                  "ack received for reply, not wanted");
-                }
+                LASSERT(req->rq_want_ack);
+                spin_lock_irqsave(&req->rq_lock, flags);
+                req->rq_want_ack = 0;
+                wake_up(&req->rq_wait_for_rep);
+                spin_unlock_irqrestore(&req->rq_lock, flags);
         } else {
                 // XXX make sure we understand all events
                 CERROR("Unknown event %d\n", ev->type);
@@ -92,10 +93,11 @@ static int reply_out_callback(ptl_event_t *ev)
 int reply_in_callback(ptl_event_t *ev)
 {
         struct ptlrpc_request *req = ev->mem_desc.user_ptr;
+        unsigned long flags;
         ENTRY;
 
         /* replies always contiguous */
-        LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0);
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
 
         if (req->rq_xid == 0x5a5a5a5a5a5a5a5a) {
                 CERROR("Reply received for freed request!  Probably a missing "
@@ -109,11 +111,21 @@ int reply_in_callback(ptl_event_t *ev)
         }
 
         if (ev->type == PTL_EVENT_PUT) {
-                req->rq_repmsg = ev->mem_desc.start + ev->offset;
-                barrier();
-                wake_up(&req->rq_wait_for_rep);
+                /* Bug 1190: should handle non-zero offset as a protocol
+                 * error  */
+                LASSERT (ev->offset == 0);
+
+                spin_lock_irqsave (&req->rq_lock, flags);
+                LASSERT (req->rq_receiving_reply);
+                req->rq_receiving_reply = 0;
+                req->rq_replied = 1;
+                if (req->rq_set != NULL)
+                        wake_up(&req->rq_set->set_waitq);
+                else
+                        wake_up(&req->rq_wait_for_rep);
+                spin_unlock_irqrestore (&req->rq_lock, flags);
         } else {
-                // XXX make sure we understand all events, including ACK's
+                // XXX make sure we understand all events, including ACKs
                 CERROR("Unknown event %d\n", ev->type);
                 LBUG();
         }
@@ -128,7 +140,7 @@ int request_in_callback(ptl_event_t *ev)
         struct ptlrpc_service *service = srv_ni->sni_service;
 
         /* requests always contiguous */
-        LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0);
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
         /* we only enable puts */
         LASSERT(ev->type == PTL_EVENT_PUT);
         LASSERT(atomic_read(&srv_ni->sni_nrqbds_receiving) > 0);
@@ -138,14 +150,14 @@ int request_in_callback(ptl_event_t *ev)
                 CERROR("Warning: Possibly truncated rpc (%d/%d)\n",
                        ev->mlength, ev->rlength);
 
-        if (ptl_is_valid_handle(&ev->unlinked_me)) {
+        if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) {
                 /* This is the last request to be received into this
                  * request buffer.  We don't bump the refcount, since the
                  * thread servicing this event is effectively taking over
                  * portals' reference.
                  */
-#warning ev->unlinked_me.nal_idx is not set properly in a callback
-                LASSERT(ev->unlinked_me.handle_idx==rqbd->rqbd_me_h.handle_idx);
+                /* NB ev->unlinked_me.nal_idx is not set properly in a callback */
+                LASSERT(ev->unlinked_me.cookie==rqbd->rqbd_me_h.cookie);
 
                 /* we're off the air */
                 /* we'll probably start dropping packets in portals soon */
@@ -163,10 +175,8 @@ int request_in_callback(ptl_event_t *ev)
 
 static int bulk_put_source_callback(ptl_event_t *ev)
 {
+        unsigned long            flags;
         struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
-        struct ptlrpc_bulk_page *bulk;
-        struct list_head        *tmp;
-        struct list_head        *next;
         ENTRY;
 
         CDEBUG(D_NET, "got %s event %d\n",
@@ -175,80 +185,77 @@ static int bulk_put_source_callback(ptl_event_t *ev)
 
         LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_ACK);
 
-        LASSERT(atomic_read(&desc->bd_source_callback_count) > 0 &&
-                atomic_read(&desc->bd_source_callback_count) <= 2);
-
         /* 1 fragment for each page always */
         LASSERT(ev->mem_desc.niov == desc->bd_page_count);
 
-        if (atomic_dec_and_test(&desc->bd_source_callback_count)) {
-                void (*event_handler)(struct ptlrpc_bulk_desc *);
-
-                list_for_each_safe(tmp, next, &desc->bd_page_list) {
-                        bulk = list_entry(tmp, struct ptlrpc_bulk_page,
-                                          bp_link);
-
-                        if (bulk->bp_cb != NULL)
-                                bulk->bp_cb(bulk);
-                }
-
-                /* We need to make a note of whether there's an event handler
-                 * before we call wake_up, because if there is no event handler,
-                 * 'desc' might be freed before we're scheduled again. */
-                event_handler = desc->bd_ptl_ev_hdlr;
-
-                desc->bd_flags |= PTL_BULK_FL_SENT;
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        
+        LASSERT(desc->bd_callback_count > 0 &&
+                desc->bd_callback_count <= 2);
+        
+        if (--desc->bd_callback_count == 0) {
+                desc->bd_network_rw = 0;
+                desc->bd_complete = 1;
                 wake_up(&desc->bd_waitq);
-                if (event_handler) {
-                        LASSERT(desc->bd_ptl_ev_hdlr == event_handler);
-                        event_handler(desc);
-                }
         }
 
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
         RETURN(0);
 }
 
+struct ptlrpc_bulk_desc ptlrpc_bad_desc;
+ptl_event_t ptlrpc_bad_event;
+
 static int bulk_put_sink_callback(ptl_event_t *ev)
 {
         struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
-        struct ptlrpc_bulk_page *bulk;
-        struct list_head        *tmp;
-        struct list_head        *next;
-        ptl_size_t               total = 0;
-        void                   (*event_handler)(struct ptlrpc_bulk_desc *);
+        unsigned long            flags;
         ENTRY;
 
         LASSERT(ev->type == PTL_EVENT_PUT);
 
-        /* put with zero offset */
-        LASSERT(ev->offset == 0);
         /* used iovs */
-        LASSERT((ev->mem_desc.options & PTL_MD_IOV) != 0);
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) ==
+                PTL_MD_KIOV);
+        /* Honestly, it's best to find out early. */
+        if (desc->bd_page_count == 0x5a5a5a5a5a ||
+            desc->bd_page_count != ev->mem_desc.niov ||
+            ev->mem_desc.start != &desc->bd_iov) {
+                /* not guaranteed (don't LASSERT) but good for this bug hunt */
+                ptlrpc_bad_event = *ev;
+                ptlrpc_bad_desc = *desc;
+                CERROR ("XXX ev %p type %d portal %d match "LPX64", seq %ld\n",
+                        ev, ev->type, ev->portal, ev->match_bits, ev->sequence);
+                CERROR ("XXX desc %p, export %p import %p gen %d "
+                        " portal %d\n", 
+                        desc, desc->bd_export,
+                        desc->bd_import, desc->bd_import_generation,
+                        desc->bd_portal);
+                RETURN (0);
+        }
+        
+        LASSERT(desc->bd_page_count != 0x5a5a5a5a);
         /* 1 fragment for each page always */
         LASSERT(ev->mem_desc.niov == desc->bd_page_count);
-
-        list_for_each_safe (tmp, next, &desc->bd_page_list) {
-                bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
-
-                total += bulk->bp_buflen;
-
-                if (bulk->bp_cb != NULL)
-                        bulk->bp_cb(bulk);
+        LASSERT(ev->match_bits == desc->bd_req->rq_xid);
+        
+        /* peer must put with zero offset */
+        if (ev->offset != 0) {
+                /* Bug 1190: handle this as a protocol failure */
+                CERROR ("Bad offset %d\n", ev->offset);
+                LBUG ();
         }
 
-        LASSERT(ev->mem_desc.length == total);
-
-        /* We need to make a note of whether there's an event handler
-         * before we call wake_up, because if there is no event
-         * handler, 'desc' might be freed before we're scheduled again. */
-        event_handler = desc->bd_ptl_ev_hdlr;
+        /* No check for total # bytes; this could be a short read */
 
-        desc->bd_flags |= PTL_BULK_FL_RCVD;
-        wake_up(&desc->bd_waitq);
-        if (event_handler) {
-                LASSERT(desc->bd_ptl_ev_hdlr == event_handler);
-                event_handler(desc);
-        }
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        desc->bd_network_rw = 0;
+        desc->bd_complete = 1;
+        if (desc->bd_req->rq_set != NULL)
+                wake_up (&desc->bd_req->rq_set->set_waitq);
+        else
+                wake_up (&desc->bd_req->rq_wait_for_rep);
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
 
         RETURN(1);
 }
@@ -258,122 +265,108 @@ static int bulk_get_source_callback(ptl_event_t *ev)
         struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
         struct ptlrpc_bulk_page *bulk;
         struct list_head        *tmp;
-        struct list_head        *next;
+        unsigned long            flags;
         ptl_size_t               total = 0;
-        void                   (*event_handler)(struct ptlrpc_bulk_desc *);
         ENTRY;
 
         LASSERT(ev->type == PTL_EVENT_GET);
 
-        /* put with zero offset */
-        LASSERT(ev->offset == 0);
         /* used iovs */
-        LASSERT((ev->mem_desc.options & PTL_MD_IOV) != 0);
+        LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) ==
+                PTL_MD_KIOV);
         /* 1 fragment for each page always */
         LASSERT(ev->mem_desc.niov == desc->bd_page_count);
+        LASSERT(ev->match_bits == desc->bd_req->rq_xid);
 
-        list_for_each_safe (tmp, next, &desc->bd_page_list) {
+        /* peer must get with zero offset */
+        if (ev->offset != 0) {
+                /* Bug 1190: handle this as a protocol failure */
+                CERROR ("Bad offset %d\n", ev->offset);
+                LBUG ();
+        }
+        
+        list_for_each (tmp, &desc->bd_page_list) {
                 bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
 
                 total += bulk->bp_buflen;
-
-                if (bulk->bp_cb != NULL)
-                        bulk->bp_cb(bulk);
         }
 
-        LASSERT(ev->mem_desc.length == total);
-
-        /* We need to make a note of whether there's an event handler
-         * before we call wake_up, because if there is no event
-         * handler, 'desc' might be freed before we're scheduled again. */
-        event_handler = desc->bd_ptl_ev_hdlr;
-
-        desc->bd_flags |= PTL_BULK_FL_SENT;
-        wake_up(&desc->bd_waitq);
-        if (event_handler) {
-                LASSERT(desc->bd_ptl_ev_hdlr == event_handler);
-                event_handler(desc);
+        /* peer must get everything */
+        if (ev->mem_desc.length != total) {
+                /* Bug 1190: handle this as a protocol failure */
+                CERROR ("Bad length/total %d/%d\n", ev->mem_desc.length, total);
+                LBUG ();
         }
 
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        desc->bd_network_rw = 0;
+        desc->bd_complete = 1;
+        if (desc->bd_req->rq_set != NULL)
+                wake_up (&desc->bd_req->rq_set->set_waitq);
+        else
+                wake_up (&desc->bd_req->rq_wait_for_rep);
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
+
         RETURN(1);
 }
 
-
 static int bulk_get_sink_callback(ptl_event_t *ev)
 {
         struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr;
-        struct ptlrpc_bulk_page *bulk;
-        struct list_head        *tmp;
-        struct list_head        *next;
+        unsigned long            flags;
         ENTRY;
 
         CDEBUG(D_NET, "got %s event %d\n",
                (ev->type == PTL_EVENT_SENT) ? "SENT" :
-               (ev->type == PTL_EVENT_REPLY)  ? "REPLY"  : "UNEXPECTED", 
+               (ev->type == PTL_EVENT_REPLY)  ? "REPLY"  : "UNEXPECTED",
                ev->type);
 
         LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_REPLY);
 
-        LASSERT(atomic_read(&desc->bd_source_callback_count) > 0 &&
-                atomic_read(&desc->bd_source_callback_count) <= 2);
-
         /* 1 fragment for each page always */
         LASSERT(ev->mem_desc.niov == desc->bd_page_count);
 
-        if (atomic_dec_and_test(&desc->bd_source_callback_count)) {
-                void (*event_handler)(struct ptlrpc_bulk_desc *);
-
-                list_for_each_safe(tmp, next, &desc->bd_page_list) {
-                        bulk = list_entry(tmp, struct ptlrpc_bulk_page,
-                                          bp_link);
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        LASSERT(desc->bd_callback_count > 0 &&
+                desc->bd_callback_count <= 2);
 
-                        if (bulk->bp_cb != NULL)
-                                bulk->bp_cb(bulk);
-                }
-
-                /* We need to make a note of whether there's an event handler
-                 * before we call wake_up, because if there is no event handler,
-                 * 'desc' might be freed before we're scheduled again. */
-                event_handler = desc->bd_ptl_ev_hdlr;
-
-                desc->bd_flags |= PTL_BULK_FL_RCVD;
+        if (--desc->bd_callback_count == 0) {
+                desc->bd_network_rw = 0;
+                desc->bd_complete = 1;
                 wake_up(&desc->bd_waitq);
-                if (event_handler) {
-                        LASSERT(desc->bd_ptl_ev_hdlr == event_handler);
-                        event_handler(desc);
-                }
         }
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
 
         RETURN(0);
 }
 
-int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) 
+int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer)
 {
         struct ptlrpc_ni   *pni;
         struct lustre_peer  lpeer;
         int                 i;
         int                 rc = lustre_uuid_to_peer (uuid->uuid, &lpeer);
-        
+
         if (rc != 0)
                 RETURN (rc);
-        
+
         for (i = 0; i < ptlrpc_ninterfaces; i++) {
                 pni = &ptlrpc_interfaces[i];
 
-                if (!memcmp (&lpeer.peer_ni, &pni->pni_ni_h,
-                             sizeof (lpeer.peer_ni))) {
+                if (!memcmp(&lpeer.peer_ni, &pni->pni_ni_h,
+                            sizeof (lpeer.peer_ni))) {
                         peer->peer_nid = lpeer.peer_nid;
                         peer->peer_ni = pni;
                         return (0);
                 }
         }
-        
-        CERROR ("Can't find ptlrpc interface for "LPX64" ni handle %08lx %08lx\n",
-                lpeer.peer_nid, lpeer.peer_ni.nal_idx, lpeer.peer_ni.handle_idx);
+
+        CERROR("Can't find ptlrpc interface for "LPX64" ni handle %08lx."LPX64"\n",
+               lpeer.peer_nid, lpeer.peer_ni.nal_idx, lpeer.peer_ni.cookie);
         return (-ENOENT);
 }
 
-void ptlrpc_ni_fini (struct ptlrpc_ni *pni) 
+void ptlrpc_ni_fini(struct ptlrpc_ni *pni)
 {
         PtlEQFree(pni->pni_request_out_eq_h);
         PtlEQFree(pni->pni_reply_out_eq_h);
@@ -382,111 +375,116 @@ void ptlrpc_ni_fini (struct ptlrpc_ni *pni)
         PtlEQFree(pni->pni_bulk_put_sink_eq_h);
         PtlEQFree(pni->pni_bulk_get_source_eq_h);
         PtlEQFree(pni->pni_bulk_get_sink_eq_h);
-        
-        inter_module_put(pni->pni_name);
+
+        kportal_put_ni (pni->pni_number);
 }
 
-int ptlrpc_ni_init (char *name, struct ptlrpc_ni *pni) 
+int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni)
 {
         int              rc;
-        ptl_handle_ni_t *nip;
+        ptl_handle_ni_t *nip = kportal_get_ni (number);
 
-        nip = (ptl_handle_ni_t *)inter_module_get (name);
         if (nip == NULL) {
                 CDEBUG (D_NET, "Network interface %s not loaded\n", name);
                 return (-ENOENT);
         }
-        
-        CDEBUG (D_NET, "init %s: nal_idx %ld\n", name, nip->nal_idx);
-                
+
+        CDEBUG (D_NET, "init %d %s: nal_idx %ld\n", number, name, nip->nal_idx);
+
         pni->pni_name = name;
+        pni->pni_number = number;
         pni->pni_ni_h = *nip;
 
-        ptl_set_inv_handle (&pni->pni_request_out_eq_h);
-        ptl_set_inv_handle (&pni->pni_reply_out_eq_h);
-        ptl_set_inv_handle (&pni->pni_reply_in_eq_h);
-        ptl_set_inv_handle (&pni->pni_bulk_put_source_eq_h);
-        ptl_set_inv_handle (&pni->pni_bulk_put_sink_eq_h);
-        ptl_set_inv_handle (&pni->pni_bulk_get_source_eq_h);
-        ptl_set_inv_handle (&pni->pni_bulk_get_sink_eq_h);
-        
+        pni->pni_request_out_eq_h = PTL_HANDLE_NONE;
+        pni->pni_reply_out_eq_h = PTL_HANDLE_NONE;
+        pni->pni_reply_in_eq_h = PTL_HANDLE_NONE;
+        pni->pni_bulk_put_source_eq_h = PTL_HANDLE_NONE;
+        pni->pni_bulk_put_sink_eq_h = PTL_HANDLE_NONE;
+        pni->pni_bulk_get_source_eq_h = PTL_HANDLE_NONE;
+        pni->pni_bulk_get_sink_eq_h = PTL_HANDLE_NONE;
+
         /* NB We never actually PtlEQGet() out of these events queues since
          * we're only interested in the event callback, so we can just let
          * them wrap.  Their sizes aren't a big deal, apart from providing
          * a little history for debugging... */
-        
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback, 
+
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback,
                         &pni->pni_request_out_eq_h);
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
-                
-        rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback, 
+
+        rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback,
                         &pni->pni_reply_out_eq_h);
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
-        
+
         rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_in_callback,
                         &pni->pni_reply_in_eq_h);
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
-                
+
         rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_source_callback,
                         &pni->pni_bulk_put_source_eq_h);
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
-                
+
         rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_sink_callback,
                         &pni->pni_bulk_put_sink_eq_h);
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
-                
+
         rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_source_callback,
                         &pni->pni_bulk_get_source_eq_h);
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
-                
+
         rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_sink_callback,
                         &pni->pni_bulk_get_sink_eq_h);
         if (rc != PTL_OK)
                 GOTO (fail, rc = -ENOMEM);
-        
+
         return (0);
- fail: 
+ fail:
         CERROR ("Failed to initialise network interface %s: %d\n",
                 name, rc);
 
-        /* OK to do complete teardown since we invalidated the handles above... */
+        /* OK to do complete teardown since we invalidated the handles above */
         ptlrpc_ni_fini (pni);
         return (rc);
 }
 
 int ptlrpc_init_portals(void)
 {
-        /* Add new portals network interface names here.
+        /* Add new portals network interfaces here.
          * Order is irrelevent! */
-        char *ni_names[] = { "kqswnal_ni",
-                             "kgmnal_ni",
-                             "ksocknal_ni",
-                             "ktoenal_ni",
-                             "tcpnal_ni",
-                             NULL };
+        static struct {
+                int   number;
+                char *name;
+        } ptl_nis[] = {
+                {QSWNAL,  "qswnal"},
+                {SOCKNAL, "socknal"},
+                {GMNAL,   "gmnal"},
+                {TOENAL,  "toenal"},
+                {TCPNAL,  "tcpnal"},
+                {SCIMACNAL, "scimacnal"}};
         int   rc;
         int   i;
-        
-        LASSERT (ptlrpc_ninterfaces == 0);
-
-        for (i = 0; ni_names[i] != NULL; i++) {
-                LASSERT (ptlrpc_ninterfaces < 
-                         sizeof (ptlrpc_interfaces)/sizeof (ptlrpc_interfaces[0]));
-                
-                rc = ptlrpc_ni_init (ni_names[i],
-                                     &ptlrpc_interfaces[ptlrpc_ninterfaces]);
+
+        LASSERT(ptlrpc_ninterfaces == 0);
+
+        for (i = 0; i < sizeof (ptl_nis) / sizeof (ptl_nis[0]); i++) {
+                LASSERT(ptlrpc_ninterfaces < (sizeof(ptlrpc_interfaces) /
+                                              sizeof(ptlrpc_interfaces[0])));
+
+                rc = ptlrpc_ni_init(ptl_nis[i].number, ptl_nis[i].name,
+                                    &ptlrpc_interfaces[ptlrpc_ninterfaces]);
                 if (rc == 0)
                         ptlrpc_ninterfaces++;
         }
-        
+
         if (ptlrpc_ninterfaces == 0) {
-                CERROR("network initialisation failed: is a NAL module loaded?\n");
+                CERROR("network initialisation failed: is a NAL module "
+                       "loaded?\n");
                 return -EIO;
         }
         return 0;
index 1b3532e..cc9982c 100644 (file)
  */
 #define DEBUG_SUBSYSTEM S_CLASS
 
+#include <linux/obd_support.h>
+#include <linux/obd.h>
 #include <linux/lprocfs_status.h>
+#include <linux/lustre_idl.h>
+#include <linux/lustre_net.h>
+#include "ptlrpc_internal.h"
+
+
+struct ll_rpc_opcode { 
+     __u32       opcode;
+     const char *opname;
+} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = {
+        { OST_REPLY,        "ost_reply" },
+        { OST_GETATTR,      "ost_getattr" },
+        { OST_SETATTR,      "ost_setattr" },
+        { OST_READ,         "ost_read" },
+        { OST_WRITE,        "ost_write" },
+        { OST_CREATE ,      "ost_create" },
+        { OST_DESTROY,      "ost_destroy" },
+        { OST_GET_INFO,     "ost_get_info" },
+        { OST_CONNECT,      "ost_connect" },
+        { OST_DISCONNECT,   "ost_disconnect" },
+        { OST_PUNCH,        "ost_punch" },
+        { OST_OPEN,         "ost_open" },
+        { OST_CLOSE,        "ost_close" },
+        { OST_STATFS,       "ost_statfs" },
+        { OST_SAN_READ,     "ost_san_read" },
+        { OST_SAN_WRITE,    "ost_san_write" },
+        { OST_SYNCFS,       "ost_syncfs" },
+        { MDS_GETATTR,      "mds_getattr" },
+        { MDS_GETATTR_NAME, "mds_getattr_name" },
+        { MDS_CLOSE,        "mds_close" },
+        { MDS_REINT,        "mds_reint" },
+        { MDS_READPAGE,     "mds_readpage" },
+        { MDS_CONNECT,      "mds_connect" },
+        { MDS_DISCONNECT,   "mds_disconnect" },
+        { MDS_GETSTATUS,    "mds_getstatus" },
+        { MDS_STATFS,       "mds_statfs" },
+        { MDS_GETLOVINFO,   "mds_getlovinfo" },
+        { LDLM_ENQUEUE,     "ldlm_enqueue" },
+        { LDLM_CONVERT,     "ldlm_convert" },
+        { LDLM_CANCEL,      "ldlm_cancel" },
+        { LDLM_BL_CALLBACK, "ldlm_bl_callback" },
+        { LDLM_CP_CALLBACK, "ldlm_cp_callback" },
+        { PTLBD_QUERY,      "ptlbd_query" },
+        { PTLBD_READ,       "ptlbd_read" },
+        { PTLBD_WRITE,      "ptlbd_write" },
+        { PTLBD_FLUSH,      "ptlbd_flush" },
+        { OBD_PING,         "obd_ping" }
+};
+
+const char* ll_opcode2str(__u32 opcode)
+{
+        /* When one of the assertions below fail, chances are that:
+         *     1) A new opcode was added in lustre_idl.h, but was
+         *        is missing from the table above.
+         * or  2) The opcode space was renumbered or rearranged, 
+         *        and the opcode_offset() function in 
+         *        ptlrpc_internals.h needs to be modified.
+         */
+        __u32 offset = opcode_offset(opcode);
+        LASSERT(offset < LUSTRE_MAX_OPCODES);
+        LASSERT(ll_rpc_opcode_table[offset].opcode == opcode);
+        return ll_rpc_opcode_table[offset].opname;
+}
 
 #ifndef LPROCFS
-struct lprocfs_vars lprocfs_obd_vars[]  = { {0} };
-struct lprocfs_vars lprocfs_module_vars[] = { {0} };
+void ptlrpc_lprocfs_register_service(struct obd_device *obddev,
+                                     struct ptlrpc_service *svc) { return ; }
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc) { return; }
 #else
-struct lprocfs_vars lprocfs_obd_vars[] = {
-        { "uuid",     lprocfs_rd_uuid,    0, 0},
-        { 0 }
-};
 
-struct lprocfs_vars lprocfs_module_vars[] = {
-        { "num_refs", lprocfs_rd_numrefs, 0, 0},
-        { 0 }
-};
+void ptlrpc_lprocfs_register_service(struct obd_device *obddev,
+                                     struct ptlrpc_service *svc)
+{
+        struct proc_dir_entry   *svc_procroot;
+        struct lprocfs_counters *svc_cntrs;
+        int i, rc;
+        unsigned int svc_counter_config = LPROCFS_CNTR_EXTERNALLOCK | 
+                LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV;
+
+        LASSERT(svc->svc_procroot == NULL);
+        LASSERT(svc->svc_counters == NULL);
+
+        svc_procroot = lprocfs_register(svc->srv_name, obddev->obd_proc_entry,
+                                        NULL, NULL);
+        if (svc_procroot == NULL) 
+                return;
+        
+        svc_cntrs = 
+                lprocfs_alloc_counters(PTLRPC_LAST_CNTR+LUSTRE_MAX_OPCODES);
+        if (svc_cntrs == NULL) {
+                lprocfs_remove(svc_procroot);
+                return;
+        }
+        LPROCFS_COUNTER_INIT(&svc_cntrs->cntr[PTLRPC_REQWAIT_CNTR], 
+                             svc_counter_config, &svc->srv_lock, 
+                             "req_waittime", "cycles");
+        LPROCFS_COUNTER_INIT(&svc_cntrs->cntr[PTLRPC_SVCEQDEPTH_CNTR], 
+                             svc_counter_config, &svc->srv_lock, 
+                             "svc_eqdepth", "reqs");
+        /* no stddev on idletime */
+        LPROCFS_COUNTER_INIT(&svc_cntrs->cntr[PTLRPC_SVCIDLETIME_CNTR],
+                             (LPROCFS_CNTR_EXTERNALLOCK | LPROCFS_CNTR_AVGMINMAX),
+                             &svc->srv_lock, "svc_idletime", "cycles");
+        for (i=0; i < LUSTRE_MAX_OPCODES; i++) {
+                __u32 opcode = ll_rpc_opcode_table[i].opcode;
+                LPROCFS_COUNTER_INIT(&svc_cntrs->cntr[PTLRPC_LAST_CNTR+i], 
+                                     svc_counter_config, &svc->srv_lock,
+                                     ll_opcode2str(opcode), "cycles");
+        }
+        rc = lprocfs_register_counters(svc_procroot, "service_stats", 
+                                       svc_cntrs);
+        if (rc < 0) {
+                lprocfs_remove(svc_procroot);
+                lprocfs_free_counters(svc_cntrs);
+        } else {
+                svc->svc_procroot = svc_procroot;
+                svc->svc_counters = svc_cntrs;
+        }
+}
 
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc)
+{
+        if (svc->svc_procroot) {
+                lprocfs_remove(svc->svc_procroot);
+                svc->svc_procroot = NULL;
+        }
+        if (svc->svc_counters) {
+                lprocfs_free_counters(svc->svc_counters);
+                svc->svc_counters = NULL;
+        }
+}
 #endif /* LPROCFS */
-LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars)
index 3b1d32f..017fb8b 100644 (file)
@@ -34,12 +34,14 @@ static int ptl_send_buf(struct ptlrpc_request *request,
                         struct ptlrpc_connection *conn, int portal)
 {
         int rc;
+        int rc2;
         ptl_process_id_t remote_id;
         ptl_handle_md_t md_h;
         ptl_ack_req_t ack_req;
 
-        LASSERT(conn);
-        CDEBUG (D_INFO, "conn=%p ni %s nid "LPX64" on %s\n", 
+        LASSERT (portal != 0);
+        LASSERT (conn != NULL);
+        CDEBUG (D_INFO, "conn=%p ni %s nid "LPX64" on %s\n",
                 conn, conn->c_peer.peer_ni->pni_name,
                 conn->c_peer.peer_nid, conn->c_peer.peer_ni->pni_name);
 
@@ -47,23 +49,26 @@ static int ptl_send_buf(struct ptlrpc_request *request,
 
         switch (request->rq_type) {
         case PTL_RPC_MSG_REQUEST:
-                request->rq_reqmsg->type = HTON__u32(request->rq_type);
+                request->rq_reqmsg->type = request->rq_type;
                 request->rq_req_md.start = request->rq_reqmsg;
                 request->rq_req_md.length = request->rq_reqlen;
-                request->rq_req_md.eventq = conn->c_peer.peer_ni->pni_request_out_eq_h;
+                request->rq_req_md.eventq =
+                        conn->c_peer.peer_ni->pni_request_out_eq_h;
+                LASSERT (!request->rq_want_ack);
                 break;
         case PTL_RPC_MSG_ERR:
         case PTL_RPC_MSG_REPLY:
-                request->rq_repmsg->type = HTON__u32(request->rq_type);
+                request->rq_repmsg->type = request->rq_type;
                 request->rq_req_md.start = request->rq_repmsg;
                 request->rq_req_md.length = request->rq_replen;
-                request->rq_req_md.eventq = conn->c_peer.peer_ni->pni_reply_out_eq_h;
+                request->rq_req_md.eventq =
+                        conn->c_peer.peer_ni->pni_reply_out_eq_h;
                 break;
         default:
                 LBUG();
                 return -1; /* notreached */
         }
-        if (request->rq_flags & PTL_RPC_FL_WANT_ACK) {
+        if (request->rq_want_ack) {
                 request->rq_req_md.threshold = 2; /* SENT and ACK */
                 ack_req = PTL_ACK_REQ;
         } else {
@@ -78,12 +83,18 @@ static int ptl_send_buf(struct ptlrpc_request *request,
                 obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED;
         }
 
-        rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, request->rq_req_md, &md_h);
-        if (rc != 0) {
+        /* NB if the send fails, we back out of the send and return
+         * failure; it's down to the caller to handle missing callbacks */
+
+        rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, request->rq_req_md,
+                       &md_h);
+        if (rc != PTL_OK) {
                 CERROR("PtlMDBind failed: %d\n", rc);
-                LBUG();
-                return rc;
+                LASSERT (rc == PTL_NOSPACE);
+                RETURN (-ENOMEM);
         }
+        if (request->rq_type != PTL_RPC_MSG_REQUEST)
+                memcpy(&request->rq_reply_md_h, &md_h, sizeof(md_h));
 
         remote_id.nid = conn->c_peer.peer_nid;
         remote_id.pid = 0;
@@ -91,27 +102,27 @@ static int ptl_send_buf(struct ptlrpc_request *request,
         CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n",
                request->rq_req_md.length, portal, request->rq_xid);
 
-        if (!portal)
-                LBUG();
         rc = PtlPut(md_h, ack_req, remote_id, portal, 0, request->rq_xid, 0, 0);
         if (rc != PTL_OK) {
                 CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n",
                        remote_id.nid, portal, request->rq_xid, rc);
-                PtlMDUnlink(md_h);
+                rc2 = PtlMDUnlink(md_h);
+                LASSERT (rc2 == PTL_OK);
+                RETURN ((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
         }
 
-        return rc;
+        return 0;
 }
 
-static inline struct iovec *
+static inline ptl_kiov_t *
 ptlrpc_get_bulk_iov (struct ptlrpc_bulk_desc *desc)
 {
-        struct iovec *iov;
+        ptl_kiov_t *iov;
 
-        if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (struct iovec))
+        if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov))
                 return (desc->bd_iov);
 
-        OBD_ALLOC (iov, desc->bd_page_count * sizeof (struct iovec));
+        OBD_ALLOC (iov, desc->bd_page_count * sizeof (*iov));
         if (iov == NULL)
                 LBUG();
 
@@ -119,39 +130,45 @@ ptlrpc_get_bulk_iov (struct ptlrpc_bulk_desc *desc)
 }
 
 static inline void
-ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, struct iovec *iov)
+ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, ptl_kiov_t *iov)
 {
-        if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (struct iovec))
+        if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov))
                 return;
 
-        OBD_FREE (iov, desc->bd_page_count * sizeof (struct iovec));
+        OBD_FREE (iov, desc->bd_page_count * sizeof (*iov));
 }
 
 int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
 {
         int rc;
+        int rc2;
         struct ptlrpc_peer *peer;
         struct list_head *tmp, *next;
         ptl_process_id_t remote_id;
-        __u32 xid = 0;
-        struct iovec *iov;
+        ptl_kiov_t *iov;
+        __u64 xid;
         ENTRY;
 
+        /* NB no locking required until desc is on the network */
+        LASSERT (!desc->bd_network_rw);
+        LASSERT (desc->bd_type == BULK_PUT_SOURCE);
+        desc->bd_complete = 0;
+
         iov = ptlrpc_get_bulk_iov (desc);
         if (iov == NULL)
                 RETURN (-ENOMEM);
 
-        peer = &desc->bd_connection->c_peer;
+        peer = &desc->bd_export->exp_connection->c_peer;
 
         desc->bd_md.start = iov;
         desc->bd_md.niov = 0;
         desc->bd_md.length = 0;
         desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_source_eq_h;
         desc->bd_md.threshold = 2; /* SENT and ACK */
-        desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV;
+        desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV;
         desc->bd_md.user_ptr = desc;
 
-        atomic_set(&desc->bd_source_callback_count, 2);
+        desc->bd_callback_count = 2;
 
         list_for_each_safe(tmp, next, &desc->bd_page_list) {
                 struct ptlrpc_bulk_page *bulk;
@@ -159,26 +176,19 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
 
                 LASSERT(desc->bd_md.niov < desc->bd_page_count);
 
-                if (desc->bd_md.niov == 0)
-                        xid = bulk->bp_xid;
-                LASSERT(xid == bulk->bp_xid);   /* should all be the same */
-
-                iov[desc->bd_md.niov].iov_base = bulk->bp_buf;
-                iov[desc->bd_md.niov].iov_len = bulk->bp_buflen;
-                if (iov[desc->bd_md.niov].iov_len <= 0) {
-                        CERROR("bad bp_buflen[%d] @ %p: %d\n", desc->bd_md.niov,
-                               bulk->bp_buf, bulk->bp_buflen);
-                        CERROR("desc: xid %u, pages %d, ptl %d, ref %d\n",
-                               xid, desc->bd_page_count, desc->bd_portal,
-                               atomic_read(&desc->bd_refcount));
-                        LBUG();
-                }
+                iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
+                iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
+                iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
+
+                LASSERT (iov[desc->bd_md.niov].kiov_offset +
+                         iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE);
                 desc->bd_md.niov++;
                 desc->bd_md.length += bulk->bp_buflen;
         }
 
+        /* NB total length may be 0 for a read past EOF, so we send a 0
+         * length bulk, since the client expects a bulk event. */
         LASSERT(desc->bd_md.niov == desc->bd_page_count);
-        LASSERT(desc->bd_md.niov != 0);
 
         rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md,
                        &desc->bd_md_h);
@@ -187,27 +197,31 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
 
         if (rc != PTL_OK) {
                 CERROR("PtlMDBind failed: %d\n", rc);
-                LBUG();
-                RETURN(rc);
+                LASSERT (rc == PTL_NOSPACE);
+                RETURN(-ENOMEM);
         }
 
+        /* Client's bulk and reply matchbits are the same */
+        xid = desc->bd_req->rq_xid;
         remote_id.nid = peer->peer_nid;
         remote_id.pid = 0;
 
         CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s "
-               "nid "LPX64" pid %d xid %d\n", 
+               "nid "LPX64" pid %d xid "LPX64"\n",
                desc->bd_md.niov, desc->bd_md.length,
                desc->bd_portal, peer->peer_ni->pni_name,
                remote_id.nid, remote_id.pid, xid);
 
+        desc->bd_network_rw = 1;
         rc = PtlPut(desc->bd_md_h, PTL_ACK_REQ, remote_id,
                     desc->bd_portal, 0, xid, 0, 0);
         if (rc != PTL_OK) {
-                CERROR("PtlPut("LPU64", %d, %d) failed: %d\n",
+                desc->bd_network_rw = 0;
+                CERROR("PtlPut("LPU64", %d, "LPX64") failed: %d\n",
                        remote_id.nid, desc->bd_portal, xid, rc);
-                PtlMDUnlink(desc->bd_md_h);
-                LBUG();
-                RETURN(rc);
+                rc2 = PtlMDUnlink(desc->bd_md_h);
+                LASSERT (rc2 == PTL_OK);
+                RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
         }
 
         RETURN(0);
@@ -216,28 +230,34 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc)
 int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
 {
         int rc;
+        int rc2;
         struct ptlrpc_peer *peer;
         struct list_head *tmp, *next;
         ptl_process_id_t remote_id;
-        __u32 xid = 0;
-        struct iovec *iov;
+        ptl_kiov_t *iov;
+        __u64 xid;
         ENTRY;
 
+        /* NB no locking required until desc is on the network */
+        LASSERT (!desc->bd_network_rw);
+        LASSERT (desc->bd_type == BULK_GET_SINK);
+        desc->bd_complete = 0;
+
         iov = ptlrpc_get_bulk_iov (desc);
         if (iov == NULL)
-                RETURN (-ENOMEM);
+                RETURN(-ENOMEM);
 
-        peer = &desc->bd_connection->c_peer;
+        peer = &desc->bd_export->exp_connection->c_peer;
 
         desc->bd_md.start = iov;
         desc->bd_md.niov = 0;
         desc->bd_md.length = 0;
         desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_sink_eq_h;
         desc->bd_md.threshold = 2; /* SENT and REPLY */
-        desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_IOV;
+        desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV;
         desc->bd_md.user_ptr = desc;
 
-        atomic_set(&desc->bd_source_callback_count, 2);
+        desc->bd_callback_count = 2;
 
         list_for_each_safe(tmp, next, &desc->bd_page_list) {
                 struct ptlrpc_bulk_page *bulk;
@@ -245,20 +265,12 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
 
                 LASSERT(desc->bd_md.niov < desc->bd_page_count);
 
-                if (desc->bd_md.niov == 0)
-                        xid = bulk->bp_xid;
-                LASSERT(xid == bulk->bp_xid);   /* should all be the same */
-
-                iov[desc->bd_md.niov].iov_base = bulk->bp_buf;
-                iov[desc->bd_md.niov].iov_len = bulk->bp_buflen;
-                if (iov[desc->bd_md.niov].iov_len <= 0) {
-                        CERROR("bad bulk %p bp_buflen[%d] @ %p: %d\n", bulk,
-                               desc->bd_md.niov, bulk->bp_buf, bulk->bp_buflen);
-                        CERROR("desc %p: xid %u, pages %d, ptl %d, ref %d\n",
-                               desc, xid, desc->bd_page_count, desc->bd_portal,
-                               atomic_read(&desc->bd_refcount));
-                        LBUG();
-                }
+                iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
+                iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
+                iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
+
+                LASSERT (iov[desc->bd_md.niov].kiov_offset +
+                         iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE);
                 desc->bd_md.niov++;
                 desc->bd_md.length += bulk->bp_buflen;
         }
@@ -266,78 +278,156 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc)
         LASSERT(desc->bd_md.niov == desc->bd_page_count);
         LASSERT(desc->bd_md.niov != 0);
 
-        rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md,
-                       &desc->bd_md_h);
+        rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md, &desc->bd_md_h);
 
-        ptlrpc_put_bulk_iov (desc, iov); /*move down to reduce latency to send*/
+        ptlrpc_put_bulk_iov(desc, iov); /*move down to reduce latency to send*/
 
         if (rc != PTL_OK) {
                 CERROR("PtlMDBind failed: %d\n", rc);
-                LBUG();
-                RETURN(rc);
+                LASSERT (rc == PTL_NOSPACE);
+                RETURN(-ENOMEM);
         }
 
-        remote_id.nid = desc->bd_connection->c_peer.peer_nid;
+        /* Client's bulk and reply matchbits are the same */
+        xid = desc->bd_req->rq_xid;
+        remote_id.nid = desc->bd_export->exp_connection->c_peer.peer_nid;
         remote_id.pid = 0;
 
-        CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s "
-               "nid "LPX64" pid %d xid %d\n", 
-               desc->bd_md.niov, desc->bd_md.length,
-               desc->bd_portal, peer->peer_ni->pni_name,
-               remote_id.nid, remote_id.pid, xid);
+        CDEBUG(D_NET, "Fetching %u pages %u bytes from portal %d on %s "
+               "nid "LPX64" pid %d xid "LPX64"\n",
+               desc->bd_md.niov, desc->bd_md.length, desc->bd_portal,
+               peer->peer_ni->pni_name, remote_id.nid, remote_id.pid,
+               xid);
 
-        rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0, xid, 0);
+        desc->bd_network_rw = 1;
+        rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0,
+                    xid, 0);
         if (rc != PTL_OK) {
-                CERROR("PtlGet("LPU64", %d, %d) failed: %d\n",
+                desc->bd_network_rw = 0;
+                CERROR("PtlGet("LPU64", %d, "LPX64") failed: %d\n",
                        remote_id.nid, desc->bd_portal, xid, rc);
-                PtlMDUnlink(desc->bd_md_h);
-                LBUG();
-                RETURN(rc);
+                rc2 = PtlMDUnlink(desc->bd_md_h);
+                LASSERT (rc2 == PTL_OK);
+                RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM);
         }
 
         RETURN(0);
 }
 
-static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc)
+void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc)
+{
+        /* Server side bulk abort. Idempotent. Not thread-safe (i.e. only
+         * serialises with completion callback) */
+        unsigned long      flags;
+        struct l_wait_info lwi;
+        int                callback_count;
+        int                rc;
+
+        LASSERT (!in_interrupt ());             /* might sleep */
+
+        /* NB. server-side bulk gets 2 events, so we have to keep trying to
+         * unlink the MD until all callbacks have happened, or
+         * PtlMDUnlink() returns OK or INVALID */
+ again:
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        if (!desc->bd_network_rw) {
+                /* completed or never even registered. NB holding bd_lock
+                 * guarantees callback has completed if it ran. */
+                spin_unlock_irqrestore (&desc->bd_lock, flags);
+                return;
+        }
+
+        /* sample callback count while we have the lock */
+        callback_count = desc->bd_callback_count;
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
+
+        rc = PtlMDUnlink (desc->bd_md_h);
+        switch (rc) {
+        default:
+                CERROR("PtlMDUnlink returned %d\n", rc);
+                LBUG ();
+        case PTL_OK:                    /* Won the race with the network */
+                LASSERT (!desc->bd_complete); /* Not all callbacks ran */
+                desc->bd_network_rw = 0;
+                return;
+
+        case PTL_MD_INUSE:              /* MD is being accessed right now */
+                for (;;) {
+                        /* Network access will complete in finite time but the
+                         * timeout lets us CERROR for visibility */
+                        lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL);
+                        rc = l_wait_event(desc->bd_waitq,
+                                          desc->bd_callback_count !=
+                                          callback_count, &lwi);
+                        if (rc == -ETIMEDOUT) {
+                                CERROR("Unexpectedly long timeout: desc %p\n",
+                                       desc);
+                                continue;
+                        }
+                        LASSERT (rc == 0);
+                        break;
+                }
+                /* go back and try again... */
+                goto again;
+
+        case PTL_INV_MD:            /* Lost the race with completion */
+                LASSERT (desc->bd_complete);    /* Callbacks all ran */
+                LASSERT (!desc->bd_network_rw);
+                return;
+        }
+}
+
+int ptlrpc_register_bulk (struct ptlrpc_request *req)
 {
+        struct ptlrpc_bulk_desc *desc = req->rq_bulk;
         struct ptlrpc_peer *peer;
         struct list_head *tmp, *next;
         int rc;
-        __u32 xid = 0;
-        struct iovec *iov;
+        int rc2;
+        ptl_kiov_t *iov;
         ptl_process_id_t source_id;
         ENTRY;
 
-        if (desc->bd_page_count > PTL_MD_MAX_IOV) {
-                CERROR("iov longer than %d pages not supported (count=%d)\n",
-                       PTL_MD_MAX_IOV, desc->bd_page_count);
-                RETURN(-EINVAL);
-        }
+        /* NB no locking required until desc is on the network */
+        LASSERT (!desc->bd_network_rw);
+        LASSERT (desc->bd_page_count <= PTL_MD_MAX_IOV);
+        LASSERT (desc->bd_req != NULL);
+        LASSERT (desc->bd_type == BULK_PUT_SINK ||
+                 desc->bd_type == BULK_GET_SOURCE);
+
+        desc->bd_complete = 0;
 
         iov = ptlrpc_get_bulk_iov (desc);
         if (iov == NULL)
                 return (-ENOMEM);
 
-        peer = &desc->bd_connection->c_peer;
-        
+        peer = &desc->bd_import->imp_connection->c_peer;
+
         desc->bd_md.start = iov;
         desc->bd_md.niov = 0;
         desc->bd_md.length = 0;
         desc->bd_md.threshold = 1;
         desc->bd_md.user_ptr = desc;
 
+        if (desc->bd_type == BULK_GET_SOURCE) {
+                desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV;
+                desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_source_eq_h;
+        } else {
+                desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV;
+                desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_sink_eq_h;
+        }
+
         list_for_each_safe(tmp, next, &desc->bd_page_list) {
                 struct ptlrpc_bulk_page *bulk;
                 bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
 
                 LASSERT(desc->bd_md.niov < desc->bd_page_count);
 
-                if (desc->bd_md.niov == 0)
-                        xid = bulk->bp_xid;
-                LASSERT(xid == bulk->bp_xid);   /* should all be the same */
+                iov[desc->bd_md.niov].kiov_page = bulk->bp_page;
+                iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen;
+                iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset;
 
-                iov[desc->bd_md.niov].iov_base = bulk->bp_buf;
-                iov[desc->bd_md.niov].iov_len = bulk->bp_buflen;
+                LASSERT (bulk->bp_pageoffset + bulk->bp_buflen <= PAGE_SIZE);
                 desc->bd_md.niov++;
                 desc->bd_md.length += bulk->bp_buflen;
         }
@@ -345,157 +435,145 @@ static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc)
         LASSERT(desc->bd_md.niov == desc->bd_page_count);
         LASSERT(desc->bd_md.niov != 0);
 
-        source_id.nid = desc->bd_connection->c_peer.peer_nid;
+        /* XXX Registering the same xid on retried bulk makes my head
+         * explode trying to understand how the original request's bulk
+         * might interfere with the retried request -eeb */
+        LASSERT (!desc->bd_registered || req->rq_xid != desc->bd_last_xid);
+        desc->bd_registered = 1;
+        desc->bd_last_xid = desc->bd_last_xid;
+
+        source_id.nid = desc->bd_import->imp_connection->c_peer.peer_nid;
         source_id.pid = PTL_PID_ANY;
 
         rc = PtlMEAttach(peer->peer_ni->pni_ni_h,
-                         desc->bd_portal, source_id, xid, 0,
+                         desc->bd_portal, source_id, req->rq_xid, 0,
                          PTL_UNLINK, PTL_INS_AFTER, &desc->bd_me_h);
 
         if (rc != PTL_OK) {
                 CERROR("PtlMEAttach failed: %d\n", rc);
-                LBUG();
-                GOTO(cleanup, rc);
+                LASSERT (rc == PTL_NOSPACE);
+                GOTO(out, rc = -ENOMEM);
         }
 
+        /* About to let the network at it... */
+        desc->bd_network_rw = 1;
         rc = PtlMDAttach(desc->bd_me_h, desc->bd_md, PTL_UNLINK,
                          &desc->bd_md_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMDAttach failed: %d\n", rc);
-                LBUG();
-                GOTO(cleanup, rc);
+                LASSERT (rc == PTL_NOSPACE);
+                desc->bd_network_rw = 0;
+                rc2 = PtlMEUnlink (desc->bd_me_h);
+                LASSERT (rc2 == PTL_OK);
+                GOTO(out, rc = -ENOMEM);
         }
+        rc = 0;
 
-        ptlrpc_put_bulk_iov (desc, iov);
-
-        CDEBUG(D_NET, "Setup bulk sink buffers: %u pages %u bytes, xid %u, "
-               "portal %u on %s\n", desc->bd_md.niov, desc->bd_md.length,
-               xid, desc->bd_portal, peer->peer_ni->pni_name);
-
-        RETURN(0);
+        CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", "
+               "portal %u on %s\n",
+               desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink",
+               desc->bd_md.niov, desc->bd_md.length,
+               req->rq_xid, desc->bd_portal, peer->peer_ni->pni_name);
 
cleanup:
out:
         ptlrpc_put_bulk_iov (desc, iov);
-        ptlrpc_abort_bulk(desc);
-
-        return rc;
-}
-
-int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *desc)
-{
-        desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_IOV;
-        desc->bd_md.eventq = 
-                desc->bd_connection->c_peer.peer_ni->pni_bulk_get_source_eq_h;
-
-        return ptlrpc_register_bulk_shared(desc);
-}
-
-int ptlrpc_register_bulk_put(struct ptlrpc_bulk_desc *desc)
-{
-        desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV;
-        desc->bd_md.eventq = 
-                desc->bd_connection->c_peer.peer_ni->pni_bulk_put_sink_eq_h;
-
-        return ptlrpc_register_bulk_shared(desc);
-}
-
-int ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc)
-{
-        int rc1, rc2;
-        /* This should be safe: these handles are initialized to be
-         * invalid in ptlrpc_prep_bulk() */
-        rc1 = PtlMDUnlink(desc->bd_md_h);
-        if (rc1 != PTL_OK)
-                CERROR("PtlMDUnlink: %d\n", rc1);
-        rc2 = PtlMEUnlink(desc->bd_me_h);
-        if (rc2 != PTL_OK)
-                CERROR("PtlMEUnlink: %d\n", rc2);
-
-        return rc1 ? rc1 : rc2;
-}
-
-void obd_brw_set_addref(struct obd_brw_set *set)
-{
-        atomic_inc(&set->brw_refcount);
-}
-
-void obd_brw_set_add(struct obd_brw_set *set, struct ptlrpc_bulk_desc *desc)
-{
-        LASSERT(list_empty(&desc->bd_set_chain));
-
-        ptlrpc_bulk_addref(desc);
-        atomic_inc(&set->brw_desc_count);
-        desc->bd_brw_set = set;
-        list_add(&desc->bd_set_chain, &set->brw_desc_head);
-}
-
-void obd_brw_set_del(struct ptlrpc_bulk_desc *desc)
-{
-        atomic_dec(&desc->bd_brw_set->brw_desc_count);
-        list_del_init(&desc->bd_set_chain);
-        ptlrpc_bulk_decref(desc);
+        RETURN(rc);
 }
 
-struct obd_brw_set *obd_brw_set_new(void)
+void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
 {
-        struct obd_brw_set *set;
-
-        OBD_ALLOC(set, sizeof(*set));
-
-        if (set != NULL) {
-                init_waitqueue_head(&set->brw_waitq);
-                INIT_LIST_HEAD(&set->brw_desc_head);
-                atomic_set(&set->brw_refcount, 1);
-                atomic_set(&set->brw_desc_count, 0);
+        /* Disconnect a bulk desc from the network. Idempotent. Not
+         * thread-safe (i.e. only interlocks with completion callback). */
+        struct ptlrpc_bulk_desc *desc = req->rq_bulk;
+        wait_queue_head_t       *wq;
+        unsigned long            flags;
+        struct l_wait_info       lwi;
+        int                      rc;
+
+        LASSERT (!in_interrupt ());             /* might sleep */
+
+        spin_lock_irqsave (&desc->bd_lock, flags);
+        if (!desc->bd_network_rw) {     /* completed or never even registered */
+                spin_unlock_irqrestore (&desc->bd_lock, flags);
+                return;
         }
-
-        return set;
-}
-
-static void obd_brw_set_free(struct obd_brw_set *set)
-{
-        struct list_head *tmp, *next;
-        ENTRY;
-
-        list_for_each_safe(tmp, next, &set->brw_desc_head) {
-                struct ptlrpc_bulk_desc *desc =
-                        list_entry(tmp, struct ptlrpc_bulk_desc, bd_set_chain);
-
-                CERROR("Unfinished bulk descriptor: %p\n", desc);
-
-                ptlrpc_abort_bulk(desc);
+        spin_unlock_irqrestore (&desc->bd_lock, flags);
+
+        LASSERT (desc->bd_req == req);     /* NB bd_req NULL until registered */
+
+        /* NB...
+         * 1. If the MD unlink is successful, the ME gets unlinked too.
+         * 2. Since client-side bulk only gets a single event and a
+         * .. threshold of 1.  If the MD was inuse at the first link
+         * .. attempt, the callback is due any minute, and the MD/ME will
+         * .. unlink themselves.
+         */
+        rc = PtlMDUnlink (desc->bd_md_h);
+        switch (rc) {
+        default:
+                CERROR("PtlMDUnlink returned %d\n", rc);
+                LBUG ();
+        case PTL_OK:                          /* Won the race with completion */
+                LASSERT (!desc->bd_complete);   /* Callback hasn't happened */
+                desc->bd_network_rw = 0;
+                return;
+        case PTL_MD_INUSE:                  /* MD is being accessed right now */
+                for (;;) {
+                        /* Network access will complete in finite time but the
+                         * timeout lets us CERROR for visibility */
+                        if (desc->bd_req->rq_set != NULL)
+                                wq = &req->rq_set->set_waitq;
+                        else
+                                wq = &req->rq_wait_for_rep;
+                        lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL);
+                        rc = l_wait_event(*wq, ptlrpc_bulk_complete(desc), &lwi);
+                        LASSERT (rc == 0 || rc == -ETIMEDOUT);
+                        if (rc == 0)
+                                break;
+                        CERROR ("Unexpectedly long timeout: desc %p\n", desc);
+                        LBUG();
+                }
+                /* Fall through */
+        case PTL_INV_MD:                     /* Lost the race with completion */
+                LASSERT (desc->bd_complete);/* Callback has run to completion */
+                LASSERT (!desc->bd_network_rw);
+                return;
         }
-        OBD_FREE(set, sizeof(*set));
-        EXIT;
-        return;
 }
 
-void obd_brw_set_decref(struct obd_brw_set *set)
+int ptlrpc_reply(struct ptlrpc_request *req)
 {
-        ENTRY;
-        if (atomic_dec_and_test(&set->brw_refcount))
-                obd_brw_set_free(set);
-        EXIT;
-}
+        unsigned long flags;
+        int rc;
 
-int ptlrpc_reply(struct ptlrpc_service *svc, struct ptlrpc_request *req)
-{
-        if (req->rq_repmsg == NULL) {
-                CERROR("bad: someone called ptlrpc_reply when they meant "
-                       "ptlrpc_error\n");
-                return -EINVAL;
-        }
+        /* We must already have a reply buffer (only ptlrpc_error() may be
+         * called without one).  We must also have a request buffer which
+         * is either the actual (swabbed) incoming request, or a saved copy
+         * if this is a req saved in target_queue_final_reply(). */
+        LASSERT (req->rq_repmsg != NULL);
+        LASSERT (req->rq_reqmsg != NULL);
 
         /* FIXME: we need to increment the count of handled events */
         if (req->rq_type != PTL_RPC_MSG_ERR)
                 req->rq_type = PTL_RPC_MSG_REPLY;
-        //req->rq_repmsg->conn = req->rq_connection->c_remote_conn;
-        //req->rq_repmsg->token = req->rq_connection->c_remote_token;
-        req->rq_repmsg->status = HTON__u32(req->rq_status);
-        return ptl_send_buf(req, req->rq_connection, svc->srv_rep_portal);
+
+        req->rq_repmsg->status = req->rq_status;
+        req->rq_repmsg->opc = req->rq_reqmsg->opc;
+
+        init_waitqueue_head(&req->rq_wait_for_rep);
+        rc = ptl_send_buf(req, req->rq_connection, req->rq_svc->srv_rep_portal);
+        if (rc != 0) {
+                /* Do what the callback handler would have done */
+                OBD_FREE (req->rq_repmsg, req->rq_replen);
+
+                spin_lock_irqsave (&req->rq_lock, flags);
+                req->rq_want_ack = 0;
+                spin_unlock_irqrestore (&req->rq_lock, flags);
+        }
+        return rc;
 }
 
-int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req)
+int ptlrpc_error(struct ptlrpc_request *req)
 {
         int rc;
         ENTRY;
@@ -510,94 +588,108 @@ int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req)
 
         req->rq_type = PTL_RPC_MSG_ERR;
 
-        rc = ptlrpc_reply(svc, req);
+        rc = ptlrpc_reply(req);
         RETURN(rc);
 }
 
 int ptl_send_rpc(struct ptlrpc_request *request)
 {
         int rc;
-        char *repbuf;
+        int rc2;
+        unsigned long flags;
         ptl_process_id_t source_id;
-
+        ptl_handle_me_t  reply_me_h;
         ENTRY;
 
-        if (request->rq_type != PTL_RPC_MSG_REQUEST) {
-                CERROR("wrong packet type sent %d\n",
-                       NTOH__u32(request->rq_reqmsg->type));
-                LBUG();
-                RETURN(EINVAL);
+        LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST);
+
+        /* If this is a re-transmit, we're required to have disengaged
+         * cleanly from the previous attempt */
+        LASSERT (!request->rq_receiving_reply);
+
+        if (request->rq_bulk != NULL) {
+                rc = ptlrpc_register_bulk (request);
+                if (rc != 0)
+                        RETURN(rc);
         }
 
+        request->rq_reqmsg->handle = request->rq_import->imp_remote_handle;
+
         source_id.nid = request->rq_connection->c_peer.peer_nid;
         source_id.pid = PTL_PID_ANY;
 
-        /* add a ref, which will be balanced in request_out_callback */
-        ptlrpc_request_addref(request);
-        if (request->rq_replen != 0) {
-                if (request->rq_reply_md.start != NULL) {
-                        rc = PtlMEUnlink(request->rq_reply_me_h);
-                        if (rc != PTL_OK && rc != PTL_INV_ME) {
-                                CERROR("rc %d\n", rc);
-                                LBUG();
-                        }
-                        repbuf = (char *)request->rq_reply_md.start;
-                        request->rq_repmsg = NULL;
-                } else {
-                        OBD_ALLOC(repbuf, request->rq_replen);
-                        if (!repbuf) {
-                                LBUG();
-                                RETURN(ENOMEM);
-                        }
-                }
+        LASSERT (request->rq_replen != 0);
+        OBD_ALLOC(request->rq_repmsg, request->rq_replen);
+        if (request->rq_repmsg == NULL) {
+                LBUG();
+                RETURN(-ENOMEM);
+        }
 
-                rc = PtlMEAttach(request->rq_connection->c_peer.peer_ni->pni_ni_h,
-                             request->rq_reply_portal,/* XXX FIXME bug 625069 */
-                                 source_id, request->rq_xid, 0, PTL_UNLINK,
-                                 PTL_INS_AFTER, &request->rq_reply_me_h);
-                if (rc != PTL_OK) {
-                        CERROR("PtlMEAttach failed: %d\n", rc);
-                        LBUG();
-                        GOTO(cleanup, rc);
-                }
+        rc = PtlMEAttach(request->rq_connection->c_peer.peer_ni->pni_ni_h,
+                         request->rq_reply_portal, /* XXX FIXME bug 249 */
+                         source_id, request->rq_xid, 0, PTL_UNLINK,
+                         PTL_INS_AFTER, &reply_me_h);
+        if (rc != PTL_OK) {
+                CERROR("PtlMEAttach failed: %d\n", rc);
+                LASSERT (rc == PTL_NOSPACE);
+                LBUG();
+                GOTO(cleanup, rc = -ENOMEM);
+        }
 
-                request->rq_reply_md.start = repbuf;
-                request->rq_reply_md.length = request->rq_replen;
-                request->rq_reply_md.threshold = 1;
-                request->rq_reply_md.options = PTL_MD_OP_PUT;
-                request->rq_reply_md.user_ptr = request;
-                request->rq_reply_md.eventq =
-                        request->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h;
-
-                rc = PtlMDAttach(request->rq_reply_me_h, request->rq_reply_md,
-                                 PTL_UNLINK, NULL);
-                if (rc != PTL_OK) {
-                        CERROR("PtlMDAttach failed: %d\n", rc);
-                        LBUG();
-                        GOTO(cleanup2, rc);
-                }
+        request->rq_reply_md.start = request->rq_repmsg;
+        request->rq_reply_md.length = request->rq_replen;
+        request->rq_reply_md.threshold = 1;
+        request->rq_reply_md.options = PTL_MD_OP_PUT;
+        request->rq_reply_md.user_ptr = request;
+        request->rq_reply_md.eventq =
+                request->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h;
 
-                CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
-                       ", portal %u on %s\n",
-                       request->rq_replen, request->rq_xid,
-                       request->rq_reply_portal,
-                       request->rq_connection->c_peer.peer_ni->pni_name);
+        rc = PtlMDAttach(reply_me_h, request->rq_reply_md,
+                         PTL_UNLINK, &request->rq_reply_md_h);
+        if (rc != PTL_OK) {
+                CERROR("PtlMDAttach failed: %d\n", rc);
+                LASSERT (rc == PTL_NOSPACE);
+                LBUG();
+                GOTO(cleanup2, rc -ENOMEM);
         }
 
-        /* Clear any flags that may be present from previous sends,
-         * except for REPLAY, NO_RESEND and WANT_ACK. */
-        request->rq_flags &= (PTL_RPC_FL_REPLAY | PTL_RPC_FL_NO_RESEND |
-                              PTL_RPC_FL_WANT_ACK);
+        CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64
+               ", portal %u on %s\n",
+               request->rq_replen, request->rq_xid,
+               request->rq_reply_portal,
+               request->rq_connection->c_peer.peer_ni->pni_name);
+
+        ptlrpc_request_addref(request);        /* 1 ref for the SENT callback */
+
+        spin_lock_irqsave (&request->rq_lock, flags);
+        request->rq_receiving_reply = 1;
+        /* Clear any flags that may be present from previous sends. */
+        request->rq_replied = 0;
+        request->rq_err = 0;
+        request->rq_timedout = 0;
+        request->rq_resend = 0;
+        request->rq_restart = 0;
+        spin_unlock_irqrestore (&request->rq_lock, flags);
+
+        request->rq_sent = LTIME_S(CURRENT_TIME);
         rc = ptl_send_buf(request, request->rq_connection,
                           request->rq_request_portal);
-        RETURN(rc);
+        if (rc == 0)
+                RETURN(rc);
 
+        spin_lock_irqsave (&request->rq_lock, flags);
+        request->rq_receiving_reply = 0;
+        spin_unlock_irqrestore (&request->rq_lock, flags);
+        ptlrpc_req_finished (request);          /* drop callback ref */
  cleanup2:
-        PtlMEUnlink(request->rq_reply_me_h);
+        /* MEUnlink is safe; the PUT didn't even get off the ground, and
+         * nobody apart from the PUT's target has the right nid+XID to
+         * access the reply buffer. */
+        rc2 = PtlMEUnlink(reply_me_h);
+        LASSERT (rc2 == PTL_OK);
  cleanup:
-        OBD_FREE(repbuf, request->rq_replen);
-        // up(&request->rq_client->cli_rpc_sem);
-
+        OBD_FREE(request->rq_repmsg, request->rq_replen);
+        request->rq_repmsg = NULL;
         return rc;
 }
 
@@ -612,10 +704,10 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
 
         LASSERT(atomic_read(&rqbd->rqbd_refcount) == 0);
 
-        CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx.%lx\n",
+        CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx."LPX64"\n",
                service->srv_req_portal, srv_ni->sni_ni->pni_name,
                srv_ni->sni_ni->pni_ni_h.nal_idx,
-               srv_ni->sni_ni->pni_ni_h.handle_idx);
+               srv_ni->sni_ni->pni_ni_h.cookie);
 
         /* Attach the leading ME on which we build the ring */
         rc = PtlMEAttach(srv_ni->sni_ni->pni_ni_h, service->srv_req_portal,
@@ -623,6 +715,7 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
                          PTL_UNLINK, PTL_INS_AFTER, &rqbd->rqbd_me_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMEAttach failed: %d\n", rc);
+                /* BUG 1191 */
                 LBUG();
         }
 
@@ -640,8 +733,9 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd)
         rc = PtlMDAttach(rqbd->rqbd_me_h, dummy, PTL_UNLINK, &md_h);
         if (rc != PTL_OK) {
                 CERROR("PtlMDAttach failed: %d\n", rc);
+                LASSERT (rc == PTL_NOSPACE);
                 LBUG();
-#warning proper cleanup required
+                /* BUG 1191 */
                 PtlMEUnlink (rqbd->rqbd_me_h);
                 atomic_set(&rqbd->rqbd_refcount, 0);
                 atomic_dec(&srv_ni->sni_nrqbds_receiving);
index 12be831..3811d2a 100644 (file)
@@ -1,7 +1,10 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *  Copyright (C) 2001-2003 Cluster File Systems, Inc.
+ *   Author: Peter J. Braam <braam@clusterfs.com>
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *   Author: Eric Barton <eeb@clusterfs.com>
  *
  *   This file is part of Lustre, http://www.lustre.org.
  *
 #include <linux/obd_support.h>
 #include <linux/lustre_net.h>
 
+
+#define HDR_SIZE(count) \
+    size_round(offsetof (struct lustre_msg, buflens[(count)]))
+
 int lustre_pack_msg(int count, int *lens, char **bufs, int *len,
                     struct lustre_msg **msg)
 {
@@ -37,26 +44,30 @@ int lustre_pack_msg(int count, int *lens, char **bufs, int *len,
         struct lustre_msg *m;
         int size = 0, i;
 
+        size = HDR_SIZE (count);
         for (i = 0; i < count; i++)
                 size += size_round(lens[i]);
 
-        *len = size_round(sizeof(*m) + count * sizeof(__u32)) + size;
+        *len = size;
 
         OBD_ALLOC(*msg, *len);
         if (!*msg)
                 RETURN(-ENOMEM);
 
         m = *msg;
-        m->bufcount = HTON__u32(count);
+        m->magic = PTLRPC_MSG_MAGIC;
+        m->version = PTLRPC_MSG_VERSION;
+        m->bufcount = count;
         for (i = 0; i < count; i++)
-                m->buflens[i] = HTON__u32(lens[i]);
+                m->buflens[i] = lens[i];
 
-        ptr = (char *)m + size_round(sizeof(*m) + count * sizeof(__u32));
+        ptr = (char *)m + HDR_SIZE(count);
         for (i = 0; i < count; i++) {
                 char *tmp = NULL;
                 if (bufs)
                         tmp = bufs[i];
                 LOGL(tmp, lens[i], ptr);
+
         }
 
         return 0;
@@ -66,38 +77,84 @@ int lustre_pack_msg(int count, int *lens, char **bufs, int *len,
  * with the given sub-buffer lengths. */
 int lustre_msg_size(int count, int *lengths)
 {
-        int size = 0, i;
+        int size;
+        int i;
 
+        size = HDR_SIZE (count);
         for (i = 0; i < count; i++)
                 size += size_round(lengths[i]);
 
-        size += size_round(sizeof(struct lustre_msg) + count * sizeof(__u32));
-
         return size;
 }
 
 int lustre_unpack_msg(struct lustre_msg *m, int len)
 {
-        int required_len, i;
+        int   flipped;
+        int   required_len;
+        int   i;
         ENTRY;
 
-        required_len = size_round(sizeof(*m));
-        if (len < required_len)
-                RETURN(-EINVAL);
+        /* We can provide a slightly better error log, if we check the
+         * message magic and version first.  In the future, struct
+         * lustre_msg may grow, and we'd like to log a version mismatch,
+         * rather than a short message.
+         *
+         */
+        required_len = MAX (offsetof (struct lustre_msg, version) +
+                            sizeof (m->version),
+                            offsetof (struct lustre_msg, magic) +
+                            sizeof (m->magic));
+        if (len < required_len) {
+                /* can't even look inside the message */
+                CERROR ("message length %d too small for magic/version check\n",
+                        len);
+                RETURN (-EINVAL);
+        }
+
+        flipped = lustre_msg_swabbed(m);
+        if (flipped)
+                __swab32s (&m->version);
+        else if (m->magic != PTLRPC_MSG_MAGIC) {
+                CERROR("wrong lustre_msg magic %#08x\n", m->magic);
+                RETURN (-EINVAL);
+        }
 
-        m->opc = NTOH__u32(m->opc);
-        m->status = NTOH__u32(m->status);
-        m->type = NTOH__u32(m->type);
-        m->bufcount = NTOH__u32(m->bufcount);
-        m->last_xid = NTOH__u64(m->last_xid);
-        m->last_committed = NTOH__u64(m->last_committed);
+        if (m->version != PTLRPC_MSG_VERSION) {
+                CERROR("wrong lustre_msg version %#08x\n", m->version);
+                RETURN (-EINVAL);
+        }
+
+        /* Now we know the sender speaks my language (but possibly flipped)...*/
+        required_len = HDR_SIZE(0);
+        if (len < required_len) {
+                /* can't even look inside the message */
+                CERROR ("message length %d too small for lustre_msg\n", len);
+                RETURN (-EINVAL);
+        }
+
+        if (flipped) {
+                __swab32s (&m->type);
+                __swab32s (&m->opc);
+                __swab64s (&m->last_xid);
+                __swab64s (&m->last_committed);
+                __swab64s (&m->transno);
+                __swab32s (&m->status);
+                __swab32s (&m->bufcount);
+                __swab32s (&m->flags);
+        }
+
+        required_len = HDR_SIZE(m->bufcount);
 
-        required_len = size_round(sizeof(*m) + m->bufcount * sizeof(__u32));
-        if (len < required_len)
+        if (len < required_len) {
+                /* didn't receive all the buffer lengths */
+                CERROR ("message length %d too small for %d buflens\n",
+                        len, m->bufcount);
                 RETURN(-EINVAL);
+        }
 
         for (i = 0; i < m->bufcount; i++) {
-                m->buflens[i] = NTOH__u32(m->buflens[i]);
+                if (flipped)
+                        __swab32s (&m->buflens[i]);
                 required_len += size_round(m->buflens[i]);
         }
 
@@ -112,33 +169,924 @@ int lustre_unpack_msg(struct lustre_msg *m, int len)
         RETURN(0);
 }
 
-void *lustre_msg_buf(struct lustre_msg *m, int n)
+void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size)
 {
-        int i, offset;
+        int i;
+        int offset;
+        int buflen;
+        int bufcount;
+
+        LASSERT (m != NULL);
+        LASSERT (n >= 0);
 
-        if (!m) {
-                CERROR("no message buffer!\n");
-                LBUG();
+        bufcount = m->bufcount;
+        if (n >= bufcount) {
+                CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n",
+                       m, n, bufcount);
                 return NULL;
         }
 
-        if (n < 0 || n >= m->bufcount) {
-                CERROR("referencing bad sub buffer in %p (want %d, count "
-                       "%d)!\n", m, n, m->bufcount);
-                LBUG();
+        buflen = m->buflens[n];
+        if (buflen == 0) {
+                CERROR("msg %p buffer[%d] is zero length\n", m, n);
                 return NULL;
         }
 
-        if (m->buflens[n] == 0) {
-                CERROR("zero-length buffer requested for buffer %d in %p\n",
-                       n, m);
+        if (buflen < min_size) {
+                CERROR("msg %p buffer[%d] size %d too small (required %d)\n",
+                        m, n, buflen, min_size);
                 return NULL;
         }
 
-        offset = size_round(sizeof(*m) + m->bufcount * sizeof(__u32));
-
+        offset = HDR_SIZE(bufcount);
         for (i = 0; i < n; i++)
                 offset += size_round(m->buflens[i]);
 
         return (char *)m + offset;
 }
+
+char *lustre_msg_string (struct lustre_msg *m, int index, int max_len)
+{
+        /* max_len == 0 means the string should fill the buffer */
+        char *str = lustre_msg_buf (m, index, 0);
+        int   slen;
+        int   blen;
+
+        if (str == NULL) {
+                CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index);
+                return (NULL);
+        }
+
+        blen = m->buflens[index];
+        slen = strnlen (str, blen);
+
+        if (slen == blen) {                     /* not NULL terminated */
+                CERROR ("can't unpack non-NULL terminated string in "
+                        "msg %p buffer[%d] len %d\n", m, index, blen);
+                return (NULL);
+        }
+
+        if (max_len == 0) {
+                if (slen != blen - 1) {
+                        CERROR ("can't unpack short string in msg %p "
+                                "buffer[%d] len %d: strlen %d\n",
+                                m, index, blen, slen);
+                        return (NULL);
+                }
+        } else if (slen > max_len) {
+                CERROR ("can't unpack oversized string in msg %p "
+                        "buffer[%d] len %d strlen %d: max %d expected\n",
+                        m, index, blen, slen, max_len);
+                return (NULL);
+        }
+
+        return (str);
+}
+
+/* Wrap up the normal fixed length case */
+void *lustre_swab_reqbuf (struct ptlrpc_request *req, int index, int min_size,
+                          void *swabber)
+{
+        void *ptr;
+
+        LASSERT_REQSWAB (req, index);
+
+        ptr = lustre_msg_buf(req->rq_reqmsg, index, min_size);
+        if (ptr == NULL)
+                return (NULL);
+
+        if (swabber != NULL &&
+            lustre_msg_swabbed (req->rq_reqmsg))
+                ((void (*)(void *))swabber)(ptr);
+
+        return (ptr);
+}
+
+/* Wrap up the normal fixed length case */
+void *lustre_swab_repbuf (struct ptlrpc_request *req, int index, int min_size,
+                          void *swabber)
+{
+        void *ptr;
+
+        LASSERT_REPSWAB (req, index);
+
+        ptr = lustre_msg_buf (req->rq_repmsg, index, min_size);
+        if (ptr == NULL)
+                return (NULL);
+
+        if (swabber != NULL &&
+            lustre_msg_swabbed (req->rq_repmsg))
+                ((void (*)(void *))swabber)(ptr);
+
+        return (ptr);
+}
+
+/* byte flipping routines for all wire types declared in
+ * lustre_idl.h implemented here.
+ */
+
+void lustre_swab_obdo (struct obdo  *o)
+{
+        __swab64s (&o->o_id);
+        __swab64s (&o->o_gr);
+        __swab64s (&o->o_atime);
+        __swab64s (&o->o_mtime);
+        __swab64s (&o->o_ctime);
+        __swab64s (&o->o_size);
+        __swab64s (&o->o_blocks);
+        __swab64s (&o->o_rdev);
+        __swab32s (&o->o_blksize);
+        __swab32s (&o->o_mode);
+        __swab32s (&o->o_uid);
+        __swab32s (&o->o_gid);
+        __swab32s (&o->o_flags);
+        __swab32s (&o->o_nlink);
+        __swab32s (&o->o_generation);
+        __swab32s (&o->o_valid);
+        __swab32s (&o->o_obdflags);
+        __swab32s (&o->o_easize);
+        /* o_inline is opaque */
+}
+
+void lustre_swab_obd_statfs (struct obd_statfs *os)
+{
+        __swab64s (&os->os_type);
+        __swab64s (&os->os_blocks);
+        __swab64s (&os->os_bfree);
+        __swab64s (&os->os_bavail);
+        __swab64s (&os->os_ffree);
+        /* no need to swap os_fsid */
+        __swab32s (&os->os_bsize);
+        __swab32s (&os->os_namelen);
+        /* no need to swap os_spare */
+}
+
+void lustre_swab_obd_ioobj (struct obd_ioobj *ioo)
+{
+        __swab64s (&ioo->ioo_id);
+        __swab64s (&ioo->ioo_gr);
+        __swab32s (&ioo->ioo_type);
+        __swab32s (&ioo->ioo_bufcnt);
+}
+
+void lustre_swab_niobuf_remote (struct niobuf_remote *nbr)
+{
+        __swab64s (&nbr->offset);
+        __swab32s (&nbr->len);
+        __swab32s (&nbr->flags);
+}
+
+void lustre_swab_ost_body (struct ost_body *b)
+{
+        lustre_swab_obdo (&b->oa);
+}
+
+void lustre_swab_ll_fid (struct ll_fid *fid)
+{
+        __swab64s (&fid->id);
+        __swab32s (&fid->generation);
+        __swab32s (&fid->f_type);
+}
+
+void lustre_swab_mds_status_req (struct mds_status_req *r)
+{
+        __swab32s (&r->flags);
+        __swab32s (&r->repbuf);
+}
+
+void lustre_swab_mds_fileh_body (struct mds_fileh_body *f)
+{
+        lustre_swab_ll_fid (&f->f_fid);
+}
+
+void lustre_swab_mds_body (struct mds_body *b)
+{
+        lustre_swab_ll_fid (&b->fid1);
+        lustre_swab_ll_fid (&b->fid2);
+        /* handle is opaque */
+        __swab64s (&b->size);
+        __swab64s (&b->blocks);
+        __swab32s (&b->ino);
+        __swab32s (&b->valid);
+        __swab32s (&b->fsuid);
+        __swab32s (&b->fsgid);
+        __swab32s (&b->capability);
+        __swab32s (&b->mode);
+        __swab32s (&b->uid);
+        __swab32s (&b->gid);
+        __swab32s (&b->mtime);
+        __swab32s (&b->ctime);
+        __swab32s (&b->atime);
+        __swab32s (&b->flags);
+        __swab32s (&b->rdev);
+        __swab32s (&b->nlink);
+        __swab32s (&b->generation);
+        __swab32s (&b->suppgid);
+        __swab32s (&b->eadatasize);
+}
+
+void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa)
+{
+        __swab32s (&sa->sa_opcode);
+        __swab32s (&sa->sa_fsuid);
+        __swab32s (&sa->sa_fsgid);
+        __swab32s (&sa->sa_cap);
+        __swab32s (&sa->sa_reserved);
+        __swab32s (&sa->sa_valid);
+        lustre_swab_ll_fid (&sa->sa_fid);
+        __swab32s (&sa->sa_mode);
+        __swab32s (&sa->sa_uid);
+        __swab32s (&sa->sa_gid);
+        __swab32s (&sa->sa_attr_flags);
+        __swab64s (&sa->sa_size);
+        __swab64s (&sa->sa_atime);
+        __swab64s (&sa->sa_mtime);
+        __swab64s (&sa->sa_ctime);
+        __swab32s (&sa->sa_suppgid);
+}
+
+void lustre_swab_mds_rec_create (struct mds_rec_create *cr)
+{
+        __swab32s (&cr->cr_opcode);
+        __swab32s (&cr->cr_fsuid);
+        __swab32s (&cr->cr_fsgid);
+        __swab32s (&cr->cr_cap);
+        __swab32s (&cr->cr_flags); /* for use with open */
+        __swab32s (&cr->cr_mode);
+        lustre_swab_ll_fid (&cr->cr_fid);
+        lustre_swab_ll_fid (&cr->cr_replayfid);
+        __swab32s (&cr->cr_uid);
+        __swab32s (&cr->cr_gid);
+        __swab64s (&cr->cr_time);
+        __swab64s (&cr->cr_rdev);
+        __swab32s (&cr->cr_suppgid);
+}
+
+void lustre_swab_mds_rec_link (struct mds_rec_link *lk)
+{
+        __swab32s (&lk->lk_opcode);
+        __swab32s (&lk->lk_fsuid);
+        __swab32s (&lk->lk_fsgid);
+        __swab32s (&lk->lk_cap);
+        __swab32s (&lk->lk_suppgid1);
+        __swab32s (&lk->lk_suppgid2);
+        lustre_swab_ll_fid (&lk->lk_fid1);
+        lustre_swab_ll_fid (&lk->lk_fid2);
+}
+
+void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul)
+{
+        __swab32s (&ul->ul_opcode);
+        __swab32s (&ul->ul_fsuid);
+        __swab32s (&ul->ul_fsgid);
+        __swab32s (&ul->ul_cap);
+        __swab32s (&ul->ul_reserved);
+        __swab32s (&ul->ul_mode);
+        __swab32s (&ul->ul_suppgid);
+        lustre_swab_ll_fid (&ul->ul_fid1);
+        lustre_swab_ll_fid (&ul->ul_fid2);
+}
+
+void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn)
+{
+        __swab32s (&rn->rn_opcode);
+        __swab32s (&rn->rn_fsuid);
+        __swab32s (&rn->rn_fsgid);
+        __swab32s (&rn->rn_cap);
+        __swab32s (&rn->rn_suppgid1);
+        __swab32s (&rn->rn_suppgid2);
+        lustre_swab_ll_fid (&rn->rn_fid1);
+        lustre_swab_ll_fid (&rn->rn_fid2);
+}
+
+void lustre_swab_lov_desc (struct lov_desc *ld)
+{
+        __swab32s (&ld->ld_tgt_count);
+        __swab32s (&ld->ld_active_tgt_count);
+        __swab32s (&ld->ld_default_stripe_count);
+        __swab64s (&ld->ld_default_stripe_size);
+        __swab64s (&ld->ld_default_stripe_offset);
+        __swab32s (&ld->ld_pattern);
+        /* uuid endian insensitive */
+}
+
+void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
+{
+        int  i;
+
+        for (i = 0; i < RES_NAME_SIZE; i++)
+                __swab64s (&id->name[i]);
+}
+
+void lustre_swab_ldlm_extent (struct ldlm_extent *e)
+{
+        __swab64s (&e->start);
+        __swab64s (&e->end);
+}
+
+void lustre_swab_ldlm_intent (struct ldlm_intent *i)
+{
+        __swab64s (&i->opc);
+}
+
+void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r)
+{
+        int   i;
+
+        __swab32s (&r->lr_type);
+        lustre_swab_ldlm_res_id (&r->lr_name);
+        for (i = 0; i < RES_VERSION_SIZE; i++)
+                __swab32s (&r->lr_version[i]);
+}
+
+void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l)
+{
+        int   i;
+
+        lustre_swab_ldlm_resource_desc (&l->l_resource);
+        __swab32s (&l->l_req_mode);
+        __swab32s (&l->l_granted_mode);
+        lustre_swab_ldlm_extent (&l->l_extent);
+        for (i = 0; i < RES_VERSION_SIZE; i++)
+                __swab32s (&l->l_version[i]);
+}
+
+void lustre_swab_ldlm_request (struct ldlm_request *rq)
+{
+        __swab32s (&rq->lock_flags);
+        lustre_swab_ldlm_lock_desc (&rq->lock_desc);
+        /* lock_handle1 opaque */
+        /* lock_handle2 opaque */
+}
+
+void lustre_swab_ldlm_reply (struct ldlm_reply *r)
+{
+        __swab32s (&r->lock_flags);
+        __swab32s (&r->lock_mode);
+        lustre_swab_ldlm_res_id (&r->lock_resource_name);
+        /* lock_handle opaque */
+        lustre_swab_ldlm_extent (&r->lock_extent);
+        __swab64s (&r->lock_policy_res1);
+        __swab64s (&r->lock_policy_res2);
+}
+
+void lustre_swab_ptlbd_op (struct ptlbd_op *op)
+{
+        __swab16s (&op->op_cmd);
+        __swab16s (&op->op_lun);
+        __swab16s (&op->op_niob_cnt);
+        /* ignore op__padding */
+        __swab32s (&op->op_block_cnt);
+}
+
+void lustre_swab_ptlbd_niob (struct ptlbd_niob *n)
+{
+        __swab64s (&n->n_xid);
+        __swab64s (&n->n_block_nr);
+        __swab32s (&n->n_offset);
+        __swab32s (&n->n_length);
+}
+
+void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r)
+{
+        __swab16s (&r->r_status);
+        __swab16s (&r->r_error_cnt);
+}
+
+void lustre_assert_wire_constants (void)
+{
+#if BUG_1343
+        /* Wire protocol assertions generated by 'wirecheck' */
+
+        /* Constants... */
+        LASSERT (PTLRPC_MSG_MAGIC == 0x0BD00BD0);
+        LASSERT (PTLRPC_MSG_VERSION == 0x00040002);
+        LASSERT (PTL_RPC_MSG_REQUEST == 4711);
+        LASSERT (PTL_RPC_MSG_ERR == 4712);
+        LASSERT (PTL_RPC_MSG_REPLY == 4713);
+        LASSERT (MSG_LAST_REPLAY == 1);
+        LASSERT (MSG_RESENT == 2);
+        LASSERT (MSG_CONNECT_RECOVERING == 1);
+        LASSERT (MSG_CONNECT_RECONNECT == 2);
+        LASSERT (MSG_CONNECT_REPLAYABLE == 4);
+        LASSERT (OST_REPLY == 0);
+        LASSERT (OST_GETATTR == 1);
+        LASSERT (OST_SETATTR == 2);
+        LASSERT (OST_READ == 3);
+        LASSERT (OST_WRITE == 4);
+        LASSERT (OST_CREATE == 5);
+        LASSERT (OST_DESTROY == 6);
+        LASSERT (OST_GET_INFO == 7);
+        LASSERT (OST_CONNECT == 8);
+        LASSERT (OST_DISCONNECT == 9);
+        LASSERT (OST_PUNCH == 10);
+        LASSERT (OST_OPEN == 11);
+        LASSERT (OST_CLOSE == 12);
+        LASSERT (OST_STATFS == 13);
+        LASSERT (OST_SAN_READ == 14);
+        LASSERT (OST_SAN_WRITE == 15);
+        LASSERT (OST_SYNCFS == 16);
+        LASSERT (OST_LAST_OPC == 17);
+        LASSERT (OST_FIRST_OPC == 0);
+        LASSERT (OBD_FL_INLINEDATA == 1);
+        LASSERT (OBD_FL_OBDMDEXISTS == 2);
+        LASSERT (LOV_MAGIC == 198183888);
+        LASSERT (OBD_MD_FLALL == -1);
+        LASSERT (OBD_MD_FLID == 1);
+        LASSERT (OBD_MD_FLATIME == 2);
+        LASSERT (OBD_MD_FLMTIME == 4);
+        LASSERT (OBD_MD_FLCTIME == 8);
+        LASSERT (OBD_MD_FLSIZE == 16);
+        LASSERT (OBD_MD_FLBLOCKS == 32);
+        LASSERT (OBD_MD_FLBLKSZ == 64);
+        LASSERT (OBD_MD_FLMODE == 128);
+        LASSERT (OBD_MD_FLTYPE == 256);
+        LASSERT (OBD_MD_FLUID == 512);
+        LASSERT (OBD_MD_FLGID == 1024);
+        LASSERT (OBD_MD_FLFLAGS == 2048);
+        LASSERT (OBD_MD_FLOBDFLG == 4096);
+        LASSERT (OBD_MD_FLNLINK == 8192);
+        LASSERT (OBD_MD_FLGENER == 16384);
+        LASSERT (OBD_MD_FLINLINE == 32768);
+        LASSERT (OBD_MD_FLRDEV == 65536);
+        LASSERT (OBD_MD_FLEASIZE == 131072);
+        LASSERT (OBD_MD_LINKNAME == 262144);
+        LASSERT (OBD_MD_FLHANDLE == 524288);
+        LASSERT (OBD_MD_FLCKSUM == 1048576);
+        LASSERT (OBD_BRW_READ == 1);
+        LASSERT (OBD_BRW_WRITE == 2);
+        LASSERT (OBD_BRW_CREATE == 4);
+        LASSERT (OBD_BRW_SYNC == 8);
+        LASSERT (OBD_OBJECT_EOF == 0xffffffffffffffffULL);
+        LASSERT (OST_REQ_HAS_OA1 == 1);
+        LASSERT (MDS_GETATTR == 33);
+        LASSERT (MDS_GETATTR_NAME == 34);
+        LASSERT (MDS_CLOSE == 35);
+        LASSERT (MDS_REINT == 36);
+        LASSERT (MDS_READPAGE == 37);
+        LASSERT (MDS_CONNECT == 38);
+        LASSERT (MDS_DISCONNECT == 39);
+        LASSERT (MDS_GETSTATUS == 40);
+        LASSERT (MDS_STATFS == 41);
+        LASSERT (MDS_GETLOVINFO == 42);
+        LASSERT (MDS_LAST_OPC == 43);
+        LASSERT (MDS_FIRST_OPC == 33);
+        LASSERT (REINT_SETATTR == 1);
+        LASSERT (REINT_CREATE == 2);
+        LASSERT (REINT_LINK == 3);
+        LASSERT (REINT_UNLINK == 4);
+        LASSERT (REINT_RENAME == 5);
+        LASSERT (REINT_OPEN == 6);
+        LASSERT (REINT_MAX == 6);
+        LASSERT (IT_INTENT_EXEC == 1);
+        LASSERT (IT_OPEN_LOOKUP == 2);
+        LASSERT (IT_OPEN_NEG == 4);
+        LASSERT (IT_OPEN_POS == 8);
+        LASSERT (IT_OPEN_CREATE == 16);
+        LASSERT (IT_OPEN_OPEN == 32);
+        LASSERT (MDS_STATUS_CONN == 1);
+        LASSERT (MDS_STATUS_LOV == 2);
+        LASSERT (MDS_OPEN_HAS_EA == 1);
+        LASSERT (LOV_RAID0 == 0);
+        LASSERT (LOV_RAIDRR == 1);
+        LASSERT (LDLM_ENQUEUE == 101);
+        LASSERT (LDLM_CONVERT == 102);
+        LASSERT (LDLM_CANCEL == 103);
+        LASSERT (LDLM_BL_CALLBACK == 104);
+        LASSERT (LDLM_CP_CALLBACK == 105);
+        LASSERT (LDLM_LAST_OPC == 106);
+        LASSERT (LDLM_FIRST_OPC == 101);
+        LASSERT (PTLBD_QUERY == 200);
+        LASSERT (PTLBD_READ == 201);
+        LASSERT (PTLBD_WRITE == 202);
+        LASSERT (PTLBD_FLUSH == 203);
+        LASSERT (PTLBD_CONNECT == 204);
+        LASSERT (PTLBD_DISCONNECT == 205);
+        LASSERT (PTLBD_LAST_OPC == 204);
+        LASSERT (PTLBD_FIRST_OPC == 200);
+        LASSERT (OBD_PING == 400);
+        /* Sizes and Offsets */
+
+
+        /* Checks for struct lustre_handle */
+        LASSERT (sizeof (struct lustre_handle) == 8);
+        LASSERT (offsetof (struct lustre_handle, cookie) == 0);
+        LASSERT (sizeof (((struct lustre_handle *)0)->cookie) == 8);
+
+        /* Checks for struct lustre_msg */
+        LASSERT (sizeof (struct lustre_msg) == 60);
+        LASSERT (offsetof (struct lustre_msg, handle) == 0);
+        LASSERT (sizeof (((struct lustre_msg *)0)->handle) == 8);
+        LASSERT (offsetof (struct lustre_msg, magic) == 8);
+        LASSERT (sizeof (((struct lustre_msg *)0)->magic) == 4);
+        LASSERT (offsetof (struct lustre_msg, type) == 12);
+        LASSERT (sizeof (((struct lustre_msg *)0)->type) == 4);
+        LASSERT (offsetof (struct lustre_msg, version) == 16);
+        LASSERT (sizeof (((struct lustre_msg *)0)->version) == 4);
+        LASSERT (offsetof (struct lustre_msg, opc) == 20);
+        LASSERT (sizeof (((struct lustre_msg *)0)->opc) == 4);
+        LASSERT (offsetof (struct lustre_msg, last_xid) == 24);
+        LASSERT (sizeof (((struct lustre_msg *)0)->last_xid) == 8);
+        LASSERT (offsetof (struct lustre_msg, last_committed) == 32);
+        LASSERT (sizeof (((struct lustre_msg *)0)->last_committed) == 8);
+        LASSERT (offsetof (struct lustre_msg, transno) == 40);
+        LASSERT (sizeof (((struct lustre_msg *)0)->transno) == 8);
+        LASSERT (offsetof (struct lustre_msg, status) == 48);
+        LASSERT (sizeof (((struct lustre_msg *)0)->status) == 4);
+        LASSERT (offsetof (struct lustre_msg, flags) == 52);
+        LASSERT (sizeof (((struct lustre_msg *)0)->flags) == 4);
+        LASSERT (offsetof (struct lustre_msg, bufcount) == 56);
+        LASSERT (sizeof (((struct lustre_msg *)0)->bufcount) == 4);
+        LASSERT (offsetof (struct lustre_msg, buflens[7]) == 88);
+        LASSERT (sizeof (((struct lustre_msg *)0)->buflens[7]) == 4);
+
+        /* Checks for struct obdo */
+        LASSERT (sizeof (struct obdo) == 164);
+        LASSERT (offsetof (struct obdo, o_id) == 0);
+        LASSERT (sizeof (((struct obdo *)0)->o_id) == 8);
+        LASSERT (offsetof (struct obdo, o_gr) == 8);
+        LASSERT (sizeof (((struct obdo *)0)->o_gr) == 8);
+        LASSERT (offsetof (struct obdo, o_atime) == 16);
+        LASSERT (sizeof (((struct obdo *)0)->o_atime) == 8);
+        LASSERT (offsetof (struct obdo, o_mtime) == 24);
+        LASSERT (sizeof (((struct obdo *)0)->o_mtime) == 8);
+        LASSERT (offsetof (struct obdo, o_ctime) == 32);
+        LASSERT (sizeof (((struct obdo *)0)->o_ctime) == 8);
+        LASSERT (offsetof (struct obdo, o_size) == 40);
+        LASSERT (sizeof (((struct obdo *)0)->o_size) == 8);
+        LASSERT (offsetof (struct obdo, o_blocks) == 48);
+        LASSERT (sizeof (((struct obdo *)0)->o_blocks) == 8);
+        LASSERT (offsetof (struct obdo, o_rdev) == 56);
+        LASSERT (sizeof (((struct obdo *)0)->o_rdev) == 8);
+        LASSERT (offsetof (struct obdo, o_blksize) == 64);
+        LASSERT (sizeof (((struct obdo *)0)->o_blksize) == 4);
+        LASSERT (offsetof (struct obdo, o_mode) == 68);
+        LASSERT (sizeof (((struct obdo *)0)->o_mode) == 4);
+        LASSERT (offsetof (struct obdo, o_uid) == 72);
+        LASSERT (sizeof (((struct obdo *)0)->o_uid) == 4);
+        LASSERT (offsetof (struct obdo, o_gid) == 76);
+        LASSERT (sizeof (((struct obdo *)0)->o_gid) == 4);
+        LASSERT (offsetof (struct obdo, o_flags) == 80);
+        LASSERT (sizeof (((struct obdo *)0)->o_flags) == 4);
+        LASSERT (offsetof (struct obdo, o_nlink) == 84);
+        LASSERT (sizeof (((struct obdo *)0)->o_nlink) == 4);
+        LASSERT (offsetof (struct obdo, o_generation) == 88);
+        LASSERT (sizeof (((struct obdo *)0)->o_generation) == 4);
+        LASSERT (offsetof (struct obdo, o_valid) == 92);
+        LASSERT (sizeof (((struct obdo *)0)->o_valid) == 4);
+        LASSERT (offsetof (struct obdo, o_obdflags) == 96);
+        LASSERT (sizeof (((struct obdo *)0)->o_obdflags) == 4);
+        LASSERT (offsetof (struct obdo, o_easize) == 100);
+        LASSERT (sizeof (((struct obdo *)0)->o_easize) == 4);
+        LASSERT (offsetof (struct obdo, o_inline) == 104);
+        LASSERT (sizeof (((struct obdo *)0)->o_inline) == 60);
+
+        /* Checks for struct obd_statfs */
+        LASSERT (sizeof (struct obd_statfs) == 144);
+        LASSERT (offsetof (struct obd_statfs, os_type) == 0);
+        LASSERT (sizeof (((struct obd_statfs *)0)->os_type) == 8);
+        LASSERT (offsetof (struct obd_statfs, os_blocks) == 8);
+        LASSERT (sizeof (((struct obd_statfs *)0)->os_blocks) == 8);
+        LASSERT (offsetof (struct obd_statfs, os_bfree) == 16);
+        LASSERT (sizeof (((struct obd_statfs *)0)->os_bfree) == 8);
+        LASSERT (offsetof (struct obd_statfs, os_bavail) == 24);
+        LASSERT (sizeof (((struct obd_statfs *)0)->os_bavail) == 8);
+        LASSERT (offsetof (struct obd_statfs, os_ffree) == 40);
+        LASSERT (sizeof (((struct obd_statfs *)0)->os_ffree) == 8);
+        LASSERT (offsetof (struct obd_statfs, os_fsid) == 48);
+        LASSERT (sizeof (((struct obd_statfs *)0)->os_fsid) == 40);
+        LASSERT (offsetof (struct obd_statfs, os_bsize) == 88);
+        LASSERT (sizeof (((struct obd_statfs *)0)->os_bsize) == 4);
+        LASSERT (offsetof (struct obd_statfs, os_namelen) == 92);
+        LASSERT (sizeof (((struct obd_statfs *)0)->os_namelen) == 4);
+
+        /* Checks for struct obd_ioobj */
+        LASSERT (sizeof (struct obd_ioobj) == 24);
+        LASSERT (offsetof (struct obd_ioobj, ioo_id) == 0);
+        LASSERT (sizeof (((struct obd_ioobj *)0)->ioo_id) == 8);
+        LASSERT (offsetof (struct obd_ioobj, ioo_gr) == 8);
+        LASSERT (sizeof (((struct obd_ioobj *)0)->ioo_gr) == 8);
+        LASSERT (offsetof (struct obd_ioobj, ioo_type) == 16);
+        LASSERT (sizeof (((struct obd_ioobj *)0)->ioo_type) == 4);
+        LASSERT (offsetof (struct obd_ioobj, ioo_bufcnt) == 20);
+        LASSERT (sizeof (((struct obd_ioobj *)0)->ioo_bufcnt) == 4);
+
+        /* Checks for struct niobuf_remote */
+        LASSERT (sizeof (struct niobuf_remote) == 16);
+        LASSERT (offsetof (struct niobuf_remote, offset) == 0);
+        LASSERT (sizeof (((struct niobuf_remote *)0)->offset) == 8);
+        LASSERT (offsetof (struct niobuf_remote, len) == 8);
+        LASSERT (sizeof (((struct niobuf_remote *)0)->len) == 4);
+        LASSERT (offsetof (struct niobuf_remote, flags) == 12);
+        LASSERT (sizeof (((struct niobuf_remote *)0)->flags) == 4);
+
+        /* Checks for struct ost_body */
+        LASSERT (sizeof (struct ost_body) == 164);
+        LASSERT (offsetof (struct ost_body, oa) == 0);
+        LASSERT (sizeof (((struct ost_body *)0)->oa) == 164);
+
+        /* Checks for struct ll_fid */
+        LASSERT (sizeof (struct ll_fid) == 16);
+        LASSERT (offsetof (struct ll_fid, id) == 0);
+        LASSERT (sizeof (((struct ll_fid *)0)->id) == 8);
+        LASSERT (offsetof (struct ll_fid, generation) == 8);
+        LASSERT (sizeof (((struct ll_fid *)0)->generation) == 4);
+        LASSERT (offsetof (struct ll_fid, f_type) == 12);
+        LASSERT (sizeof (((struct ll_fid *)0)->f_type) == 4);
+
+        /* Checks for struct mds_status_req */
+        LASSERT (sizeof (struct mds_status_req) == 8);
+        LASSERT (offsetof (struct mds_status_req, flags) == 0);
+        LASSERT (sizeof (((struct mds_status_req *)0)->flags) == 4);
+        LASSERT (offsetof (struct mds_status_req, repbuf) == 4);
+        LASSERT (sizeof (((struct mds_status_req *)0)->repbuf) == 4);
+
+        /* Checks for struct mds_fileh_body */
+        LASSERT (sizeof (struct mds_fileh_body) == 24);
+        LASSERT (offsetof (struct mds_fileh_body, f_fid) == 0);
+        LASSERT (sizeof (((struct mds_fileh_body *)0)->f_fid) == 16);
+
+        /* Checks for struct mds_body */
+        LASSERT (sizeof (struct mds_body) == 124);
+        LASSERT (offsetof (struct mds_body, fid1) == 0);
+        LASSERT (sizeof (((struct mds_body *)0)->fid1) == 16);
+        LASSERT (offsetof (struct mds_body, fid2) == 16);
+        LASSERT (sizeof (((struct mds_body *)0)->fid2) == 16);
+        LASSERT (offsetof (struct mds_body, handle) == 32);
+        LASSERT (sizeof (((struct mds_body *)0)->handle) == 8);
+        LASSERT (offsetof (struct mds_body, size) == 40);
+        LASSERT (sizeof (((struct mds_body *)0)->size) == 8);
+        LASSERT (offsetof (struct mds_body, blocks) == 48);
+        LASSERT (sizeof (((struct mds_body *)0)->blocks) == 8);
+        LASSERT (offsetof (struct mds_body, ino) == 56);
+        LASSERT (sizeof (((struct mds_body *)0)->ino) == 4);
+        LASSERT (offsetof (struct mds_body, valid) == 60);
+        LASSERT (sizeof (((struct mds_body *)0)->valid) == 4);
+        LASSERT (offsetof (struct mds_body, fsuid) == 64);
+        LASSERT (sizeof (((struct mds_body *)0)->fsuid) == 4);
+        LASSERT (offsetof (struct mds_body, fsgid) == 68);
+        LASSERT (sizeof (((struct mds_body *)0)->fsgid) == 4);
+        LASSERT (offsetof (struct mds_body, capability) == 72);
+        LASSERT (sizeof (((struct mds_body *)0)->capability) == 4);
+        LASSERT (offsetof (struct mds_body, mode) == 76);
+        LASSERT (sizeof (((struct mds_body *)0)->mode) == 4);
+        LASSERT (offsetof (struct mds_body, uid) == 80);
+        LASSERT (sizeof (((struct mds_body *)0)->uid) == 4);
+        LASSERT (offsetof (struct mds_body, gid) == 84);
+        LASSERT (sizeof (((struct mds_body *)0)->gid) == 4);
+        LASSERT (offsetof (struct mds_body, mtime) == 88);
+        LASSERT (sizeof (((struct mds_body *)0)->mtime) == 4);
+        LASSERT (offsetof (struct mds_body, ctime) == 92);
+        LASSERT (sizeof (((struct mds_body *)0)->ctime) == 4);
+        LASSERT (offsetof (struct mds_body, atime) == 96);
+        LASSERT (sizeof (((struct mds_body *)0)->atime) == 4);
+        LASSERT (offsetof (struct mds_body, flags) == 100);
+        LASSERT (sizeof (((struct mds_body *)0)->flags) == 4);
+        LASSERT (offsetof (struct mds_body, rdev) == 104);
+        LASSERT (sizeof (((struct mds_body *)0)->rdev) == 4);
+        LASSERT (offsetof (struct mds_body, nlink) == 108);
+        LASSERT (sizeof (((struct mds_body *)0)->nlink) == 4);
+        LASSERT (offsetof (struct mds_body, generation) == 112);
+        LASSERT (sizeof (((struct mds_body *)0)->generation) == 4);
+        LASSERT (offsetof (struct mds_body, suppgid) == 116);
+        LASSERT (sizeof (((struct mds_body *)0)->suppgid) == 4);
+
+        /* Checks for struct mds_rec_setattr */
+        LASSERT (sizeof (struct mds_rec_setattr) == 92);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_opcode) == 0);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_opcode) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_fsuid) == 4);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_fsuid) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_fsgid) == 8);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_fsgid) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_cap) == 12);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_cap) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_reserved) == 16);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_reserved) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_valid) == 20);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_valid) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_fid) == 24);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_fid) == 16);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_mode) == 40);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_mode) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_uid) == 44);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_uid) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_gid) == 48);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_gid) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_attr_flags) == 52);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_attr_flags) == 4);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_size) == 56);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_size) == 8);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_atime) == 64);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_atime) == 8);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_mtime) == 72);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_mtime) == 8);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_ctime) == 80);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_ctime) == 8);
+        LASSERT (offsetof (struct mds_rec_setattr, sa_suppgid) == 88);
+        LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_suppgid) == 4);
+
+        /* Checks for struct mds_rec_create */
+        LASSERT (sizeof (struct mds_rec_create) == 84);
+        LASSERT (offsetof (struct mds_rec_create, cr_opcode) == 0);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_opcode) == 4);
+        LASSERT (offsetof (struct mds_rec_create, cr_fsuid) == 4);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_fsuid) == 4);
+        LASSERT (offsetof (struct mds_rec_create, cr_fsgid) == 8);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_fsgid) == 4);
+        LASSERT (offsetof (struct mds_rec_create, cr_cap) == 12);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_cap) == 4);
+        LASSERT (offsetof (struct mds_rec_create, cr_flags) == 16);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_flags) == 4);
+        LASSERT (offsetof (struct mds_rec_create, cr_mode) == 20);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_mode) == 4);
+        LASSERT (offsetof (struct mds_rec_create, cr_fid) == 24);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_fid) == 16);
+        LASSERT (offsetof (struct mds_rec_create, cr_replayfid) == 40);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_replayfid) == 16);
+        LASSERT (offsetof (struct mds_rec_create, cr_uid) == 56);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_uid) == 4);
+        LASSERT (offsetof (struct mds_rec_create, cr_gid) == 60);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_gid) == 4);
+        LASSERT (offsetof (struct mds_rec_create, cr_time) == 64);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_time) == 8);
+        LASSERT (offsetof (struct mds_rec_create, cr_rdev) == 72);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_rdev) == 8);
+        LASSERT (offsetof (struct mds_rec_create, cr_suppgid) == 80);
+        LASSERT (sizeof (((struct mds_rec_create *)0)->cr_suppgid) == 4);
+
+        /* Checks for struct mds_rec_link */
+        LASSERT (sizeof (struct mds_rec_link) == 56);
+        LASSERT (offsetof (struct mds_rec_link, lk_opcode) == 0);
+        LASSERT (sizeof (((struct mds_rec_link *)0)->lk_opcode) == 4);
+        LASSERT (offsetof (struct mds_rec_link, lk_fsuid) == 4);
+        LASSERT (sizeof (((struct mds_rec_link *)0)->lk_fsuid) == 4);
+        LASSERT (offsetof (struct mds_rec_link, lk_fsgid) == 8);
+        LASSERT (sizeof (((struct mds_rec_link *)0)->lk_fsgid) == 4);
+        LASSERT (offsetof (struct mds_rec_link, lk_cap) == 12);
+        LASSERT (sizeof (((struct mds_rec_link *)0)->lk_cap) == 4);
+        LASSERT (offsetof (struct mds_rec_link, lk_suppgid1) == 16);
+        LASSERT (sizeof (((struct mds_rec_link *)0)->lk_suppgid1) == 4);
+        LASSERT (offsetof (struct mds_rec_link, lk_suppgid2) == 20);
+        LASSERT (sizeof (((struct mds_rec_link *)0)->lk_suppgid2) == 4);
+        LASSERT (offsetof (struct mds_rec_link, lk_fid1) == 24);
+        LASSERT (sizeof (((struct mds_rec_link *)0)->lk_fid1) == 16);
+        LASSERT (offsetof (struct mds_rec_link, lk_fid2) == 40);
+        LASSERT (sizeof (((struct mds_rec_link *)0)->lk_fid2) == 16);
+
+        /* Checks for struct mds_rec_unlink */
+        LASSERT (sizeof (struct mds_rec_unlink) == 60);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_opcode) == 0);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_opcode) == 4);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_fsuid) == 4);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_fsuid) == 4);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_fsgid) == 8);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_fsgid) == 4);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_cap) == 12);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_cap) == 4);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_reserved) == 16);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_reserved) == 4);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_mode) == 20);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_mode) == 4);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_suppgid) == 24);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_suppgid) == 4);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_fid1) == 28);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_fid1) == 16);
+        LASSERT (offsetof (struct mds_rec_unlink, ul_fid2) == 44);
+        LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_fid2) == 16);
+
+        /* Checks for struct mds_rec_rename */
+        LASSERT (sizeof (struct mds_rec_rename) == 56);
+        LASSERT (offsetof (struct mds_rec_rename, rn_opcode) == 0);
+        LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_opcode) == 4);
+        LASSERT (offsetof (struct mds_rec_rename, rn_fsuid) == 4);
+        LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_fsuid) == 4);
+        LASSERT (offsetof (struct mds_rec_rename, rn_fsgid) == 8);
+        LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_fsgid) == 4);
+        LASSERT (offsetof (struct mds_rec_rename, rn_cap) == 12);
+        LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_cap) == 4);
+        LASSERT (offsetof (struct mds_rec_rename, rn_suppgid1) == 16);
+        LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_suppgid1) == 4);
+        LASSERT (offsetof (struct mds_rec_rename, rn_suppgid2) == 20);
+        LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_suppgid2) == 4);
+        LASSERT (offsetof (struct mds_rec_rename, rn_fid1) == 24);
+        LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_fid1) == 16);
+        LASSERT (offsetof (struct mds_rec_rename, rn_fid2) == 40);
+        LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_fid2) == 16);
+
+        /* Checks for struct lov_desc */
+        LASSERT (sizeof (struct lov_desc) == 72);
+        LASSERT (offsetof (struct lov_desc, ld_tgt_count) == 0);
+        LASSERT (sizeof (((struct lov_desc *)0)->ld_tgt_count) == 4);
+        LASSERT (offsetof (struct lov_desc, ld_active_tgt_count) == 4);
+        LASSERT (sizeof (((struct lov_desc *)0)->ld_active_tgt_count) == 4);
+        LASSERT (offsetof (struct lov_desc, ld_default_stripe_count) == 8);
+        LASSERT (sizeof (((struct lov_desc *)0)->ld_default_stripe_count) == 4);
+        LASSERT (offsetof (struct lov_desc, ld_default_stripe_size) == 12);
+        LASSERT (sizeof (((struct lov_desc *)0)->ld_default_stripe_size) == 8);
+        LASSERT (offsetof (struct lov_desc, ld_default_stripe_offset) == 20);
+        LASSERT (sizeof (((struct lov_desc *)0)->ld_default_stripe_offset) == 8);
+        LASSERT (offsetof (struct lov_desc, ld_pattern) == 28);
+        LASSERT (sizeof (((struct lov_desc *)0)->ld_pattern) == 4);
+        LASSERT (offsetof (struct lov_desc, ld_uuid) == 32);
+        LASSERT (sizeof (((struct lov_desc *)0)->ld_uuid) == 37);
+
+        /* Checks for struct ldlm_res_id */
+        LASSERT (sizeof (struct ldlm_res_id) == 24);
+        LASSERT (offsetof (struct ldlm_res_id, name[3]) == 24);
+        LASSERT (sizeof (((struct ldlm_res_id *)0)->name[3]) == 8);
+
+        /* Checks for struct ldlm_extent */
+        LASSERT (sizeof (struct ldlm_extent) == 16);
+        LASSERT (offsetof (struct ldlm_extent, start) == 0);
+        LASSERT (sizeof (((struct ldlm_extent *)0)->start) == 8);
+        LASSERT (offsetof (struct ldlm_extent, end) == 8);
+        LASSERT (sizeof (((struct ldlm_extent *)0)->end) == 8);
+
+        /* Checks for struct ldlm_intent */
+        LASSERT (sizeof (struct ldlm_intent) == 8);
+        LASSERT (offsetof (struct ldlm_intent, opc) == 0);
+        LASSERT (sizeof (((struct ldlm_intent *)0)->opc) == 8);
+
+        /* Checks for struct ldlm_resource_desc */
+        LASSERT (sizeof (struct ldlm_resource_desc) == 44);
+        LASSERT (offsetof (struct ldlm_resource_desc, lr_type) == 0);
+        LASSERT (sizeof (((struct ldlm_resource_desc *)0)->lr_type) == 4);
+        LASSERT (offsetof (struct ldlm_resource_desc, lr_name) == 4);
+        LASSERT (sizeof (((struct ldlm_resource_desc *)0)->lr_name) == 24);
+        LASSERT (offsetof (struct ldlm_resource_desc, lr_version[4]) == 44);
+        LASSERT (sizeof (((struct ldlm_resource_desc *)0)->lr_version[4]) == 4);
+
+        /* Checks for struct ldlm_lock_desc */
+        LASSERT (sizeof (struct ldlm_lock_desc) == 84);
+        LASSERT (offsetof (struct ldlm_lock_desc, l_resource) == 0);
+        LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_resource) == 44);
+        LASSERT (offsetof (struct ldlm_lock_desc, l_req_mode) == 44);
+        LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_req_mode) == 4);
+        LASSERT (offsetof (struct ldlm_lock_desc, l_granted_mode) == 48);
+        LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_granted_mode) == 4);
+        LASSERT (offsetof (struct ldlm_lock_desc, l_extent) == 52);
+        LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_extent) == 16);
+        LASSERT (offsetof (struct ldlm_lock_desc, l_version[4]) == 84);
+        LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_version[4]) == 4);
+
+        /* Checks for struct ldlm_request */
+        LASSERT (sizeof (struct ldlm_request) == 104);
+        LASSERT (offsetof (struct ldlm_request, lock_flags) == 0);
+        LASSERT (sizeof (((struct ldlm_request *)0)->lock_flags) == 4);
+        LASSERT (offsetof (struct ldlm_request, lock_desc) == 4);
+        LASSERT (sizeof (((struct ldlm_request *)0)->lock_desc) == 84);
+        LASSERT (offsetof (struct ldlm_request, lock_handle1) == 88);
+        LASSERT (sizeof (((struct ldlm_request *)0)->lock_handle1) == 8);
+        LASSERT (offsetof (struct ldlm_request, lock_handle2) == 96);
+        LASSERT (sizeof (((struct ldlm_request *)0)->lock_handle2) == 8);
+
+        /* Checks for struct ldlm_reply */
+        LASSERT (sizeof (struct ldlm_reply) == 72);
+        LASSERT (offsetof (struct ldlm_reply, lock_flags) == 0);
+        LASSERT (sizeof (((struct ldlm_reply *)0)->lock_flags) == 4);
+        LASSERT (offsetof (struct ldlm_reply, lock_mode) == 4);
+        LASSERT (sizeof (((struct ldlm_reply *)0)->lock_mode) == 4);
+        LASSERT (offsetof (struct ldlm_reply, lock_resource_name) == 8);
+        LASSERT (sizeof (((struct ldlm_reply *)0)->lock_resource_name) == 24);
+        LASSERT (offsetof (struct ldlm_reply, lock_handle) == 32);
+        LASSERT (sizeof (((struct ldlm_reply *)0)->lock_handle) == 8);
+        LASSERT (offsetof (struct ldlm_reply, lock_extent) == 40);
+        LASSERT (sizeof (((struct ldlm_reply *)0)->lock_extent) == 16);
+        LASSERT (offsetof (struct ldlm_reply, lock_policy_res1) == 56);
+        LASSERT (sizeof (((struct ldlm_reply *)0)->lock_policy_res1) == 8);
+        LASSERT (offsetof (struct ldlm_reply, lock_policy_res2) == 64);
+        LASSERT (sizeof (((struct ldlm_reply *)0)->lock_policy_res2) == 8);
+
+        /* Checks for struct ptlbd_op */
+        LASSERT (sizeof (struct ptlbd_op) == 12);
+        LASSERT (offsetof (struct ptlbd_op, op_cmd) == 0);
+        LASSERT (sizeof (((struct ptlbd_op *)0)->op_cmd) == 2);
+        LASSERT (offsetof (struct ptlbd_op, op_lun) == 2);
+        LASSERT (sizeof (((struct ptlbd_op *)0)->op_lun) == 2);
+        LASSERT (offsetof (struct ptlbd_op, op_niob_cnt) == 4);
+        LASSERT (sizeof (((struct ptlbd_op *)0)->op_niob_cnt) == 2);
+        LASSERT (offsetof (struct ptlbd_op, op__padding) == 6);
+        LASSERT (sizeof (((struct ptlbd_op *)0)->op__padding) == 2);
+        LASSERT (offsetof (struct ptlbd_op, op_block_cnt) == 8);
+        LASSERT (sizeof (((struct ptlbd_op *)0)->op_block_cnt) == 4);
+
+        /* Checks for struct ptlbd_niob */
+        LASSERT (sizeof (struct ptlbd_niob) == 24);
+        LASSERT (offsetof (struct ptlbd_niob, n_xid) == 0);
+        LASSERT (sizeof (((struct ptlbd_niob *)0)->n_xid) == 8);
+        LASSERT (offsetof (struct ptlbd_niob, n_block_nr) == 8);
+        LASSERT (sizeof (((struct ptlbd_niob *)0)->n_block_nr) == 8);
+        LASSERT (offsetof (struct ptlbd_niob, n_offset) == 16);
+        LASSERT (sizeof (((struct ptlbd_niob *)0)->n_offset) == 4);
+        LASSERT (offsetof (struct ptlbd_niob, n_length) == 20);
+        LASSERT (sizeof (((struct ptlbd_niob *)0)->n_length) == 4);
+
+        /* Checks for struct ptlbd_rsp */
+        LASSERT (sizeof (struct ptlbd_rsp) == 4);
+        LASSERT (offsetof (struct ptlbd_rsp, r_status) == 0);
+        LASSERT (sizeof (((struct ptlbd_rsp *)0)->r_status) == 2);
+        LASSERT (offsetof (struct ptlbd_rsp, r_error_cnt) == 2);
+        LASSERT (sizeof (((struct ptlbd_rsp *)0)->r_error_cnt) == 2);
+#endif
+}
diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c
new file mode 100644 (file)
index 0000000..51a0cad
--- /dev/null
@@ -0,0 +1,174 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Portal-RPC reconnection and replay operations, for use in recovery.
+ *
+ *  Copyright (c) 2003 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/version.h>
+
+#define DEBUG_SUBSYSTEM S_RPC
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include "ptlrpc_internal.h"
+
+static struct ptlrpc_thread *pinger_thread = NULL;
+static spinlock_t pinger_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head pinger_imports = LIST_HEAD_INIT(pinger_imports);
+
+int ptlrpc_pinger_add_import(struct obd_import *imp)
+{
+        ENTRY;
+        if (!list_empty(&imp->imp_pinger_chain))
+                RETURN(-EALREADY);
+
+        spin_lock(&pinger_lock);
+        list_add(&imp->imp_pinger_chain, &pinger_imports);
+        spin_unlock(&pinger_lock);
+        RETURN(0);
+}
+
+int ptlrpc_pinger_del_import(struct obd_import *imp)
+{
+        ENTRY;
+        if (list_empty(&imp->imp_pinger_chain))
+                RETURN(-EALREADY);
+
+        spin_lock(&pinger_lock);
+        list_del_init(&imp->imp_pinger_chain);
+        spin_unlock(&pinger_lock);
+        RETURN(0);
+}
+
+static void ptlrpc_pinger_do_stuff(void)
+{
+
+
+
+}
+
+static int ptlrpc_pinger_main(void *arg)
+{
+        struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg;
+        struct ptlrpc_thread *thread = data->thread;
+        unsigned long flags;
+        int rc = 0;
+        ENTRY;
+
+        lock_kernel();
+        ptlrpc_daemonize();
+
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
+        sprintf(current->comm, "%s|%d", data->name,current->thread.extern_pid);
+#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        sprintf(current->comm, "%s|%d", data->name,
+                current->thread.mode.tt.extern_pid);
+#else
+        strcpy(current->comm, data->name);
+#endif
+        unlock_kernel();
+
+        /* Record that the thread is running */
+        thread->t_flags = SVC_RUNNING;
+        wake_up(&thread->t_ctl_waitq);
+
+        /* And now, loop forever on requests */
+        while (1) {
+                struct l_wait_info lwi = LWI_TIMEOUT(5 * HZ, NULL, NULL);
+                l_wait_event(thread->t_ctl_waitq,
+                             thread->t_flags & SVC_STOPPING, &lwi);
+
+                if (thread->t_flags & SVC_STOPPING) {
+                        thread->t_flags &= ~SVC_STOPPING;
+                        EXIT;
+                        break;
+                }
+                ptlrpc_pinger_do_stuff();
+        }
+
+        thread->t_flags = SVC_STOPPED;
+        wake_up(&thread->t_ctl_waitq);
+
+        CDEBUG(D_NET, "pinger thread exiting, process %d: rc = %d\n",
+               current->pid, rc);
+        return rc;
+}
+
+int ptlrpc_pinger_start(void)
+{
+        struct l_wait_info lwi = { 0 };
+        struct ptlrpc_svc_data d;
+        int rc;
+        ENTRY;
+
+        spin_lock(&pinger_lock);
+        if (pinger_thread != NULL)
+                GOTO(out, rc = -EALREADY);
+
+        OBD_ALLOC(pinger_thread, sizeof(*pinger_thread));
+        if (pinger_thread == NULL)
+                GOTO(out, rc = -ENOMEM);
+        init_waitqueue_head(&pinger_thread->t_ctl_waitq);
+
+        d.name = "Lustre pinger";
+        d.thread = pinger_thread;
+
+        /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
+         * just drop the VM and FILES in ptlrpc_daemonize() right away. */
+        rc = kernel_thread(ptlrpc_pinger_main, &d, CLONE_VM | CLONE_FILES);
+        if (rc < 0) {
+                CERROR("cannot start thread: %d\n", rc);
+                OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+                GOTO(out, rc);
+        }
+        l_wait_event(pinger_thread->t_ctl_waitq,
+                     pinger_thread->t_flags & SVC_RUNNING, &lwi);
+
+ out:
+        spin_unlock(&pinger_lock);
+        RETURN(rc);
+}
+
+int ptlrpc_stop_pinger(void)
+{
+        struct l_wait_info lwi = { 0 };
+        int rc = 0;
+        ENTRY;
+
+        spin_lock(&pinger_lock);
+        if (pinger_thread == NULL)
+                GOTO(out, rc = -EALREADY);
+
+        pinger_thread->t_flags = SVC_STOPPING;
+        wake_up(&pinger_thread->t_ctl_waitq);
+        l_wait_event(pinger_thread->t_ctl_waitq,
+                     (pinger_thread->t_flags & SVC_STOPPED), &lwi);
+
+        OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+
+ out:
+        spin_unlock(&pinger_lock);
+        RETURN(rc);
+}
diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h
new file mode 100644 (file)
index 0000000..7100707
--- /dev/null
@@ -0,0 +1,93 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+/* Intramodule declarations for ptlrpc. */
+
+#ifndef PTLRPC_INTERNAL_H
+#define PTLRPC_INTERNAL_H
+
+struct ldlm_namespace;
+struct obd_import;
+struct ldlm_res_id;
+
+/* ldlm hooks that we need, managed via inter_module_{get,put} */
+extern int (*ptlrpc_ldlm_namespace_cleanup)(struct ldlm_namespace *, int);
+extern int (*ptlrpc_ldlm_cli_cancel_unused)(struct ldlm_namespace *,
+                                     struct ldlm_res_id *, int);
+extern int (*ptlrpc_ldlm_replay_locks)(struct obd_import *);
+
+int ptlrpc_get_ldlm_hooks(void);
+void ptlrpc_daemonize(void);
+
+int ptlrpc_request_handle_eviction(struct ptlrpc_request *);
+void lustre_assert_wire_constants (void);
+
+void ptlrpc_lprocfs_register_service(struct obd_device *obddev,
+                                     struct ptlrpc_service *svc);
+void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc);
+
+
+static inline int opcode_offset(__u32 opc) {
+        if (opc < OST_LAST_OPC) {
+                 /* OST opcode */
+                return (opc - OST_FIRST_OPC);
+        } else if (opc < MDS_LAST_OPC) {
+                /* MDS opcode */
+                return (opc - MDS_FIRST_OPC +
+                        (OST_LAST_OPC - OST_FIRST_OPC));
+        } else if (opc < LDLM_LAST_OPC) {
+                /* LDLM Opcode */
+                return (opc - LDLM_FIRST_OPC + 
+                        (MDS_LAST_OPC - MDS_FIRST_OPC) + 
+                        (OST_LAST_OPC - OST_FIRST_OPC));
+        } else if (opc < PTLBD_LAST_OPC) {
+                /* Portals Block Device */
+                return (opc - PTLBD_FIRST_OPC + 
+                        (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
+                        (MDS_LAST_OPC - MDS_FIRST_OPC) +
+                        (OST_LAST_OPC - OST_FIRST_OPC));
+        } else if (opc == OBD_PING) {
+                /* OBD Ping */
+                return (opc - OBD_PING + 
+                        (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) +
+                        (LDLM_LAST_OPC - LDLM_FIRST_OPC) +
+                        (MDS_LAST_OPC - MDS_FIRST_OPC) +
+                        (OST_LAST_OPC - OST_FIRST_OPC));
+        } else { 
+                /* Unknown Opcode */
+                return -1;
+        }
+}
+
+#define LUSTRE_MAX_OPCODES (1 + (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) \
+                              + (LDLM_LAST_OPC - LDLM_FIRST_OPC)   \
+                              + (MDS_LAST_OPC - MDS_FIRST_OPC)     \
+                              + (OST_LAST_OPC - OST_FIRST_OPC))
+
+enum {
+        PTLRPC_REQWAIT_CNTR = 0,
+        PTLRPC_SVCEQDEPTH_CNTR = 1,
+        PTLRPC_SVCIDLETIME_CNTR = 2,
+        PTLRPC_LAST_CNTR    = 3
+};
+
+#endif /* PTLRPC_INTERNAL_H */
diff --git a/lustre/ptlrpc/ptlrpc_lib.c b/lustre/ptlrpc/ptlrpc_lib.c
new file mode 100644 (file)
index 0000000..71142fa
--- /dev/null
@@ -0,0 +1,119 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define EXPORT_SYMTAB
+#define DEBUG_SUBSYSTEM S_RPC
+
+#ifdef __KERNEL__
+# include <linux/module.h>
+#else 
+# include <liblustre.h>
+#endif
+#include <linux/obd.h>
+#include <linux/obd_ost.h>
+#include <linux/lustre_net.h>
+#include <linux/lustre_dlm.h>
+
+int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
+{
+        struct ptlrpc_connection *conn;
+        struct obd_ioctl_data* data = buf;
+        struct client_obd *cli = &obddev->u.cli;
+        struct obd_import *imp;
+        struct obd_uuid server_uuid;
+        int rq_portal, rp_portal, connect_op;
+        char *name;
+        ENTRY;
+
+        if (obddev->obd_type->typ_ops->o_brw) {
+                rq_portal = OST_REQUEST_PORTAL;
+                rp_portal = OSC_REPLY_PORTAL;
+                name = "osc";
+                connect_op = OST_CONNECT;
+        } else {
+                rq_portal = MDS_REQUEST_PORTAL;
+                rp_portal = MDC_REPLY_PORTAL;
+                name = "mdc";
+                connect_op = MDS_CONNECT;
+        }
+
+        if (data->ioc_inllen1 < 1) {
+                CERROR("requires a TARGET UUID\n");
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen1 > 37) {
+                CERROR("client UUID must be less than 38 characters\n");
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen2 < 1) {
+                CERROR("setup requires a SERVER UUID\n");
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen2 > 37) {
+                CERROR("target UUID must be less than 38 characters\n");
+                RETURN(-EINVAL);
+        }
+
+        sema_init(&cli->cl_sem, 1);
+        cli->cl_conn_count = 0;
+        memcpy(server_uuid.uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2,
+                                                        sizeof(server_uuid)));
+
+        conn = ptlrpc_uuid_to_connection(&server_uuid);
+        if (conn == NULL)
+                RETURN(-ENOENT);
+
+        ptlrpc_init_client(rq_portal, rp_portal, name,
+                           &obddev->obd_ldlm_client);
+
+        imp = class_new_import();
+        if (imp == NULL) {
+                ptlrpc_put_connection(conn);
+                RETURN(-ENOMEM);
+        }
+        imp->imp_connection = conn;
+        imp->imp_client = &obddev->obd_ldlm_client;
+        imp->imp_obd = obddev;
+        imp->imp_connect_op = connect_op;
+        imp->imp_generation = 0;
+        memcpy(imp->imp_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1);
+        class_import_put(imp);
+
+        cli->cl_import = imp;
+        cli->cl_max_mds_easize = sizeof(struct lov_mds_md);
+        cli->cl_sandev = to_kdev_t(0);
+
+        RETURN(0);
+}
+
+int client_obd_cleanup(struct obd_device *obddev, int force, int failover)
+{
+        struct client_obd *client = &obddev->u.cli;
+
+        if (!client->cl_import)
+                RETURN(-EINVAL);
+        class_destroy_import(client->cl_import);
+        client->cl_import = NULL;
+        RETURN(0);
+}
diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c
new file mode 100644 (file)
index 0000000..01ba349
--- /dev/null
@@ -0,0 +1,237 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define EXPORT_SYMTAB
+#define DEBUG_SUBSYSTEM S_RPC
+
+#ifdef __KERNEL__
+# include <linux/module.h>
+# include <linux/init.h>
+#else
+# include <liblustre.h>
+#endif
+
+#include <linux/obd_support.h>
+#include <linux/obd_class.h>
+#include <linux/lustre_net.h>
+
+#include "ptlrpc_internal.h"
+
+extern int ptlrpc_init_portals(void);
+extern void ptlrpc_exit_portals(void);
+static int ldlm_hooks_referenced = 0;
+
+int (*ptlrpc_ldlm_namespace_cleanup)(struct ldlm_namespace *, int);
+int (*ptlrpc_ldlm_replay_locks)(struct obd_import *);
+
+#define GET_HOOK(name)                                                         \
+if (!ptlrpc_##name) {                                                          \
+        if (!(ptlrpc_##name = inter_module_get(#name))) {                      \
+                CERROR("can't i_m_g(\"" #name "\")\n");                        \
+                return 0;                                                      \
+        }                                                                      \
+}
+
+static int ldlm_hooks_referenced;
+
+/* This is called from ptlrpc_get_connection, which runs after all the modules
+ * are loaded, but before anything else interesting happens.
+ */
+int ptlrpc_get_ldlm_hooks(void)
+{
+        if (ldlm_hooks_referenced)
+                return 1;
+
+        GET_HOOK(ldlm_namespace_cleanup);
+        GET_HOOK(ldlm_replay_locks);
+
+        ldlm_hooks_referenced = 1;
+        RETURN(1);
+}
+
+#undef GET_HOOK
+
+#define PUT_HOOK(hook)                                                         \
+if (ptlrpc_##hook) {                                                           \
+        inter_module_put(#hook);                                               \
+        ptlrpc_##hook = NULL;                                                  \
+}
+
+void ptlrpc_put_ldlm_hooks(void)
+{
+        ENTRY;
+        if (!ldlm_hooks_referenced)
+                return;
+
+        PUT_HOOK(ldlm_namespace_cleanup);
+        PUT_HOOK(ldlm_replay_locks);
+        ldlm_hooks_referenced = 0;
+        EXIT;
+}
+
+#undef PUT_HOOK
+
+int ptlrpc_ldlm_hooks_referenced(void)
+{
+        return ldlm_hooks_referenced;
+}
+
+__init int ptlrpc_init(void)
+{
+        int rc;
+        ENTRY;
+
+        lustre_assert_wire_constants ();
+        
+        rc = ptlrpc_init_portals();
+        if (rc)
+                RETURN(rc);
+
+        ptlrpc_init_connection();
+
+        ptlrpc_put_connection_superhack = ptlrpc_put_connection;
+        ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight;
+        RETURN(0);
+}
+
+static void __exit ptlrpc_exit(void)
+{
+        ptlrpc_exit_portals();
+        ptlrpc_cleanup_connection();
+}
+
+/* connection.c */
+EXPORT_SYMBOL(ptlrpc_dump_connections);
+EXPORT_SYMBOL(ptlrpc_readdress_connection);
+EXPORT_SYMBOL(ptlrpc_get_connection);
+EXPORT_SYMBOL(ptlrpc_put_connection);
+EXPORT_SYMBOL(ptlrpc_connection_addref);
+EXPORT_SYMBOL(ptlrpc_init_connection);
+EXPORT_SYMBOL(ptlrpc_cleanup_connection);
+
+/* niobuf.c */
+EXPORT_SYMBOL(ptlrpc_bulk_put);
+EXPORT_SYMBOL(ptlrpc_bulk_get);
+EXPORT_SYMBOL(ptlrpc_abort_bulk);
+EXPORT_SYMBOL(ptlrpc_register_bulk);
+EXPORT_SYMBOL(ptlrpc_unregister_bulk);
+EXPORT_SYMBOL(ptlrpc_reply);
+EXPORT_SYMBOL(ptlrpc_error);
+EXPORT_SYMBOL(ptlrpc_resend_req);
+EXPORT_SYMBOL(ptl_send_rpc);
+EXPORT_SYMBOL(ptlrpc_link_svc_me);
+
+/* client.c */
+EXPORT_SYMBOL(ptlrpc_init_client);
+EXPORT_SYMBOL(ptlrpc_cleanup_client);
+EXPORT_SYMBOL(ptlrpc_req_to_uuid);
+EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
+EXPORT_SYMBOL(ptlrpc_queue_wait);
+EXPORT_SYMBOL(ptlrpc_replay_req);
+EXPORT_SYMBOL(ptlrpc_restart_req);
+EXPORT_SYMBOL(ptlrpc_prep_req);
+EXPORT_SYMBOL(ptlrpc_free_req);
+EXPORT_SYMBOL(ptlrpc_unregister_reply);
+EXPORT_SYMBOL(ptlrpc_req_finished);
+EXPORT_SYMBOL(ptlrpc_request_addref);
+EXPORT_SYMBOL(ptlrpc_prep_bulk_imp);
+EXPORT_SYMBOL(ptlrpc_prep_bulk_exp);
+EXPORT_SYMBOL(ptlrpc_free_bulk);
+EXPORT_SYMBOL(ptlrpc_prep_bulk_page);
+EXPORT_SYMBOL(ptlrpc_free_bulk_page);
+EXPORT_SYMBOL(ptlrpc_abort_inflight);
+EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
+EXPORT_SYMBOL(ptlrpc_next_xid);
+
+EXPORT_SYMBOL(ptlrpc_prep_set);
+EXPORT_SYMBOL(ptlrpc_set_add_req);
+EXPORT_SYMBOL(ptlrpc_set_destroy);
+EXPORT_SYMBOL(ptlrpc_set_wait);
+
+/* service.c */
+EXPORT_SYMBOL(ptlrpc_init_svc);
+EXPORT_SYMBOL(ptlrpc_stop_all_threads);
+EXPORT_SYMBOL(ptlrpc_start_thread);
+EXPORT_SYMBOL(ptlrpc_unregister_service);
+
+/* pack_generic.c */
+EXPORT_SYMBOL(lustre_pack_msg);
+EXPORT_SYMBOL(lustre_msg_size);
+EXPORT_SYMBOL(lustre_unpack_msg);
+EXPORT_SYMBOL(lustre_msg_buf);
+EXPORT_SYMBOL(lustre_msg_string);
+EXPORT_SYMBOL(lustre_swab_reqbuf);
+EXPORT_SYMBOL(lustre_swab_repbuf);
+EXPORT_SYMBOL(lustre_swab_obdo);
+EXPORT_SYMBOL(lustre_swab_obd_statfs);
+EXPORT_SYMBOL(lustre_swab_obd_ioobj);
+EXPORT_SYMBOL(lustre_swab_niobuf_remote);
+EXPORT_SYMBOL(lustre_swab_ost_body);
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+EXPORT_SYMBOL(lustre_swab_mds_status_req);
+EXPORT_SYMBOL(lustre_swab_mds_fileh_body);
+EXPORT_SYMBOL(lustre_swab_mds_body);
+EXPORT_SYMBOL(lustre_swab_mds_rec_setattr);
+EXPORT_SYMBOL(lustre_swab_mds_rec_create);
+EXPORT_SYMBOL(lustre_swab_mds_rec_link);
+EXPORT_SYMBOL(lustre_swab_mds_rec_unlink);
+EXPORT_SYMBOL(lustre_swab_mds_rec_rename);
+EXPORT_SYMBOL(lustre_swab_lov_desc);
+EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
+EXPORT_SYMBOL(lustre_swab_ldlm_extent);
+EXPORT_SYMBOL(lustre_swab_ldlm_intent);
+EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc);
+EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc);
+EXPORT_SYMBOL(lustre_swab_ldlm_request);
+EXPORT_SYMBOL(lustre_swab_ldlm_reply);
+EXPORT_SYMBOL(lustre_swab_ptlbd_op);
+EXPORT_SYMBOL(lustre_swab_ptlbd_niob);
+EXPORT_SYMBOL(lustre_swab_ptlbd_rsp);
+
+/* ptlrpc_module.c */
+EXPORT_SYMBOL(ptlrpc_put_ldlm_hooks);
+EXPORT_SYMBOL(ptlrpc_ldlm_hooks_referenced);
+
+/* recover.c */
+EXPORT_SYMBOL(ptlrpc_run_recovery_over_upcall);
+EXPORT_SYMBOL(ptlrpc_run_failed_import_upcall);
+EXPORT_SYMBOL(ptlrpc_reconnect_import);
+EXPORT_SYMBOL(ptlrpc_replay);
+EXPORT_SYMBOL(ptlrpc_resend);
+EXPORT_SYMBOL(ptlrpc_wake_delayed);
+EXPORT_SYMBOL(ptlrpc_set_import_active);
+EXPORT_SYMBOL(ptlrpc_fail_import);
+EXPORT_SYMBOL(ptlrpc_fail_export);
+EXPORT_SYMBOL(ptlrpc_recover_import);
+
+/*ptlrpc_lib.c*/
+EXPORT_SYMBOL(client_obd_setup);
+EXPORT_SYMBOL(client_obd_cleanup);
+
+#ifdef __KERNEL__
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Lustre Request Processor");
+MODULE_LICENSE("GPL");
+
+module_init(ptlrpc_init);
+module_exit(ptlrpc_exit);
+#endif
diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c
deleted file mode 100644 (file)
index 21cb3fe..0000000
+++ /dev/null
@@ -1,372 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  obd/rpc/recovd.c
- *
- *  Lustre High Availability Daemon
- *
- *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
- *
- *  This code is issued under the GNU General Public License.
- *  See the file COPYING in this distribution
- *
- *  by Peter Braam <braam@clusterfs.com>
- *
- */
-
-#define DEBUG_SUBSYSTEM S_RPC
-#ifndef __KERNEL__
-#include <liblustre.h>
-#include <linux/obd.h>
-#include <linux/obd_class.h>
-#else 
-#include <linux/lustre_lite.h>
-#endif
-
-#include <linux/lustre_ha.h>
-#include <linux/obd_support.h>
-
-/* dump_connection_list, but shorter for nicer debugging logs */
-static void d_c_l(struct list_head *head)
-{
-        struct list_head *tmp;
-
-        list_for_each(tmp, head) {
-                struct ptlrpc_connection *conn =
-                        list_entry(tmp, struct ptlrpc_connection,
-                                   c_recovd_data.rd_managed_chain);
-                CDEBUG(D_HA, "   %p = %s (%d/%d)\n", conn, 
-                       conn->c_remote_uuid.uuid,
-                       conn->c_recovd_data.rd_phase,
-                       conn->c_recovd_data.rd_next_phase);
-        }
-}
-
-static void dump_lists(struct recovd_obd *recovd)
-{
-        CDEBUG(D_HA, "managed: \n");
-        d_c_l(&recovd->recovd_managed_items);
-        CDEBUG(D_HA, "troubled: \n");
-        d_c_l(&recovd->recovd_troubled_items);
-}
-
-void recovd_conn_manage(struct ptlrpc_connection *conn,
-                        struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover)
-{
-        struct recovd_data *rd = &conn->c_recovd_data;
-        ENTRY;
-        if (!recovd || !recover) {
-                EXIT;
-                return;
-        }
-
-        if (!list_empty(&rd->rd_managed_chain)) {
-                if (rd->rd_recovd == recovd && rd->rd_recover == recover) {
-                        CDEBUG(D_HA, "conn %p/%s already setup for recovery\n",
-                               conn, conn->c_remote_uuid.uuid);
-                        EXIT;
-                        return;
-                }
-                CDEBUG(D_HA,
-                       "conn %p/%s has recovery items %p/%p, making %p/%p\n",
-                       conn, conn->c_remote_uuid.uuid, rd->rd_recovd, rd->rd_recover,
-                       recovd, recover);
-                spin_lock(&rd->rd_recovd->recovd_lock);
-                list_del_init(&rd->rd_managed_chain);
-                spin_unlock(&rd->rd_recovd->recovd_lock);
-        }
-
-        rd->rd_recovd = recovd;
-        rd->rd_recover = recover;
-        rd->rd_phase = RD_IDLE;
-        rd->rd_next_phase = RD_TROUBLED;
-
-        spin_lock(&recovd->recovd_lock);
-        list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items);
-        dump_lists(recovd);
-        spin_unlock(&recovd->recovd_lock);
-
-        EXIT;
-}
-
-void recovd_conn_unmanage(struct ptlrpc_connection *conn)
-{
-        struct recovd_data *rd = &conn->c_recovd_data;
-        struct recovd_obd *recovd = rd->rd_recovd;
-        ENTRY;
-
-        if (recovd) {
-                spin_lock(&recovd->recovd_lock);
-                list_del_init(&rd->rd_managed_chain);
-                rd->rd_recovd = NULL;
-                spin_unlock(&recovd->recovd_lock);
-        }
-        /* should be safe enough, right? */
-        rd->rd_recover = NULL;
-        rd->rd_next_phase = RD_IDLE;
-        rd->rd_next_phase = RD_TROUBLED;
-}
-
-void recovd_conn_fail(struct ptlrpc_connection *conn)
-{
-        struct recovd_data *rd = &conn->c_recovd_data;
-        struct recovd_obd *recovd = rd->rd_recovd;
-        ENTRY;
-
-        if (!recovd) {
-                CERROR("no recovd for connection %p\n", conn);
-                EXIT;
-                return;
-        }
-
-        spin_lock(&recovd->recovd_lock);
-        if (rd->rd_phase == RD_TROUBLED || rd->rd_phase == RD_PREPARING) {
-                CDEBUG(D_HA, "connection %p to %s already in recovery\n",
-                       conn, conn->c_remote_uuid.uuid);
-                spin_unlock(&recovd->recovd_lock);
-                EXIT;
-                return;
-        }
-
-        CERROR("connection %p to %s nid "LPX64" on %s failed\n", conn,
-               conn->c_remote_uuid.uuid, conn->c_peer.peer_nid,
-               conn->c_peer.peer_ni->pni_name);
-        list_del(&rd->rd_managed_chain);
-        list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
-        if (rd->rd_phase != RD_IDLE) {
-                CDEBUG(D_HA,
-                       "connection %p to %s failed in recovery: restarting\n",
-                       conn, conn->c_remote_uuid.uuid);
-                /* XXX call callback with PHASE_FAILED? */
-                rd->rd_next_phase = RD_TROUBLED;
-        }
-        rd->rd_phase = RD_TROUBLED;
-        dump_lists(recovd);
-        spin_unlock(&recovd->recovd_lock);
-
-        wake_up(&recovd->recovd_waitq);
-
-        EXIT;
-}
-
-void recovd_conn_fixed(struct ptlrpc_connection *conn)
-{
-        struct recovd_data *rd = &conn->c_recovd_data;
-        ENTRY;
-
-        CDEBUG(D_HA, "connection %p (now to %s) fixed\n",
-               conn, conn->c_remote_uuid.uuid);
-        spin_lock(&rd->rd_recovd->recovd_lock);
-        list_del(&rd->rd_managed_chain);
-        rd->rd_phase = RD_IDLE;
-        rd->rd_next_phase = RD_TROUBLED;
-        list_add(&rd->rd_managed_chain, &rd->rd_recovd->recovd_managed_items);
-        dump_lists(rd->rd_recovd);
-        spin_unlock(&rd->rd_recovd->recovd_lock);
-
-        EXIT;
-}
-
-static int recovd_check_event(struct recovd_obd *recovd)
-{
-        int rc = 0;
-        struct list_head *tmp;
-
-        ENTRY;
-
-        spin_lock(&recovd->recovd_lock);
-
-        if (recovd->recovd_state == RECOVD_STOPPING)
-                GOTO(out, rc = 1);
-
-        list_for_each(tmp, &recovd->recovd_troubled_items) {
-
-                struct recovd_data *rd = list_entry(tmp, struct recovd_data,
-                                                    rd_managed_chain);
-
-                if (rd->rd_phase == rd->rd_next_phase ||
-                    rd->rd_phase == RD_FAILED)
-                        GOTO(out, rc = 1);
-        }
-
- out:
-        spin_unlock(&recovd->recovd_lock);
-        RETURN(rc);
-}
-
-static int recovd_handle_event(struct recovd_obd *recovd)
-{
-        struct list_head *tmp, *n;
-        int rc = 0;
-        ENTRY;
-
-        spin_lock(&recovd->recovd_lock);
-
-        dump_lists(recovd);
-
-        /*
-         * We use _safe here because one of the callbacks, expecially
-         * FAILURE or PREPARED, could move list items around.
-         */
-        list_for_each_safe(tmp, n, &recovd->recovd_troubled_items) {
-                struct recovd_data *rd = list_entry(tmp, struct recovd_data,
-                                                    rd_managed_chain);
-
-                if (rd->rd_phase != RD_FAILED &&
-                    rd->rd_phase != rd->rd_next_phase)
-                        continue;
-
-                switch (rd->rd_phase) {
-                    case RD_FAILED:
-                cb_failed: /* must always reach here with recovd_lock held! */
-                        CERROR("recovery FAILED for rd %p (conn %p): %d\n",
-                               rd, class_rd2conn(rd), rc);
-
-                        spin_unlock(&recovd->recovd_lock);
-                        (void)rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_FAILURE);
-                        spin_lock(&recovd->recovd_lock);
-                        break;
-
-                    case RD_TROUBLED:
-                        if (!rd->rd_recover) {
-                                CERROR("no rd_recover for rd %p (conn %p)\n",
-                                       rd, class_rd2conn(rd));
-                                rc = -EINVAL;
-                                break;
-                        }
-                        CERROR("starting recovery for rd %p (conn %p)\n",
-                               rd, class_rd2conn(rd));
-                        rd->rd_phase = RD_PREPARING;
-                        rd->rd_next_phase = RD_PREPARED;
-
-                        spin_unlock(&recovd->recovd_lock);
-                        rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE);
-                        spin_lock(&recovd->recovd_lock);
-                        if (rc)
-                                goto cb_failed;
-
-                        break;
-
-                    case RD_PREPARED:
-
-                        CERROR("recovery prepared for rd %p (conn %p)\n",
-                               rd, class_rd2conn(rd));
-                        rd->rd_phase = RD_RECOVERING;
-                        rd->rd_next_phase = RD_RECOVERED;
-
-                        spin_unlock(&recovd->recovd_lock);
-                        rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_RECOVER);
-                        spin_lock(&recovd->recovd_lock);
-                        if (rc)
-                                goto cb_failed;
-
-                        break;
-
-                    case RD_RECOVERED:
-                        rd->rd_phase = RD_IDLE;
-                        rd->rd_next_phase = RD_TROUBLED;
-
-                        CERROR("recovery complete for rd %p (conn %p)\n",
-                               rd, class_rd2conn(rd));
-                        break;
-
-                    default:
-                        break;
-                }
-        }
-        spin_unlock(&recovd->recovd_lock);
-        RETURN(0);
-}
-
-#ifdef __KERNEL__
-static int recovd_main(void *arg)
-{
-        struct recovd_obd *recovd = (struct recovd_obd *)arg;
-        unsigned long flags;
-        ENTRY;
-
-        lock_kernel();
-        daemonize();
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
-        sigfillset(&current->blocked);
-        recalc_sigpending();
-#else
-        spin_lock_irqsave(&current->sigmask_lock, flags);
-        sigfillset(&current->blocked);
-        recalc_sigpending(current);
-        spin_unlock_irqrestore(&current->sigmask_lock, flags);
-#endif
-
-        sprintf(current->comm, "lustre_recovd");
-        unlock_kernel();
-
-        /* Signal that the thread is running. */
-        recovd->recovd_thread = current;
-        recovd->recovd_state = RECOVD_READY;
-        wake_up(&recovd->recovd_ctl_waitq);
-
-        /* And now, loop forever on requests. */
-        while (1) {
-                wait_event(recovd->recovd_waitq, recovd_check_event(recovd));
-                if (recovd->recovd_state == RECOVD_STOPPING)
-                        break;
-                recovd_handle_event(recovd);
-        }
-
-        recovd->recovd_thread = NULL;
-        recovd->recovd_state = RECOVD_STOPPED;
-        wake_up(&recovd->recovd_ctl_waitq);
-        CDEBUG(D_HA, "mgr exiting process %d\n", current->pid);
-        RETURN(0);
-}
-
-int recovd_setup(struct recovd_obd *recovd)
-{
-        int rc = 0; /* initialize for Liblustre */
-
-        ENTRY;
-
-        INIT_LIST_HEAD(&recovd->recovd_managed_items);
-        INIT_LIST_HEAD(&recovd->recovd_troubled_items);
-        spin_lock_init(&recovd->recovd_lock);
-
-        init_waitqueue_head(&recovd->recovd_waitq);
-        init_waitqueue_head(&recovd->recovd_recovery_waitq);
-        init_waitqueue_head(&recovd->recovd_ctl_waitq);
-
-        rc = kernel_thread(recovd_main, (void *)recovd,
-                           CLONE_VM | CLONE_FS | CLONE_FILES);
-        if (rc < 0) {
-                CERROR("cannot start thread\n");
-                RETURN(-EINVAL);
-        }
-        wait_event(recovd->recovd_ctl_waitq,
-                   recovd->recovd_state == RECOVD_READY);
-
-        ptlrpc_recovd = recovd;
-        class_signal_connection_failure = recovd_conn_fail;
-
-        RETURN(0);
-}
-#else 
-int recovd_setup(struct recovd_obd *recovd)
-{
-        return 0;
-}
-#endif
-
-int recovd_cleanup(struct recovd_obd *recovd)
-{
-        ENTRY;
-        spin_lock(&recovd->recovd_lock);
-        recovd->recovd_state = RECOVD_STOPPING;
-        wake_up(&recovd->recovd_waitq);
-        spin_unlock(&recovd->recovd_lock);
-
-        wait_event(recovd->recovd_ctl_waitq,
-                   (recovd->recovd_state == RECOVD_STOPPED));
-        RETURN(0);
-}
-
-struct recovd_obd *ptlrpc_recovd;
index a1464a3..a90df0e 100644 (file)
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kmod.h>
-#else 
+#else
 #include <liblustre.h>
 #endif
 
+#include <linux/obd_support.h>
 #include <linux/lustre_ha.h>
 #include <linux/lustre_net.h>
+#include <linux/lustre_import.h>
+#include <linux/lustre_export.h>
 #include <linux/obd.h>
+#include <linux/obd_class.h>
+#include <linux/obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
 
-int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc,
+#include "ptlrpc_internal.h"
+
+int ptlrpc_reconnect_import(struct obd_import *imp,
                             struct ptlrpc_request **reqptr)
 {
         struct obd_device *obd = imp->imp_obd;
-        struct client_obd *cli = &obd->u.cli;
-        int size[] = { sizeof(cli->cl_target_uuid), sizeof(obd->obd_uuid) };
-        char *tmp[] = {cli->cl_target_uuid.uuid, obd->obd_uuid.uuid};
+        int flags, rc, size[] = {sizeof(imp->imp_target_uuid),
+                                 sizeof(obd->obd_uuid),
+                                 sizeof(imp->imp_dlm_handle)};
+        char *tmp[] = {imp->imp_target_uuid.uuid,
+                       obd->obd_uuid.uuid,
+                       (char *)&imp->imp_dlm_handle};
         struct ptlrpc_connection *conn = imp->imp_connection;
         struct ptlrpc_request *req;
-        struct obd_export *ldlmexp;
         struct lustre_handle old_hdl;
-        int rc;
 
-        req = ptlrpc_prep_req(imp, rq_opc, 2, size, tmp);
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        imp->imp_generation++;
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+        CERROR("reconnect handle "LPX64"\n", 
+               imp->imp_dlm_handle.cookie);
+
+        req = ptlrpc_prep_req(imp, imp->imp_connect_op, 3, size, tmp);
         if (!req)
                 RETURN(-ENOMEM);
         req->rq_level = LUSTRE_CONN_NEW;
         req->rq_replen = lustre_msg_size(0, NULL);
-        /*
-         * This address is the export that represents our client-side LDLM
-         * service (for ASTs).  We should only have one on this list, so we
-         * just grab the first one.
-         *
-         * XXX tear down export, call class_obd_connect?
-         */
-        ldlmexp = list_entry(obd->obd_exports.next, struct obd_export,
-                             exp_obd_chain);
-        req->rq_reqmsg->addr = (__u64)(unsigned long)ldlmexp;
-        req->rq_reqmsg->cookie = ldlmexp->exp_cookie;
         rc = ptlrpc_queue_wait(req);
         if (rc) {
                 CERROR("cannot connect to %s@%s: rc = %d\n",
-                       cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid, rc);
+                       imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid, rc);
                 GOTO(out_disc, rc);
         }
+
         if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) {
                 memset(&old_hdl, 0, sizeof(old_hdl));
-                if (!memcmp(&old_hdl.addr, &req->rq_repmsg->addr,
-                            sizeof (old_hdl.addr)) &&
-                    !memcmp(&old_hdl.cookie, &req->rq_repmsg->cookie,
-                            sizeof (old_hdl.cookie))) {
-                        CERROR("%s@%s didn't like our handle "LPX64"/"LPX64
-                               ", failed\n", cli->cl_target_uuid.uuid,
+                if (!memcmp(&old_hdl, &req->rq_repmsg->handle,
+                            sizeof (old_hdl))) {
+                        CERROR("%s@%s didn't like our handle "LPX64
+                               ", failed\n", imp->imp_target_uuid.uuid,
                                conn->c_remote_uuid.uuid,
-                               (__u64)(unsigned long)ldlmexp,
-                               ldlmexp->exp_cookie);
+                               imp->imp_dlm_handle.cookie);
                         GOTO(out_disc, rc = -ENOTCONN);
                 }
 
-                old_hdl.addr = req->rq_repmsg->addr;
-                old_hdl.cookie = req->rq_repmsg->cookie;
-                if (memcmp(&imp->imp_handle, &old_hdl, sizeof(old_hdl))) {
-                        CERROR("%s@%s changed handle from "LPX64"/"LPX64
-                               " to "LPX64"/"LPX64"; "
-                               "copying, but this may foreshadow disaster\n",
-                               cli->cl_target_uuid.uuid, 
+                if (memcmp(&imp->imp_remote_handle, &req->rq_repmsg->handle, 
+                           sizeof(imp->imp_remote_handle))) {
+                        CERROR("%s@%s changed handle from "LPX64" to "LPX64
+                               "; copying, but this may foreshadow disaster\n",
+                               imp->imp_target_uuid.uuid,
                                conn->c_remote_uuid.uuid,
-                               old_hdl.addr, old_hdl.cookie,
-                               imp->imp_handle.addr, imp->imp_handle.cookie);
-                        imp->imp_handle.addr = req->rq_repmsg->addr;
-                        imp->imp_handle.cookie = req->rq_repmsg->cookie;
+                               imp->imp_remote_handle.cookie,
+                               req->rq_repmsg->handle.cookie);
+                        imp->imp_remote_handle = req->rq_repmsg->handle;
                         GOTO(out_disc, rc = 0);
                 }
 
                 CERROR("reconnected to %s@%s after partition\n",
-                       cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid);
+                       imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid);
                 GOTO(out_disc, rc = 0);
         }
 
-        old_hdl = imp->imp_handle;
-        imp->imp_handle.addr = req->rq_repmsg->addr;
-        imp->imp_handle.cookie = req->rq_repmsg->cookie;
-        CERROR("reconnected to %s@%s ("LPX64"/"LPX64", was "LPX64"/"
-               LPX64")!\n", cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid,
-               imp->imp_handle.addr, imp->imp_handle.cookie,
-               old_hdl.addr, old_hdl.cookie);
+        old_hdl = imp->imp_remote_handle;
+        imp->imp_remote_handle = req->rq_repmsg->handle;
+        CERROR("reconnected to %s@%s ("LPX64", was "LPX64")!\n",
+               imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid,
+               imp->imp_remote_handle.cookie, old_hdl.cookie);
         GOTO(out_disc, rc = 0);
 
  out_disc:
@@ -118,37 +115,62 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc,
         return rc;
 }
 
-int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn)
+void ptlrpc_run_recovery_over_upcall(struct obd_device *obd)
 {
-        char *argv[3];
+        char *argv[4];
         char *envp[3];
         int rc;
 
         ENTRY;
-        argv[0] = obd_recovery_upcall;
-        argv[1] = conn->c_remote_uuid.uuid;
-        argv[2] = NULL;
+        argv[0] = obd_lustre_upcall;
+        argv[1] = "RECOVERY_OVER";
+        argv[2] = obd->obd_uuid.uuid;
+        argv[3] = NULL;
 
         envp[0] = "HOME=/";
         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
         envp[2] = NULL;
 
-        rc = call_usermodehelper(argv[0], argv, envp);
+        rc = USERMODEHELPER(argv[0], argv, envp);
         if (rc < 0) {
-                CERROR("Error invoking recovery upcall %s for %s: %d\n",
-                       argv[0], argv[1], rc);
-                CERROR("Check /proc/sys/lustre/recovery_upcall?\n");
+                CERROR("Error invoking recovery upcall %s %s %s: %d; check "
+                       "/proc/sys/lustre/upcall\n",                
+                       argv[0], argv[1], argv[2], rc);
+                
         } else {
-                CERROR("Invoked upcall %s for connection %s\n",
-                       argv[0], argv[1]);
+                CERROR("Invoked upcall %s %s %s",
+                       argv[0], argv[1], argv[2]);
         }
+}
 
-        /*
-         * We don't want to make this a "failed" recovery, because the system
-         * administrator -- or, perhaps, tester -- may well be able to rescue
-         * things by running the correct upcall.
-         */
-        RETURN(0);
+void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
+{
+        char *argv[6];
+        char *envp[3];
+        int rc;
+
+        ENTRY;
+        argv[0] = obd_lustre_upcall;
+        argv[1] = "FAILED_IMPORT";
+        argv[2] = imp->imp_target_uuid.uuid;
+        argv[3] = imp->imp_obd->obd_uuid.uuid;
+        argv[4] = imp->imp_connection->c_remote_uuid.uuid;
+        argv[5] = NULL;
+
+        envp[0] = "HOME=/";
+        envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+        envp[2] = NULL;
+
+        rc = USERMODEHELPER(argv[0], argv, envp);
+        if (rc < 0) {
+                CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; check "
+                       "/proc/sys/lustre/lustre_upcall\n",                
+                       argv[0], argv[1], argv[2], argv[3], argv[4],rc);
+                
+        } else {
+                CERROR("Invoked upcall %s %s %s %s %s\n",
+                       argv[0], argv[1], argv[2], argv[3], argv[4]);
+        }
 }
 
 int ptlrpc_replay(struct obd_import *imp)
@@ -164,119 +186,404 @@ int ptlrpc_replay(struct obd_import *imp)
          * get rid of them now.
          */
         spin_lock_irqsave(&imp->imp_lock, flags);
-
         ptlrpc_free_committed(imp);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
 
         CDEBUG(D_HA, "import %p from %s has committed "LPD64"\n",
-               imp, imp->imp_obd->u.cli.cl_target_uuid.uuid, committed);
+               imp, imp->imp_target_uuid.uuid, committed);
 
         list_for_each(tmp, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
                 DEBUG_REQ(D_HA, req, "RETAINED: ");
         }
 
+        /* Do I need to hold a lock across this iteration?  We shouldn't be
+         * racing with any additions to the list, because we're in recovery
+         * and are therefore not processing additional requests to add.  Calls
+         * to ptlrpc_free_committed might commit requests, but nothing "newer"
+         * than the one we're replaying (it can't be committed until it's
+         * replayed, and we're doing that here).  l_f_e_safe protects against
+         * problems with the current request being committed, in the unlikely
+         * event of that race.  So, in conclusion, I think that it's safe to 
+         * perform this list-walk without the imp_lock held.
+         *
+         * But, the {mdc,osc}_replay_open callbacks both iterate
+         * request lists, and have comments saying they assume the
+         * imp_lock is being held by ptlrpc_replay, but it's not. it's
+         * just a little race...
+         */
         list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 
                 DEBUG_REQ(D_HA, req, "REPLAY:");
 
-                /* XXX locking WRT failure during replay? */
                 rc = ptlrpc_replay_req(req);
-
+        
                 if (rc) {
                         CERROR("recovery replay error %d for req "LPD64"\n",
                                rc, req->rq_xid);
-                        GOTO(out, rc);
+                        RETURN(rc);
                 }
         }
 
- out:
-        spin_unlock_irqrestore(&imp->imp_lock, flags);
-        return rc;
+        RETURN(0);
 }
 
-#define NO_RESEND     0 /* No action required. */
-#define RESEND        1 /* Resend required. */
-#define RESEND_IGNORE 2 /* Resend, ignore the reply (already saw it). */
-#define RESTART       3 /* Have to restart the call, sorry! */
+int ptlrpc_resend(struct obd_import *imp)
+{
+        struct list_head *tmp, *pos;
+        struct ptlrpc_request *req;
+        unsigned long flags;
 
-static int resend_type(struct ptlrpc_request *req, __u64 committed)
+        ENTRY;
+
+        /* As long as we're in recovery, nothing should be added to the sending
+         * list, so we don't need to hold the lock during this iteration and
+         * resend process.
+         */
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        LASSERT(imp->imp_level < LUSTRE_CONN_FULL);
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+        list_for_each_safe(tmp, pos, &imp->imp_sending_list) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+                ptlrpc_resend_req(req);
+        }
+
+        RETURN(0);
+}
+
+void ptlrpc_wake_delayed(struct obd_import *imp)
 {
-        if (req->rq_transno && req->rq_transno < committed) {
-                if (req->rq_flags & PTL_RPC_FL_REPLIED) {
-                        /* Saw the reply and it was committed, no biggie. */
-                        DEBUG_REQ(D_HA, req, "NO_RESEND");
-                        return NO_RESEND;
+        unsigned long flags;
+        struct list_head *tmp, *pos;
+        struct ptlrpc_request *req;
+
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
+                req = list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                ptlrpc_put_connection(req->rq_connection);
+                req->rq_connection =
+                       ptlrpc_connection_addref(req->rq_import->imp_connection);
+
+                if (req->rq_set) {
+                        DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
+                        wake_up(&req->rq_set->set_waitq);
+                } else {
+                        DEBUG_REQ(D_HA, req, "waking:");
+                        wake_up(&req->rq_wait_for_rep);
                 }
-                /* Request committed, but no reply: have to restart. */
-                return RESTART;
         }
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+}
 
-        if (req->rq_flags & PTL_RPC_FL_REPLIED) {
-                /* Saw reply, so resend and ignore new reply. */
-                return RESEND_IGNORE;
-        }
+inline void ptlrpc_invalidate_import_state(struct obd_import *imp)
+{
+        struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
+        if (ptlrpc_ldlm_namespace_cleanup == NULL)
+                CERROR("ptlrpc/ldlm hook is NULL!  Please tell phil\n");
+        else
+                ptlrpc_ldlm_namespace_cleanup(ns, 1 /* no network ops */);
+        ptlrpc_abort_inflight(imp);
+}
+
+int ptlrpc_request_handle_eviction(struct ptlrpc_request *failed_req)
+{
+        int rc = 0, in_recovery = 0;
+        struct obd_import *imp= failed_req->rq_import;
+        unsigned long flags;
+        struct ptlrpc_request *req;
+
+        spin_lock_irqsave(&imp->imp_lock, flags);
+
+        if (imp->imp_level == LUSTRE_CONN_NOTCONN)
+                in_recovery = 1;
+
+        if (failed_req->rq_import_generation == imp->imp_generation)
+                imp->imp_level = LUSTRE_CONN_NOTCONN;
+        else
+                in_recovery = 1;
 
-        /* Didn't see reply either, so resend. */
-        return RESEND;
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+        if (in_recovery) {
+                ptlrpc_resend_req(failed_req);
+                RETURN(rc);
+        }
 
+        CDEBUG(D_HA, "import %s of %s@%s evicted: reconnecting\n",
+               imp->imp_obd->obd_name,
+               imp->imp_target_uuid.uuid,
+               imp->imp_connection->c_remote_uuid.uuid);
+        rc = ptlrpc_reconnect_import(imp, &req);
+        if (rc) {
+                ptlrpc_resend_req(failed_req);
+                ptlrpc_fail_import(imp, imp->imp_generation);
+        } else {
+                spin_lock_irqsave (&failed_req->rq_lock, flags);
+                failed_req->rq_err = 1;
+                spin_unlock_irqrestore (&failed_req->rq_lock, flags);
+                spin_lock_irqsave(&imp->imp_lock, flags);
+                imp->imp_level = LUSTRE_CONN_FULL;
+                imp->imp_invalid = 0;
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+                ptlrpc_invalidate_import_state(imp/*, req->rq_import_generation*/);
+        }
+        ptlrpc_req_finished(req);
+        RETURN(rc);
 }
 
-int ptlrpc_resend(struct obd_import *imp)
+int ptlrpc_set_import_active(struct obd_import *imp, int active)
 {
-        int rc = 0;
-        struct list_head *tmp, *pos;
-        struct ptlrpc_request *req;
+        struct obd_device *notify_obd;
         unsigned long flags;
-        __u64 committed = imp->imp_peer_committed_transno;
+        int rc;
+
+        LASSERT(imp->imp_obd);
+
+        notify_obd = imp->imp_obd->u.cli.cl_containing_lov;
+
+        /* When deactivating, mark import invalid, and 
+           abort in-flight requests. */
+        if (!active) {
+                spin_lock_irqsave(&imp->imp_lock, flags);
+                imp->imp_invalid = 1;
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+                ptlrpc_abort_inflight(imp);
+        } 
+
+        imp->imp_invalid = !active;
+
+        if (notify_obd == NULL)
+                GOTO(out, rc = 0);
+
+        /* How gross is _this_? */
+        if (!list_empty(&notify_obd->obd_exports)) {
+                struct lustre_handle fakeconn;
+                struct obd_ioctl_data ioc_data = { 0 };
+                struct obd_export *exp =
+                        list_entry(notify_obd->obd_exports.next,
+                                   struct obd_export, exp_obd_chain);
+
+                fakeconn.cookie = exp->exp_handle.h_cookie;
+                ioc_data.ioc_inlbuf1 = (char *)&imp->imp_target_uuid;
+                ioc_data.ioc_offset = active;
+                rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn,
+                                   sizeof ioc_data, &ioc_data, NULL);
+                if (rc)
+                        CERROR("error %sabling %s on LOV %p/%s: %d\n",
+                               active ? "en" : "dis",
+                               imp->imp_target_uuid.uuid, notify_obd,
+                               notify_obd->obd_uuid.uuid, rc);
+        } else {
+                CDEBUG(D_HA, "No exports for obd %p/%s, can't notify about "
+                       "%p\n", notify_obd, notify_obd->obd_uuid.uuid,
+                       imp->imp_obd->obd_uuid.uuid);
+                rc = -ENOENT;
+        }
 
+out:
+        /* When activating, mark import valid */
+        if (active) {
+                spin_lock_irqsave(&imp->imp_lock, flags);
+                imp->imp_invalid = 0;
+                spin_unlock_irqrestore(&imp->imp_lock, flags);
+        }
+
+        RETURN(rc);
+}
+
+void ptlrpc_fail_import(struct obd_import *imp, int generation)
+{
+        unsigned long flags;
+        int in_recovery = 0;
         ENTRY;
 
+        LASSERT (!imp->imp_dlm_fake);
+        
+        /* If we were already in recovery, or if the import's connection to its
+         * service is newer than the failing operation's original attempt, then
+         * we don't want to recover again. */
         spin_lock_irqsave(&imp->imp_lock, flags);
-        list_for_each_safe(tmp, pos, &imp->imp_sending_list) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
 
-                switch(resend_type(req, committed)) {
-                    case NO_RESEND:
-                        break;
-
-                    case RESTART:
-                        ptlrpc_restart_req(req);
-                        break;
-
-                    case RESEND_IGNORE:
-                        rc = ptlrpc_replay_req(req);
-                        if (rc) {
-                                DEBUG_REQ(D_ERROR, req, "error %d resending:",
-                                          rc);
-                                ptlrpc_restart_req(req); /* might as well */
-                        }
-                        break;
-
-                    case RESEND:
-                        ptlrpc_resend_req(req);
-                        break;
-
-                    default:
-                        LBUG();
-                }
+        if (imp->imp_level == LUSTRE_CONN_RECOVD)
+                in_recovery = 1;
+
+        if (generation == imp->imp_generation) {
+                imp->imp_level = LUSTRE_CONN_RECOVD;
+                imp->imp_generation++;
+        } else {
+                in_recovery = 1;
         }
 
         spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+        if (in_recovery) {
+                EXIT;
+                return;
+        }
+
+        if (!imp->imp_replayable) {
+                CDEBUG(D_HA,
+                       "import %s@%s for %s not replayable, deactivating\n",
+                       imp->imp_target_uuid.uuid,
+                       imp->imp_connection->c_remote_uuid.uuid,
+                       imp->imp_obd->obd_name);
+                ptlrpc_set_import_active(imp, 0);
+        }
+
+        ptlrpc_run_failed_import_upcall(imp);
+        EXIT;
+}
+
+static int signal_completed_replay(struct obd_import *imp)
+{
+        struct ptlrpc_request *req;
+        int rc;
+        ENTRY;
+
+        req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, NULL);
+        if (!req)
+                RETURN(-ENOMEM);
+
+        req->rq_replen = lustre_msg_size(0, NULL);
+        req->rq_level = LUSTRE_CONN_RECOVD;
+        req->rq_reqmsg->flags |= MSG_LAST_REPLAY;
+
+        rc = ptlrpc_queue_wait(req);
+
+        ptlrpc_req_finished(req);
         RETURN(rc);
 }
 
-void ptlrpc_wake_delayed(struct obd_import *imp)
+int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
 {
+        int msg_flags = 0, rc;
         unsigned long flags;
-        struct list_head *tmp, *pos;
         struct ptlrpc_request *req;
+        ENTRY;
 
         spin_lock_irqsave(&imp->imp_lock, flags);
-        list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
-                req = list_entry(tmp, struct ptlrpc_request, rq_list);
-                DEBUG_REQ(D_HA, req, "waking:");
-                wake_up(&req->rq_wait_for_rep);
+        if (imp->imp_level == LUSTRE_CONN_FULL) {
+                imp->imp_level = LUSTRE_CONN_RECOVD;
+                imp->imp_generation++;
+        }
+        spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+        if (new_uuid) {
+                struct ptlrpc_connection *conn;
+                struct obd_uuid uuid;
+                struct ptlrpc_peer peer;
+                struct obd_export *dlmexp;
+
+                obd_str2uuid(&uuid, new_uuid);
+                if (ptlrpc_uuid_to_peer(&uuid, &peer)) {
+                        CERROR("no connection found for UUID %s\n", new_uuid);
+                        RETURN(-EINVAL);
+                }
+
+                conn = ptlrpc_get_connection(&peer, &uuid);
+                if (!conn)
+                        RETURN(-ENOMEM);
+
+                CDEBUG(D_HA, "switching import %s/%s from %s to %s\n",
+                       imp->imp_target_uuid.uuid, imp->imp_obd->obd_name,
+                       imp->imp_connection->c_remote_uuid.uuid,
+                       conn->c_remote_uuid.uuid);
+
+                /* Switch the import's connection and the DLM export's
+                 * connection (which are almost certainly the same, but we
+                 * keep distinct refs just to make things clearer. I think. */
+                if (imp->imp_connection)
+                        ptlrpc_put_connection(imp->imp_connection);
+                /* We hand off the ref from ptlrpc_get_connection. */
+                imp->imp_connection = conn;
+
+                dlmexp = class_conn2export(&imp->imp_dlm_handle);
+                if (dlmexp->exp_connection)
+                        ptlrpc_put_connection(dlmexp->exp_connection);
+                dlmexp->exp_connection = ptlrpc_connection_addref(conn);
+                class_export_put(dlmexp);
+
+        }
+
+        rc = ptlrpc_reconnect_import(imp, &req);
+
+        if (rc) {
+                CERROR("failed to reconnect to %s@%s: %d\n",
+                       imp->imp_target_uuid.uuid,
+                       imp->imp_connection->c_remote_uuid.uuid, rc);
+                RETURN(rc);
+        }
+
+        if (req->rq_repmsg)
+                msg_flags = lustre_msg_get_op_flags(req->rq_repmsg);
+
+        if (msg_flags & MSG_CONNECT_RECOVERING) {
+                CDEBUG(D_HA, "replay requested by %s\n",
+                       imp->imp_target_uuid.uuid);
+                rc = ptlrpc_replay(imp);
+                if (rc)
+                        GOTO(out, rc);
+
+                if (ptlrpc_ldlm_replay_locks == NULL)
+                        CERROR("ptlrpc/ldlm hook is NULL!  Please tell phil\n");
+                else
+                        rc = ptlrpc_ldlm_replay_locks(imp);
+                if (rc)
+                        GOTO(out, rc);
+
+                rc = signal_completed_replay(imp);
+                if (rc)
+                        GOTO(out, rc);
+        } else if (msg_flags & MSG_CONNECT_RECONNECT) {
+                CDEBUG(D_HA, "reconnected to %s@%s\n",
+                       imp->imp_target_uuid.uuid,
+                       imp->imp_connection->c_remote_uuid.uuid);
+        } else {
+                CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
+                       imp->imp_target_uuid.uuid,
+                       imp->imp_connection->c_remote_uuid.uuid);
+                ptlrpc_invalidate_import_state(imp);
         }
+
+        rc = ptlrpc_resend(imp);
+
+        spin_lock_irqsave(&imp->imp_lock, flags);
+        imp->imp_level = LUSTRE_CONN_FULL;
+        imp->imp_invalid = 0;
         spin_unlock_irqrestore(&imp->imp_lock, flags);
+
+        ptlrpc_wake_delayed(imp);
+        EXIT;
+ out:
+        ptlrpc_req_finished(req);
+        return rc;
+}
+
+void ptlrpc_fail_export(struct obd_export *exp)
+{
+        int rc, already_failed;
+        struct lustre_handle hdl;
+        unsigned long flags;
+
+        spin_lock_irqsave(&exp->exp_lock, flags);
+        already_failed = exp->exp_failed;
+        exp->exp_failed = 1;
+        spin_unlock_irqrestore(&exp->exp_lock, flags);
+
+        if (already_failed) {
+                CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+                       exp, exp->exp_client_uuid.uuid);
+                return;
+        }
+
+        CDEBUG(D_HA, "disconnecting export %p/%s\n",
+               exp, exp->exp_client_uuid.uuid);
+        hdl.cookie = exp->exp_handle.h_cookie;
+        rc = obd_disconnect(&hdl, 0);
+        if (rc)
+                CERROR("disconnecting export %p failed: %d\n", exp, rc);
 }
diff --git a/lustre/ptlrpc/rpc.c b/lustre/ptlrpc/rpc.c
deleted file mode 100644 (file)
index c0d5ba5..0000000
+++ /dev/null
@@ -1,312 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#define EXPORT_SYMTAB
-#define DEBUG_SUBSYSTEM S_RPC
-
-#ifdef __KERNEL__
-# include <linux/module.h>
-# include <linux/init.h>
-#else
-# include <liblustre.h>
-#endif
-#include <linux/obd.h>
-#include <linux/obd_support.h>
-#include <linux/obd_class.h>
-#include <linux/lustre_lib.h>
-#include <linux/lustre_ha.h>
-#include <linux/lustre_net.h>
-#include <linux/lprocfs_status.h>
-
-extern int ptlrpc_init_portals(void);
-extern void ptlrpc_exit_portals(void);
-
-static __u32 ptlrpc_last_xid = 0;
-static spinlock_t ptlrpc_last_xid_lock = SPIN_LOCK_UNLOCKED;
-
-__u32 ptlrpc_next_xid(void)
-{
-        __u32 tmp;
-        spin_lock(&ptlrpc_last_xid_lock);
-        tmp = ++ptlrpc_last_xid;
-        spin_unlock(&ptlrpc_last_xid_lock);
-        return tmp;
-}
-
-int connmgr_setup(struct obd_device *obddev, obd_count len, void *buf)
-{
-        struct recovd_obd *recovd = &obddev->u.recovd;
-        int err;
-        ENTRY;
-
-        memset(recovd, 0, sizeof(*recovd));
-
-        err = recovd_setup(recovd);
-        RETURN(err);
-}
-
-int connmgr_cleanup(struct obd_device *dev)
-{
-        struct recovd_obd *recovd = &dev->u.recovd;
-        int err;
-
-        err = recovd_cleanup(recovd);
-        RETURN(err);
-}
-
-int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len,
-                      void *karg, void *uarg)
-{
-        struct ptlrpc_connection *conn = NULL;
-        struct obd_device *obd = class_conn2obd(hdl);
-        struct recovd_obd *recovd = &obd->u.recovd;
-        struct obd_ioctl_data *data = karg;
-        struct list_head *tmp;
-        int rc = 0;
-
-        ENTRY;
-
-        if (cmd != OBD_IOC_RECOVD_NEWCONN && cmd != OBD_IOC_RECOVD_FAILCONN)
-                RETURN(-EINVAL); /* XXX ENOSYS? */
-
-        /* Find the connection that's been rebuilt or has failed. */
-        spin_lock(&recovd->recovd_lock);
-        list_for_each(tmp, &recovd->recovd_troubled_items) {
-                conn = list_entry(tmp, struct ptlrpc_connection,
-                                  c_recovd_data.rd_managed_chain);
-
-                LASSERT(conn->c_recovd_data.rd_recovd == recovd); /* sanity */
-#warning check buffer overflow in next line
-                if (!strcmp(conn->c_remote_uuid.uuid, data->ioc_inlbuf1))
-                        break;
-                conn = NULL;
-        }
-
-        if (!conn) {
-                if (cmd == OBD_IOC_RECOVD_NEWCONN)
-                        GOTO(out, rc = -EINVAL);
-                /* XXX macroize/inline and share with loop above */
-                list_for_each(tmp, &recovd->recovd_managed_items) {
-                        conn = list_entry(tmp, struct ptlrpc_connection,
-                                          c_recovd_data.rd_managed_chain);
-
-                        LASSERT(conn->c_recovd_data.rd_recovd == recovd);
-
-#warning check buffer overflow in next line
-                        if (!strcmp(conn->c_remote_uuid.uuid,
-                                    data->ioc_inlbuf1))
-                                break;
-                        conn = NULL;
-                }
-                if (!conn)
-                        GOTO(out, rc = -EINVAL);
-        }
-
-        if (cmd == OBD_IOC_RECOVD_FAILCONN) {
-                spin_unlock(&recovd->recovd_lock);
-                recovd_conn_fail(conn);
-                spin_lock(&recovd->recovd_lock);
-                goto out;
-        }
-
-
-        /* else (NEWCONN) */
-        spin_lock(&conn->c_lock);
-
-        /* whatever happens, reset the INVALID flag */
-        conn->c_flags &= ~CONN_INVALID;
-
-        /* XXX is this a good check?  should we allow readdressing of
-         * XXX conns that aren't in recovery?
-         */
-        if (conn->c_recovd_data.rd_phase != RD_PREPARING) {
-                spin_unlock(&conn->c_lock);
-                GOTO(out, rc = -EALREADY);
-        }
-
-        if (data->ioc_inllen2) {
-                CERROR("conn %p UUID change %s -> %s\n",
-                       conn, conn->c_remote_uuid.uuid, data->ioc_inlbuf2);
-                obd_str2uuid(&conn->c_remote_uuid, data->ioc_inlbuf2);
-        } else {
-                CERROR("conn %p UUID %s reconnected\n", conn,
-                       conn->c_remote_uuid.uuid);
-        }
-        ptlrpc_readdress_connection(conn, &conn->c_remote_uuid);
-        spin_unlock(&conn->c_lock);
-
-        conn->c_recovd_data.rd_phase = RD_PREPARED;
-        wake_up(&recovd->recovd_waitq);
- out:
-        spin_unlock(&recovd->recovd_lock);
-        RETURN(rc);
-}
-
-static int connmgr_connect(struct lustre_handle *conn, struct obd_device *src,
-                           struct obd_uuid *cluuid, struct recovd_obd *recovd,
-                           ptlrpc_recovery_cb_t recover)
-{
-        return class_connect(conn, src, cluuid);
-}
-
-int connmgr_attach(struct obd_device *dev, obd_count len, void *data)
-{
-        struct lprocfs_static_vars lvars;
-        int rc = 0;
-
-        lprocfs_init_vars(&lvars);
-        rc = lprocfs_obd_attach(dev, lvars.obd_vars);
-        return rc;
-}
-
-int conmgr_detach(struct obd_device *dev)
-{
-        return lprocfs_obd_detach(dev);
-}
-
-/* use obd ops to offer management infrastructure */
-static struct obd_ops recovd_obd_ops = {
-        o_owner:        THIS_MODULE,
-        o_attach:       connmgr_attach,
-        o_detach:       conmgr_detach,
-        o_setup:        connmgr_setup,
-        o_cleanup:      connmgr_cleanup,
-        o_iocontrol:    connmgr_iocontrol,
-        o_connect:      connmgr_connect,
-        o_disconnect:   class_disconnect
-};
-
-
-
-__init int ptlrpc_init(void)
-{
-        struct lprocfs_static_vars lvars;
-        int rc;
-        ENTRY;
-
-        rc = ptlrpc_init_portals();
-        if (rc)
-                RETURN(rc);
-        ptlrpc_init_connection();
-
-        lprocfs_init_vars(&lvars);
-        rc = class_register_type(&recovd_obd_ops, lvars.module_vars,
-                                 LUSTRE_HA_NAME);
-        if (rc)
-                RETURN(rc);
-        ptlrpc_put_connection_superhack = ptlrpc_put_connection;
-        ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight;
-        RETURN(0);
-}
-
-static void __exit ptlrpc_exit(void)
-{
-        class_unregister_type(LUSTRE_HA_NAME);
-        ptlrpc_exit_portals();
-        ptlrpc_cleanup_connection();
-}
-
-/* rpc.c */
-EXPORT_SYMBOL(ptlrpc_next_xid);
-
-/* recovd.c */
-EXPORT_SYMBOL(ptlrpc_recovd);
-EXPORT_SYMBOL(recovd_conn_fail);
-EXPORT_SYMBOL(recovd_conn_manage);
-EXPORT_SYMBOL(recovd_conn_fixed);
-EXPORT_SYMBOL(recovd_setup);
-EXPORT_SYMBOL(recovd_cleanup);
-
-/* connection.c */
-EXPORT_SYMBOL(ptlrpc_readdress_connection);
-EXPORT_SYMBOL(ptlrpc_get_connection);
-EXPORT_SYMBOL(ptlrpc_put_connection);
-EXPORT_SYMBOL(ptlrpc_connection_addref);
-EXPORT_SYMBOL(ptlrpc_init_connection);
-EXPORT_SYMBOL(ptlrpc_cleanup_connection);
-
-/* niobuf.c */
-EXPORT_SYMBOL(ptlrpc_bulk_put);
-EXPORT_SYMBOL(ptlrpc_bulk_get);
-EXPORT_SYMBOL(ptlrpc_register_bulk_put);
-EXPORT_SYMBOL(ptlrpc_register_bulk_get);
-EXPORT_SYMBOL(ptlrpc_abort_bulk);
-EXPORT_SYMBOL(ptlrpc_reply);
-EXPORT_SYMBOL(ptlrpc_error);
-EXPORT_SYMBOL(ptlrpc_resend_req);
-EXPORT_SYMBOL(ptl_send_rpc);
-EXPORT_SYMBOL(ptlrpc_link_svc_me);
-EXPORT_SYMBOL(obd_brw_set_new);
-EXPORT_SYMBOL(obd_brw_set_add);
-EXPORT_SYMBOL(obd_brw_set_del);
-EXPORT_SYMBOL(obd_brw_set_decref);
-EXPORT_SYMBOL(obd_brw_set_addref);
-
-/* client.c */
-EXPORT_SYMBOL(ptlrpc_init_client);
-EXPORT_SYMBOL(ptlrpc_cleanup_client);
-EXPORT_SYMBOL(ptlrpc_req_to_uuid);
-EXPORT_SYMBOL(ptlrpc_uuid_to_connection);
-EXPORT_SYMBOL(ptlrpc_queue_wait);
-EXPORT_SYMBOL(ptlrpc_continue_req);
-EXPORT_SYMBOL(ptlrpc_replay_req);
-EXPORT_SYMBOL(ptlrpc_restart_req);
-EXPORT_SYMBOL(ptlrpc_prep_req);
-EXPORT_SYMBOL(ptlrpc_free_req);
-EXPORT_SYMBOL(ptlrpc_abort);
-EXPORT_SYMBOL(ptlrpc_req_finished);
-EXPORT_SYMBOL(ptlrpc_request_addref);
-EXPORT_SYMBOL(ptlrpc_prep_bulk);
-EXPORT_SYMBOL(ptlrpc_free_bulk);
-EXPORT_SYMBOL(ptlrpc_prep_bulk_page);
-EXPORT_SYMBOL(ptlrpc_free_bulk_page);
-EXPORT_SYMBOL(ll_brw_sync_wait);
-EXPORT_SYMBOL(ptlrpc_abort_inflight);
-EXPORT_SYMBOL(ptlrpc_retain_replayable_request);
-
-/* service.c */
-EXPORT_SYMBOL(ptlrpc_init_svc);
-EXPORT_SYMBOL(ptlrpc_stop_all_threads);
-EXPORT_SYMBOL(ptlrpc_start_thread);
-EXPORT_SYMBOL(ptlrpc_unregister_service);
-
-/* pack_generic.c */
-EXPORT_SYMBOL(lustre_pack_msg);
-EXPORT_SYMBOL(lustre_msg_size);
-EXPORT_SYMBOL(lustre_unpack_msg);
-EXPORT_SYMBOL(lustre_msg_buf);
-
-/* recover.c */
-EXPORT_SYMBOL(ptlrpc_run_recovery_upcall);
-EXPORT_SYMBOL(ptlrpc_reconnect_import);
-EXPORT_SYMBOL(ptlrpc_replay);
-EXPORT_SYMBOL(ptlrpc_resend);
-EXPORT_SYMBOL(ptlrpc_wake_delayed);
-
-#ifdef __KERNEL__
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Lustre Request Processor");
-MODULE_LICENSE("GPL");
-
-module_init(ptlrpc_init);
-module_exit(ptlrpc_exit);
-#endif
index 3338445..f9475b0 100644 (file)
@@ -28,6 +28,8 @@
 #include <linux/obd_support.h>
 #include <linux/obd_class.h>
 #include <linux/lustre_net.h>
+#include <portals/types.h>
+#include "ptlrpc_internal.h"
 
 extern int request_in_callback(ptl_event_t *ev);
 
@@ -52,11 +54,10 @@ static int ptlrpc_check_event(struct ptlrpc_service *svc,
                 idx = (svc->srv_interface_rover + i) % ptlrpc_ninterfaces;
                 srv_ni = &svc->srv_interfaces[idx];
 
-                LASSERT (ptl_is_valid_handle (&srv_ni->sni_eq_h));
+                LASSERT (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE));
 
                 rc = PtlEQGet(srv_ni->sni_eq_h, event);
-                switch (rc)
-                {
+                switch (rc) {
                 case PTL_OK:
                         /* next time start with the next interface */
                         svc->srv_interface_rover = (idx+1) % ptlrpc_ninterfaces;
@@ -72,6 +73,7 @@ static int ptlrpc_check_event(struct ptlrpc_service *svc,
                 }
         }
         rc = 0;
+        EXIT;
  out:
         spin_unlock(&svc->srv_lock);
         return rc;
@@ -81,12 +83,10 @@ struct ptlrpc_service *
 ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
                 __u32 bufsize, __u32 max_req_size,
                 int req_portal, int rep_portal,
-                svc_handler_t handler, char *name)
+                svc_handler_t handler, char *name,
+                struct obd_device *obddev)
 {
-        int ssize;
-        int rc;
-        int i;
-        int j;
+        int i, j, ssize, rc;
         struct ptlrpc_service *service;
         struct ptlrpc_srv_ni  *srv_ni;
         ENTRY;
@@ -118,7 +118,7 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
 
                 srv_ni->sni_service = service;
                 srv_ni->sni_ni = &ptlrpc_interfaces[i];
-                ptl_set_inv_handle (&srv_ni->sni_eq_h);
+                srv_ni->sni_eq_h = PTL_HANDLE_NONE;
                 INIT_LIST_HEAD(&srv_ni->sni_rqbds);
                 srv_ni->sni_nrqbds = 0;
                 atomic_set(&srv_ni->sni_nrqbds_receiving, 0);
@@ -152,7 +152,7 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
                         }
 
                         rqbd->rqbd_srv_ni = srv_ni;
-                        ptl_set_inv_handle(&rqbd->rqbd_me_h);
+                        rqbd->rqbd_me_h = PTL_HANDLE_NONE;
                         atomic_set(&rqbd->rqbd_refcount, 0);
 
                         OBD_ALLOC(rqbd->rqbd_buffer, service->srv_buf_size);
@@ -171,6 +171,8 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs,
                 }
         }
 
+        ptlrpc_lprocfs_register_service(obddev, service);
+
         CDEBUG(D_NET, "%s: Started on %d interfaces, listening on portal %d\n",
                service->srv_name, ptlrpc_ninterfaces, service->srv_req_portal);
 
@@ -192,12 +194,13 @@ static int handle_incoming_request(struct obd_device *obddev,
          * on the stack of mds_handle instead. */
 
         LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0);
-        LASSERT ((event->mem_desc.options & PTL_MD_IOV) == 0);
+        LASSERT ((event->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0);
         LASSERT (rqbd->rqbd_srv_ni->sni_service == svc);
         LASSERT (rqbd->rqbd_buffer == event->mem_desc.start);
         LASSERT (event->offset + event->mlength <= svc->srv_buf_size);
 
         memset(request, 0, sizeof(*request));
+        spin_lock_init (&request->rq_lock);
         INIT_LIST_HEAD(&request->rq_list);
         request->rq_svc = svc;
         request->rq_obd = obddev;
@@ -205,55 +208,37 @@ static int handle_incoming_request(struct obd_device *obddev,
         request->rq_reqmsg = event->mem_desc.start + event->offset;
         request->rq_reqlen = event->mlength;
 
-        rc = -EINVAL;
-
-        if (request->rq_reqlen < sizeof(struct lustre_msg)) {
-                CERROR("incomplete request (%d): ptl %d from "LPX64" xid "
-                       LPU64"\n",
-                       request->rq_reqlen, svc->srv_req_portal,
+#if SWAB_PARANOIA
+        /* Clear request swab mask; this is a new request */
+        request->rq_req_swab_mask = 0;
+#endif
+        rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen);
+        if (rc != 0) {
+                CERROR ("error unpacking request: ptl %d from "LPX64
+                        " xid "LPU64"\n", svc->srv_req_portal,
                        event->initiator.nid, request->rq_xid);
                 goto out;
         }
-
-        CDEBUG(D_RPCTRACE, "Handling RPC ni:pid:xid:nid:opc %d:%d:"LPU64":"
-               LPX64":%d\n", (int)(rqbd->rqbd_srv_ni - svc->srv_interfaces),
-               NTOH__u32(request->rq_reqmsg->status), request->rq_xid,
-               event->initiator.nid, NTOH__u32(request->rq_reqmsg->opc));
-
-        if (NTOH__u32(request->rq_reqmsg->type) != PTL_RPC_MSG_REQUEST) {
+        rc = -EINVAL;
+        if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) {
                 CERROR("wrong packet type received (type=%u)\n",
                        request->rq_reqmsg->type);
                 goto out;
         }
 
-        if (request->rq_reqmsg->magic != PTLRPC_MSG_MAGIC) {
-                CERROR("wrong lustre_msg magic %d: ptl %d from "LPX64" xid "
-                       LPD64"\n",
-                       request->rq_reqmsg->magic, svc->srv_req_portal,
-                       event->initiator.nid, request->rq_xid);
-                goto out;
-        }
-
-        if (request->rq_reqmsg->version != PTLRPC_MSG_VERSION) {
-                CERROR("wrong lustre_msg version %d: ptl %d from "LPX64" xid "
-                       LPD64"\n",
-                       request->rq_reqmsg->version, svc->srv_req_portal,
-                       event->initiator.nid, request->rq_xid);
-                goto out;
-        }
-
         CDEBUG(D_NET, "got req "LPD64" (md: %p + %d)\n", request->rq_xid,
                event->mem_desc.start, event->offset);
 
         request->rq_peer.peer_nid = event->initiator.nid;
         request->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni;
 
-        request->rq_export = class_conn2export((struct lustre_handle *)
-                                               request->rq_reqmsg);
+        request->rq_export = class_conn2export(&request->rq_reqmsg->handle);
 
         if (request->rq_export) {
                 request->rq_connection = request->rq_export->exp_connection;
                 ptlrpc_connection_addref(request->rq_connection);
+                request->rq_export->exp_last_request_time =
+                        LTIME_S(CURRENT_TIME);
         } else {
                 /* create a (hopefully temporary) connection that will be used
                  * to send the reply if this call doesn't create an export.
@@ -262,8 +247,28 @@ static int handle_incoming_request(struct obd_device *obddev,
                         ptlrpc_get_connection(&request->rq_peer, NULL);
         }
 
+        CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid:pid:xid:ni:nid:opc %s:%s:%d:"
+               LPU64":%s:"LPX64":%d\n",
+               current->comm,
+               (request->rq_export ? 
+                (char *)request->rq_export->exp_client_uuid.uuid : "0"), 
+               request->rq_reqmsg->status, request->rq_xid,
+               rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid,
+               request->rq_reqmsg->opc);
+
         rc = svc->srv_handler(request);
+        CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid:pid:xid:ni:nid:opc %s:%s:%d:"
+               LPU64":%s:"LPX64":%d\n",
+               current->comm,
+               (request->rq_export ? 
+                (char *)request->rq_export->exp_client_uuid.uuid : "0"),
+               request->rq_reqmsg->status, request->rq_xid,
+               rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid,
+               request->rq_reqmsg->opc);
+
         ptlrpc_put_connection(request->rq_connection);
+        if (request->rq_export != NULL)
+                class_export_put(request->rq_export);
 
  out:
         if (atomic_dec_and_test (&rqbd->rqbd_refcount)) /* last reference? */
@@ -272,8 +277,8 @@ static int handle_incoming_request(struct obd_device *obddev,
         return rc;
 }
 
-/* Don't use daemonize, it removes fs struct from new thread  (bug 418) */
-static void ptlrpc_daemonize(void)
+/* Don't use daemonize, it removes fs struct from new thread (bug 418) */
+void ptlrpc_daemonize(void)
 {
         exit_mm(current);
 
@@ -295,25 +300,23 @@ static int ptlrpc_main(void *arg)
         ptl_event_t *event;
         int rc = 0;
         unsigned long flags;
+        cycles_t workdone_time;
+        cycles_t svc_workcycles;
         ENTRY;
 
         lock_kernel();
         ptlrpc_daemonize();
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+        SIGNAL_MASK_LOCK(current, flags);
         sigfillset(&current->blocked);
-        recalc_sigpending();
-#else
-        spin_lock_irqsave(&current->sigmask_lock, flags);
-        sigfillset(&current->blocked);
-        recalc_sigpending(current);
-        spin_unlock_irqrestore(&current->sigmask_lock, flags);
-#endif
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
 
-#ifdef __arch_um__
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
         sprintf(current->comm, "%s|%d", data->name,current->thread.extern_pid);
-#endif
+#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+        sprintf(current->comm, "%s|%d", data->name,
+                current->thread.mode.tt.extern_pid);
 #else
         strcpy(current->comm, data->name);
 #endif
@@ -328,6 +331,7 @@ static int ptlrpc_main(void *arg)
 
         /* Record that the thread is running */
         thread->t_flags = SVC_RUNNING;
+        svc_workcycles = workdone_time = 0;
         wake_up(&thread->t_ctl_waitq);
 
         /* XXX maintain a list of all managed devices: insert here */
@@ -348,12 +352,43 @@ static int ptlrpc_main(void *arg)
                 }
 
                 if (thread->t_flags & SVC_EVENT) {
+                        cycles_t  workstart_time;
                         spin_lock(&svc->srv_lock);
                         thread->t_flags &= ~SVC_EVENT;
+                        /* Update Service Statistics */
+                        workstart_time = get_cycles();
+                        if (workdone_time && (svc->svc_counters != NULL)) {
+                                /* Stats for req(n) are updated just before
+                                 * req(n+1) is executed. This avoids need to
+                                 * reacquire svc->srv_lock after
+                                 * call to handling_request().
+                                 */
+                                int opc_offset;
+                                /* req_waittime */
+                                LPROCFS_COUNTER_INCR(&svc->svc_counters->cntr[PTLRPC_REQWAIT_CNTR],
+                                                     (workstart_time -
+                                                      event->arrival_time));
+                                /* svc_eqdepth */
+                                LPROCFS_COUNTER_INCR(&svc->svc_counters->cntr[PTLRPC_SVCEQDEPTH_CNTR],
+                                                     0); /* Wait for b_eq branch */
+                                /* svc_idletime */
+                                LPROCFS_COUNTER_INCR(&svc->svc_counters->cntr[PTLRPC_SVCIDLETIME_CNTR],
+                                                     (workstart_time -
+                                                      workdone_time));
+                                /* previous request */
+                                opc_offset = 
+                                        opcode_offset(request->rq_reqmsg->opc);
+                                if (opc_offset >= 0) {
+                                        LASSERT(opc_offset < LUSTRE_MAX_OPCODES);
+                                        LPROCFS_COUNTER_INCR(&svc->svc_counters->cntr[PTLRPC_LAST_CNTR+opc_offset], svc_workcycles);
+                                }
+                        }
                         spin_unlock(&svc->srv_lock);
 
                         rc = handle_incoming_request(obddev, svc, event,
                                                      request);
+                        workdone_time = get_cycles();
+                        svc_workcycles = workdone_time - workstart_time;
                         continue;
                 }
 
@@ -363,6 +398,10 @@ static int ptlrpc_main(void *arg)
                 break;
         }
 
+        /* NB should wait for all SENT callbacks to complete before exiting
+         * here.  Unfortunately at this time there is no way to track this
+         * state.
+         */
         OBD_FREE(request, sizeof(*request));
 out_event:
         OBD_FREE(event, sizeof(*event));
@@ -415,10 +454,8 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
         ENTRY;
 
         OBD_ALLOC(thread, sizeof(*thread));
-        if (thread == NULL) {
-                LBUG();
+        if (thread == NULL)
                 RETURN(-ENOMEM);
-        }
         init_waitqueue_head(&thread->t_ctl_waitq);
 
         d.dev = dev;
@@ -433,9 +470,9 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
         /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
          * just drop the VM and FILES in ptlrpc_daemonize() right away.
          */
-        rc = kernel_thread(ptlrpc_main, (void *) &d, CLONE_VM | CLONE_FILES);
+        rc = kernel_thread(ptlrpc_main, &d, CLONE_VM | CLONE_FILES);
         if (rc < 0) {
-                CERROR("cannot start thread\n");
+                CERROR("cannot start thread: %d\n", rc);
                 OBD_FREE(thread, sizeof(*thread));
                 RETURN(rc);
         }
@@ -446,8 +483,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc,
 
 int ptlrpc_unregister_service(struct ptlrpc_service *service)
 {
-        int i;
-        int rc;
+        int i, rc;
         struct ptlrpc_srv_ni *srv_ni;
 
         LASSERT (list_empty (&service->srv_threads));
@@ -490,7 +526,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
 
                 LASSERT (srv_ni->sni_nrqbds == 0);
 
-                if (ptl_is_valid_handle (&srv_ni->sni_eq_h)) {
+                if (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE)) {
                         rc = PtlEQFree(srv_ni->sni_eq_h);
                         if (rc)
                                 CERROR("%s.%d: PtlEQFree failed on %s: %d\n",
@@ -499,6 +535,8 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service)
                 }
         }
 
+        ptlrpc_lprocfs_unregister_service(service);
+
         OBD_FREE(service,
                  offsetof (struct ptlrpc_service,
                            srv_interfaces[ptlrpc_ninterfaces]));
diff --git a/lustre/scripts/llite-group.sh b/lustre/scripts/llite-group.sh
new file mode 100644 (file)
index 0000000..ed914e8
--- /dev/null
@@ -0,0 +1,67 @@
+#!/bin/sh
+#
+# llite-group.sh : Cluster Manager service script for Lustre
+#
+# This must be named llite-<group>.sh, where group is the device 
+# group that is being managed by the cluster manager service.
+#
+
+set -e
+set -vx
+
+[ -f ${LUSTRE_CFG:=/etc/lustre/lustre.cfg} ] && . ${LUSTRE_CFG}
+
+LDAPURL=${LDAPURL:-ldap://localhost}
+CONFIG=${CONFIG:-test23}
+
+LACTIVE=${LACTIVE:-/usr/sbin/lactive}
+LCONF=${LCONF:-/usr/sbin/lconf}
+
+group=`basename $0 .sh| cut -d- -f2`
+confopt="--ldapurl $LDAPURL --config $CONFIG"
+
+[ -z "$group" ] && exit 0
+
+node=`hostname -s`
+
+[ -d ${STATUS_DIR:=/var/lustre} ] || mkdir -p $STATUS_DIR
+
+start() {
+        echo -n "Starting $SERVICE: "
+       python2 $LACTIVE $confopt --group $group --active $node
+        python2 $LCONF -v $confopt
+        RETVAL=$?
+       echo done
+}
+
+stop() {
+        echo -n "Shutting down $SERVICE: "
+        python2 $LCONF -v --cleanup --force --failover $confopt
+        RETVAL=$?
+        echo done
+}
+
+status() {
+        RETVAL=0
+}
+
+
+case "$1" in
+  start)
+       start
+       ;;
+  stop)
+       stop
+       ;;
+  restart)
+       restart
+       ;;
+  status)
+       status $SERVICE
+       ;;
+  *)
+       echo "Usage: $0 {start|stop|status}"
+       exit 1
+esac
+
+exit $RETVAL
index 3657c7a..40e627d 100644 (file)
@@ -1,10 +1,8 @@
 # lustre.spec
-%define version HEAD
+%define version b_devel
 %define kversion @RELEASE@
 %define linuxdir @LINUX@
-%define portalsdir @PORTALS@
-%define portalslibdir @PORTALSLIB@
-Release: 0302240920chaos
+Release: 0305281701chaos
 
 Summary: Lustre Lite File System
 Name: lustre-lite
@@ -21,7 +19,7 @@ servers and utilities.
 
 %package -n lustre-modules
 Summary: Kernel Lustre drivers for Linux %{kversion}
-Requires: portals-modules
+Requires: modutils >= 2.4.10
 Group: Development/Kernel
 
 %description -n lustre-modules
@@ -59,7 +57,6 @@ Group: Development/Kernel
 %description -n liblustre
 Lustre lib binary package.
 
-
 %prep
 %setup -qn lustre-%{version}
 %setup -c -n lustre-%{version}-lib
@@ -69,12 +66,12 @@ rm -rf $RPM_BUILD_ROOT
 
 # Set an explicit path to our Linux tree, if we can.
 cd $RPM_BUILD_DIR/lustre-%{version}
-./configure --with-linux='%{linuxdir}' --with-portals='%{portalsdir}' --with-portalslib='%{portalslibdir}'
+./configure --with-linux='%{linuxdir}' 
 make
 
 %ifarch i386
 cd $RPM_BUILD_DIR/lustre-%{version}-lib/lustre-%{version}
-./configure --with-lib --with-portals='%{portalsdir}' --with-portalslib='%{portalslibdir}'
+./configure --with-lib 
 make
 %endif
 
@@ -87,6 +84,17 @@ cd $RPM_BUILD_DIR/lustre-%{version}-lib/lustre-%{version}
 make install prefix=$RPM_BUILD_ROOT
 %endif
 
+%ifarch alpha
+# this hurts me
+  conf_flag=
+  linuxdir=%{linuxdir}
+  test -d $linuxdir && conf_flag=--with-linux=$linuxdir
+  make clean
+  ./configure --enable-rtscts-myrinet $conf_flag
+  make
+  cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/lustre/rtscts_myrinet.o
+  cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload
+%endif
 
 # Create the pristine source directory.
 cd $RPM_BUILD_DIR/lustre-%{version}
@@ -107,6 +115,7 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre
 %attr(-, root, root) /usr/sbin/lstripe
 %attr(-, root, root) /usr/sbin/mcreate
 %attr(-, root, root) /usr/sbin/mkdirmany
+%attr(-, root, root) /usr/lib/lustre/python/*
 %attr(-, root, root) /usr/lib/lustre/examples/llmount.sh
 %attr(-, root, root) /usr/lib/lustre/examples/llmountcleanup.sh
 %attr(-, root, root) /usr/lib/lustre/examples/llecho.sh
@@ -114,9 +123,19 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre
 %attr(-, root, root) /usr/lib/lustre/examples/uml.sh
 %attr(-, root, root) /usr/lib/lustre/examples/lov.sh
 %attr(-, root, root) /etc/init.d/lustre
+%attr(-, root, root) /usr/sbin/acceptor
+%attr(-, root, root) /usr/sbin/ptlctl
+%attr(-, root, root) /usr/sbin/debugctl
+%attr(-, root, root) /lib/libportals.a
+%attr(-, root, root) /lib/libptlctl.a
+%attr(-, root, root) /lib/libtcpnal.a
+%attr(-, root, root) /usr/include/lustre/*.h
+%ifarch alpha
+%attr(-, root, root) /usr/sbin/mcpload
+%endif
 
 %files -n lustre-doc
-%attr(-, root, root) %doc COPYING FDL
+#%attr(-, root, root) %doc COPYING FDL
 %attr(-, root, root) %doc doc/lustre.pdf doc/lustre-HOWTO.txt
 %attr(-, root, root) %doc tests/client-echo.cfg tests/client-mount.cfg
 %attr(-, root, root) %doc tests/client-mount2.cfg
@@ -128,12 +147,11 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre
 
 %files -n lustre-modules
 %attr(-, root, root) %doc COPYING
-%attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/extN.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/ldlm.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/llite.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/mdc.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/mds.o
-%attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/fsfilt_extN.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/fsfilt_ext3.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/obdclass.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/obdecho.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/obdfilter.o
@@ -141,6 +159,14 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/osc.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/ost.o
 %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/ptlrpc.o
+#portals modules
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/kptlrouter.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/*nal.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/portals.o
+%ifarch alpha
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/p3mod.o
+%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/rtscts.o
+%endif
 
 %files -n lustre-source
 %attr(-, root, root) /usr/src/lustre-%{version}
@@ -179,6 +205,9 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre
 if [ ! -e /dev/obd ]; then
    mknod /dev/obd c 10 241
 fi
+if [ ! -e /dev/portals ]; then
+   mknod /dev/portals c 10 240
+fi
 depmod -ae || exit 0
 
 grep -q obdclass /etc/modules.conf || \
@@ -190,6 +219,12 @@ grep -q '/dev/obd' /etc/modules.conf || \
 grep -q '/dev/lustre' /etc/modules.conf || \
        echo 'alias /dev/lustre obdclass' >> /etc/modules.conf
 
+grep -q portals /etc/modules.conf || \
+        echo 'alias char-major-10-240 portals' >> /etc/modules.conf
+
+grep -q '/dev/portals' /etc/modules.conf || \
+        echo 'alias /dev/portals portals' >> /etc/modules.conf
+
 %postun
 depmod -ae || exit 0
 
@@ -206,7 +241,6 @@ if grep -q slapd-lustre $slapd; then
    cp $tmp $slapd
    rm $tmp
 fi
-
 %clean
 #rm -rf $RPM_BUILD_ROOT
 
index 3575b87..f33443f 100644 (file)
@@ -59,6 +59,9 @@ sub get_latest_mtime()
             $cur_dir =~ s/\/CVS\/Entries$//;
             my @statbuf = stat("$cur_dir/$file");
             my $mtime = $statbuf[9];
+            if (!defined($mtime)) {
+                next;
+            }
             my $local_date = gmtime($mtime);
             if ($local_date ne $date &&
                 $file ne "lustre.spec.in") {
@@ -100,7 +103,7 @@ sub get_linuxdir()
     }
     while (defined($line = <$config>)) {
         chomp($line);
-        if ($line =~ /LINUX = (.*)/) {
+        if ($line =~ /LINUX :?= (.*)/) {
             $dir = $1;
             last;
         }
index 7a18486..5bb1e26 100644 (file)
@@ -35,3 +35,9 @@ wantedi
 createtest
 open_delay
 statone
+opendevunlink
+opendirunlink
+runas
+openfile
+unlinkmany
+fchdir_test
index 6d23b3d..470c9de 100644 (file)
@@ -1,32 +1,22 @@
 # Lustre test Makefile
 DEFS=
-CPPFLAGS = -I. -I$(PORTALS)/include -I$(top_srcdir)/include -D_LARGEFILE64_SOURCE
+CPPFLAGS = -I. -I$(top_srcdir)/portals/include/ -I$(top_srcdir)/include -D_LARGEFILE64_SOURCE
 CFLAGS := -g -Wall
 # LDADD = -lldap
 # LDADD := -lreadline -ltermcap # -lefence
 EXTRA_DIST = $(pkgexample_SCRIPTS) $(noinst_SCRIPTS) $(noinst_DATA) \
-       common.sh lustre.cfg \
-       client-echo.cfg    elan-server.cfg  net-client.cfg  obdecho.cfg \
-       client-mount.cfg   ldlm.cfg         net-local.cfg   obdfilter.cfg \
-       client-mount2.cfg  lustre.cfg       net-server.cfg  sanity.sh \
-       rundbench          mcreate \
-       elan-client.cfg    mds.cfg      trivial.sh
-pkgexampledir = '${exec_prefix}/usr/lib/$(PACKAGE)/examples'
+       sanity.sh          rundbench    mcreate
 pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh local.sh echo.sh uml.sh lov.sh
-noinst_SCRIPTS = llsetup.sh llrsetup.sh llcleanup.sh
-noinst_DATA = lustre.cfg
-noinst_SCRIPTS += fs.sh intent-test.sh intent-test2.sh leak_finder.pl \
-       lldlm.sh llecho.sh llext3.sh llmodules.sh llmount-client.sh \
-       llmount-server.sh llmount.sh llmountcleanup.sh llrext3.sh \
-       llrmount.sh llsimple.sh  mdcreq.sh mdcreqcleanup.sh \
-       ostreq.sh runfailure-client-mds-recover.sh runfailure-mds \
-       runfailure-net runfailure-ost runiozone runregression-net.sh \
-       runtests runvmstat snaprun.sh tbox.sh  common.sh
+noinst_DATA =
+noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh tbox.sh \
+       llrmount.sh runfailure-mds runvmstat runfailure-net runfailure-ost \
+       runiozone runregression-net.sh runtests sanity.sh rundbench
 noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay
 noinst_PROGRAMS += munlink tchmod toexcl fsx test_brw openclose createdestroy
 noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink
+noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test
 # noinst_PROGRAMS += ldaptest 
-noinst_PROGRAMS += checkstat wantedi statone runas
+noinst_PROGRAMS += checkstat wantedi statone runas openfile 
 sbin_PROGRAMS = mcreate mkdirmany
 
 # ldaptest_SOURCES = ldaptest.c
@@ -48,13 +38,21 @@ createdestroy_SOURCES = createdestroy.c
 stat_SOURCES = stat.c
 createmany_SOURCES = createmany.c
 statmany_SOURCES = statmany.c
+unlinkmany_SOURCES = unlinkmany.c
 statone_SOURCES = statone.c
 mkdirmany_SOURCES = mkdirmany.c
 multifstat_SOURCES = multifstat.c
 checkstat_SOURCES = checkstat.c
 runas_SOURCES = runas.c
+openfile_SOURCES = openfile.c
 wantedi_SOURCES = wantedi.c
 createtest_SOURCES = createtest.c
 open_delay_SOURCES = open_delay.c
+opendirunlink_SOURCES=opendirunlink.c
+opendevunlink_SOURCES=opendirunlink.c
+fchdir_test_SOURCES=fchdir_test.c
+#mkdirdeep_SOURCES= mkdirdeep.c
+#mkdirdeep_LDADD=-L../portals/util -lptlctl
+#mkdirdeep_CPPFLAGS=-I$(top_srcdir)/portals/include
 
 include $(top_srcdir)/Rules
index bee6588..e874f5d 100755 (executable)
@@ -18,11 +18,12 @@ fi
 [ "$TMP" ] || TMP=/tmp
 [ "$COUNT" ] || COUNT=1000
 [ "$DEBUG_OFF" ] || DEBUG_OFF="eval echo 0 > /proc/sys/portals/debug"
+[ "$DEBUG_ON" ] || DEBUG_ON="eval echo -1 > /proc/sys/portals/debug"
 
 for NAME in $CONFIGS; do
        export NAME
        [ -e $NAME.sh ] && sh $NAME.sh
-       [ ! -e $NAME.xml ] && echo "no config '$NAME.xml'" 1>&2 && exit 1
+       [ ! -e $NAME.xml ] && [ -z "$LDAPURL" ] && echo "no config '$NAME.xml'" 1>&2 && exit 1
 
        if [ "$RUNTESTS" != "no" ]; then
                sh runtests
@@ -39,11 +40,13 @@ for NAME in $CONFIGS; do
 
                $DEBUG_OFF
                sh rundbench 1
+               $DEBUG_ON
                sh llmountcleanup.sh
                sh llrmount.sh
                if [ $DB_THREADS -gt 1 ]; then
                        $DEBUG_OFF
                        sh rundbench $DB_THREADS
+                       $DEBUG_ON
                        sh llmountcleanup.sh
                        sh llrmount.sh
                fi
@@ -54,6 +57,7 @@ for NAME in $CONFIGS; do
                mount | grep $MNT || sh llmount.sh
                $DEBUG_OFF
                bonnie++ -s 0 -n 10 -u $UID -d $MNT
+               $DEBUG_ON
                sh llmountcleanup.sh
                sh llrmount.sh
        fi
@@ -63,6 +67,7 @@ for NAME in $CONFIGS; do
                mount | grep $MNT || sh llmount.sh
                $DEBUG_OFF
                iozone $IOZONE_OPTS $IOZONE_FILE
+               $DEBUG_ON
                sh llmountcleanup.sh
                sh llrmount.sh
        fi
@@ -75,6 +80,7 @@ for NAME in $CONFIGS; do
                $DEBUG_OFF
                iozone -I $IOZONE_OPTS $IOZONE_FILE.odir
                IOZVER=`iozone -v | awk '/Revision:/ { print $3 }' | tr -d '.'`
+               $DEBUG_ON
                sh llmountcleanup.sh
                sh llrmount.sh
                if [ "$IOZ_THREADS" -gt 1 -a "$IOZVER" -ge 3145 ]; then
@@ -86,6 +92,7 @@ for NAME in $CONFIGS; do
                                THREAD=`expr $THREAD + 1`
                        done
                        iozone -I $IOZONE_OPTS -t $IOZ_THREADS $IOZONE_FILE
+                       $DEBUG_ON
                        sh llmountcleanup.sh
                        sh llrmount.sh
                elif [ $IOZVER -lt 3145 ]; then
@@ -97,6 +104,7 @@ for NAME in $CONFIGS; do
                mount | grep $MNT || sh llmount.sh
                $DEBUG_OFF
                ./fsx -W -c 50 -p 1000 -P $TMP -l 1024000 -N $(($COUNT * 100)) $MNT/fsxfile
+               $DEBUG_ON
                sh llmountcleanup.sh
                #sh llrmount.sh
        fi      
index c0427fd..b28c5f4 100644 (file)
@@ -7,7 +7,7 @@ LMC="save_cmd"
 
 TCPBUF=1048576
 OST=${OST:-ba-ost-1}
-CLIENT=`hostname`
+CLIENT=${CLIENT:-`hostname`}
 
 UUIDLIST=${UUIDLIST:-/usr/local/admin/ba-ost/UUID.txt}
 
index f09fde9..c98d6aa 100644 (file)
@@ -215,7 +215,8 @@ main (int argc, char **argv)
                        }
                        else
                        {
-                               fprintf (stderr, "Can't parse file type %s\n", type);
+                               fprintf (stderr, "Can't parse file type %s\n",
+                                        type);
                                return (1);
                        }
 
@@ -229,7 +230,8 @@ main (int argc, char **argv)
                        {
                                if (verbose)
                                        printf ("%s has perms 0%o, not 0%o\n",
-                                               fname, (buf.st_mode & ~S_IFMT), perms);
+                                               fname, (buf.st_mode & ~S_IFMT),
+                                               perms);
                                return (1);
                        }
 
@@ -244,7 +246,8 @@ main (int argc, char **argv)
                        {
                                if (verbose)
                                        printf ("%s has size %Ld, not %Ld\n",
-                                               fname, (long long)buf.st_size, size);
+                                               fname, (long long)buf.st_size,
+                                               size);
                                return (1);
                        }
 
index 3f6521a..cb4f94d 100755 (executable)
@@ -12,16 +12,6 @@ MDSSIZE=50000
 OSTDEV=$TMP/ost1
 OSTSIZE=200000
 
-kver=`uname -r | cut -d "." -f 1,2`
-
-case $kver in
-  2.4) FSTYPE="--fstype=extN"  ;;
-  2.5) FSTYPE="--fstype=ext3"  ;;
-  *) echo "Kernel version $kver not supported"
-     exit 1
-     ;;
-esac
-
 rm -f $config
 # create nodes
 ${LMC} --add node --node localhost || exit 10
index 5404f13..6223034 100644 (file)
@@ -94,7 +94,7 @@ int main(int argc, char *argv[])
                                argv[0], name, strerror(errno));
                        exit(11);
                }
-               if ((st.st_mode & S_IFMT) != S_IFREG) {
+               if (!S_ISREG(st.st_mode & S_IFMT)) {
                        fprintf(stderr, "%s: ERROR mode %s: %o != %o",
                                argv[0], name, st.st_mode & S_IFMT, S_IFREG);
                        exit(12);
@@ -124,7 +124,7 @@ int main(int argc, char *argv[])
                                argv[0], name, strerror(errno));
                        exit(11);
                }
-               if ((st.st_mode & S_IFMT) != S_IFDIR) {
+               if (!S_ISDIR(st.st_mode)) {
                        fprintf(stderr, "%s: ERROR mode %s: %o != %o",
                                argv[0], name, st.st_mode & S_IFMT, S_IFDIR);
                        exit(12);
index f529fb0..e660ea4 100644 (file)
@@ -17,24 +17,26 @@ int main(int argc, char **argv)
 {
         int fd;
         char *buf;
-        int blocks;
+        int blocks, seek_blocks;
         long len;
-        struct stat st;
+        off64_t seek;
+        struct stat64 st;
         int rc;
 
-        if (argc != 3) {
-                printf("Usage: %s file nr_blocks\n", argv[0]);
+        if (argc != 4) {
+                printf("Usage: %s file seek nr_blocks\n", argv[0]);
                 return 1;
         }
 
-        blocks = strtoul(argv[2], 0, 0);
-        fd = open(argv[1], O_DIRECT | O_RDWR | O_CREAT, 0644);
+        seek_blocks = strtoul(argv[2], 0, 0);
+        blocks = strtoul(argv[3], 0, 0);
+        fd = open(argv[1], O_LARGEFILE | O_DIRECT | O_RDWR | O_CREAT, 0644);
         if (fd == -1) {
                 printf("Cannot open %s:  %s\n", argv[1], strerror(errno));
                 return 1;
         }
 
-        if (fstat(fd, &st) < 0) {
+        if (fstat64(fd, &st) < 0) {
                 printf("Cannot stat %s:  %s\n", argv[1], strerror(errno));
                 return 1;
         }
@@ -42,6 +44,12 @@ int main(int argc, char **argv)
         printf("directio on %s for %dx%lu blocks \n", argv[1], blocks,
                st.st_blksize);
 
+        seek = (off64_t)seek_blocks * (off64_t)st.st_blksize;
+        if (lseek64(fd, seek, SEEK_SET) < 0) {
+                printf("lseek64 failed: %s\n", strerror(errno));
+                return 1;
+        }
+
         len = blocks * st.st_blksize;
         buf = mmap(0, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, 0, 0);
         if (!buf) {
@@ -56,7 +64,7 @@ int main(int argc, char **argv)
                 return 1;
         }
 
-        if (lseek(fd, 0, SEEK_SET) != 0) {
+        if (lseek64(fd, seek, SEEK_SET) < 0) {
                 printf("Cannot seek %s\n", strerror(errno));
                 return 1;
         }
index 99e026f..335db41 100755 (executable)
@@ -16,6 +16,9 @@ TMP=${TMP:-/tmp}
 SERVER=${SERVER:-localhost}
 CLIENT=${CLIENT:-localhost}
 NET=${NET:-tcp}
+SERVERNID=${SERVERNID:-$SERVER}
+CLIENTNID=${CLIENTNID:-$CLIENT}
+
 
 # FIXME: make LMC not require MDS for obdecho LOV
 MDSDEV=${MDSDEV:-$TMP/mds1}
@@ -27,7 +30,7 @@ STRIPES_PER_OBJ=2     # 0 means stripe over all OSTs
 rm -f $config
 # create nodes
 $LMC --add node --node $SERVER  || exit 1
-$LMC --add net --node $SERVER --nid $SERVER --nettype $NET || exit 2
+$LMC --add net --node $SERVER --nid $SERVERNID --nettype $NET || exit 2
 
 if (($LOV)); then
     $LMC --add mds --node $SERVER --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 10
@@ -42,7 +45,7 @@ fi
 
 if [ "$SERVER" != "$CLIENT" ]; then
    $LMC --add node --node $CLIENT  || exit 1
-   $LMC --add net --node $CLIENT --nid $CLIENT --nettype $NET || exit 2
+   $LMC --add net --node $CLIENT --nid $CLIENTNID --nettype $NET || exit 2
 fi
 
 $LMC --add echo_client --node $CLIENT --ost ${OBD_NAME} || exit 3
diff --git a/lustre/tests/fchdir_test.c b/lustre/tests/fchdir_test.c
new file mode 100644 (file)
index 0000000..83c096e
--- /dev/null
@@ -0,0 +1,41 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+
+int main(int argc, char **argv)
+{
+        int fd;
+        int rc;
+
+        fd = open(".", O_RDONLY);
+        if (fd < 0) {
+                perror("opening '.' :");
+                exit(2);
+        }
+
+        rc = chdir("/mnt/lustre/subdir/subdir");
+        if (rc) { 
+                perror("cannot chdir subdir:");
+                exit(3);
+        }
+
+        rc = fchdir(fd);
+        if (rc) { 
+                perror("cannot fchdir back\n");
+                exit(4);
+        }
+
+        rc = close(fd);
+        if (rc) { 
+                perror("cannot close '.'\n");
+                exit(5);
+        }
+
+        return(0);
+}
index d2497a4..5afade1 100644 (file)
@@ -6,9 +6,13 @@ NAME=${NAME:-echo}
 config=$NAME.xml
 mkconfig=$NAME.sh
 
-sh $mkconfig $config || exit 1
+if [ "$LUSTRE" ]; then
+  lustre_opt="--lustre=$LUSTRE"
+fi
 
-$LCONF --reformat --gdb $OPTS $config || exit 4
+sh -x $mkconfig $config || exit 1
+
+$LCONF $lustre_opt --reformat --gdb $OPTS $config || exit 4
 
 cat <<EOF
 
index c490856..de20003 100755 (executable)
@@ -15,12 +15,20 @@ if [ "$LUSTRE" ]; then
   lustre_opt="--lustre=$LUSTRE"
 fi
 
+if [ "$LDAPURL" ]; then
+    conf_opt="--ldapurl $LDAPURL --config $NAME"
+else
+    sh $mkconfig $config || exit 1
+    conf_opt="$config"
+fi    
+
+[ "$NODE" ] && node_opt="--node $NODE"
+
 if [ "$1" = "-v" ]; then
   verbose="-v"
 fi
 
 [ -x $LCONF ] || chmod a+rx $LCONF
 
-sh $mkconfig $config || exit 1
-
-${LCONF} $portals_opt $lustre_opt --reformat --gdb $verbose $config  || exit 2
+${LCONF} $portals_opt $lustre_opt $node_opt --reformat --gdb \
+    $verbose $conf_opt  || exit 2
index cd28d21..98d0512 100755 (executable)
@@ -15,12 +15,24 @@ if [ "$LUSTRE" ]; then
   lustre_opt="--lustre=$LUSTRE"
 fi
 
-if [ ! -f $config ]; then
-   sh $mkconfig $config || exit 1
+if [ "$1" = "--force" ]; then
+  force="--force"
 fi
 
+if [ "$LDAPURL" ]; then
+    conf_opt="--ldapurl $LDAPURL --config $NAME"
+else
+    if [ ! -f $config -o $mkconfig -nt $config ]; then
+       sh $mkconfig $config || exit 1
+    fi
+    conf_opt="$config"
+fi    
+
+[ "$NODE" ] && node_opt="--node $NODE"
+
 sync; sleep 2; sync
-${LCONF} $portals_opt $lustre_opt --cleanup --dump $TMP/debug $config
+${LCONF} $portals_opt $lustre_opt $node_opt --cleanup $force \
+    --dump $TMP/debug $conf_opt
 rc=$?
 BUSY=`dmesg | grep -i destruct`
 if [ "$BUSY" ]; then
@@ -28,7 +40,7 @@ if [ "$BUSY" ]; then
        mv $TMP/debug $TMP/debug-busy.`date +%s`
        exit 255
 fi
-LEAK_LUSTRE=`dmesg | tail -20 | grep -v "leaked: 0" | grep leaked`
+LEAK_LUSTRE=`dmesg | grep "obd mem.*leaked" | tail -1 | grep -v "leaked: 0"`
 LEAK_PORTALS=`dmesg | tail -20 | grep "Portals memory leaked"`
 if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then
        echo "$LEAK_LUSTRE" 1>&2
index 6531055..b12c1ae 100755 (executable)
@@ -14,8 +14,15 @@ if [ "$LUSTRE" ]; then
   lustre_opt="--lustre=$LUSTRE"
 fi
 
-if [ ! -f $config -o $mkconfig -nt $config ]; then
-   sh $mkconfig $config || exit 1
-fi
+if [ "$LDAPURL" ]; then
+    conf_opt="--ldapurl $LDAPURL --config $NAME"
+else
+    if [ ! -f $config -o $mkconfig -nt $config ]; then
+       sh $mkconfig $config || exit 1
+    fi
+    conf_opt="$config"
+fi    
+
+[ "$NODE" ] && node_opt="--node $NODE"
 
-${LCONF} $portals_opt $lustre_opt --gdb $config || exit 2
+${LCONF} $portals_opt $lustre_opt $node_opt --gdb $conf_opt || exit 2
index 2132801..2bd47ae 100755 (executable)
@@ -11,17 +11,7 @@ MDSSIZE=${MDSSIZE:-50000}
 
 OSTDEV=${OSTDEV:-$TMP/ost1}
 OSTSIZE=${OSTSIZE:-200000}
-
-kver=`uname -r | cut -d "." -f 1,2`
-
-case $kver in
-  2.4) FSTYPE="--fstype=extN"  ;;
-  2.5) FSTYPE="--fstype=ext3"  ;;
-  *) echo "Kernel version $kver not supported"
-     exit 1
-     ;;
-esac
-
+FSTYPE=${FSTYPE:-ext3}
 
 rm -f $config
 
@@ -30,10 +20,10 @@ ${LMC} --add node --node localhost || exit 10
 ${LMC} --add net --node  localhost --nid localhost --nettype tcp || exit 11
 
 # configure mds server
-${LMC} --add mds  --node localhost --mds mds1 $FSTYPE --dev $MDSDEV --size $MDSSIZE || exit 20
+${LMC} --add mds  --node localhost --mds mds1  --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE || exit 20
 
 # configure ost
-${LMC} --add ost --node localhost --ost obd1 $FSTYPE --dev $OSTDEV --size  $OSTSIZE || exit 30
+${LMC} --add ost --node localhost --ost ost1  --fstype $FSTYPE --dev $OSTDEV --size  $OSTSIZE || exit 30
 
 # create client config
-${LMC} --add mtpt --node localhost --path /mnt/lustre --mds mds1 --ost obd1 || exit 40
+${LMC} --add mtpt --node localhost --path /mnt/lustre --mds mds1 --ost ost1 || exit 40
index 3b1d961..8d8a100 100755 (executable)
@@ -11,7 +11,7 @@ save_cmd() {
 }
 
 LMC="save_cmd"
-LMC_REAL="../../lustre/utils/lmc -m $config"
+LMC_REAL="../utils/lmc -m $config"
 
 # TCP/IP servers
 SERVER_START=0
diff --git a/lustre/tests/mkdirdeep.c b/lustre/tests/mkdirdeep.c
new file mode 100644 (file)
index 0000000..cfd1535
--- /dev/null
@@ -0,0 +1,275 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Compile with:
+ * cc -I../../portals/include -o mkdirdeep mkdirdeep.c 
+ *    -L../../portals/linux/utils -lptlctl 
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <linux/limits.h>
+#include <portals/lltrace.h>
+
+static int opt_depth = 1;
+static int opt_mknod = 0; 
+static int opt_verbose = 0;
+static int opt_trace = 1;
+static char* basepathname = 0;
+static char mycwd[PATH_MAX];
+static char* pname = 0;
+static char* outputfilename = 0;
+
+void usage()
+{
+        fprintf(stderr, "Usage: %s --depth <d> --output <outputtracefilename>"
+                "[--mknod] [--verbose] [--notrace] <basepath>\n", pname);
+        exit(1);
+}
+
+int do_mkdir(char* path)
+{
+        int rc = mkdir(path, 0755);
+        if (rc!=0) 
+                fprintf(stderr, "mkdir(%s) failed: %s\n",
+                        path, strerror(errno));
+        if (opt_verbose)
+                printf("mkdir %s\n", path);
+        return rc;
+}
+
+
+int do_mknod(char* path)
+{
+        int rc = mknod(path, 0755, S_IFIFO);
+        if (rc!=0) 
+                fprintf(stderr, "mkdir(%s) failed: %s\n",
+                        path, strerror(errno));
+        if (opt_verbose)
+                printf("mknod %s\n", path);
+        return rc;
+}
+
+int do_chdir(char* path)
+{
+        int rc = chdir(path);
+        if (rc!=0) 
+                fprintf(stderr, "chdir(%s) failed: %s\n",
+                        path, strerror(errno));
+        if (opt_verbose)
+                printf("chdir %s\n", path);
+
+        return rc;
+}
+
+
+int do_stat(char* path)
+{
+        char mark_buf[PATH_MAX];
+        struct stat mystat;
+        int rc = stat(path, &mystat);
+        if (rc!=0) 
+                fprintf(stderr, "stat(%s) failed: %s\n",
+                        path, strerror(errno));
+        if (opt_verbose)
+                printf("stat %s = inode %lu\n", path, mystat.st_ino);
+
+        if (opt_trace) {
+                snprintf(mark_buf, PATH_MAX, "stat %s = inode %lu", 
+                         path, mystat.st_ino);
+                ltrace_mark(0, mark_buf);
+        }
+
+        return rc;
+}
+
+int main(int argc, char** argv)
+{
+        int c, opt_index, i, mypid;
+
+        static struct option long_options[] = {
+                {"depth", 1, 0, 0 },
+                {"help", 0, 0, 0 },
+                {"mknod", 0, 0, 0 },  
+                {"verbose", 0, 0, 0 },  
+                {"notrace", 0, 0, 0 },  
+                {"output", 1, 0, 0 },  
+                {0,0,0,0}
+        };
+
+        char full_pathname[PATH_MAX];
+        char rel_pathname[PATH_MAX];
+        char mark_buf[PATH_MAX];
+
+        pname = strdup(argv[0]);
+        
+        while (1) {
+                c = getopt_long(argc, argv, "d:mhv", long_options, &opt_index);
+                if (c == -1)
+                        break;
+                if (c==0) {
+                        if (!strcmp(long_options[opt_index].name, "notrace")) {
+                                opt_trace = 0;
+                                continue;
+                        }
+                        c = long_options[opt_index].name[0];
+                }
+                switch (c) {
+                case 'd': 
+                        opt_depth = atoi(optarg);
+                        if ((opt_depth == 0) || (opt_depth > 100))
+                                usage();
+                        break;
+                case 'm':
+                        opt_mknod = 1;
+                        break;
+                case 'v':
+                        opt_verbose = 1;
+                        break;
+                case 'o':
+                        outputfilename = optarg;
+                        break;
+                case 'h':
+                case '?': 
+                case ':': 
+                default:
+                        usage();
+                        break;
+                }
+        }
+                
+        if (optind != (argc-1)) 
+                usage();
+
+        if (outputfilename == NULL)
+                usage();
+
+        basepathname = argv[optind];
+        mypid = getpid();
+        
+        printf("%s(pid=%d) depth=%d mknod=%d, basepathname=%s, "
+               "trace=%d, outputfilename=%s\n",
+               pname, mypid, opt_depth, opt_mknod, basepathname, opt_trace, 
+               outputfilename);
+
+        if (!getcwd(&mycwd[0], sizeof(mycwd))) {
+                fprintf(stderr, "%s: unable to getcwd()\n", pname);
+                exit(1);
+        }
+
+        if (opt_trace) {
+                ltrace_start();
+                ltrace_clear();
+                snprintf(mark_buf, PATH_MAX, 
+                         "Initialize - mkdir %s; chdir %s",
+                         basepathname, basepathname);
+                ltrace_mark(2, mark_buf);
+        }
+
+        if (do_mkdir(basepathname)!=0)
+                exit(1);
+        if (do_chdir(basepathname)!=0)
+                exit(1);
+
+        /* Create directory tree with depth level of subdirectories */
+
+        if (opt_trace) {
+                snprintf(mark_buf, PATH_MAX, 
+                         "Create Directory Tree (depth %d)", opt_depth);
+                ltrace_mark(2, mark_buf);
+        }
+
+        for (i=0; i<opt_depth; i++) {
+                
+                snprintf(rel_pathname, sizeof(rel_pathname),"%d", i+1);
+                
+                 if (i == (opt_depth-1)) {
+                         /* Last Iteration */
+                         
+                         if (opt_trace) {
+                                 snprintf(mark_buf, PATH_MAX, 
+                                          "Tree Leaf (%d) %s/stat", i,
+                                          (opt_mknod ? "mknod" : "mkdir"));
+                                 ltrace_mark(3, mark_buf);
+                         }
+                         
+                         if (opt_mknod)
+                                 do_mknod(rel_pathname);
+                         else
+                                 do_mkdir(rel_pathname);
+                         /* Now stat it */
+                         do_stat(rel_pathname);
+                 }
+                else {
+                        /* Not Leaf */
+
+                        if (opt_trace) {
+                                snprintf(mark_buf, PATH_MAX, 
+                                         "Tree Level (%d) mkdir/stat/chdir",
+                                         i);
+                                ltrace_mark(3, mark_buf);
+                        }
+                        
+                        do_mkdir(rel_pathname);
+                        do_stat(rel_pathname);
+                        do_chdir(rel_pathname);
+                }
+        }
+        
+        /* Stat through directory tree with fullpaths */
+
+        if (opt_trace) {
+                snprintf(mark_buf, PATH_MAX, "Walk Directory Tree");
+                ltrace_mark(2, mark_buf);
+        }
+
+        do_chdir(basepathname);
+
+        strncpy(full_pathname, basepathname, sizeof(full_pathname));
+
+        for (i=0; i<opt_depth; i++) {
+                snprintf(rel_pathname, sizeof(rel_pathname),"%d", i+1);
+                strcat(full_pathname, "/");
+                strcat(full_pathname, rel_pathname);
+
+                if (opt_trace) {
+                        snprintf(mark_buf, PATH_MAX, "stat %s", 
+                                 full_pathname);
+                        ltrace_mark(2, mark_buf);
+                }
+
+                do_stat(full_pathname);
+        }
+
+        /* Cleanup */
+
+        if (opt_trace) {
+                snprintf(mark_buf, PATH_MAX, "Cleanup");
+                ltrace_mark(2, mark_buf);
+        }
+
+        if (opt_trace) {
+                    ltrace_write_file(outputfilename);
+                    ltrace_add_processnames(outputfilename);
+                    ltrace_stop();
+        }
+
+        do_chdir(basepathname);        
+        
+        snprintf(full_pathname, sizeof(full_pathname), 
+                 "rm -rf %s\n", basepathname);
+        if (opt_verbose) 
+                printf("Cleanup: %s", full_pathname);
+
+        system(full_pathname);
+
+        printf("%s (pid=%d) done.\n", pname, mypid);
+        return 0;
+}
diff --git a/lustre/tests/opendevunlink.c b/lustre/tests/opendevunlink.c
new file mode 100644 (file)
index 0000000..fde7d36
--- /dev/null
@@ -0,0 +1,111 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <string.h>
+
+int main(int argc, char **argv)
+{
+        char *dname1, *dname2;
+        int fddev1, fddev2, rc;
+        //DIR *dp;
+        struct stat st1, st2;
+
+        if (argc < 2 || argc > 3) {
+                fprintf(stderr, "usage: %s filename1 [filename2]\n", argv[0]);
+                exit(1);
+        }
+
+        dname1 = argv[1];
+        if (argc == 3)
+                dname2 = argv[2];
+        else
+                dname2 = argv[1];
+
+        //create the special file (right now only test on pipe)
+        fprintf(stderr, "creating special file %s\n", dname1);
+        rc = mknod(dname1, 0777|S_IFIFO, 0);
+        if (rc == -1) {
+                fprintf(stderr, "creating %s fails: %s\n", 
+                        dname1, strerror(errno));
+                exit(1);
+        }
+
+        // open the special file again
+        fprintf(stderr, "opening file\n");
+        fddev1 = open(dname1, O_RDONLY | O_NONBLOCK);
+        if (fddev1 == -1) {
+                fprintf(stderr, "open %s fails: %s\n",
+                        dname1, strerror(errno));
+                exit(1);
+        }
+        
+        // doesn't matter if the two dirs are the same??
+        fddev2 = open(dname2, O_RDONLY | O_NONBLOCK);
+        if (fddev2 == -1) {
+                fprintf(stderr, "open %s fails: %s\n",
+                        dname2, strerror(errno));
+                exit(1);
+        }
+        
+        // delete the special file
+        fprintf (stderr, "unlinking %s\n", dname1);
+        rc = unlink(dname1);
+        if (rc) {
+                fprintf(stderr, "unlink %s error: %s\n", 
+                        dname1, strerror(errno));
+                exit(1);
+        }
+
+        if (access(dname2, F_OK) == 0){
+                fprintf(stderr, "%s still exists\n", dname2);
+                exit(1);
+        }
+
+        if (access(dname1, F_OK) == 0){
+                fprintf(stderr, "%s still exists\n", dname1);
+                exit(1);
+        }
+
+        // fchmod one special file
+        rc = fchmod (fddev1, 0777);
+        if(rc == -1)
+        {
+                fprintf(stderr, "fchmod unlinked special file %s fails: %s\n", 
+                        dname1, strerror(errno));
+                exit(1);
+        }
+                
+        // fstat two files to check if they are the same
+        rc = fstat(fddev1, &st1);
+        if(rc == -1)
+        {
+                fprintf(stderr, "fstat unlinked special file %s fails: %s\n", 
+                        dname1, strerror(errno));
+                exit(1);
+        }
+
+        rc = fstat(fddev2, &st2);
+        if (rc == -1) {
+                fprintf(stderr, "fstat file %s fails: %s\n",
+                        dname2, strerror(errno));
+                exit(1);
+        }
+
+        if (st1.st_mode != st2.st_mode) {  // can we do this?
+                fprintf(stderr, "fstat different value on %s and %s\n",                                 dname1, dname2);
+                exit(1);
+        }        
+
+        fprintf(stderr, "Ok, everything goes well.\n");
+        return 0;
+}
+
diff --git a/lustre/tests/opendirunlink.c b/lustre/tests/opendirunlink.c
new file mode 100644 (file)
index 0000000..2664618
--- /dev/null
@@ -0,0 +1,122 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <string.h>
+
+int main(int argc, char **argv)
+{
+        char *dname1, *dname2;
+        int fddir1, fddir2, rc;
+        //DIR *dp;
+        struct stat st1, st2;
+
+        if (argc < 2 || argc > 3) {
+                fprintf(stderr, "usage: %s dirname1 [dirname2]\n", argv[0]);
+                exit(1);
+        }
+
+        dname1 = argv[1];
+        if (argc == 3)
+                dname2 = argv[2];
+        else
+                dname2 = argv[1];
+
+        //create the directory
+        fprintf(stderr, "creating directory %s\n", dname1);
+        rc = mkdir(dname1, 0744);
+        if (rc == -1) {
+                fprintf(stderr, "creating %s fails: %s\n", 
+                        dname1, strerror(errno));
+                exit(1);
+        }
+
+        // open the dir again
+        fprintf(stderr, "opening directory\n");
+        fddir1 = open(dname1, O_RDONLY | O_DIRECTORY);
+        if (fddir1 == -1) {
+                fprintf(stderr, "open %s fails: %s\n",
+                        dname1, strerror(errno));
+                exit(1);
+        }
+        
+        // doesn't matter if the two dirs are the same??
+        fddir2 = open(dname2, O_RDONLY | O_DIRECTORY);
+        if (fddir2 == -1) {
+                fprintf(stderr, "open %s fails: %s\n",
+                        dname2, strerror(errno));
+                exit(1);
+        }
+        
+        // another method
+/*        
+        if ( (dp = opendir(dname2)) == NULL) {
+                fprintf(stderr, "opendir() %s\n", strerror(errno));
+                exit(1);
+        }
+        fddir = dirfd(dp);
+*/
+
+        // delete the dir
+        fprintf (stderr, "unlinking %s\n", dname1);
+        rc = rmdir(dname1);
+        if (rc) {
+                fprintf(stderr, "unlink %s error: %s\n", 
+                        dname1, strerror(errno));
+                exit(1);
+        }
+
+        if (access(dname2, F_OK) == 0){
+                fprintf(stderr, "%s still exists\n", dname2);
+                exit(1);
+        }
+
+        if (access(dname1, F_OK) == 0){
+                fprintf(stderr, "%s still exists\n", dname1);
+                exit(1);
+        }
+
+        // fchmod the dir
+        rc = fchmod (fddir1, 0777);
+        if(rc == -1)
+        {
+                fprintf(stderr, "fchmod unlinked dir fails %s\n", 
+                        strerror(errno));
+                exit(1);
+        }
+                
+        // fstat two dirs to check if they are the same
+        rc = fstat(fddir1, &st1);
+        if(rc == -1)
+        {
+                fprintf(stderr, "fstat unlinked dir %s fails %s\n", 
+                        dname1, strerror(errno));
+                exit(1);
+        }
+
+        rc = fstat(fddir2, &st2);
+        if (rc == -1) {
+                fprintf(stderr, "fstat dir %s fails %s\n",
+                        dname2, strerror(errno));
+                exit(1);
+        }
+
+        if (st1.st_mode != st2.st_mode) {  // can we do this?
+                fprintf(stderr, "fstat different value on %s and %s\n",                                 dname1, dname2);
+                exit(1);
+        }        
+
+        fprintf(stderr, "Ok, everything goes well.\n");
+        return 0;
+}
+
diff --git a/lustre/tests/openfile.c b/lustre/tests/openfile.c
new file mode 100644 (file)
index 0000000..ab5cbdb
--- /dev/null
@@ -0,0 +1,162 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#if 0
+#define DEBUG
+#endif
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+typedef struct flag_mapping {
+       char string[20];
+       int  flag;
+} FLAG_MAPPING;
+
+FLAG_MAPPING flag_table[] = {
+       {"O_RDONLY", O_RDONLY},
+       {"O_WRONLY", O_WRONLY},
+       {"O_RDWR", O_RDWR},
+       {"O_CREAT", O_CREAT},
+       {"O_EXCL", O_EXCL},
+       {"O_NOCTTY", O_NOCTTY},
+       {"O_TRUNC", O_TRUNC},
+       {"O_APPEND", O_APPEND},
+       {"O_NONBLOCK", O_NONBLOCK},
+       {"O_NDELAY", O_NDELAY},
+       {"O_SYNC", O_SYNC},
+       {"O_NOFOLLOW", O_NOFOLLOW},
+       {"O_DIRECTORY", O_DIRECTORY},
+       {"O_LARGEFILE", O_LARGEFILE},
+       {"", -1}
+};
+
+void Usage_and_abort(void)
+{
+       fprintf(stderr, "Usage: openfile -f flags [ -m mode ] filename \n");
+       fprintf(stderr, "e.g. openfile -f O_RDWR:O_CREAT -m 0755 /etc/passwd\n");
+       exit(-1);
+}
+
+int main(int argc, char** argv)
+{
+        int i;
+        int    flags=0;
+        mode_t mode=0;
+        char*  fname=NULL;
+        int    mode_set=0;
+        int    flag_set=0;
+        int    file_set=0;
+        char   c;
+        char*  cloned_flags;
+
+        if(argc == 1) {
+                Usage_and_abort();
+        }
+
+        while ((c = getopt (argc, argv, "f:m:")) != -1) {
+                switch (c) {
+                case 'f': {
+                        char *tmp;
+
+                        cloned_flags = (char*)malloc(strlen(optarg));
+                        if (cloned_flags==NULL) {
+                                fprintf(stderr, "Insufficient memory.\n");
+                                exit(-1);
+                        }
+
+                        strncpy(cloned_flags, optarg, strlen(optarg));
+                        tmp = strtok(optarg, ":");
+                        while (tmp) {
+                                int i = 0;
+#ifdef DEBUG
+                                printf("flags = %s\n",tmp);
+#endif
+                                flag_set = 1;
+                                while (flag_table[i].flag != -1) {
+                                        int r;
+                                        r = strncasecmp(tmp, (flag_table[i].string),
+                                                        strlen((flag_table[i].string)) );
+
+                                        if (r == 0)
+                                                break;
+                                        i++;
+                                }
+
+                                if (flag_table[i].flag != -1) {
+                                        flags |= flag_table[i].flag;
+                                } else {
+                                        fprintf(stderr, "No such flag: %s\n",
+                                                tmp);
+                                        exit(-1);
+                                }
+
+                                tmp = strtok(NULL, ":");
+
+                        }
+#ifdef DEBUG
+                        printf("flags = %x\n", flags);
+#endif
+                        break;
+                }
+                case 'm':
+#ifdef DEBUG
+                        printf("mode = %s\n", optarg);
+#endif
+                        mode = strtol (optarg, NULL, 8);
+                        mode_set = 1;
+#ifdef DEBUG
+                        printf("mode = %o\n", mode);
+#endif
+                        break;
+                default:
+                        fprintf(stderr, "Bad parameters.\n");
+                        Usage_and_abort();
+                }
+        }
+
+        if (optind == argc) {
+                fprintf(stderr, "Bad parameters.\n");
+                Usage_and_abort();
+        }
+
+        fname = argv[optind];
+        file_set = 1;
+
+        if (!flag_set || !file_set) {
+                fprintf(stderr, "Missing flag or file-name\n");
+                exit(-1);
+        }
+
+
+        if (mode_set)
+                i = open(fname, flags, mode);
+        else
+                i = open(fname, flags);
+
+        if (i != -1) {
+                fprintf(stderr, "Succeed in opening file \"%s\"(flags=%s",
+                        fname, cloned_flags);
+
+                if (mode_set)
+                        fprintf(stderr, ", mode=%o", mode);
+                fprintf(stderr, ")\n");
+                close (i);
+        } else {
+                fprintf(stderr, "Error in opening file \"%s\"(flags=%s",
+                        fname, cloned_flags);
+                if (mode_set)
+                        fprintf(stderr, ", mode=%o", mode);
+                fprintf(stderr, ") %s\n", strerror(errno));
+        }
+        return(i);
+}
index 481ebaa..c8f85ee 100755 (executable)
@@ -3,23 +3,29 @@
 set -ex
 
 LUSTRE=${LUSTRE:-`dirname $0`/..}
+LTESTDIR=${LTESTDIR:-"$LUSTRE/../ltest"}
 PATH=$PATH:$LUSTRE/utils:$LUSTRE/tests
 
-. $LUSTRE/../ltest/functional/llite/common/common.sh
+. $LTESTDIR/functional/llite/common/common.sh
+
+# Allow us to override the setup if we already have a mounted system by
+# setting SETUP=" " and CLEANUP=" "
+SETUP=${SETUP:-"setup"}
+CLEANUP=${CLEANUP:-"cleanup"}
 
 PDSH='pdsh -S -w'
 
 # XXX I wish all this stuff was in some default-config.sh somewhere
 MDSNODE=${MDSNODE:-mdev6}
 OSTNODE=${OSTNODE:-mdev7}
-CLIENT=${CLIENTNODE:-mdev8}
+CLIENT=${CLIENT:-mdev8}
 NETWORKTYPE=${NETWORKTYPE:-tcp}
 MOUNTPT=${MOUNTPT:-/mnt/lustre}
-CONFIG=recovery-small.xml
-MDSDEV=/tmp/mds
-OSTDEV=/tmp/ost
-MDSSIZE=100000
-OSTSIZE=100000
+CONFIG=${CONFIG:-recovery-cleanup.xml}
+MDSDEV=${MDSDEV:-/tmp/mds}
+OSTDEV=${OSTDEV:-/tmp/ost}
+MDSSIZE=${MDSSIZE:-100000}
+OSTSIZE=${OSTSIZE:-100000}
 
 do_mds() {
     $PDSH $MDSNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@"
@@ -99,7 +105,7 @@ wait_for_timeout() {
 
 try_to_cleanup() {
     kill -INT $!
-    unmount_client --force
+    unmount_client --force --dump /tmp/client-cleanup-`date +%s`.log
     mount_client --timeout=${TIMEOUT:-5} --recovery_upcall=/bin/true
 }
 
@@ -108,7 +114,8 @@ if [ ! -z "$ONLY" ]; then
     exit $?
 fi
 
-setup
+$SETUP
+
 drop_request "mcreate /mnt/lustre/1" & wait_for_timeout
 try_to_cleanup
 
@@ -131,4 +138,4 @@ try_to_cleanup
 drop_request "munlink /mnt/lustre/link1" & wait_for_timeout
 try_to_cleanup
 
-cleanup
+$CLEANUP '--dump /tmp/`hostname`-cleanup.log'
diff --git a/lustre/tests/recovery-small-upcall.sh b/lustre/tests/recovery-small-upcall.sh
new file mode 100755 (executable)
index 0000000..02e9f69
--- /dev/null
@@ -0,0 +1,3 @@
+#!/bin/sh
+LUSTRE=`dirname $0`/..
+$LUSTRE/utils/lctl --device %$3 recover || logger -p kern.info recovery failed: $@
index 7425e57..42a1e18 100755 (executable)
@@ -3,43 +3,56 @@
 set -ex
 
 LUSTRE=${LUSTRE:-`dirname $0`/..}
+LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest}
 PATH=$PATH:$LUSTRE/utils:$LUSTRE/tests
 
-. $LUSTRE/../ltest/functional/llite/common/common.sh
+RLUSTRE=${RLUSTRE:-$LUSTRE}
+RPWD=${RPWD:-$PWD}
 
-PDSH='pdsh -S -w'
+. $LTESTDIR/functional/llite/common/common.sh
+
+# Allow us to override the setup if we already have a mounted system by
+# setting SETUP=" " and CLEANUP=" "
+SETUP=${SETUP:-"setup"}
+CLEANUP=${CLEANUP:-"cleanup"}
+
+PDSH=${PDSH:-'pdsh -S -w'}
 
 # XXX I wish all this stuff was in some default-config.sh somewhere
 MDSNODE=${MDSNODE:-mdev6}
 OSTNODE=${OSTNODE:-mdev7}
-CLIENT=${CLIENTNODE:-mdev8}
+CLIENT=${CLIENT:-mdev8}
 NETWORKTYPE=${NETWORKTYPE:-tcp}
 MOUNTPT=${MOUNTPT:-/mnt/lustre}
-CONFIG=recovery-small.xml
-MDSDEV=/tmp/mds
-OSTDEV=/tmp/ost
-MDSSIZE=100000
-OSTSIZE=100000
+CONFIG=${CONFIG:-recovery-small.xml}
+MDSDEV=${MDSDEV:-/tmp/mds}
+OSTDEV=${OSTDEV:-/tmp/ost}
+MDSSIZE=${MDSSIZE:-100000}
+OSTSIZE=${OSTSIZE:-100000}
+UPCALL=${UPCALL:-$RPWD/recovery-small-upcall.sh}
+FSTYPE=${FSTYPE:-ext3}
 
 do_mds() {
-    $PDSH $MDSNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@"
+    $PDSH $MDSNODE "PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests; cd $RPWD; $@" || exit $?
 }
 
 do_client() {
-    $PDSH $CLIENT "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@"
+    $PDSH $CLIENT "PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests; cd $RPWD; $@"  || exit $?
 }
 
 do_ost() {
-    $PDSH $OSTNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@"
+    $PDSH $OSTNODE "PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests; cd $RPWD; $@" || exit $?
 }
 
 drop_request() {
+# OBD_FAIL_MDS_ALL_REQUEST_NET
     do_mds "echo 0x121 > /proc/sys/lustre/fail_loc"
     do_client "$1"
     do_mds "echo 0 > /proc/sys/lustre/fail_loc"
 }
 
 drop_reply() {
+# OBD_FAIL_MDS_ALL_REPLY_NET
     do_mds "echo 0x120 > /proc/sys/lustre/fail_loc"
     do_client "$@"
     do_mds "echo 0 > /proc/sys/lustre/fail_loc"
@@ -52,9 +65,9 @@ make_config() {
            --nettype $NETWORKTYPE || exit 4
     done
     lmc -m $CONFIG --add mds --node $MDSNODE --mds mds1 --dev $MDSDEV \
-        --size $MDSSIZE || exit 5
+        --size $MDSSIZE --fstype $FSTYPE || exit 5
     lmc -m $CONFIG --add ost --node $OSTNODE --ost ost1 --dev $OSTDEV \
-        --size $OSTSIZE || exit 6
+        --size $OSTSIZE --fstype $FSTYPE || exit 6
     lmc -m $CONFIG --add mtpt --node $CLIENT --path $MOUNTPT --mds mds1 \
         --ost ost1 || exit 7
 }
@@ -84,12 +97,11 @@ unmount_client() {
 }
 
 setup() {
-    make_config
-    start_mds ${REFORMAT:---reformat}
-    start_ost ${REFORMAT:---reformat}
+    start_mds ${REFORMAT}
+    start_ost ${REFORMAT}
     # XXX we should write our own upcall, when we move this somewhere better.
     mount_client --timeout=${TIMEOUT:-5} \
-        --recovery_upcall=$PWD/../../ltest/functional/llite/09/client-upcall.sh
+        --lustre_upcall=$UPCALL
 }
 
 cleanup() {
@@ -114,7 +126,11 @@ if [ ! -z "$ONLY" ]; then
     exit $?
 fi
 
-setup
+make_config
+
+REFORMAT=--reformat $SETUP
+unset REFORMAT
+
 drop_request "mcreate /mnt/lustre/1"
 drop_reply "mcreate /mnt/lustre/2"
 # replay "mcreate /mnt/lustre/3"
@@ -140,5 +156,4 @@ drop_reply "mlink /mnt/lustre/renamed-again /mnt/lustre/link2"
 drop_request "munlink /mnt/lustre/link1"
 drop_reply "munlink /mnt/lustre/link2"
 
-
-cleanup
+$CLEANUP
index 3d29f1b..8731699 100644 (file)
@@ -19,7 +19,7 @@ Usage_and_abort()
        exit(-1);
 }
 
-// Usage: runas -u user_id [ -g grp_id ] "command_to_be_run"
+// Usage: runas -u user_id [ -g grp_id ] [--] command_to_be_run
 // return: the return value of "command_to_be_run"
 // NOTE: returning -1 might be the return code of this program itself or
 // the "command_to_be_run"
@@ -30,8 +30,7 @@ Usage_and_abort()
 int 
 main(int argc, char**argv)
 {
-        char command[1024];
-        char *cmd_ptr;
+        char **my_argv;
         int status;
         int c,i;
         int gid_is_set = 0;
@@ -44,7 +43,7 @@ main(int argc, char**argv)
         }
 
         // get UID and GID
-        while ((c = getopt (argc, argv, "u:g:h")) != -1) {
+        while ((c = getopt (argc, argv, "+u:g:h")) != -1) {
                 switch (c) {
                 case 'u':
                         user_id = (uid_t)atoi(optarg);
@@ -79,12 +78,18 @@ main(int argc, char**argv)
                 Usage_and_abort();
         }
 
-
         // assemble the command
-        cmd_ptr = command ;
-        for (i = optind; i < argc; i++)
-                 cmd_ptr += sprintf(cmd_ptr,  "%s ", argv[i]);
-
+        my_argv = (char**)malloc(sizeof(char*)*(argc+1-optind));
+               if(my_argv == NULL) {
+                       fprintf(stderr, "Error in allocating memory. (%s)\n", strerror(errno));
+                               exit(-1);
+               }
+               
+        for(i=optind; i< argc; i++) {
+                my_argv[i-optind] = argv[i];
+//                printf("%s\n",my_argv[i-optind]);
+        }
+        my_argv[i-optind]=NULL;
 
 #if DEBUG
   system("whoami");
@@ -94,7 +99,7 @@ main(int argc, char**argv)
         status = setregid(grp_id, grp_id );
         if( status == -1) {
                  fprintf(stderr, "Cannot change grp_ID to %d, errno=%d (%s)\n",
-                  grp_id, errno, strerror(errno) );
+                          grp_id, errno, strerror(errno) );
                  exit(-1);
         }
 
@@ -102,32 +107,24 @@ main(int argc, char**argv)
         status = setreuid(user_id, user_id );
         if(status == -1) {
                   fprintf(stderr,"Cannot change user_ID to %d, errno=%d (%s)\n",
-                   user_id, errno, strerror(errno) );
+                           user_id, errno, strerror(errno) );
                   exit(-1);
         }
 
-#if DEBUG
-  system("whoami");
-#endif
 
-        fprintf(stdout, "running as USER(%d), Grp (%d):  \"%s\" \n", 
-           user_id, grp_id, command );
+        fprintf(stderr, "running as USER(%d), Grp (%d):  ", 
+           user_id, grp_id );
 
-        // run the command
-        status = system(command);
+        for(i=0; i<argc-optind; i++)
+                 fprintf(stderr, " [%s]", my_argv[i]);
 
-        // pass the return code of command_to_be_run out of this wrapper
-        if (status == -1) {
-                 fprintf(stderr, "%s: system() command failed to run\n",
-                           argv[0]);
-        }
-        else{
-                 status = WEXITSTATUS(status);
-                 fprintf(stderr, "[%s #%d] \"%s\" returns %d (%s).\n", argv[0],
-                        user_id, argv[optind], status, strerror(status));
+        fprintf(stderr, "\n");
+        fflush(stderr);
 
-        }
+        // The command to be run
+        execvp(my_argv[0], my_argv);
+        fprintf(stderr, "execvp fails running %s\n", my_argv[0]);
+        exit(-1);
 
-        return(status);
 }
 
diff --git a/lustre/tests/runobdstat b/lustre/tests/runobdstat
new file mode 100644 (file)
index 0000000..886ce8f
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+PATH=`dirname $0`/../utils:$PATH
+
+obdstat filter 1 | while read LINE; do
+       echo "`date +s`: $LINE"
+       [ "$1" ] && echo "`date +s`: $LINE" >> $1
+done
index 6bff5ce..3ce6810 100755 (executable)
@@ -1,2 +1,5 @@
 #!/bin/sh
-vmstat 1 | while read LINE ; do echo "`date +%H:%M:%S`: $LINE" ; done
+vmstat 1 | while read LINE ; do
+       echo "`date +s`: $LINE"
+       [ "$1" ] && echo "`date +s`: $LINE" >> $1
+done
diff --git a/lustre/tests/sanity-ldlm.sh b/lustre/tests/sanity-ldlm.sh
new file mode 100644 (file)
index 0000000..e5bd422
--- /dev/null
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+set -e
+
+SRCDIR=`dirname $0`
+PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
+
+MOUNT=${MOUNT:-/mnt/lustre}
+DIR=${DIR:-$MOUNT}
+export NAME=$NAME
+clean() {
+        echo -n "cln.."
+        sh llmountcleanup.sh > /dev/null || exit 20
+}
+CLEAN=${CLEAN:-clean}
+start() {
+        echo -n "mnt.."
+        sh llrmount.sh > /dev/null || exit 10
+        echo "done"
+}
+START=${START:-start}
+
+log() {
+       echo "$*"
+       lctl mark "$*" || /bin/true
+}
+
+pass() {
+    echo PASS
+}
+
+mount | grep $MOUNT || sh llmount.sh
+
+log '== drop ldlm request  ======================== test 1'
+echo 0x302 > /proc/sys/lustre/fail_loc
+echo 3 > /proc/sys/lustre/timeout
+touch $DIR/f &
+sleep 5
+echo 0 > /proc/sys/lustre/fail_loc
+lctl --device 6 recover
+pass
+$CLEAN
+$START
+
+log '== drop ldlm reply (bug 1139) ================ test 2'
+echo 0x213 > /proc/sys/lustre/fail_loc
+echo 3 > /proc/sys/lustre/timeout
+touch $DIR/f
+pass
+$CLEAN
+$START
+
+log '== drop reply after completion (bug 1068) ==== test 3'
+touch $DIR/f
+stat $DIR/f
+echo 0x213 > /proc/sys/lustre/fail_loc
+echo 3 > /proc/sys/lustre/timeout
+echo foo >> $DIR/f
+pass
+$CLEAN
+$START
index fdaf82e..84572bf 100644 (file)
@@ -1,9 +1,8 @@
 #!/bin/bash
-
 set -e
 
 SRCDIR=`dirname $0`
-PATH=$SRCDIR:$SRCDIR/../utils:$PATH
+PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
 
 CHECKSTAT=${CHECKSTAT:-"./checkstat -v"}
 CREATETEST=${CREATETEST:-createtest}
@@ -11,6 +10,7 @@ LFIND=${LFIND:-lfind}
 LSTRIPE=${LSTRIPE:-lstripe}
 MCREATE=${MCREATE:-mcreate}
 TOEXCL=${TOEXCL:-toexcl}
+TRUNCATE=${TRUNCATE:-truncate}
 
 RUNAS_ID=${RUNAS_ID:-500}
 RUNAS=${RUNAS:-"runas -u $RUNAS_ID"}
@@ -32,7 +32,7 @@ START=${START:-start}
 
 log() {
        echo "$*"
-       lctl mark "$*"
+       lctl mark "$*" || /bin/true
 }
 
 error() { 
@@ -46,6 +46,15 @@ pass() {
 
 mount | grep $MOUNT || sh llmount.sh
 
+echo preparing for tests involving mounts
+EXT2_DEV=/tmp/SANITY.LOOP
+dd if=/dev/zero of=$EXT2_DEV bs=1k count=1000
+#losetup /dev/loop0 || losetup /dev/loop0 /tmp/SANITY.LOOP
+#mke2fs -c /dev/loop0 100
+#losetup -d /dev/loop0
+mke2fs -F /tmp/SANITY.LOOP 
+
+
 log '== touch .../f ; rm .../f ======================== test 0'
 touch $DIR/f
 $CHECKSTAT -t file $DIR/f || error 
@@ -301,10 +310,11 @@ $START
 
 log '== unpack tar archive as non-root user =========== test 22'
 mkdir $DIR/d22
-[ $UID -ne 0 ] && RUNAS=""
 [ $UID -ne 0 ] && RUNAS_ID="$UID"
+[ $UID -ne 0 ] && RUNAS=""
 chown $RUNAS_ID $DIR/d22
-$RUNAS tar cf - /etc/hosts /etc/sysconfig/network | $RUNAS tar xfC - $DIR/d22
+# Tar gets pissy if it can't access $PWD *sigh*
+(cd /tmp ; $RUNAS tar cf - /etc/hosts /etc/sysconfig/network | $RUNAS tar xfC - $DIR/d22)
 ls -lR $DIR/d22/etc
 $CHECKSTAT -t dir $DIR/d22/etc || error
 $CHECKSTAT -u \#$RUNAS_ID $DIR/d22/etc || error
@@ -516,7 +526,10 @@ pass
 $CLEAN
 $START
 
-log "--test 27.8 lfind "
+log "--test 27.8 mcreate file without objects to test lfind"
+$MCREATE $DIR/d27/fnone || error
+
+log "--test 27.9 lfind "
 $LFIND $DIR/d27
 pass
 $CLEAN
@@ -554,8 +567,281 @@ log '== open-unlink file ============================== test31'
 ./openunlink $DIR/f31 $DIR/f31 || error
 pass
 
+
+log '== more mountpoints and symlinks ================= test32'
+
+log '-- test 32-R1: stat d32/ext2-mountpoint/..'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/ext2-mountpoint 
+mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint  
+$CHECKSTAT -t dir $DIR/d32/ext2-mountpoint/.. || error  
+umount $DIR/d32/ext2-mountpoint/
+pass
+$CLEAN
+$START
+
+log '-- test 32-R2: open d32/ext2-mountpoint/..'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/ext2-mountpoint 
+mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint  
+ls -al $DIR/d32/ext2-mountpoint/.. || error
+umount $DIR/d32/ext2-mountpoint/
+pass
+$CLEAN
+$START
+log '-- test 32-R3: stat d32/ext2-mountpoint/../d2/test_dir'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/ext2-mountpoint 
+mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint  
+mkdir -p $DIR/d32/d2/test_dir    
+$CHECKSTAT -t dir $DIR/d32/ext2-mountpoint/../d2/test_dir || error
+umount $DIR/d32/ext2-mountpoint/
+pass
+$CLEAN
+$START
+
+log '-- test 32-R4: open d32/ext2-mountpoint/../d2/test_dir'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/ext2-mountpoint 
+mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint  
+mkdir -p $DIR/d32/d2/test_dir    
+ls -al $DIR/d32/ext2-mountpoint/../d2/test_dir || error
+umount $DIR/d32/ext2-mountpoint/
+pass
+$CLEAN
+$START
+
+log '-- test 32-R5: stat d32/symlink->tmp/symlink->lustre-subdir'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/tmp    
+TMP_DIR=$DIR/d32/tmp       
+ln -s $DIR/d32 $TMP_DIR/symlink11 
+ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 
+$CHECKSTAT -t link $DIR/d32/tmp/symlink11 || error
+$CHECKSTAT -t link $DIR/d32/symlink01 || error
+pass
+$CLEAN
+$START
+
+log '-- test 32-R6: open d32/symlink->tmp/symlink->lustre-subdir'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/tmp    
+TMP_DIR=$DIR/d32/tmp       
+ln -s $DIR/d32 $TMP_DIR/symlink11 
+ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 
+ls $DIR/d32/tmp/symlink11  || error
+ls $DIR/d32/symlink01 || error
+pass
+$CLEAN
+$START
+
+log '-- test 32-R7: stat d32/symlink->tmp/symlink->lustre-subdir/test_dir'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+[ -e $DIR/test_dir ] && rm -fr $DIR/test_dir
+mkdir -p $DIR/test_dir 
+mkdir -p $DIR/d32/tmp    
+TMP_DIR=$DIR/d32/tmp       
+ln -s $DIR/test_dir $TMP_DIR/symlink12 
+ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 
+$CHECKSTAT -t link $DIR/d32/tmp/symlink12 || error
+$CHECKSTAT -t link $DIR/d32/symlink02 || error
+$CHECKSTAT -t dir -f $DIR/d32/tmp/symlink12 || error
+$CHECKSTAT -t dir -f $DIR/d32/symlink02 || error
+pass
+$CLEAN
+$START
+
+log '-- test 32-R8: open d32/symlink->tmp/symlink->lustre-subdir/test_dir'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+[ -e $DIR/test_dir ] && rm -fr $DIR/test_dir
+mkdir -p $DIR/test_dir 
+mkdir -p $DIR/d32/tmp    
+TMP_DIR=$DIR/d32/tmp       
+ln -s $DIR/test_dir $TMP_DIR/symlink12 
+ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 
+ls $DIR/d32/tmp/symlink12 || error
+ls $DIR/d32/symlink02  || error
+pass
+$CLEAN
+$START
+
+log '-- test 32-R9: stat d32/ext2-mountpoint/../test_file'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/ext2-mountpoint 
+mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint  
+touch $DIR/d32/test_file
+$CHECKSTAT -t file $DIR/d32/ext2-mountpoint/../test_file || error  
+umount $DIR/d32/ext2-mountpoint  
+pass
+$CLEAN
+$START
+
+log '-- test 32-R10: open d32/ext2-mountpoint/../test_file'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/ext2-mountpoint 
+mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint  
+touch $DIR/d32/test_file
+cat $DIR/d32/ext2-mountpoint/../test_file || error
+umount $DIR/d32/ext2-mountpoint/
+pass
+$CLEAN
+$START
+
+log '-- test 32-R11: stat d32/ext2-mountpoint/../d2/test_file'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/ext2-mountpoint 
+mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint  
+mkdir -p $DIR/d32/d2
+touch $DIR/d32/d2/test_file
+$CHECKSTAT -t file $DIR/d32/ext2-mountpoint/../d2/test_file || error
+umount $DIR/d32/ext2-mountpoint/
+pass
+$CLEAN
+$START
+
+log '-- test 32-R12: open d32/ext2-mountpoint/../d2/test_file'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/ext2-mountpoint 
+mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint  
+mkdir -p $DIR/d32/d2
+touch $DIR/d32/d2/test_file
+cat  $DIR/d32/ext2-mountpoint/../d2/test_file || error
+umount $DIR/d32/ext2-mountpoint/
+pass
+$CLEAN
+$START
+
+log '-- test 32-R13: stat d32/symlink->tmp/symlink->lustre-root'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/tmp    
+TMP_DIR=$DIR/d32/tmp       
+ln -s $DIR $TMP_DIR/symlink11 
+ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 
+$CHECKSTAT -t link $DIR/d32/tmp/symlink11 || error
+$CHECKSTAT -t link $DIR/d32/symlink01 || error
+pass
+$CLEAN
+$START
+
+log '-- test 32-R14: open d32/symlink->tmp/symlink->lustre-root'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+mkdir -p $DIR/d32/tmp    
+TMP_DIR=$DIR/d32/tmp       
+ln -s $DIR $TMP_DIR/symlink11 
+ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 
+ls -l $DIR/d32/tmp/symlink11  || error
+ls -l $DIR/d32/symlink01 || error
+pass
+$CLEAN
+$START
+
+log '-- test 32-R15: stat d32/symlink->tmp/symlink->lustre-root/test_file'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+[ -e $DIR/test_file ] && rm -fr $DIR/test_file
+touch $DIR/test_file 
+mkdir -p $DIR/d32/tmp    
+TMP_DIR=$DIR/d32/tmp       
+ln -s $DIR/test_file $TMP_DIR/symlink12 
+ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 
+$CHECKSTAT -t link $DIR/d32/tmp/symlink12 || error
+$CHECKSTAT -t link $DIR/d32/symlink02 || error
+$CHECKSTAT -t file -f $DIR/d32/tmp/symlink12 || error
+$CHECKSTAT -t file -f $DIR/d32/symlink02 || error
+pass
+$CLEAN
+$START
+
+log '-- test 32-R16: open d32/symlink->tmp/symlink->lustre-root/test_file'
+[ -e $DIR/d32 ] && rm -fr $DIR/d32
+[ -e $DIR/test_file ] && rm -fr $DIR/test_file
+touch $DIR/test_file 
+mkdir -p $DIR/d32/tmp    
+TMP_DIR=$DIR/d32/tmp       
+ln -s $DIR/test_file $TMP_DIR/symlink12 
+ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 
+cat $DIR/d32/tmp/symlink12 || error
+cat $DIR/d32/symlink02  || error
+pass
+$CLEAN
+$START
+
+log '-- test 33: write file with mode 444 (should return error)'
+#   chmod 444 /mnt/lustre/somefile
+#   open(/mnt/lustre/somefile, O_RDWR)
+#   Should return -1
+[ $UID -ne 0 ] && RUNAS_ID="$UID"
+[ $UID -ne 0 ] && RUNAS=""
+[ -e $DIR/test_33_file ] && rm -fr $DIR/test_33_file
+touch $DIR/test_33_file
+chmod 444 $DIR/test_33_file
+chown $RUNAS_ID $DIR/test_33_file
+$RUNAS openfile -f O_RDWR $DIR/test_33_file && error
+pass
+$CLEAN
+$START
+
+if [ -n "$BUG1360" ]; then
+log '-- test 34: execute a file with mode 444 (should return error)'
+[ $UID -ne 0 ] && RUNAS_ID="$UID"
+[ $UID -ne 0 ] && RUNAS=""
+[ -e $DIR/test_35_file ] && rm -fr $DIR/test_35_file
+cp /bin/sh $DIR/test_35_file
+chmod 444 $DIR/test_35_file
+chown $RUNAS_ID $DIR/test_35_file
+$DIR/test_35_file && error
+pass
+$CLEAN
+$START
+else
+echo "Skipping test for 1360: set \$BUG_1360 to run it (fail cleanup, likely)."
+fi
+
+if [ -n "$BUG_1365" ]; then
+log '-- test 35: truncate file that has not been opened'
+$MCREATE $DIR/f
+$TRUNCATE $DIR/f 100
+rm $DIR/f
+pass
+$CLEAN
+$START
+else
+echo "Skipping test for 1365: set \$BUG_1365 to run it (and crash, likely)."
+fi
+
+log '-- test 36: cvs operations'
+[ $UID -ne 0 ] && RUNAS_ID="$UID"
+[ $UID -ne 0 ] && RUNAS=""
+mkdir -p $DIR/cvsroot
+log '-- test 36-1: cvs init'
+cvs -d $DIR/cvsroot init 
+$CLEAN
+$START
+log '-- test 36-2: cvs import'
+(cd /etc/init.d ; cvs -d $DIR/cvsroot import -m "nomesg"  reposname vtag rtag )
+$CLEAN
+$START
+log '-- test 36-3: cvs checkout'
+(cd $DIR ; cvs -d $DIR/cvsroot co reposname )
+$CLEAN
+$START
+log '-- test 36-4: cvs add'
+(cd $DIR/reposname ; touch foo34 ; cvs add -m 'addmsg' foo34 )
+$CLEAN
+$START
+log '-- test 36-5: cvs update'
+(cd $DIR/reposname ; cvs update )
+$CLEAN
+$START
+log '-- test 36-5: cvs commit'
+#
+# XXX change this: use a non rooot users
+(cd $DIR/reposname ; cvs commit -m 'nomsg'  foo32 )
+pass
+$CLEAN
+$START
+
 log '== cleanup ============================================='
 rm -r $DIR/[Rdfs][1-9]* $DIR/ls
 
 echo '======================= finished ======================='
-exit
index a4930de..8145e63 100644 (file)
@@ -80,10 +80,28 @@ echo -n "test 6: fstat validation on multiple mount points..."
 ./multifstat $MOUNT1/f6 $MOUNT2/f6
 pass
 
+if [ -n "$BUG_1365" ]; then
+echo -n "test 7: create a file on one mount, truncate it on the other..."
+mcreate $MOUNT1/f1
+truncate $MOUNT2/f1 100
+rm $MOUNT1/f1
+pass
+else
+echo "Skipping test for 1365: set \$BUG_1365 to run it (and crash, likely)."
+fi
+
 echo "test 9: remove of open file on other node..."
 ./openunlink $MOUNT1/f9 $MOUNT2/f9 || error
 pass
 
+echo "test 9b: remove of open directory on other node..."
+./opendirunlink $MOUNT1/dir1 $MOUNT2/dir1 || error
+pass
+
+#echo "test 9c: remove of open special file on other node..."
+#./opendevunlink $MOUNT1/dev1 $MOUNT2/dev1 || error
+#pass
+
 echo -n "test 10: append of file with sub-page size on multiple mounts..."
 MTPT=1
 > $MOUNT2/f10
@@ -106,35 +124,8 @@ for C in a b c d e f g h i j k l; do
 done
 [ "`cat $MOUNT1/f11`" = "abcdefghijkl" ] && pass || error
        
-echo "test 12: file length and contents across mounts"
-dd if=$SHELL of=$MOUNT1/f12 bs=4096 count=1
-$CHECKSTAT -s 4096 $MOUNT1/f12 $MOUNT2/f12 || error
-dd if=$SHELL bs=4096 count=1 |                                 \
-       md5sum - $MOUNT1/f12 $MOUNT2/f12 | (                    \
-               read GOODSUM DASH;                              \
-               while read SUM FILE ; do                        \
-                       [ $SUM == $GOODSUM ] || exit 2;         \
-               done; ) || error
-
-echo "test 13: open(,O_TRUNC,), close() across mounts"
-dd if=$SHELL of=$MOUNT1/f13 bs=4096 count=1
-> $MOUNT1/f13
-$CHECKSTAT -s 0 $MOUNT1/f13 $MOUNT2/f13 || error
-
-echo "test 14: file extension while holding the fd open"
-> $MOUNT1/f14
-# ugh.
-touch $MOUNT1/f14-start
-sh -c "
-  echo -n a;
-  mv $MOUNT1/f14-start $MOUNT1/f14-going;
-  while [ -f $MOUNT1/f14-going ] ; do sleep 1; done;
-    "  >> $MOUNT1/f14 &
-while [ -f $MOUNT1/f14-start ] ; do sleep 1; done;
-$CHECKSTAT -s 1 $MOUNT1/f14 $MOUNT2/f14 || error
-rm $MOUNT1/f14-going
-
 rm -f $MOUNT1/f[0-9]* $MOUNT1/lnk
+
 $CLEAN
 
 exit
index 396f3b0..6cbfcb5 100644 (file)
@@ -17,7 +17,8 @@
 #define CERROR(fmt, arg...) fprintf(stderr, fmt, ## arg)
 #ifndef __u64
 #define __u64 long long
-#define HTON__u64(v) (v)
+#define cpu_to_le64(v) (v)
+#define le64_to_cpu(v) (v)
 #endif
 
 #ifndef LPU64
@@ -31,8 +32,8 @@
 #define LPDS sizeof(__u64)
 int page_debug_setup(void *addr, int len, __u64 off, __u64 id)
 {
-        off = HTON__u64(off);
-        id = HTON__u64(id);
+        off = cpu_to_le64(off);
+        id = cpu_to_le64(id);
         memcpy(addr, (char *)&off, LPDS);
         memcpy(addr + LPDS, (char *)&id, LPDS);
 
@@ -48,8 +49,8 @@ int page_debug_check(char *who, void *addr, int size, __u64 off, __u64 id)
         __u64 ne_off;
         int err = 0;
 
-        ne_off = HTON__u64(off);
-        id = HTON__u64(id);
+        ne_off = le64_to_cpu(off);
+        id = le64_to_cpu(id);
         if (memcmp(addr, (char *)&ne_off, LPDS)) {
                 CERROR("%s: for offset "LPU64" off: "LPX64" != "LPX64"\n",
                        who, off, *(__u64 *)addr, ne_off);
@@ -199,7 +200,7 @@ int main(int argc, char **argv)
                 return 5;
         }
 
-       for (offset = 0; offset < last && cmd && READ; offset += len) {
+       for (offset = 0; offset < last && cmd & READ; offset += len) {
                int i;
 
                rc = read(fd, buf, len);
index 599bd21..2b3adc3 100644 (file)
@@ -1,15 +1,22 @@
 #!/bin/bash
 
-config=${1-uml.xml}
-LMC=${LMC-../utils/lmc}
+export PATH=`dirname $0`/../utils:$PATH
+
+config=${1:-uml.xml}
+LMC=${LMC:-lmc}
 TMP=${TMP:-/tmp}
 
 MDSDEV=${MDSDEV:-$TMP/mds1}
 MDSSIZE=${MDSSIZE:-50000}
 
-OSTDEV1=${OSTDEV1:-$TMP/ost1}
-OSTDEV2=${OSTDEV2:-$TMP/ost2}
+OSTDEVBASE=$TMP/ost
+#OSTDEV1=${OSTDEV1:-${OSTDEVBASE}1}
+#OSTDEV2=${OSTDEV2:-${OSTDEVBASE}2}
+#etc
 OSTSIZE=${OSTSIZE:-100000}
+STRIPECNT=${STRIPECNT:-1}
+
+FSTYPE=${FSTYPE:-ext3}
 
 NETTYPE=${NETTYPE:-tcp}
 
@@ -66,17 +73,17 @@ done
 
 # configure mds server
 echo; echo "adding MDS on: $MDSNODE"
-${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --dev $MDSDEV --size $MDSSIZE ||exit 10
+${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE ||exit 10
 
 # configure ost
-${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 1 --stripe_pattern 0 || exit 20
+${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt $STRIPECNT --stripe_pattern 0 || exit 20
 COUNT=1
 echo -n "adding OST on:"
 for NODE in $OSTNODES; do
        eval OSTDEV=\$OSTDEV$COUNT
        echo -n " $NODE"
-       OSTDEV=${OSTDEV:-$OSTDEV1}
-        ${LMC} -m $config --add ost --node $NODE --lov lov1 --dev $OSTDEV --size $OSTSIZE || exit 21
+       OSTDEV=${OSTDEV:-$OSTDEVBASE$COUNT}
+        ${LMC} -m $config --add ost --node $NODE --lov lov1 --fstype $FSTYPE --dev $OSTDEV --size $OSTSIZE || exit 21
        COUNT=`expr $COUNT + 1`
 done
 
diff --git a/lustre/tests/unlinkmany.c b/lustre/tests/unlinkmany.c
new file mode 100644 (file)
index 0000000..ba1bee7
--- /dev/null
@@ -0,0 +1,74 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+void usage(char *prog)
+{
+       printf("usage: %s filenamefmt count\n", prog);
+       printf("       %s filenamefmt start count\n", prog);
+}
+
+int main(int argc, char ** argv)
+{
+        int i, rc = 0;
+        char format[4096], *fmt;
+        char filename[4096];
+        long start, last;
+       long begin = 0, count;
+
+        if (argc < 3 || argc > 4) {
+               usage(argv[0]);
+                return 1;
+        }
+
+        if (strlen(argv[1]) > 4080) {
+                printf("name too long\n");
+                return 1;
+        }
+
+        start = last = time(0);
+
+       if (argc == 3) {
+               count = strtol(argv[2], NULL, 0);
+               if (count < 1) {
+                        printf("count must be at least one\n");
+                        return 1;
+                }
+       } else {
+               begin = strtol(argv[2], NULL, 0);
+               count = strtol(argv[3], NULL, 0);
+       }
+
+       if (strchr(argv[1], '%')) {
+               fmt = argv[1];
+        } else {
+               sprintf(format, "%s%%d", argv[1]);
+               fmt = format;
+       }
+        for (i = 0; i < count; i++, begin++) {
+                sprintf(filename, fmt, begin);
+                rc = unlink(filename);
+                if (rc) {
+                        printf("unlink(%s) error: %s\n",
+                               filename, strerror(errno));
+                        rc = errno;
+                        break;
+                }
+                if ((i % 10000) == 0) {
+                        printf(" - unlinked %d (time %ld ; total %ld ; last "
+                               "%ld)\n", i, time(0), time(0) - start,
+                               time(0) - last);
+                        last = time(0);
+                }
+        }
+        printf("total: %d unlinks in %ld seconds: %f unlinks/second\n", i,
+               time(0) - start, ((float)i / (time(0) - start)));
+
+        return rc;
+}
index ab8692f..a376063 100644 (file)
@@ -6,27 +6,27 @@
 
 int main(int argc, char **argv)
 {
-        int fd, rc; 
+        int fd, rc;
         int i = 0;
         char buf[4096];
-        
+
         memset(buf, 0, 4096);
 
-        if (argc != 2) { 
-                printf("Usage openme <filename>\n"); 
+        if (argc != 2) {
+                printf("Usage: %s <filename>\n", argv[0]);
                 exit(1);
         }
 
         fd = open(argv[1], O_RDWR | O_CREAT, 0600);
-        if (fd == -1) { 
+        if (fd == -1) {
                 printf("Error opening %s\n", argv[1]);
                 exit(1);
         }
 
-        while (1) { 
-                sprintf(buf, "write %d\n", i); 
-                rc = write(fd, buf, sizeof(buf)); 
-                sleep(1); 
+        while (1) {
+                sprintf(buf, "write %d\n", i);
+                rc = write(fd, buf, sizeof(buf));
+                sleep(1);
         }
         return 0;
 }
index 4775289..06a1588 100644 (file)
@@ -11,8 +11,8 @@ obdctl
 lctl
 lfind
 lstripe
-lconf
 obdstat
 obdio
 obdbarrier
 lload
+wirecheck
\ No newline at end of file
diff --git a/lustre/utils/Lustre/.cvsignore b/lustre/utils/Lustre/.cvsignore
new file mode 100644 (file)
index 0000000..97e22b9
--- /dev/null
@@ -0,0 +1,4 @@
+Makefile
+Makefile.in
+.deps
+*.pyc
diff --git a/lustre/utils/Lustre/Makefile.am b/lustre/utils/Lustre/Makefile.am
new file mode 100644 (file)
index 0000000..e8e522f
--- /dev/null
@@ -0,0 +1,2 @@
+pymod_SCRIPTS = __init__.py lustredb.py error.py cmdline.py
+EXTRA_DIST = $(pymod_SCRIPTS)
diff --git a/lustre/utils/Lustre/__init__.py b/lustre/utils/Lustre/__init__.py
new file mode 100644 (file)
index 0000000..c1b93e6
--- /dev/null
@@ -0,0 +1,7 @@
+__all__ = ["lustredb"]
+
+from lustredb import LustreDB, LustreDB_XML, LustreDB_LDAP
+from error import LconfError, OptionError
+from cmdline import Options
+
+CONFIG_VERSION="2003060501"
diff --git a/lustre/utils/Lustre/cmdline.py b/lustre/utils/Lustre/cmdline.py
new file mode 100644 (file)
index 0000000..53bb6e8
--- /dev/null
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+#
+#  Copyright (C) 2002 Cluster File Systems, Inc.
+#   Author: Robert Read <rread@clusterfs.com>
+#   This file is part of Lustre, http://www.lustre.org.
+#
+#   Lustre is free software; you can redistribute it and/or
+#   modify it under the terms of version 2 of the GNU General Public
+#   License as published by the Free Software Foundation.
+#
+#   Lustre is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with Lustre; if not, write to the Free Software
+#   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+# Standard the comand line handling for all the python tools.
+
+import sys, getopt, types
+import string
+import error
+
+class Options:
+    FLAG = 1
+    PARAM = 2
+    INTPARAM = 3
+    def __init__(self, cmd, remain_help, options):
+        self.options = options
+        shorts = ""
+        longs = []
+        options.append(('help,h', "Print this help")) 
+        for opt in options:
+            long = self.long(opt)
+            short = self.short(opt)
+            if self.type(opt) in (Options.PARAM, Options.INTPARAM):
+                if short:  short = short + ':'
+                if long: long = long + '='
+            shorts = shorts + short
+            longs.append(long)
+        self.short_opts = shorts
+        self.long_opts = longs
+        self.cmd = cmd
+        self.remain_help = remain_help
+
+    def init_values(self):
+        values = {}
+        for opt in self.options:
+            values[self.key(opt)] = self.default(opt)
+        return values
+
+    def long(self, option):
+        n = string.find(option[0], ',')
+        if n < 0: return option[0]
+        else:     return option[0][0:n]
+
+    def key(self, option):
+        key = self.long(option)
+        return string.replace(key, '-', '_')
+        
+    def short(self, option):
+        n = string.find(option[0], ',')
+        if n < 0: return ''
+        else:     return option[0][n+1:]
+
+    def help(self, option):
+        return option[1]
+    
+    def type(self, option):
+        if len(option) >= 3:
+            return option[2]
+        return Options.FLAG
+    
+    def default(self, option):
+        if len(option) >= 4:
+            return option[3]
+        return None
+
+    def lookup_option(self, key, key_func):
+        for opt in self.options:
+            if key_func(opt) == key:
+                return opt
+
+    def lookup_short(self, key):
+        return self.lookup_option(key, self.short)
+
+    def lookup_long(self, key):
+        return self.lookup_option(key, self.long)
+
+    def handle_opts(self, opts):
+        values = self.init_values()
+        for o, a in opts:
+            if o[0:2] != '--':
+                option = self.lookup_short(o[1:])
+            else:
+                option = self.lookup_long(o[2:])
+            if self.type(option) == Options.PARAM:
+                val = a
+            elif self.type(option) == Options.INTPARAM:
+                try: 
+                    val = int(a)
+                except ValueError, e:
+                    raise error.OptionError("option: '%s' expects integer value, got '%s' "  % (o,a))
+            else:
+                val = 1
+            values[self.key(option)] = val
+        return values
+                
+        
+    class option_wrapper:
+        def __init__(self, values):
+            self.__dict__['values'] = values
+        def __getattr__(self, name):
+            if self.values.has_key(name):
+                return self.values[name]
+            else:
+                raise error.OptionError("bad option name: " + name)
+        def __setattr__(self, name, value):
+            self.values[name] = value
+
+    def parse(self, argv):
+        try:
+            opts, args = getopt.getopt(argv, self.short_opts, self.long_opts)
+            values = self.handle_opts(opts)
+            if values["help"]:
+                self.usage()
+                sys.exit(0)
+            return self.option_wrapper(values), args
+        except getopt.error, e:
+            raise error.OptionError(str(e))
+
+    def usage(self):
+        ret = 'usage: %s [options] %s\n' % (self.cmd, self.remain_help)
+        for opt in self.options:
+            s = self.short(opt)
+            if s: str = "-%s|--%s" % (s,self.long(opt))
+            else: str = "--%s" % (self.long(opt),)
+            if self.type(opt) in (Options.PARAM, Options.INTPARAM):
+                str = "%s <arg>" % (str,)
+            help = self.help(opt)
+            n = string.find(help, '\n')
+            if self.default(opt) != None:
+                if n < 0:
+                    str = "%-15s  %s (default=%s)" %(str, help,
+                                                     self.default(opt))
+                else:
+                    str = "%-15s  %s (default=%s)%s" %(str, help[0:n],
+                                                       self.default(opt),
+                                                       help[n:])
+            else:
+                str = "%-15s  %s" %(str, help)
+            ret = ret + str + "\n"
+        print ret
+
+# Test driver
+if __name__ == "__main__":
+    cl = Options("test", "xml_file", [
+                  ('verbose,v', "verbose ", Options.FLAG, 0),
+                  ('cleanup,d', "shutdown"),
+                  ('gdb',     "Display gdb module file ", Options.FLAG, 0),
+                  ('device', "device path ", Options.PARAM),
+                  ('ldapurl', "LDAP server URL ", Options.PARAM),
+                  ('lustre', "Lustre source dir ", Options.PARAM),
+                  ('portals', "Portals source dir ", Options.PARAM),
+                  ('maxlevel', """Specify the maximum level
+                    Levels are aproximatly like:
+                            70 - mountpoint, echo_client, osc, mdc, lov""",
+                   Options.INTPARAM, 100),
+
+                  ])
+
+    conf, args = cl.parse(sys.argv[1:])
+
+    for key in conf.values.keys():
+        print "%-10s = %s" % (key, conf.values[key])
diff --git a/lustre/utils/Lustre/error.py b/lustre/utils/Lustre/error.py
new file mode 100644 (file)
index 0000000..6c30416
--- /dev/null
@@ -0,0 +1,10 @@
+import exceptions
+
+class LconfError (exceptions.Exception):
+    def __init__(self, args):
+        self.args = args
+
+class OptionError (exceptions.Exception):
+    def __init__(self, args):
+        self.args = args
+
diff --git a/lustre/utils/Lustre/lustredb.py b/lustre/utils/Lustre/lustredb.py
new file mode 100644 (file)
index 0000000..35bca56
--- /dev/null
@@ -0,0 +1,413 @@
+import sys, types, string, os
+import re, exceptions
+import xml.dom.minidom
+import Lustre
+
+# ============================================================
+# XML processing and query
+
+class LustreDB:
+    def lookup(self, uuid):
+        """ lookup returns a new LustreDB instance"""
+        return self._lookup_by_uuid(uuid)
+
+    def lookup_name(self, name, class_name = ""):
+        """ lookup returns a new LustreDB instance"""
+        return self._lookup_by_name(name, class_name)
+
+    def lookup_class(self, class_name):
+        """ lookup returns a new LustreDB instance"""
+        return self._lookup_by_class(class_name)
+
+    def get_val(self, tag, default=None):
+        v =  self._get_val(tag)
+        if v:
+            return v
+        if default != None:
+            return default
+        return None
+
+    def get_class(self):
+        return self._get_class()
+
+    def get_val_int(self, tag, default=0):
+        str = self._get_val(tag)
+        try:
+            if str:
+                return int(str)
+            return default
+        except ValueError:
+            raise LconfError("text value is not integer:", str)
+            
+    def get_first_ref(self, tag):
+        """ Get the first uuidref of the type TAG. Only
+        one is expected.  Returns the uuid."""
+        uuids = self._get_refs(tag)
+        if len(uuids) > 0:
+            return  uuids[0]
+        return None
+    
+    def get_refs(self, tag):
+        """ Get all the refs of type TAG.  Returns list of uuids. """
+        uuids = self._get_refs(tag)
+        return uuids
+
+    def get_all_refs(self):
+        """ Get all the refs.  Returns list of uuids. """
+        uuids = self._get_all_refs()
+        return uuids
+
+    def nid2server(self, nid, net_type):
+        netlist = self.lookup_class('network')
+        for net_db in netlist:
+            if net_db.get_val('nid') == nid and net_db.get_val('nettype') == net_type: 
+                return net_db
+        return None
+    
+    # Find the target_device for target on a node
+    # node->profiles->device_refs->target
+    def get_node_tgt_dev(self, node_name, target_uuid):
+        node_db = self.lookup_name(node_name)
+        if not node_db:
+            return None
+        return self.get_tgt_dev(target_uuid)
+
+    # get all network uuids for this node
+    def get_networks(self):
+        ret = []
+        prof_list = self.get_refs('profile')
+        for prof_uuid in prof_list:
+            prof_db = self.lookup(prof_uuid)
+            net_list = prof_db.get_refs('network')
+            for net_uuid in net_list:
+                ret.append(net_uuid)
+        return ret
+
+    def get_active_dev(self, tgtuuid):
+        tgt = self.lookup(tgtuuid)
+        tgt_dev_uuid =tgt.get_first_ref('active')
+        return tgt_dev_uuid
+
+    def get_tgt_dev(self, tgtuuid):
+        prof_list = self.get_refs('profile')
+        for prof_uuid in prof_list:
+            prof_db = self.lookup(prof_uuid)
+            if not prof_db:
+                panic("profile:", profile, "not found.")
+            for ref_class, ref_uuid in prof_db.get_all_refs(): 
+                if ref_class in ('osd', 'mdsdev'):
+                    devdb = self.lookup(ref_uuid)
+                    uuid = devdb.get_first_ref('target')
+                    if tgtuuid == uuid:
+                        return ref_uuid
+        return None
+
+    def get_group(self, group):
+        ret = []
+        devs = self.lookup_class('mds')
+        for tgt in devs:
+            if tgt.get_val('group', "") == group:
+                ret.append(tgt.getUUID())
+        devs = self.lookup_class('ost')
+        for tgt in devs:
+            if tgt.get_val('group', "") == group:
+                ret.append(tgt.getUUID())
+        return ret
+
+    # Change the current active device for a target
+    def update_active(self, tgtuuid, new_uuid):
+        self._update_active(tgtuuid, new_uuid)
+
+    def get_version(self):
+        return self.get_val('version')
+
+class LustreDB_XML(LustreDB):
+    def __init__(self, dom, root_node):
+        # init xmlfile
+        self.dom_node = dom
+        self.root_node = root_node
+
+    def xmltext(self, dom_node, tag):
+        list = dom_node.getElementsByTagName(tag)
+        if len(list) > 0:
+            dom_node = list[0]
+            dom_node.normalize()
+            if dom_node.firstChild:
+                txt = string.strip(dom_node.firstChild.data)
+                if txt:
+                    return txt
+
+    def xmlattr(self, dom_node, attr):
+        return dom_node.getAttribute(attr)
+
+    def _get_val(self, tag):
+        """a value could be an attribute of the current node
+        or the text value in a child node"""
+        ret  = self.xmlattr(self.dom_node, tag)
+        if not ret:
+            ret = self.xmltext(self.dom_node, tag)
+        return ret
+
+    def _get_class(self):
+        return self.dom_node.nodeName
+
+    def get_ref_type(self, ref_tag):
+        res = string.split(ref_tag, '_')
+        return res[0]
+
+    #
+    # [(ref_class, ref_uuid),]
+    def _get_all_refs(self):
+        list = []
+        for n in self.dom_node.childNodes: 
+            if n.nodeType == n.ELEMENT_NODE:
+                ref_uuid = self.xml_get_ref(n)
+                ref_class = self.get_ref_type(n.nodeName)
+                list.append((ref_class, ref_uuid))
+                    
+        list.sort()
+        return list
+
+    def _get_refs(self, tag):
+        """ Get all the refs of type TAG.  Returns list of uuids. """
+        uuids = []
+        refname = '%s_ref' % tag
+        reflist = self.dom_node.getElementsByTagName(refname)
+        for r in reflist:
+            uuids.append(self.xml_get_ref(r))
+        return uuids
+
+    def xmllookup_by_uuid(self, dom_node, uuid):
+        for n in dom_node.childNodes:
+            if n.nodeType == n.ELEMENT_NODE:
+                if self.xml_get_uuid(n) == uuid:
+                    return n
+                else:
+                    n = self.xmllookup_by_uuid(n, uuid)
+                    if n: return n
+        return None
+
+    def _lookup_by_uuid(self, uuid):
+        dom = self. xmllookup_by_uuid(self.root_node, uuid)
+        if dom:
+            return LustreDB_XML(dom, self.root_node)
+
+    def xmllookup_by_name(self, dom_node, name):
+        for n in dom_node.childNodes:
+            if n.nodeType == n.ELEMENT_NODE:
+                if self.xml_get_name(n) == name:
+                    return n
+                else:
+                    n = self.xmllookup_by_name(n, name)
+                    if n: return n
+        return None
+
+    def _lookup_by_name(self, name, class_name):
+        dom = self.xmllookup_by_name(self.root_node, name)
+        if dom:
+            return LustreDB_XML(dom, self.root_node)
+
+    def xmllookup_by_class(self, dom_node, class_name):
+        return dom_node.getElementsByTagName(class_name)
+
+    def _lookup_by_class(self, class_name):
+        ret = []
+        domlist = self.xmllookup_by_class(self.root_node, class_name)
+        for node in domlist:
+            ret.append(LustreDB_XML(node, self.root_node))
+        return ret
+
+    def xml_get_name(self, n):
+        return n.getAttribute('name')
+        
+    def getName(self):
+        return self.xml_get_name(self.dom_node)
+
+    def xml_get_ref(self, n):
+        return n.getAttribute('uuidref')
+
+    def xml_get_uuid(self, dom_node):
+        return dom_node.getAttribute('uuid')
+
+    def getUUID(self):
+        return self.xml_get_uuid(self.dom_node)
+
+    # Convert routes from the router to a route that will be used
+    # on the local system.  The network type and gw are changed to the
+    # interface on the router the local system will connect to.
+    def get_local_routes(self, type, gw):
+        """ Return the routes as a list of tuples of the form:
+        [(type, gw, lo, hi),]"""
+        res = []
+        tbl = self.dom_node.getElementsByTagName('routetbl')
+        for t in tbl:
+            routes = t.getElementsByTagName('route')
+            for r in routes:
+                net_type = self.xmlattr(r, 'type')
+                if type != net_type:
+                    lo = self.xmlattr(r, 'lo')
+                    hi = self.xmlattr(r, 'hi')
+                    tgt_cluster_id = self.xmlattr(r, 'tgtclusterid')
+                    res.append((type, gw, tgt_cluster_id, lo, hi))
+        return res
+
+    def get_route_tbl(self):
+        ret = []
+        for r in self.dom_node.getElementsByTagName('route'):
+            net_type = self.xmlattr(r, 'type')
+            gw = self.xmlattr(r, 'gw')
+            gw_cluster_id = self.xmlattr(r, 'gwclusterid')
+            tgt_cluster_id = self.xmlattr(r, 'tgtclusterid')
+            lo = self.xmlattr(r, 'lo')
+            hi = self.xmlattr(r, 'hi')
+            ret.append((net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi))
+        return ret
+
+    def _update_active(self, tgt, new):
+        raise LconfError("updates not implemented for XML")
+
+# ================================================================    
+# LDAP Support
+class LustreDB_LDAP(LustreDB):
+    def __init__(self, name, attrs,
+                 base = "fs=lustre",
+                 parent = None,
+                 url  = "ldap://localhost",
+                 user = "cn=Manager, fs=lustre",
+                 pw   = "secret"
+                 ):
+        self._name = name
+        self._attrs = attrs
+        self._base = base
+        self._parent = parent
+        self._url  = url
+        self._user = user
+        self._pw   = pw
+        if parent:
+            self.l = parent.l
+            self._base = parent._base
+        else:
+            self.open()
+
+    def open(self):
+        import ldap
+        try:
+            self.l = ldap.initialize(self._url)
+            # Set LDAP protocol version used
+            self.l.protocol_version=ldap.VERSION3
+            # user and pw only needed if modifying db
+            self.l.bind_s(self._user, self._pw, ldap.AUTH_SIMPLE);
+        except ldap.LDAPError, e:
+            raise Lustre.LconfError('Unable to connection to ldap server')
+
+        try:
+            self._name, self._attrs = self.l.search_s(self._base,
+                                                      ldap.SCOPE_BASE)[0]
+        except ldap.LDAPError, e:
+            raise Lustre.LconfError("no config found in ldap: %s"
+                                      % (self._base,))
+    def close(self):
+        self.l.unbind_s()
+
+    def ldap_search(self, filter):
+        """Return list of uuids matching the filter."""
+        import ldap
+        dn = self._base
+        ret = []
+        uuids = []
+        try:
+            for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL,
+                                        filter, ["uuid"]):
+                for v in attrs['uuid']:
+                    uuids.append(v)
+        except ldap.NO_SUCH_OBJECT, e:
+            pass
+        except ldap.LDAPError, e:
+            print e                     # FIXME: die here?
+        if len(uuids) > 0:
+            for uuid in uuids:
+                ret.append(self._lookup_by_uuid(uuid))
+        return ret
+
+    def _lookup_by_name(self, name, class_name):
+        list =  self.ldap_search("lustreName=%s" %(name))
+        if len(list) == 1:
+            return list[0]
+        return None
+
+    def _lookup_by_class(self, class_name):
+        return self.ldap_search("objectclass=%s" %(string.upper(class_name)))
+
+    def _lookup_by_uuid(self, uuid):
+        import ldap
+        dn = "uuid=%s,%s" % (uuid, self._base)
+        ret = None
+        try:
+            for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE,
+                                               "objectclass=*"):
+                ret = LustreDB_LDAP(name, attrs,  parent = self)
+                        
+        except ldap.NO_SUCH_OBJECT, e:
+            pass                        # just return empty list
+        except ldap.LDAPError, e:
+            print e                     # FIXME: die here?
+        return ret
+
+
+    def _get_val(self, k):
+        ret = None
+        if self._attrs.has_key(k):
+            v = self._attrs[k]
+            if type(v) == types.ListType:
+                ret = str(v[0])
+            else:
+                ret = str(v)
+        return ret
+
+    def _get_class(self):
+        return string.lower(self._attrs['objectClass'][0])
+
+    def get_ref_type(self, ref_tag):
+        return ref_tag[:-3]
+
+    #
+    # [(ref_class, ref_uuid),]
+    def _get_all_refs(self):
+        list = []
+        for k in self._attrs.keys():
+            if re.search('.*Ref', k):
+                for uuid in self._attrs[k]:
+                    ref_class = self.get_ref_type(k)
+                    list.append((ref_class, uuid))
+        return list
+
+    def _get_refs(self, tag):
+        """ Get all the refs of type TAG.  Returns list of uuids. """
+        uuids = []
+        refname = '%sRef' % tag
+        if self._attrs.has_key(refname):
+            return self._attrs[refname]
+        return []
+
+    def getName(self):
+        return self._get_val('lustreName')
+
+    def getUUID(self):
+        return self._get_val('uuid')
+
+    def get_route_tbl(self):
+        return []
+
+    def _update_active(self, tgtuuid, newuuid):
+        """Return list of uuids matching the filter."""
+        import ldap
+        dn = "uuid=%s,%s" %(tgtuuid, self._base)
+        ret = []
+        uuids = []
+        try:
+            self.l.modify_s(dn, [(ldap.MOD_REPLACE, "activeRef", newuuid)])
+        except ldap.NO_SUCH_OBJECT, e:
+            print e
+        except ldap.LDAPError, e:
+            print e                     # FIXME: die here?
+        return 
index d345b64..e78bb7d 100644 (file)
@@ -1,13 +1,15 @@
 # Administration utilities Makefile
 DEFS=
+SUBDIRS = Lustre
 
-CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(PORTALS)/include  -I$(srcdir)/../include -Wall -L$(PORTALSLIB)
+CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include  -I$(srcdir)/../include -Wall -L../portals/utils
 KFLAGS:=
 CPPFLAGS = $(HAVE_LIBREADLINE)
 lctl_LDADD := $(LIBREADLINE) -lptlctl
 lload_LDADD := -lptlctl
-sbin_PROGRAMS = lctl lfind lstripe obdio obdbarrier obdstat lload
+sbin_PROGRAMS = lctl lfind lstripe obdio obdbarrier obdstat lload wirecheck
 sbin_SCRIPTS = lconf lmc llanalyze
+wirecheck_SOURCES = wirecheck.c
 lctl_SOURCES = parser.c obd.c lctl.c parser.h obdctl.h
 lload_SOURCES = lload.c 
 obdio_SOURCES = obdio.c obdiolib.c obdiolib.h
diff --git a/lustre/utils/lactive b/lustre/utils/lactive
new file mode 100644 (file)
index 0000000..6d7771d
--- /dev/null
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+#
+#  Copyright (C) 2002 Cluster File Systems, Inc.
+#   Author: Robert Read <rread@clusterfs.com>
+#   This file is part of Lustre, http://www.lustre.org.
+#
+#   Lustre is free software; you can redistribute it and/or
+#   modify it under the terms of version 2 of the GNU General Public
+#   License as published by the Free Software Foundation.
+#
+#   Lustre is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with Lustre; if not, write to the Free Software
+#   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+# For all the OST/MDSs that are primary on the --primary node, set
+# them to be active on --active if that OST is available on --active.
+#
+# Make the active node the active node for all devices it shares with the
+# old. The bulk of this code is for figuring out which devices to
+# change, and what to change them to.
+
+# XXX add error checking
+# XXX make this code less ugly
+
+import sys, getopt, types
+import string, os
+import ldap
+import Lustre
+
+lactive_options = [
+    ('ldapurl',"LDAP server URL", Lustre.Options.PARAM,
+     "ldap://localhost"),
+    ('config', "Cluster config name used for LDAP query", Lustre.Options.PARAM),
+    ('group', "The group of devices to update", Lustre.Options.PARAM),
+    ('active', "The active node name", Lustre.Options.PARAM),
+    ]
+
+def fatal(*args):
+    msg = string.join(map(str,args))
+    print "! " + msg
+    sys.exit(1)
+
+
+cl = Lustre.Options("lactive","", lactive_options)
+config, args = cl.parse(sys.argv[1:])
+
+if not (config.group or config.active):
+    fatal("Must specify both  group and active node.")
+
+if not config.config:
+    fatal("Missing config")
+    
+base = "config=%s,fs=lustre" % (config.config,)
+db = Lustre.LustreDB_LDAP('', {}, base=base, url = config.ldapurl)
+
+active_node = db.lookup_name(config.active)
+if not active_node:
+    fatal(config.active, "node not found in database.")
+
+devices =  db.get_group(config.group)
+if len(devices) < 0:
+    fatal("no devices found for group", config.group)
+
+# for all devices in group
+  # lookup device in active node
+  # update the active device 
+for tgtuuid in devices:
+    active_uuid = db.get_active_dev(tgtuuid)
+    new_active_uuid = active_node.get_tgt_dev(tgtuuid)
+    if active_uuid != new_active_uuid:
+        print ("%s: changing active %s to %s:%s"
+               % (tgtuuid, active_uuid,
+                  config.active, new_active_uuid))
+        db.update_active(tgtuuid, new_active_uuid)
+
+
+
+
+
similarity index 60%
rename from lustre/utils/lconf.in
rename to lustre/utils/lconf
index cbe05dd..7b31fef 100755 (executable)
@@ -26,7 +26,7 @@
 
 import sys, getopt, types
 import string, os, stat, popen2, socket, time, random, fcntl, select
-import re, exceptions
+import re, exceptions, signal
 import xml.dom.minidom
 
 if sys.version[0] == '1':
@@ -34,6 +34,19 @@ if sys.version[0] == '1':
 else:
     from fcntl import F_GETFL, F_SETFL
 
+PYMOD_DIR = "/usr/lib/lustre/python"
+
+def development_mode():
+    base = os.path.dirname(sys.argv[0])
+    if os.access(base+"/Makefile.am", os.R_OK):
+        return 1
+    return 0
+
+if not development_mode():
+    sys.path.append(PYMOD_DIR)
+
+import Lustre
+
 # Global parameters
 MAXTCPBUF = 1048576
 DEFAULT_TCPBUF = 1048576
@@ -41,7 +54,61 @@ DEFAULT_TCPBUF = 1048576
 # Maximum number of devices to search for.
 # (the /dev/loop* nodes need to be created beforehand)
 MAX_LOOP_DEVICES = 256
-PORTALS_DIR = '@PORTALSLOC@'
+PORTALS_DIR = 'portals'
+
+
+# Please keep these uptodate with the values in portals/kp30.h
+ptldebug_names = { 
+    "trace" :     (1 << 0),
+    "inode" :     (1 << 1),
+    "super" :     (1 << 2),
+    "ext2" :      (1 << 3),
+    "malloc" :    (1 << 4),
+    "cache" :     (1 << 5),
+    "info" :      (1 << 6),
+    "ioctl" :     (1 << 7),
+    "blocks" :    (1 << 8),
+    "net" :       (1 << 9),
+    "warning" :   (1 << 10),
+    "buffs" :     (1 << 11),
+    "other" :     (1 << 12),
+    "dentry" :    (1 << 13),
+    "portals" :   (1 << 14),
+    "page" :      (1 << 15),
+    "dlmtrace" :  (1 << 16),
+    "error" :     (1 << 17),
+    "emerg" :     (1 << 18),
+    "ha" :        (1 << 19),
+    "rpctrace" :  (1 << 20),
+    "vfstrace" :  (1 << 21),
+    }
+
+subsystem_names = {
+    "undefined" :    (0 << 24),
+    "mdc" :          (1 << 24),
+    "mds" :          (2 << 24),
+    "osc" :          (3 << 24),
+    "ost" :          (4 << 24),
+    "class" :        (5 << 24),
+    "obdfs" :        (6 << 24),
+    "llite" :        (7 << 24),
+    "rpc" :          (8 << 24),
+    "ext2obd" :      (9 << 24),
+    "portals" :     (10 << 24),
+    "socknal" :     (11 << 24),
+    "qswnal" :      (12 << 24),
+    "pinger" :      (13 << 24),
+    "filter" :      (14 << 24),
+    "trace" :       (15 << 24),
+    "echo" :        (16 << 24),
+    "ldlm" :        (17 << 24),
+    "lov" :         (18 << 24),
+    "gmnal" :       (19 << 24),
+    "ptlrouter" :   (20 << 24),
+    "cobd" :        (21 << 24),
+    "ptlbd" :       (22 << 24),
+    }
+
 
 first_cleanup_error = 0
 def cleanup_error(rc):
@@ -49,194 +116,16 @@ def cleanup_error(rc):
     if not first_cleanup_error:
         first_cleanup_error = rc
 
-
-def usage():
-    print """usage: lconf config.xml
-
-config.xml          Lustre configuration in xml format.
---ldapurl           LDAP server URL, eg. ldap://localhost
---config            Cluster config name used for LDAP query
---node <nodename>   Load config for <nodename>
---select service=nodeA,service2=nodeB   U
--d | --cleanup      Cleans up config. (Shutdown)
--f | --force        Forced unmounting and/or obd detach during cleanup
--v | --verbose      Print system commands as they are run
--h | --help         Print this help 
---gdb               Prints message after creating gdb module script
-                    and sleeps for 5 seconds.
--n | --noexec       Prints the commands and steps that will be run for a
-                    config without executing them. This can used to check if a
-                    config file is doing what it should be doing. (Implies -v)
---nomod             Skip load/unload module step.
---nosetup           Skip device setup/cleanup step.
---reformat          Reformat all devices (without question)
---dump <file>       Dump the kernel debug log before portals is unloaded
---minlevel <num>    Specify the minimum level of services to configure/cleanup (default 0)
---maxlevel <num>    Specify the maximum level of services to configure/cleanup (default 100)
-                    Levels are aproximatly like:
-                            10 - network
-                            20 - device, ldlm
-                            30 - osd, mdd
-                            40 - mds, ost
-                            50 - mdc, osc
-                            60 - lov
-                            70 - mountpoint, echo_client
---lustre=src_dir    Base directory of lustre sources. This parameter will cause lconf
-                    to load modules from a source tree.
---portals=src_dir   Portals source directory.  If this is a relative path, then it is
-                    assumed to be relative to lustre. 
-
-"""
-    TODO = """
---ldap server       LDAP server with lustre config database
---makeldiff         Translate xml source to LDIFF 
-This are perhaps not needed:
-"""
-    sys.exit()
-
-# ============================================================
-# Config parameters, encapsulated in a class
-class Config:
-    def __init__(self):
-        # flags
-        self._noexec = 0
-        self._verbose = 0
-        self._reformat = 0
-        self._cleanup = 0
-        self._gdb = 0
-        self._nomod = 0
-        self._nosetup = 0
-        self._force = 0
-        # parameters
-        self._modules = None
-        self._node = None
-        self._url = None
-        self._gdb_script = '/tmp/ogdb'
-        self._debug_path = '/tmp/lustre-log'
-        self._dump_file = None
-        self._lustre_dir = ''
-        self._portals_dir = ''
-       self._minlevel = 0
-       self._maxlevel = 100
-        self._timeout = 0
-        self._recovery_upcall = ''
-        self._ldapurl = ''
-        self._config_name = ''
-        self._select = {}
-        self._lctl_dump = ''
-
-    def verbose(self, flag = None):
-        if flag: self._verbose = flag
-        return self._verbose
-
-    def noexec(self, flag = None):
-        if flag: self._noexec = flag
-        return self._noexec
-
-    def reformat(self, flag = None):
-        if flag: self._reformat = flag
-        return self._reformat
-
-    def cleanup(self, flag = None):
-        if flag: self._cleanup = flag
-        return self._cleanup
-
-    def gdb(self, flag = None):
-        if flag: self._gdb = flag
-        return self._gdb
-
-    def nomod(self, flag = None):
-        if flag: self._nomod = flag
-        return self._nomod
-
-    def nosetup(self, flag = None):
-        if flag: self._nosetup = flag
-        return self._nosetup
-
-    def force(self, flag = None):
-        if flag: self._force = flag
-        return self._force
-
-    def node(self, val = None):
-        if val: self._node = val
-        return self._node
-
-    def gdb_script(self):
-        if os.path.isdir('/r'):
-            return '/r' + self._gdb_script
-        else:
-            return self._gdb_script
-
-    def debug_path(self):
-        if os.path.isdir('/r'):
-            return '/r' + self._debug_path
-        else:
-            return self._debug_path
-
-    def dump_file(self, val = None):
-        if val: self._dump_file = val
-        return self._dump_file
-    def minlevel(self, val = None):
-        if val: self._minlevel = int(val)
-        return self._minlevel
-
-    def maxlevel(self, val = None):
-        if val: self._maxlevel = int(val)
-        return self._maxlevel
-
-    def portals_dir(self, val = None):
-        if val: self._portals_dir = val
-        return self._portals_dir
-
-    def lustre_dir(self, val = None):
-        if val: self._lustre_dir = val
-        return self._lustre_dir
-
-    def timeout(self, val = None):
-        if val: self._timeout = val
-        return self._timeout
-
-    def recovery_upcall(self, val = None):
-        if val: self._recovery_upcall = val
-        return self._recovery_upcall
-
-    def ldapurl(self, val = None):
-        if val: self._ldapurl = val
-        return self._ldapurl
-
-    def config_name(self, val = None):
-        if val: self._config_name = val
-        return self._config_name
-
-    def init_select(self, arg):
-        # arg = "service=nodeA,service2=nodeB"
-        list = string.split(arg, ',')
-        for entry in list:
-            srv, node = string.split(entry, '=')
-            self._select[srv] = node
-        
-    def select(self, srv):
-        if self._select.has_key(srv):
-            return self._select[srv]
-        return None
-
-    def lctl_dump(self, val = None):
-        if val: self._lctl_dump = val
-        return self._lctl_dump
-
-
-config = Config()
-
 # ============================================================ 
 # debugging and error funcs
 
 def fixme(msg = "this feature"):
-    raise LconfError, msg + ' not implmemented yet.'
+    raise Lustre.LconfError, msg + ' not implmemented yet.'
 
 def panic(*args):
     msg = string.join(map(str,args))
-    if not config.noexec():
-        raise LconfError(msg)
+    if not config.noexec:
+        raise Lustre.LconfError(msg)
     else:
         print "! " + msg
 
@@ -249,10 +138,24 @@ def logall(msgs):
         print string.strip(s)
 
 def debug(*args):
-    if config.verbose():
+    if config.verbose:
         msg = string.join(map(str,args))
         print msg
 
+
+# ack, python's builtin int() does not support '0x123' syntax.
+# eval can do it, although what a hack!
+def my_int(s):
+    try:
+        if s[0:2] == '0x':
+            return eval(s, {}, {})
+        else:
+            return int(s)
+    except SyntaxError, e:
+        raise ValueError("not a number")
+    except NameError, e:
+        raise ValueError("not a number")
+
 # ============================================================
 # locally defined exceptions
 class CommandError (exceptions.Exception):
@@ -278,10 +181,6 @@ class CommandError (exceptions.Exception):
         else:
             print self.cmd_err
 
-class LconfError (exceptions.Exception):
-    def __init__(self, args):
-        self.args = args
-
 
 # ============================================================
 # handle daemons, like the acceptor
@@ -374,6 +273,14 @@ def run_acceptors():
         if not daemon.running():
             daemon.start()
 
+def run_one_acceptor(port):
+    if acceptors.has_key(port):
+        daemon = acceptors[port]
+        if not daemon.running():
+            daemon.start()
+    else:
+         panic("run_one_acceptor: No acceptor defined for port:", port)   
+        
 def stop_acceptor(port):
     if acceptors.has_key(port):
         daemon = acceptors[port]
@@ -395,7 +302,7 @@ class LCTLInterface:
         self.lctl = find_prog(cmd)
         self.save_file = ''
         if not self.lctl:
-            if config.noexec():
+            if config.noexec:
                 debug('! lctl not found')
                 self.lctl = 'lctl'
             else:
@@ -422,7 +329,7 @@ class LCTLInterface:
             cmds = '\n  dump ' + self.save_file + cmds
 
         debug("+", cmd_line, cmds)
-        if config.noexec(): return (0, [])
+        if config.noexec: return (0, [])
 
         child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
         child.tochild.write(cmds + "\n")
@@ -474,19 +381,16 @@ class LCTLInterface:
             
     def network(self, net, nid):
         """ initialized network and add "self" """
-        # Idea: "mynid" could be used for all network types to add "self," and then
-        # this special case would be gone and the "self" hack would be hidden.
-        if net  in ('tcp', 'toe'):
-            cmds =  """
+        cmds =  """
   network %s
   mynid %s
   quit """ % (net, nid)
-            self.run(cmds)
+        self.run(cmds)
 
     # create a new connection
     def connect(self, srv):
         cmds =  "\n  add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type)
-        if srv.net_type  in ('tcp', 'toe') and not config.lctl_dump():
+        if srv.net_type  in ('tcp', 'toe') and not config.lctl_dump:
             flags = ''
             if srv.irq_affinity:
                 flags = flags + 'i'
@@ -503,6 +407,14 @@ class LCTLInterface:
 
         cmds = cmds + "\n  quit"
         self.run(cmds)
+
+    # Recover a device
+    def recover(self, dev_uuid, new_conn):
+        cmds = """
+    device %%%s
+    probe
+    recover %s""" %(dev_uuid, new_conn)
+        self.run(cmds)
                 
     # add a route to a range
     def add_route(self, net, gw, lo, hi):
@@ -553,6 +465,13 @@ class LCTLInterface:
   quit""" % (net, nid, servuuid)
         self.run(cmds)
 
+    def del_uuid(self, servuuid):
+        cmds =  """
+  ignore_errors
+  del_uuid %s
+  quit""" % (servuuid,)
+        self.run(cmds)
+
     # disconnect all
     def disconnectAll(self, net):
         cmds =  """
@@ -572,17 +491,20 @@ class LCTLInterface:
         self.run(cmds)
 
     # cleanup a device
-    def cleanup(self, name, uuid):
+    def cleanup(self, name, uuid, force, failover = 0):
+        if failover: force = 1
         cmds = """
   ignore_errors
   device $%s
-  cleanup %s
+  cleanup %s %s
   detach
-  quit""" % (name, ('', 'force')[config.force()])
+  quit""" % (name, ('', 'force')[force],
+             ('', 'failover')[failover])
         self.run(cmds)
 
     # create an lov
-    def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
+    def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
+                      pattern, devlist):
         cmds = """
   device $%s
   probe
@@ -599,7 +521,13 @@ class LCTLInterface:
 
     # get list of devices
     def device_list(self):
-        rc, out = self.runcmd('device_list')
+        try:
+            rc, out = self.runcmd('device_list')
+        except CommandError, e:
+            if config.cleanup:
+                out = []
+            else:
+                raise e
         return out
 
     # get lustre version
@@ -607,6 +535,12 @@ class LCTLInterface:
         rc, out = self.runcmd('version')
         return out
 
+    # dump mount options
+    def mount_option(self, option):
+        cmds = """
+  mount_option %s
+  quit""" % (option)
+        self.run(cmds)
 # ============================================================
 # Various system-level functions
 # (ideally moved to their own module)
@@ -616,7 +550,7 @@ class LCTLInterface:
 # save it if necessary
 def runcmd(cmd):
     debug ("+", cmd)
-    if config.noexec(): return (0, [])
+    if config.noexec: return (0, [])
     f = os.popen(cmd + ' 2>&1')
     out = f.readlines()
     ret = f.close()
@@ -634,7 +568,7 @@ def run(*args):
 def run_daemon(*args):
     cmd = string.join(map(str,args))
     debug ("+", cmd)
-    if config.noexec(): return 0
+    if config.noexec: return 0
     f = os.popen(cmd + ' 2>&1')
     ret = f.close()
     if ret:
@@ -649,8 +583,8 @@ def find_prog(cmd):
     syspath = string.split(os.environ['PATH'], ':')
     cmdpath = os.path.dirname(sys.argv[0])
     syspath.insert(0, cmdpath);
-    if config.portals_dir():
-        syspath.insert(0, os.path.join(config.portals_dir()+'/linux/utils/'))
+    if config.portals:
+        syspath.insert(0, os.path.join(config.portals, 'utils/'))
     for d in syspath:
         prog = os.path.join(d,cmd)
         if os.access(prog, os.X_OK):
@@ -690,25 +624,32 @@ def is_block(path):
 
 # build fs according to type
 # fixme: dangerous
-def mkfs(dev, devsize, fstype):
+def mkfs(dev, devsize, fstype,jsize):
     block_cnt = ''
+    jopt = ''
     if devsize:
+        if devsize < 8000:
+            panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
+                  (dev, devsize))
         # devsize is in 1k, and fs block count is in 4k
         block_cnt = devsize/4
 
-    if(fstype in ('ext3', 'extN')):
+    if fstype in ('ext3', 'extN'):
+        # ext3 journal size is in megabytes
+        if jsize:  jopt = "-J size=%d" %(jsize,)
         mkfs = 'mkfs.ext2 -j -b 4096 -F '
-    elif (fstype == 'reiserfs'):
+    elif fstype == 'reiserfs':
+        # reiserfs journal size is in blocks
+        if jsize:  jopt = "--journal_size %d" %(jsize,)
         mkfs = 'mkreiserfs -ff'
     else:
         print 'unsupported fs type: ', fstype
 
-    (ret, out) = run (mkfs, dev, block_cnt)
+    (ret, out) = run (mkfs, jopt, dev, block_cnt)
     if ret:
-        panic("Unable to build fs:", dev)
+        panic("Unable to build fs:", dev, string.join(out))
     # enable hash tree indexing on fsswe
-    # FIXME: this check can probably go away on 2.5
-    if fstype == 'extN':
+    if fstype in ('ext3', 'extN'):
         htree = 'echo "feature FEATURE_C5" | debugfs -w'
         (ret, out) = run (htree, dev)
         if ret:
@@ -731,7 +672,7 @@ def find_loop(file):
         dev = loop + str(n)
         if os.access(dev, os.R_OK):
             (stat, out) = run('losetup', dev)
-            if (out and stat == 0):
+            if out and stat == 0:
                 m = re.search(r'\((.*)\)', out[0])
                 if m and file == m.group(1):
                     return dev
@@ -740,18 +681,19 @@ def find_loop(file):
     return ''
 
 # create file if necessary and assign the first free loop device
-def init_loop(file, size, fstype):
+def init_loop(file, size, fstype, journal_size):
     dev = find_loop(file)
     if dev:
         print 'WARNING file:', file, 'already mapped to', dev
         return dev
-    if config.reformat()  or not os.access(file, os.R_OK | os.W_OK):
+    if config.reformat or not os.access(file, os.R_OK | os.W_OK):
         if size < 8000:
-            panic(file, "size must be larger than 8MB, currently set to:", size)
+            panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
         (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
                                                                          file))
         if ret:
             panic("Unable to create backing store:", file)
+        mkfs(file, size, fstype, journal_size)
 
     loop = loop_base()
     # find next free loop
@@ -759,7 +701,7 @@ def init_loop(file, size, fstype):
         dev = loop + str(n)
         if os.access(dev, os.R_OK):
             (stat, out) = run('losetup', dev)
-            if (stat):
+            if stat:
                 run('losetup', dev, file)
                 return dev
         else:
@@ -783,12 +725,12 @@ def need_format(fstype, dev):
     return 0
 
 # initialize a block device if needed
-def block_dev(dev, size, fstype, format):
-    if config.noexec(): return dev
+def block_dev(dev, size, fstype, format, journal_size):
+    if config.noexec: return dev
     if not is_block(dev):
-        dev = init_loop(dev, size, fstype)
-    if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
-        mkfs(dev, size, fstype)
+        dev = init_loop(dev, size, fstype, journal_size)
+    elif config.reformat or (need_format(fstype, dev) and format == 'yes'):
+        mkfs(dev, size, fstype, journal_size)
 
 #    else:
 #        panic("device:", dev,
@@ -807,8 +749,7 @@ def if2addr(iface):
     return ip
 
 def get_local_nid(net_type, wildcard):
-    """Return the local nid. First look for an elan interface,
-      then use the local address. """
+    """Return the local nid."""
     local = ""
     if os.access('/proc/elan/device0/position', os.R_OK):
         local = get_local_address('elan', '*')
@@ -843,16 +784,28 @@ def get_local_address(net_type, wildcard):
             log(e)
     elif net_type == 'gm':
         fixme("automatic local address for GM")
+    elif net_type == 'scimac':
+        scinode="/opt/scali/sbin/scinode"
+        if os.path.exists(scinode):
+            (rc,local) = run(scinode)
+        else:
+            panic (scinode, " not found on node with scimac networking")
+        if rc:
+            panic (scinode, " failed")
+        local=string.rstrip(local[0])
+
     return local
         
 
 def is_prepared(uuid):
     """Return true if a device exists for the uuid"""
-    # expect this format:
-    # 1 UP ldlm ldlm ldlm_UUID 2
-    if config.lctl_dump():
+    if config.lctl_dump:
         return 0
+    if config.noexec and config.cleanup:
+        return 1
     try:
+        # expect this format:
+        # 1 UP ldlm ldlm ldlm_UUID 2
         out = lctl.device_list()
         for s in out:
             if uuid == string.split(s)[4]:
@@ -861,20 +814,27 @@ def is_prepared(uuid):
         e.dump()
     return 0
 
-def is_network_prepared():
-    """If the  PTLRPC device exists, then assumet that all networking
-       has been configured"""
-    if config.lctl_dump():
+def is_prepared_name(name):
+    """Return true if a device exists for the name"""
+    if config.lctl_dump:
         return 0
+    if config.noexec and config.cleanup:
+        return 1
     try:
+        # expect this format:
+        # 1 UP ldlm ldlm ldlm_UUID 2
         out = lctl.device_list()
         for s in out:
-            if 'RPCDEV_UUID' == string.split(s)[4]:
+            if name == string.split(s)[3]:
                 return 1
     except CommandError, e:
         e.dump()
     return 0
-    
+
+def is_network_prepared():
+    """If the LDLM device exists, then assume that all networking
+       has been configured"""
+    return is_prepared('ldlm_UUID')
     
 def fs_is_mounted(path):
     """Return true if path is a mounted lustre filesystem"""
@@ -915,7 +875,7 @@ class Module:
         """ default cleanup, used for most modules """
         self.info()
         try:
-            lctl.cleanup(self.name, self.uuid)
+            lctl.cleanup(self.name, self.uuid, config.force)
         except CommandError, e:
             log(self.module_name, "cleanup failed: ", self.name)
             e.dump()
@@ -923,11 +883,11 @@ class Module:
             
     def add_portals_module(self, dev_dir, modname):
         """Append a module to list of modules to load."""
-        self.kmodule_list.append((config.portals_dir(), dev_dir, modname))
+        self.kmodule_list.append((config.portals, dev_dir, modname))
 
     def add_lustre_module(self, dev_dir, modname):
         """Append a module to list of modules to load."""
-        self.kmodule_list.append((config.lustre_dir(), dev_dir, modname))
+        self.kmodule_list.append((config.lustre, dev_dir, modname))
 
     def mod_loaded(self, modname):
         """Check if a module is already loaded. Look in /proc/modules for it."""
@@ -943,9 +903,9 @@ class Module:
         """Load all the modules in the list in the order they appear."""
         for src_dir, dev_dir, mod in self.kmodule_list:
             #  (rc, out) = run ('/sbin/lsmod | grep -s', mod)
-            if self.mod_loaded(mod) and not config.noexec():
+            if self.mod_loaded(mod) and not config.noexec:
                 continue
-            log ('loading module:', mod)
+            log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
             if src_dir:
                 module = find_module(src_dir, dev_dir,  mod)
                 if not module:
@@ -960,27 +920,34 @@ class Module:
             
     def cleanup_module(self):
         """Unload the modules in the list in reverse order."""
+        if not self.safe_to_clean():
+            return
         rev = self.kmodule_list
         rev.reverse()
         for src_dir, dev_dir, mod in rev:
-            if not self.mod_loaded(mod):
+            if not self.mod_loaded(mod) and not config.noexec:
                 continue
             # debug hack
-            if mod == 'portals' and config.dump_file():
-                lctl.dump(config.dump_file())
+            if mod == 'portals' and config.dump:
+                lctl.dump(config.dump)
             log('unloading module:', mod)
-            if config.noexec():
-                continue
             (rc, out) = run('/sbin/rmmod', mod)
             if rc:
                 log('! unable to unload module:', mod)
                 logall(out)
+
+    def safe_to_clean(self):
+        return 1
+        
+    def safe_to_clean_modules(self):
+        return self.safe_to_clean()
         
 class Network(Module):
     def __init__(self,db):
         Module.__init__(self, 'NETWORK', db)
         self.net_type = self.db.get_val('nettype')
         self.nid = self.db.get_val('nid', '*')
+        self.cluster_id = self.db.get_val('clusterid', "0")
         self.port = self.db.get_val_int('port', 0)
         self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
         self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
@@ -988,7 +955,10 @@ class Network(Module):
         self.nid_exchange = self.db.get_val_int('nidexchange', 0)
 
         if '*' in self.nid:
-            self.nid = get_local_nid(self.net_type, self.nid)
+            if self.nid_exchange:
+                self.nid = get_local_nid(self.net_type, self.nid)
+            else:
+                self.nid = get_local_address(self.net_type, self.nid)
             if not self.nid:
                 panic("unable to set nid for", self.net_type, self.nid)
             debug("nid:", self.nid)
@@ -999,31 +969,88 @@ class Network(Module):
             if not self.nid:
                 panic("unable to set nid for", self.net_type, self.hostaddr)
             debug("hostaddr:", self.hostaddr)
-        # debug ( "hostaddr ", self.hostaddr, "net_type", self.net_type)
 
-        self.add_portals_module("linux/oslib", 'portals')
+        self.add_portals_module("libcfs", 'portals')
         if node_needs_router():
-            self.add_portals_module("linux/router", 'kptlrouter')
+            self.add_portals_module("router", 'kptlrouter')
         if self.net_type == 'tcp':
-            self.add_portals_module("linux/socknal", 'ksocknal')
+            self.add_portals_module("knals/socknal", 'ksocknal')
         if self.net_type == 'toe':
-            self.add_portals_module("/linux/toenal", 'ktoenal')
+            self.add_portals_module("knals/toenal", 'ktoenal')
         if self.net_type == 'elan':
-            self.add_portals_module("/linux/rqswnal", 'kqswnal')
+            self.add_portals_module("knals/qswnal", 'kqswnal')
         if self.net_type == 'gm':
-            self.add_portals_module("/linux/gmnal", 'kgmnal')
-        self.add_lustre_module('obdclass', 'obdclass')
+            self.add_portals_module("knals/gmnal", 'kgmnal')
+        if self.net_type == 'scimac':
+            self.add_portals_module("knals/scimacnal", 'kscimacnal')
 
     def prepare(self):
         if is_network_prepared():
             return
         self.info(self.net_type, self.nid, self.port)
         lctl.network(self.net_type, self.nid)
+        if self.port and  node_is_router():
+            run_one_acceptor(self.port)
+            self.connect_peer_gateways()
+
+    def connect_peer_gateways(self):
+        for router in self.db.lookup_class('node'):
+            if router.get_val_int('router', 0):
+                # if this is a peer with a nid less than mine,
+                # then connect.
+                for netuuid in router.get_networks():
+                    net = self.db.lookup(netuuid)
+                    gw = Network(net)
+                    if (gw.cluster_id == self.cluster_id and
+                        gw.net_type == self.net_type):
+                        # hack: compare as numbers if possible, this should all
+                        # go away once autoconnect is done.
+                        # This also conveniently prevents us from connecting to ourself.
+                        try:
+                            gw_nid = my_int(gw.nid)
+                            self_nid = my_int(self.nid)
+                        except ValueError, e:
+                            print "Error!", str(e)
+                            gw_nid = gw.nid
+                            self_nid = self.nid
+                        if gw_nid < self_nid:
+                            lctl.connect(gw)
+
+    def disconnect_peer_gateways(self):
+        for router in self.db.lookup_class('node'):
+            if router.get_val_int('router', 0):
+                # if this is a peer with a nid less than mine,
+                # then connect.
+                    if (gw.cluster_id == self.cluster_id and
+                        gw.net_type == self.net_type):
+                        # hack: compare as numbers if possible, this should all
+                        # go away once autoconnect is done.
+                        # This also conveniently prevents us from connecting to ourself.
+                        try:
+                            gw_nid = my_int(gw.nid)
+                            self_nid = my_int(self.nid)
+                        except ValueError, e:
+                            print "Error!", str(e)
+                            gw_nid = gw.nid
+                            self_nid = self.nid
+                        if gw_nid < self_nid:
+                            try:
+                                lctl.disconnect(router.net_type, router.nid, router.port,
+                                                router.uuid)
+                            except CommandError, e:
+                                print "disconnectAll failed: ", self.name
+                                e.dump()
+                                cleanup_error(e.rc)
+
+    def safe_to_clean(self):
+        return not is_network_prepared()
 
     def cleanup(self):
         self.info(self.net_type, self.nid, self.port)
-        if self.net_type in ('tcp', 'toe'):
+        if self.port:
             stop_acceptor(self.port)
+        if  node_is_router():
+            self.disconnect_peer_gateways()
         try:
             lctl.disconnectAll(self.net_type)
         except CommandError, e:
@@ -1031,25 +1058,31 @@ class Network(Module):
             e.dump()
             cleanup_error(e.rc)
 
-class Router(Module):
+class RouteTable(Module):
     def __init__(self,db):
-        Module.__init__(self, 'ROUTER', db)
+        Module.__init__(self, 'ROUTES', db)
     def prepare(self):
         if is_network_prepared():
             return
         self.info()
-        for net_type, gw, lo, hi in self.db.get_route_tbl():
+        for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
             lctl.add_route(net_type, gw, lo, hi)
-            if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
+            if net_type in ('tcp', 'toe') and local_net_type(net_type) and lo == hi:
                 srvdb = self.db.nid2server(lo, net_type)
-
                 if not srvdb:
                     panic("no server for nid", lo)
                 else:
                     srv = Network(srvdb)
                     lctl.connect(srv)
+
+    def safe_to_clean(self):
+        return not is_network_prepared()
+
     def cleanup(self):
-        for net_type, gw, lo, hi in self.db.get_route_tbl():
+        if is_network_prepared():
+            # the network is still being used, don't clean it up
+            return
+        for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
             if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
                 srvdb = self.db.nid2server(lo, net_type)
                 if not srvdb:
@@ -1072,25 +1105,20 @@ class Router(Module):
 class LDLM(Module):
     def __init__(self,db):
         Module.__init__(self, 'LDLM', db)
+        self.add_lustre_module('obdclass', 'obdclass')
+        self.add_lustre_module('ptlrpc', 'ptlrpc')
         self.add_lustre_module('ldlm', 'ldlm') 
+
     def prepare(self):
         if is_prepared(self.uuid):
             return
         self.info()
         lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid))
-    def cleanup(self):
-        if is_prepared(self.uuid):
-            Module.cleanup(self)
 
-class PTLRPC(Module):
-    def __init__(self,db):
-        Module.__init__(self, 'PTLRPC', db)
-        self.add_lustre_module('ptlrpc', 'ptlrpc') 
-    def prepare(self):
-        if is_prepared(self.uuid):
-            return
-        self.info()
-        lctl.newdev(attach="ptlrpc %s %s" % (self.name, self.uuid))
+    def safe_to_clean(self):
+        out = lctl.device_list()
+        return len(out) <= 1
+
     def cleanup(self):
         if is_prepared(self.uuid):
             Module.cleanup(self)
@@ -1109,7 +1137,7 @@ class LOV(Module):
         self.devlist = self.db.get_refs('obd')
         self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
         self.osclist = []
-        self.mdc_uudi = ''
+        self.mdc_uuid = ''
         for obd_uuid in self.devlist:
             obd = self.db.lookup(obd_uuid)
             osc = get_osc(obd, self.name)
@@ -1123,11 +1151,12 @@ class LOV(Module):
             return
         for osc in self.osclist:
             try:
-                # Ignore connection failures, because the LOV will DTRT with
-                # an unconnected OSC.
-                osc.prepare(ignore_connect_failure=1)
-            except CommandError:
+                # Only ignore connect failures with --force, which
+                # isn't implemented here yet.
+                osc.prepare(ignore_connect_failure=0)
+            except CommandError, e:
                 print "Error preparing OSC %s (inactive)\n" % osc.uuid
+                raise e
         self.mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid)
         self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
                   self.stripe_off, self.pattern, self.devlist, self.mds_name)
@@ -1178,27 +1207,33 @@ class MDSDEV(Module):
         Module.__init__(self, 'MDSDEV', db)
         self.devpath = self.db.get_val('devpath','')
         self.size = self.db.get_val_int('devsize', 0)
+        self.journal_size = self.db.get_val_int('journalsize', 0)
         self.fstype = self.db.get_val('fstype', '')
         # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
         target_uuid = self.db.get_first_ref('target')
         mds = self.db.lookup(target_uuid)
         self.name = mds.getName()
         self.lovconfig_uuids = mds.get_refs('lovconfig')
+        self.filesystem_uuids = mds.get_refs('filesystem')
         # FIXME: if fstype not set, then determine based on kernel version
         self.format = self.db.get_val('autoformat', "no")
-
-        active_uuid = mds.get_active_target()
+        if mds.get_val('failover', 0):
+            self.failover_mds = 'f'
+        else:
+            self.failover_mds = ''
+        active_uuid = get_active_target(mds)
         if not active_uuid:
             panic("No target device found:", target_uuid)
         if active_uuid == self.uuid:
             self.active = 1
         else:
             self.active = 0
+        if self.active and config.group and config.group != ost.get_val('group'):
+            self.active = 0
+
         self.target_dev_uuid = self.uuid
         self.uuid = target_uuid
         # modules
-        if self.fstype == 'extN':
-            self.add_lustre_module('extN', 'extN') 
         self.add_lustre_module('mds', 'mds')
         if self.fstype:
             self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
@@ -1215,7 +1250,8 @@ class MDSDEV(Module):
             return
         self.info(self.devpath, self.fstype, self.format)
         run_acceptors()
-        blkdev = block_dev(self.devpath, self.size, self.fstype, self.format)
+        blkdev = block_dev(self.devpath, self.size, self.fstype, self.format,
+                           self.journal_size)
         if not is_prepared('MDT_UUID'):
             lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
                         setup ="")
@@ -1225,17 +1261,57 @@ class MDSDEV(Module):
             db = self.db.lookup(uuid)
             lovconfig = LOVConfig(db)
             lovconfig.prepare()
+        if config.mds_ost_conn:
+            for uuid in self.filesystem_uuids:
+                log("open clients for filesystem:", uuid)
+                fs = self.db.lookup(uuid)
+                obd_uuid = fs.get_first_ref('obd')
+                client = VOSC(self.db.lookup(obd_uuid), self.name)
+                client.prepare()
+                
             
+    def msd_remaining(self):
+        out = lctl.device_list()
+        for s in out:
+            if string.split(s)[2] in ('mds',):
+                return 1
+
+    def safe_to_clean(self):
+        return self.active
+
+    def safe_to_clean_modules(self):
+        return not self.msd_remaining()
+
     def cleanup(self):
-        if is_prepared('MDT_UUID'):
+        if not self.active:
+            debug(self.uuid, "not active")
+            return
+        if is_prepared(self.uuid):
+            self.info()
             try:
-                lctl.cleanup("MDT", "MDT_UUID")
+                lctl.cleanup(self.name, self.uuid, config.force,
+                             config.failover)
+            except CommandError, e:
+                log(self.module_name, "cleanup failed: ", self.name)
+                e.dump()
+                cleanup_error(e.rc)
+                Module.cleanup(self)
+        if config.mds_ost_conn:
+            for uuid in self.filesystem_uuids:
+                log("clean clients for filesystem:", uuid)
+                log("open clients for filesystem:", uuid)
+                fs = self.db.lookup(uuid)
+                obd_uuid = fs.get_first_ref('obd')
+                client = VOSC(self.db.lookup(obd_uuid), self.name)
+                client.cleanup()
+        if not self.msd_remaining() and is_prepared('MDT_UUID'):
+            try:
+                lctl.cleanup("MDT", "MDT_UUID", config.force,
+                             config.failover)
             except CommandError, e:
                 print "cleanup failed: ", self.name
                 e.dump()
                 cleanup_error(e.rc)
-        if is_prepared(self.uuid):
-            Module.cleanup(self)
         clean_loop(self.devpath)
 
 class OSD(Module):
@@ -1244,29 +1320,35 @@ class OSD(Module):
         self.osdtype = self.db.get_val('osdtype')
         self.devpath = self.db.get_val('devpath', '')
         self.size = self.db.get_val_int('devsize', 0)
+        self.journal_size = self.db.get_val_int('journalsize', 0)
         self.fstype = self.db.get_val('fstype', '')
         target_uuid = self.db.get_first_ref('target')
         ost = self.db.lookup(target_uuid)
         self.name = ost.getName()
-        # FIXME: if fstype not set, then determine based on kernel version
         self.format = self.db.get_val('autoformat', 'yes')
-        if self.fstype == 'extN':
-            self.add_lustre_module('extN', 'extN') 
+        if ost.get_val('failover', 0):
+            self.failover_ost = 'f'
+        else:
+            self.failover_ost = ''
 
-        active_uuid = ost.get_active_target()
+        active_uuid = get_active_target(ost)
         if not active_uuid:
             panic("No target device found:", target_uuid)
         if active_uuid == self.uuid:
             self.active = 1
         else:
             self.active = 0
+        if self.active and config.group and config.group != ost.get_val('group'):
+            self.active = 0
+            
         self.target_dev_uuid = self.uuid
         self.uuid = target_uuid
         # modules
         self.add_lustre_module('ost', 'ost')
-        self.add_lustre_module(self.osdtype, self.osdtype)
+       # FIXME: should we default to ext3 here?
         if self.fstype:
             self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype))
+        self.add_lustre_module(self.osdtype, self.osdtype)
 
     def load_module(self):
         if self.active:
@@ -1281,28 +1363,54 @@ class OSD(Module):
         if not self.active:
             debug(self.uuid, "not active")
             return
-        self.info(self.osdtype, self.devpath, self.size, self.fstype, self.format)
+        self.info(self.osdtype, self.devpath, self.size, self.fstype,
+                  self.format, self.journal_size)
         run_acceptors()
         if self.osdtype == 'obdecho':
             blkdev = ''
         else:
-            blkdev = block_dev(self.devpath, self.size, self.fstype, self.format)
+            blkdev = block_dev(self.devpath, self.size, self.fstype,
+                               self.format, self.journal_size)
         lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid),
-                    setup ="%s %s" %(blkdev, self.fstype))
+                    setup ="%s %s %s" %(blkdev, self.fstype,
+                                        self.failover_ost))
         if not is_prepared('OSS_UUID'):
             lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'),
                         setup ="")
 
+    def osd_remaining(self):
+        out = lctl.device_list()
+        for s in out:
+            if string.split(s)[2] in ('obdfilter', 'obdecho'):
+                return 1
+
+    def safe_to_clean(self):
+        return self.active
+
+    def safe_to_clean_modules(self):
+        return not self.osd_remaining()
+
     def cleanup(self):
-        if is_prepared('OSS_UUID'):
+        if not self.active:
+            debug(self.uuid, "not active")
+            return
+        if is_prepared(self.uuid):
+            self.info()
             try:
-                lctl.cleanup("OSS", "OSS_UUID")
+                lctl.cleanup(self.name, self.uuid, config.force,
+                             config.failover)
+            except CommandError, e:
+                log(self.module_name, "cleanup failed: ", self.name)
+                e.dump()
+                cleanup_error(e.rc)
+        if not self.osd_remaining() and is_prepared('OSS_UUID'):
+            try:
+                lctl.cleanup("OSS", "OSS_UUID", config.force,
+                             config.failover)
             except CommandError, e:
                 print "cleanup failed: ", self.name
                 e.dump()
                 cleanup_error(e.rc)
-        if is_prepared(self.uuid):
-            Module.cleanup(self)
         if not self.osdtype == 'obdecho':
             clean_loop(self.devpath)
 
@@ -1313,7 +1421,7 @@ class Client(Module):
         self.target_uuid = tgtdb.getUUID()
         self.db = tgtdb
 
-        self.tgt_dev_uuid = tgtdb.get_active_target()
+        self.tgt_dev_uuid = get_active_target(tgtdb)
         if not self.tgt_dev_uuid:
             panic("No target device found for target:", self.target_name)
             
@@ -1323,9 +1431,10 @@ class Client(Module):
 
         self.module = module
         self.module_name = string.upper(module)
-        self.name = '%s_%s_%s' % (self.module_name, owner, self.target_name)
-        self.uuid = '%05x%05x_%.14s_%05x%05x' % (int(random.random() * 1048576),
-                                              int(random.random() * 1048576),self.name,
+        self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
+                                     self.target_name, owner)
+        self.uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
+                                              self.name,
                                               int(random.random() * 1048576),
                                               int(random.random() * 1048576))
         self.uuid = self.uuid[0:36]
@@ -1334,7 +1443,7 @@ class Client(Module):
 
     def lookup_server(self, srv_uuid):
         """ Lookup a server's network information """
-        self._server_nets = self.db.get_ost_net(srv_uuid)
+        self._server_nets = get_ost_net(self.db, srv_uuid)
         if len(self._server_nets) == 0:
             panic ("Unable to find a server for:", srv_uuid)
 
@@ -1342,11 +1451,11 @@ class Client(Module):
         return self._server_nets
 
     def prepare(self, ignore_connect_failure = 0):
-        if is_prepared(self.uuid):
-            return
         self.info(self.target_uuid)
+        if is_prepared_name(self.name):
+            self.cleanup()
         try:
-            srv = local_net(self.get_servers())
+            srv = choose_local_server(self.get_servers())
             if srv:
                 lctl.connect(srv)
             else:
@@ -1355,34 +1464,28 @@ class Client(Module):
                     lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
                 else:
                     panic ("no route to",  self.target_uuid)
-        except CommandError:
-            if (ignore_connect_failure == 0):
-                pass
+        except CommandError, e:
+            if not ignore_connect_failure:
+                raise e
         if srv:
             lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
                         setup ="%s %s" %(self.target_uuid, srv.uuid))
 
     def cleanup(self):
-        Module.cleanup(self)
-        srv = local_net(self.get_servers())
-        if srv:
+        if is_prepared_name(self.name):
+            Module.cleanup(self)
             try:
-                lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
+                srv = choose_local_server(self.get_servers())
+                if srv:
+                    lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
+                else:
+                    srv, r =  find_route(self.get_servers())
+                    if srv:
+                        lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
             except CommandError, e:
-                log(self.module_name, "disconnect failed: ", self.name)
+                log(self.module_name, "cleanup failed: ", self.name)
                 e.dump()
                 cleanup_error(e.rc)
-        else:
-            self.info(self.target_uuid)
-            srv, r =  find_route(self.get_servers())
-            if srv:
-                try:
-                    lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
-                except CommandError, e:
-                    print "del_route failed: ", self.name
-                    e.dump()
-                    cleanup_error(e.rc)
-
 
 
 class MDC(Client):
@@ -1472,8 +1575,10 @@ class Mountpoint(Module):
     def __init__(self,db):
         Module.__init__(self, 'MTPT', db)
         self.path = self.db.get_val('path')
-        self.mds_uuid = self.db.get_first_ref('mds')
-        self.obd_uuid = self.db.get_first_ref('obd')
+        self.fs_uuid = self.db.get_first_ref('filesystem')
+        fs = self.db.lookup(self.fs_uuid)
+        self.mds_uuid = fs.get_first_ref('mds')
+        self.obd_uuid = fs.get_first_ref('obd')
         obd = self.db.lookup(self.obd_uuid)
         self.vosc = VOSC(obd, self.name)
         if self.vosc.need_mdc():
@@ -1482,25 +1587,36 @@ class Mountpoint(Module):
 
 
     def prepare(self):
+        if fs_is_mounted(self.path):
+            log(self.path, "already mounted.")
+            return
         self.vosc.prepare()
         if self.vosc.need_mdc():
             mdc_uuid = prepare_mdc(self.db, self.name,  self.mds_uuid)
         else:
             mdc_uuid = self.vosc.get_mdc_uuid()
         if not mdc_uuid:
+            self.vosc.cleanup()
             panic("Unable to determine MDC UUID. Probably need to cleanup before re-mounting.")
         self.info(self.path, self.mds_uuid, self.obd_uuid)
+        if config.lctl_dump:
+            cmd = "osc=%s,mdc=%s" % (self.vosc.get_uuid(), mdc_uuid)
+            lctl.mount_option(cmd)
+            return
         cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
               (self.vosc.get_uuid(), mdc_uuid, self.path)
         run("mkdir", self.path)
         ret, val = run(cmd)
         if ret:
-            panic("mount failed:", self.path)
+            self.vosc.cleanup()
+            if self.vosc.need_mdc():
+                cleanup_mdc(self.db, self.name, self.mds_uuid)
+            panic("mount failed:", self.path, ":", string.join(val))
 
     def cleanup(self):
         self.info(self.path, self.mds_uuid,self.obd_uuid)
-        if  fs_is_mounted(self.path):
-            if config.force():
+        if fs_is_mounted(self.path):
+            if config.force:
                 (rc, out) = run("umount", "-f", self.path)
             else:
                 (rc, out) = run("umount", self.path)
@@ -1523,426 +1639,64 @@ class Mountpoint(Module):
 
 
 # ============================================================
-# XML processing and query
-
-class LustreDB:
-    def lookup(self, uuid):
-        """ lookup returns a new LustreDB instance"""
-        return self._lookup_by_uuid(uuid)
-
-    def lookup_name(self, name, class_name = ""):
-        """ lookup returns a new LustreDB instance"""
-        return self._lookup_by_name(name, class_name)
-
-    def lookup_class(self, class_name):
-        """ lookup returns a new LustreDB instance"""
-        return self._lookup_by_class(class_name)
-
-    def get_val(self, tag, default=None):
-        v =  self._get_val(tag)
-        if v:
-            return v
-        if default != None:
-            return default
-        debug("LustreDB", self.getName(), " no value for:", tag)
-        return None
-
-    def get_class(self):
-        return self._get_class()
-
-    def get_val_int(self, tag, default=0):
-        str = self._get_val(tag)
-        try:
-            if str:
-                return int(str)
-            return default
-        except ValueError:
-            panic("text value is not integer:", str)
-            
-    def get_first_ref(self, tag):
-        """ Get the first uuidref of the type TAG. Only
-        one is expected.  Returns the uuid."""
-        uuids = self._get_refs(tag)
-        if len(uuids) > 0:
-            return  uuids[0]
-        return None
-    
-    def get_refs(self, tag):
-        """ Get all the refs of type TAG.  Returns list of uuids. """
-        uuids = self._get_refs(tag)
-        return uuids
-
-    def get_all_refs(self):
-        """ Get all the refs.  Returns list of uuids. """
-        uuids = self._get_all_refs()
-        return uuids
-
-    def get_ost_net(self, osd_uuid):
-        srv_list = []
-        if not osd_uuid:
-            return srv_list
-        osd = self.lookup(osd_uuid)
-        node_uuid = osd.get_first_ref('node')
-        node = self.lookup(node_uuid)
-        if not node:
-            panic("unable to find node for osd_uuid:", osd_uuid,
-                  " node_ref:", node_uuid)
-        for net_uuid in node.get_networks():
-            db = node.lookup(net_uuid)
-            srv_list.append(Network(db))
-        return srv_list
+# misc query functions
 
-    def nid2server(self, nid, net_type):
-        netlist = self.lookup_class('network')
-        for net_db in netlist:
-            if net_db.get_val('nid') == nid and net_db.get_val('nettype') == net_type: 
-                return net_db
-        return None
-    
-    # the tag name is the service type
-    # fixme: this should do some checks to make sure the dom_node is a service
-    #
-    # determine what "level" a particular node is at.
-    
-    # the order of iniitailization is based on level. 
-    def getServiceLevel(self):
-        type = self.get_class()
-        ret=0;
-        if type in ('network',):
-            ret = 5
-        elif type in ('routetbl',):
-            ret = 6
-        elif type in ('ptlrpc',):
-            ret = 7
-        elif type in ('device', 'ldlm'):
-            ret = 20
-        elif type in ('osd', 'mdd', 'cobd'):
-            ret = 30
-        elif type in ('mdsdev','ost'):
-            ret = 40
-        elif type in ('mdc','osc'):
-            ret = 50
-        elif type in ('lov',):
-            ret = 60
-        elif type in ('mountpoint', 'echoclient'):
-            ret = 70
-
-        if ret < config.minlevel() or ret > config.maxlevel():
-            ret = 0 
-        return ret
-    
-    #
-    # return list of services in a profile. list is a list of tuples
-    # [(level, db_object),]
-    def getServices(self):
-        list = []
-        for ref_class, ref_uuid in self.get_all_refs(): 
-                servdb = self.lookup(ref_uuid)
-                if  servdb:
-                    level = servdb.getServiceLevel()
-                    if level > 0:
-                        list.append((level, servdb))
-                else:
-                    panic('service not found: ' + ref_uuid)
-                    
-        list.sort()
-        return list
-
-    # Find the target_device for target on a node
-    # node->profiles->device_refs->target
-    def get_target_device(self, target_uuid, node_name):
-        node_db = self.lookup_name(node_name)
-        if not node_db:
-            return None
-        prof_list = node_db.get_refs('profile')
-        for prof_uuid in prof_list:
-            prof_db = node_db.lookup(prof_uuid)
-            ref_list = prof_db.get_all_refs()
-            for ref in ref_list:
-                dev = self.lookup(ref[1])
-                if dev and dev.get_first_ref('target') == target_uuid:
-                    return ref[1]
-        return None
-
-    def get_active_target(self):
-        target_uuid = self.getUUID()
-        target_name = self.getName()
-        node_name = config.select(target_name)
-        if node_name:
-            tgt_dev_uuid = self.get_target_device(target_uuid, node_name)
-        else:
-            tgt_dev_uuid = self.get_first_ref('active')
-        return tgt_dev_uuid
-        
-
-    # get all network uuids for this node
-    def get_networks(self):
-        ret = []
-        prof_list = self.get_refs('profile')
-        for prof_uuid in prof_list:
-            prof_db = self.lookup(prof_uuid)
-            net_list = prof_db.get_refs('network')
-            #debug("get_networks():", prof_uuid, net_list)
-            for net_uuid in net_list:
-                ret.append(net_uuid)
-        return ret
-
-class LustreDB_XML(LustreDB):
-    def __init__(self, dom, root_node):
-        # init xmlfile
-        self.dom_node = dom
-        self.root_node = root_node
-
-    def xmltext(self, dom_node, tag):
-        list = dom_node.getElementsByTagName(tag)
-        if len(list) > 0:
-            dom_node = list[0]
-            dom_node.normalize()
-            if dom_node.firstChild:
-                txt = string.strip(dom_node.firstChild.data)
-                if txt:
-                    return txt
-
-    def xmlattr(self, dom_node, attr):
-        return dom_node.getAttribute(attr)
-
-    def _get_val(self, tag):
-        """a value could be an attribute of the current node
-        or the text value in a child node"""
-        ret  = self.xmlattr(self.dom_node, tag)
-        if not ret:
-            ret = self.xmltext(self.dom_node, tag)
-        return ret
-
-    def _get_class(self):
-        return self.dom_node.nodeName
-
-    #
-    # [(ref_class, ref_uuid),]
-    def _get_all_refs(self):
-        list = []
-        for n in self.dom_node.childNodes: 
-            if n.nodeType == n.ELEMENT_NODE:
-                ref_uuid = self.xml_get_ref(n)
-                ref_class = n.nodeName
-                list.append((ref_class, ref_uuid))
-                    
-        list.sort()
-        return list
-
-    def _get_refs(self, tag):
-        """ Get all the refs of type TAG.  Returns list of uuids. """
-        uuids = []
-        refname = '%s_ref' % tag
-        reflist = self.dom_node.getElementsByTagName(refname)
-        for r in reflist:
-            uuids.append(self.xml_get_ref(r))
-        return uuids
-
-    def xmllookup_by_uuid(self, dom_node, uuid):
-        for n in dom_node.childNodes:
-            if n.nodeType == n.ELEMENT_NODE:
-                if self.xml_get_uuid(n) == uuid:
-                    return n
-                else:
-                    n = self.xmllookup_by_uuid(n, uuid)
-                    if n: return n
-        return None
-
-    def _lookup_by_uuid(self, uuid):
-        dom = self. xmllookup_by_uuid(self.root_node, uuid)
-        if dom:
-            return LustreDB_XML(dom, self.root_node)
-
-    def xmllookup_by_name(self, dom_node, name):
-        for n in dom_node.childNodes:
-            if n.nodeType == n.ELEMENT_NODE:
-                if self.xml_get_name(n) == name:
-                    return n
-                else:
-                    n = self.xmllookup_by_name(n, name)
-                    if n: return n
-        return None
-
-    def _lookup_by_name(self, name, class_name):
-        dom = self.xmllookup_by_name(self.root_node, name)
-        if dom:
-            return LustreDB_XML(dom, self.root_node)
-
-    def xmllookup_by_class(self, dom_node, class_name):
-        return dom_node.getElementsByTagName(class_name)
-
-    def _lookup_by_class(self, class_name):
-        ret = []
-        domlist = self.xmllookup_by_class(self.root_node, class_name)
-        for node in domlist:
-            ret.append(LustreDB_XML(node, self.root_node))
-        return ret
-
-    def xml_get_name(self, n):
-        return n.getAttribute('name')
-        
-    def getName(self):
-        return self.xml_get_name(self.dom_node)
-
-    def xml_get_ref(self, n):
-        return n.getAttribute('uuidref')
-
-    def xml_get_uuid(self, dom_node):
-        return dom_node.getAttribute('uuid')
-
-    def getUUID(self):
-        return self.xml_get_uuid(self.dom_node)
-
-    def get_routes(self, type, gw):
-        """ Return the routes as a list of tuples of the form:
-        [(type, gw, lo, hi),]"""
-        res = []
-        tbl = self.dom_node.getElementsByTagName('routetbl')
-        for t in tbl:
-            routes = t.getElementsByTagName('route')
-            for r in routes:
-                net_type = self.xmlattr(r, 'type')
-                if type != net_type:
-                    lo = self.xmlattr(r, 'lo')
-                    hi = self.xmlattr(r, 'hi')
-                    res.append((type, gw, lo, hi))
-        return res
-
-    def get_route_tbl(self):
-        ret = []
-        for r in self.dom_node.getElementsByTagName('route'):
-            net_type = self.xmlattr(r, 'type')
-            gw = self.xmlattr(r, 'gw')
-            lo = self.xmlattr(r, 'lo')
-            hi = self.xmlattr(r, 'hi')
-            ret.append((net_type, gw, lo, hi))
-        return ret
-
-
-# ================================================================    
-# LDAP Support
-class LustreDB_LDAP(LustreDB):
-    def __init__(self, name, attrs,
-                 base = "fs=lustre",
-                 parent = None,
-                 url  = "ldap://localhost",
-                 user = "cn=Manager, fs=lustre",
-                 pw   = "secret"
-                 ):
-        self._name = name
-        self._attrs = attrs
-        self._base = base
-        self._parent = parent
-        self._url  = url
-        self._user = user
-        self._pw   = pw
-        if parent:
-            self.l = parent.l
-            self._base = parent._base
-        else:
-            self.open()
-
-    def open(self):
-        import ldap
-        try:
-            self.l = ldap.initialize(self._url)
-            # Set LDAP protocol version used
-            self.l.protocol_version=ldap.VERSION3
-            # user and pw only needed if modifying db
-            self.l.bind_s("", "", ldap.AUTH_SIMPLE);
-        except ldap.LDAPError, e:
-            panic(e)
-            # FIXME, do something useful here
-
-    def close(self):
-        self.l.unbind_s()
-
-    def ldap_search(self, filter):
-        """Return list of uuids matching the filter."""
-        import ldap
-        dn = self._base
-        ret = []
-        uuids = []
-        try:
-            for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL,
-                                        filter, ["uuid"]):
-                for v in attrs['uuid']:
-                    uuids.append(v)
-        except ldap.NO_SUCH_OBJECT, e:
-            pass
-        except ldap.LDAPError, e:
-            print e                     # FIXME: die here?
-        if len(uuids) > 0:
-            for uuid in uuids:
-                ret.append(self._lookup_by_uuid(uuid))
-        return ret
-
-    def _lookup_by_name(self, name, class_name):
-        list =  self.ldap_search("lustreName=%s" %(name))
-        if len(list) == 1:
-            return list[0]
-        return []
-
-    def _lookup_by_class(self, class_name):
-        return self.ldap_search("objectclass=%s" %(string.upper(class_name)))
-
-    def _lookup_by_uuid(self, uuid):
-        import ldap
-        dn = "uuid=%s,%s" % (uuid, self._base)
-        ret = None
-        try:
-            for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE,
-                                               "objectclass=*"):
-                ret = LustreDB_LDAP(name, attrs,  parent = self)
-                        
-        except ldap.NO_SUCH_OBJECT, e:
-            debug("NO_SUCH_OBJECT:", uuid)
-            pass                        # just return empty list
-        except ldap.LDAPError, e:
-            print e                     # FIXME: die here?
-        return ret
+def get_ost_net(self, osd_uuid):
+    srv_list = []
+    if not osd_uuid:
+        return srv_list
+    osd = self.lookup(osd_uuid)
+    node_uuid = osd.get_first_ref('node')
+    node = self.lookup(node_uuid)
+    if not node:
+        panic("unable to find node for osd_uuid:", osd_uuid,
+              " node_ref:", node_uuid)
+    for net_uuid in node.get_networks():
+        db = node.lookup(net_uuid)
+        srv_list.append(Network(db))
+    return srv_list
+
+
+# the order of iniitailization is based on level. 
+def getServiceLevel(self):
+    type = self.get_class()
+    ret=0;
+    if type in ('network',):
+        ret = 5
+    elif type in ('routetbl',):
+        ret = 6
+    elif type in ('ldlm',):
+        ret = 20
+    elif type in ('osd', 'cobd'):
+        ret = 30
+    elif type in ('mdsdev',):
+        ret = 40
+    elif type in ('mountpoint', 'echoclient'):
+        ret = 70
+    else:
+        panic("Unknown type: ", type)
 
+    if ret < config.minlevel or ret > config.maxlevel:
+        ret = 0 
+    return ret
 
-    def _get_val(self, k):
-        ret = None
-        if self._attrs.has_key(k):
-            v = self._attrs[k]
-            if type(v) == types.ListType:
-                ret = str(v[0])
+#
+# return list of services in a profile. list is a list of tuples
+# [(level, db_object),]
+def getServices(self):
+    list = []
+    for ref_class, ref_uuid in self.get_all_refs(): 
+            servdb = self.lookup(ref_uuid)
+            if  servdb:
+                level = getServiceLevel(servdb)
+                if level > 0:
+                    list.append((level, servdb))
             else:
-                ret = str(v)
-        return ret
-
-    def _get_class(self):
-        return string.lower(self._attrs['objectClass'][0])
-
-    #
-    # [(ref_class, ref_uuid),]
-    def _get_all_refs(self):
-        list = []
-        for k in self._attrs.keys():
-            if re.search('.*Ref', k):
-                for uuid in self._attrs[k]:
-                    list.append((k, uuid))
-        return list
+                panic('service not found: ' + ref_uuid)
 
-    def _get_refs(self, tag):
-        """ Get all the refs of type TAG.  Returns list of uuids. """
-        uuids = []
-        refname = '%sRef' % tag
-        if self._attrs.has_key(refname):
-            return self._attrs[refname]
-        return []
+    list.sort()
+    return list
 
-    def getName(self):
-        return self._get_val('lustreName')
-
-    def getUUID(self):
-        return self._get_val('uuid')
-
-    def get_route_tbl(self):
-        return []
 
 ############################################################
 # MDC UUID hack - 
@@ -1973,85 +1727,102 @@ def cleanup_mdc(db, owner, mds_uuid):
 
 ############################################################
 # routing ("rooting")
-#
-routes = []
-local_node = []
-router_flag = 0
 
-def add_local_interfaces(node_db):
-    global local_node
+# list of (nettype, cluster_id)
+local_clusters = []
+
+def find_local_clusters(node_db):
+    global local_clusters
     for netuuid in node_db.get_networks():
         net = node_db.lookup(netuuid)
         srv = Network(net)
         debug("add_local", netuuid)
-        local_node.append((srv.net_type, srv.nid))
-        if acceptors.has_key(srv.port):
-            panic("duplicate port:", srv.port)
-        if srv.net_type in ('tcp', 'toe'):
+        local_clusters.append((srv.net_type, srv.cluster_id))
+        if srv.port > 0:
+            if acceptors.has_key(srv.port):
+                panic("duplicate port:", srv.port)
             acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
                                                   srv.send_mem, srv.recv_mem,
                                                   srv.irq_affinity,
                                                   srv.nid_exchange)
 
+# This node is a gateway.
+is_router = 0
+def node_is_router():
+    return is_router
+
+# If there are any routers found in the config, then this will be true
+# and all nodes will load kptlrouter.
+needs_router = 0
 def node_needs_router():
-    return router_flag
+    return needs_router or is_router
+
+# list of (nettype, gw, tgt_cluster_id, lo, hi)
+# Currently, these local routes are only added to kptlrouter route
+# table if they are needed to connect to a specific server.  This
+# should be changed so all available routes are loaded, and the
+# ptlrouter can make all the decisions.
+local_routes = []
 
-def init_route_config(lustre):
-    """ Scan the lustre config looking for routers.  Build list of
+def find_local_routes(lustre):
+    """ Scan the lustre config looking for routers .  Build list of
     routes. """
-    global routes, router_flag
-    routes = []
+    global local_routes, needs_router
+    local_routes = []
     list = lustre.lookup_class('node')
-    for node_db in list:
-        if node_db.get_val_int('router', 0):
-            router_flag = 1
-            #debug("init_route_config: found router", node_db.getName())
-            for (local_type, local_nid) in local_node:
-                #debug("init_route_config:", local_type, local_nid)
+    for router in list:
+        if router.get_val_int('router', 0):
+            needs_router = 1
+            for (local_type, local_cluster_id) in local_clusters:
                 gw = None
-                for netuuid in node_db.get_networks():
-                    db = node_db.lookup(netuuid)
-                    if local_type == db.get_val('nettype'):
+                for netuuid in router.get_networks():
+                    db = router.lookup(netuuid)
+                    if (local_type == db.get_val('nettype') and
+                       local_cluster_id == db.get_val('clusterid')):
                         gw = db.get_val('nid')
                         break
-                #debug("init_route_config: gw is", gw)
-                if not gw:
-                    continue
-                for route in node_db.get_routes(local_type, gw):
-                    routes.append(route)
-    debug("init_route_config routes:", routes)
-
-
-def local_net(srv_list):
-    global local_node
-    for iface in local_node:
-        for srv in srv_list:
-            #debug("local_net a:", srv.net_type, "b:", iface[0])
-            if srv.net_type == iface[0]:
-                return srv
-    return None
+                if gw:
+                    debug("find_local_routes: gw is", gw)
+                    for route in router.get_local_routes(local_type, gw):
+                        local_routes.append(route)
+    debug("find_local_routes:", local_routes)
+
+
+def choose_local_server(srv_list):
+    for srv in srv_list:
+        if local_net_type(srv.net_type):
+            return srv
 
 def local_net_type(net_type):
-    global local_node
-    for iface in local_node:
-        if net_type == iface[0]:
+    for cluster in local_clusters:
+        if net_type == cluster[0]:
             return 1
     return 0
 
 def find_route(srv_list):
-    global local_node, routes
-    frm_type = local_node[0][0]
+    frm_type = local_clusters[0][0]
     for srv in srv_list:
-        #debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
+        debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
         to_type = srv.net_type
-        to = srv.hostaddr
-        #debug ('looking for route to', to_type, to)
-        for r in routes:
-            #debug("find_route: ", r)
-            if  r[2] == to:
+        to = srv.hostaddr  # XXX should this be hostaddr, or nid?
+        cluster_id = srv.cluster_id
+        debug ('looking for route to', to_type, to)
+        for r in local_routes:
+            debug("find_route: ", r)
+            if  (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
                 return srv, r
     return None,None
            
+def get_active_target(db):
+    target_uuid = db.getUUID()
+    target_name = db.getName()
+    node_name = get_select(target_name)
+    if node_name:
+        tgt_dev_uuid = db.get_target_device(target_uuid, node_name)
+    else:
+        tgt_dev_uuid = db.get_first_ref('active')
+    return tgt_dev_uuid
+
 
 ############################################################
 # lconf level logic
@@ -2062,14 +1833,12 @@ def newService(db):
     n = None
     if type == 'ldlm':
         n = LDLM(db)
-    elif type == 'ptlrpc':
-        n = PTLRPC(db)
     elif type == 'lov':
         n = LOV(db)
     elif type == 'network':
         n = Network(db)
     elif type == 'routetbl':
-        n = Router(db)
+        n = RouteTable(db)
     elif type == 'osd':
         n = OSD(db)
     elif type == 'cobd':
@@ -2097,44 +1866,45 @@ def for_each_profile(db, prof_list, operation):
         prof_db = db.lookup(prof_uuid)
         if not prof_db:
             panic("profile:", profile, "not found.")
-        services = prof_db.getServices()
+        services = getServices(prof_db)
         operation(services)
         
 def doSetup(services):
-    if config.nosetup():
+    if config.nosetup:
         return
     for s in services:
         n = newService(s[1])
         n.prepare()
     
 def doModules(services):
-    if config.nomod():
+    if config.nomod:
         return
     for s in services:
         n = newService(s[1])
         n.load_module()
 
 def doCleanup(services):
-    if config.nosetup():
+    if config.nosetup:
         return
     services.reverse()
     for s in services:
         n = newService(s[1])
-        n.cleanup()
+        if n.safe_to_clean():
+            n.cleanup()
 
 def doUnloadModules(services):
-    if config.nomod():
+    if config.nomod:
         return
     services.reverse()
     for s in services:
         n = newService(s[1])
-        n.cleanup_module()
+        if n.safe_to_clean_modules():
+            n.cleanup_module()
 
 #
 # Load profile for 
 def doHost(lustreDB, hosts):
-    global routes
-    global router_flag 
+    global is_router 
     node_db = None
     for h in hosts:
         node_db = lustreDB.lookup_name(h, 'node')
@@ -2144,188 +1914,168 @@ def doHost(lustreDB, hosts):
         print 'No host entry found.'
         return
 
-    router_flag = node_db.get_val_int('router', 0)
-    recovery_upcall = node_db.get_val('recovery_upcall', '')
+    is_router = node_db.get_val_int('router', 0)
+    lustre_upcall = node_db.get_val('lustreUpcall', '')
+    portals_upcall = node_db.get_val('portalsUpcall', '')
     timeout = node_db.get_val_int('timeout', 0)
 
-    add_local_interfaces(node_db)
-    if not router_flag:
-        init_route_config(lustreDB)
+    find_local_clusters(node_db)
+    if not is_router:
+        find_local_routes(lustreDB)
 
     # Two step process: (1) load modules, (2) setup lustre
     # if not cleaning, load modules first.
     prof_list = node_db.get_refs('profile')
 
-    if config.cleanup():
-        if config.force():
+    if config.recover:
+        if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
+            raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
+                                     "--client_uuid <UUID> --conn_uuid <UUID>")
+        doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
+                   config.conn_uuid)
+    elif config.cleanup:
+        if config.force:
             # the command line can override this value
             timeout = 5
         # ugly hack, only need to run lctl commands for --dump
-        if config.lctl_dump():
+        if config.lctl_dump:
             for_each_profile(node_db, prof_list, doCleanup)
             return
 
         sys_set_timeout(timeout)
-        sys_set_recovery_upcall(recovery_upcall)
+        sys_set_ptldebug()
+        sys_set_subsystem()
+        sys_set_lustre_upcall(lustre_upcall)
+        sys_set_portals_upcall(portals_upcall)
 
         for_each_profile(node_db, prof_list, doCleanup)
         for_each_profile(node_db, prof_list, doUnloadModules)
 
     else:
         # ugly hack, only need to run lctl commands for --dump
-        if config.lctl_dump():
+        if config.lctl_dump:
             for_each_profile(node_db, prof_list, doSetup)
             return
 
+        sys_make_devices()
+        sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
+        sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
+
         for_each_profile(node_db, prof_list, doModules)
 
         sys_set_debug_path()
-        script = config.gdb_script()
+        sys_set_ptldebug()
+        sys_set_subsystem()
+        script = config.gdb_script
         run(lctl.lctl, ' modules >', script)
-        if config.gdb():
+        if config.gdb:
             log ("The GDB module script is in", script)
             # pause, so user has time to break and
             # load the script
             time.sleep(5)
         sys_set_timeout(timeout)
-        sys_set_recovery_upcall(recovery_upcall)
+        sys_set_lustre_upcall(lustre_upcall)
+        sys_set_portals_upcall(portals_upcall)
 
         for_each_profile(node_db, prof_list, doSetup)
 
-############################################################
-# Command line processing
-#
-def parse_cmdline(argv):
-    short_opts = "hdnvf"
-    long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
-                 "portals=", "makeldiff", "cleanup", "noexec",
-                 "help", "node=", "nomod", "nosetup",
-                 "dump=", "force", "minlevel=", "maxlevel=",
-                 "timeout=", "recovery_upcall=",
-                 "ldapurl=", "config=", "select=", "lctl_dump="]
-    opts = []
-    args = []
+def doRecovery(db, lctl, tgt_uuid, client_uuid, conn_uuid):
+    tgt = db.lookup(tgt_uuid)
+    if not tgt:
+        raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
+    new_uuid = get_active_target(tgt)
+    if not new_uuid:
+        raise Lustre.LconfError("doRecovery: no active target found for: " +
+                                tgt_uuid)
+    net = choose_local_server(get_ost_net(db, new_uuid))
+    if not net:
+        raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
+    # XXX, better to do a full disconnect here
+    log("Reconnecting", tgt_uuid, " to ",  net.uuid);
+    lctl.del_uuid(conn_uuid)
+    lctl.connect(net)
+    lctl.recover(client_uuid, net.uuid)
 
-    try:
-        opts, args = getopt.getopt(argv, short_opts, long_opts)
-    except getopt.error:
-        print "invalid opt"
-        usage()
-    
-    for o, a in opts:
-        if o in ("-h", "--help"):
-            usage()
-        if o in ("-d","--cleanup"):
-            config.cleanup(1)
-        if o in ("-v", "--verbose"):
-            config.verbose(1)
-        if o in ("-n", "--noexec"):
-            config.noexec(1)
-        if o == "--portals":
-            config.portals_dir(a)
-        if o == "--lustre":
-            config.lustre_dir(a)
-        if o == "--reformat":
-            config.reformat(1)
-        if o == "--node":
-            config.node(a)
-        if o == "--gdb":
-            config.gdb(1)
-        if o == "--nomod":
-            config.nomod(1)
-        if o == "--nosetup":
-            config.nosetup(1)
-        if o == "--dump":
-            config.dump_file(a)
-        if o in ("-f", "--force"):
-            config.force(1)
-       if o == "--minlevel":
-               config.minlevel(a)
-        if o == "--maxlevel":
-                config.maxlevel(a)
-        if o == "--timeout":
-                config.timeout(a)
-        if o == "--recovery_upcall":
-                config.recovery_upcall(a)
-        if o == "--ldapurl":
-                config.ldapurl(a)
-        if o == "--config":
-                config.config_name(a)
-        if o == "--select":
-                config.init_select(a)
-        if o == "--lctl_dump":
-            config.lctl_dump(a)
-
-    return args
-
-def fetch(url):
-    import urllib
-    data = ""
-    try:
-        s = urllib.urlopen(url)
-        data = s.read()
-    except:
-        usage()
-    return data
 
 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
     base = os.path.dirname(cmd)
-    if os.access(base+"/Makefile", os.R_OK):
-        if not config.lustre_dir():
-            config.lustre_dir(os.path.join(base, ".."))
+    if development_mode():
+        if not config.lustre:
+            config.lustre = (os.path.join(base, ".."))
         # normalize the portals dir, using command line arg if set
-        if config.portals_dir():
-            portals_dir = config.portals_dir()
-        dir = os.path.join(config.lustre_dir(), portals_dir)
-        config.portals_dir(dir)
-    elif config.lustre_dir() and config.portals_dir():
+        if config.portals:
+            portals_dir = config.portals
+        dir = os.path.join(config.lustre, portals_dir)
+        config.portals = dir
+        debug('config.portals', config.portals)
+    elif config.lustre and config.portals:
         # production mode
         # if --lustre and --portals, normalize portals 
         # can ignore POTRALS_DIR here, since it is probly useless here
-        dir = config.portals_dir()
-        dir = os.path.join(config.lustre_dir(), dir)
-        config.portals_dir(dir)
+        config.portals = os.path.join(config.lustre, config.portals)
+        debug('config.portals B', config.portals)
 
 def sysctl(path, val):
-    if config.noexec():
+    debug("+ sysctl", path, val)
+    if config.noexec:
         return
     try:
         fp = open(os.path.join('/proc/sys', path), 'w')
         fp.write(str(val))
         fp.close()
     except IOError, e:
-        print e
+        panic(str(e))
 
 
 def sys_set_debug_path():
-    debug("debug path: ", config.debug_path())
-    sysctl('portals/debug_path', config.debug_path())
+    sysctl('portals/debug_path', config.debug_path)
 
-def sys_set_recovery_upcall(upcall):
+def sys_set_lustre_upcall(upcall):
     # the command overrides the value in the node config
-    if config.recovery_upcall():
-        upcall = config.recovery_upcall()
+    if config.lustre_upcall:
+        upcall = config.lustre_upcall
+    elif config.upcall:
+        upcall = config.upcall
     if upcall:
-        debug("setting recovery_upcall:", upcall)
-        sysctl('lustre/recovery_upcall', upcall)
+        sysctl('lustre/upcall', upcall)
+
+def sys_set_portals_upcall(upcall):
+    # the command overrides the value in the node config
+    if config.portals_upcall:
+        upcall = config.portals_upcall
+    elif config.upcall:
+        upcall = config.upcall
+    if upcall:
+        sysctl('portals/upcall', upcall)
 
 def sys_set_timeout(timeout):
     # the command overrides the value in the node config
-    if config.timeout() > 0:
-        timeout = config.timeout()
-    if timeout > 0:
-        debug("setting timeout:", timeout)
+    if config.timeout > 0:
+        timeout = config.timeout
+    if timeout != None and timeout > 0:
         sysctl('lustre/timeout', timeout)
 
-def sys_set_ptldebug(ptldebug):
-    # the command overrides the value in the node config
-    if config.ptldebug():
-        ptldebug = config.ptldebug()
-    sysctl('portals/debug', ptldebug)
+def sys_set_ptldebug():
+    if config.ptldebug != None:
+        try:
+            val = eval(config.ptldebug, ptldebug_names)
+            val = "0x%x" % (val,)
+            sysctl('portals/debug', val)
+        except NameError, e:
+            panic(str(e))
+
+def sys_set_subsystem():
+    if config.subsystem != None:
+        try:
+            val = eval(config.ptldebug, ptldebug_names)
+            val = "0x%x" % (val,)
+            sysctl('portals/subsystem_debug', val)
+        except NameError, e:
+            panic(str(e))
 
 def sys_set_netmem_max(path, max):
     debug("setting", path, "to at least", max)
-    if config.noexec():
+    if config.noexec:
         return
     fp = open(path)
     str = fp.readline()
@@ -2351,6 +2101,20 @@ def add_to_path(new_dir):
         return
     os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
     
+def default_debug_path():
+    path = '/tmp/lustre-log'
+    if os.path.isdir('/r'):
+        return '/r' + path
+    else:
+        return path
+
+def default_gdb_script():
+    script = '/tmp/ogdb'
+    if os.path.isdir('/r'):
+        return '/r' + script
+    else:
+        return script
+
 
 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
 # ensure basic elements are in the system path
@@ -2358,13 +2122,94 @@ def sanitise_path():
     for dir in DEFAULT_PATH:
         add_to_path(dir)
 
-# Initialize or shutdown lustre according to a configuration file
-#   * prepare the system for lustre
-#   * configure devices with lctl
-# Shutdown does steps in reverse
-#
+# global hack for the --select handling
+tgt_select = {}
+def init_select(arg):
+    # arg = "service=nodeA,service2=nodeB"
+    global tgt_select
+    list = string.split(arg, ',')
+    for entry in list:
+        srv, node = string.split(entry, '=')
+        tgt_select[srv] = node
+
+def get_select(srv):
+    if tgt_select.has_key(srv):
+        return tgt_select[srv]
+    return None
+
+
+PARAM = Lustre.Options.PARAM
+INTPARAM = Lustre.Options.INTPARAM
+lconf_options = [
+    ('verbose,v', "Print system commands as they are run"),
+    ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
+    ('config', "Cluster config name used for LDAP query", PARAM),
+    ('select', "service=nodeA,service2=nodeB ", PARAM),
+    ('node',   "Load config for <nodename>", PARAM),
+    ('cleanup,d', "Cleans up config. (Shutdown)"),
+    ('force,f', "Forced unmounting and/or obd detach during cleanup",
+               Lustre.Options.FLAG, 0),
+    ('mds_ost_conn', "Open connections to OSTs on the MDS"),
+    ('failover',"""Used to shut down without saving state.
+                   This will allow this node to "give up" a service to a
+                   another node for failover purposes. This will not
+                   be a clean shutdown.""",
+               Lustre.Options.FLAG, 0),
+    ('gdb', """Prints message after creating gdb module script
+                    and sleeps for 5 seconds."""),
+    ('noexec,n', """Prints the commands and steps that will be run for a
+                    config without executing them. This can used to check if a
+                    config file is doing what it should be doing"""),
+    ('nomod', "Skip load/unload module step."),
+    ('nosetup', "Skip device setup/cleanup step."),
+    ('reformat', "Reformat all devices (without question)"),
+    ('dump',  "Dump the kernel debug log to file before portals is unloaded",
+               PARAM),
+    ('minlevel', "Minimum level of services to configure/cleanup",
+                 INTPARAM, 0),
+    ('maxlevel', """Maximum level of services to configure/cleanup 
+                    Levels are aproximatly like:
+                            10 - network
+                            20 - device, ldlm
+                            30 - osd, mdd
+                            40 - mds, ost
+                            70 - mountpoint, echo_client, osc, mdc, lov""",
+               INTPARAM, 100),
+    ('lustre', """Base directory of lustre sources. This parameter will
+                  cause lconf to load modules from a source tree.""", PARAM),
+    ('portals', """Portals source directory.  If this is a relative path,
+                   then it is assumed to be relative to lustre. """, PARAM),
+    ('timeout', "Set recovery timeout", PARAM),
+    ('upcall',  "Set both portals and lustre upcall script", PARAM),
+    ('lustre_upcall', "Set lustre upcall script", PARAM),
+    ('portals_upcall', "Set portals upcall script", PARAM),
+    ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
+    ('ptldebug', "Set the portals debug level",  PARAM),
+    ('subsystem', "Set the portals debug subsystem",  PARAM),
+    ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
+    ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
+# Client recovery options
+    ('recover', "Recover a device"),
+    ('group', "The group of devices to configure or cleanup", PARAM),
+    ('tgt_uuid', "The failed target (required for recovery)", PARAM),
+    ('client_uuid', "The failed client (required for recovery)", PARAM),
+    ('conn_uuid', "The failed connection (required for recovery)", PARAM),
+    ]      
+
 def main():
-    global  lctl, MAXTCPBUF
+    global lctl, config
+
+    # in the upcall this is set to SIG_IGN
+    signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+    
+    cl = Lustre.Options("lconf", "config.xml", lconf_options)
+    try:
+        config, args = cl.parse(sys.argv[1:])
+    except Lustre.OptionError, e:
+        print e
+        sys.exit(1)
+
+    setupModulePath(sys.argv[0])
 
     host = socket.gethostname()
 
@@ -2380,7 +2225,6 @@ def main():
 
     sanitise_path()
 
-    args = parse_cmdline(sys.argv[1:])
     if len(args) > 0:
         if not os.access(args[0], os.R_OK):
             print 'File not found or readable:', args[0]
@@ -2390,44 +2234,48 @@ def main():
         except Exception:
             panic("%s does not appear to be a config file." % (args[0]))
             sys.exit(1) # make sure to die here, even in debug mode.
-        db = LustreDB_XML(dom.documentElement, dom.documentElement)
-    elif config.ldapurl():
-        if not config.config_name():
+        db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
+    elif config.ldapurl:
+        if not config.config:
             panic("--ldapurl requires --config name")
-        dn = "config=%s,fs=lustre" % (config.config_name())
-        db = LustreDB_LDAP('', {}, base=dn, url = config.ldapurl())
+        dn = "config=%s,fs=lustre" % (config.config)
+        db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
     else:
-        usage()
+        cl.usage()
+        sys.exit(1)
+
+    ver = db.get_version()
+    if not ver:
+        panic("No version found in config data, please recreate.")
+    if ver != Lustre.CONFIG_VERSION:
+        panic("Config version", ver, "does not match lconf version",
+              Lustre.CONFIG_VERSION)
 
     node_list = []
-    if config.node():
-        node_list.append(config.node())
+    if config.node:
+        node_list.append(config.node)
     else:
         if len(host) > 0:
             node_list.append(host)
         node_list.append('localhost')
+
     debug("configuring for host: ", node_list)
 
     if len(host) > 0:
-        config._debug_path = config._debug_path + '-' + host
-        config._gdb_script = config._gdb_script + '-' + host
-
-    setupModulePath(sys.argv[0])
+        config.debug_path = config.debug_path + '-' + host
+        config.gdb_script = config.gdb_script + '-' + host
 
     lctl = LCTLInterface('lctl')
-    if config.lctl_dump():
-        lctl.use_save_file(config.lctl_dump())
-    else:
-        sys_make_devices()
-        sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
-        sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
+
+    if config.lctl_dump:
+        lctl.use_save_file(config.lctl_dump)
 
     doHost(db, node_list)
 
 if __name__ == "__main__":
     try:
         main()
-    except LconfError, e:
+    except Lustre.LconfError, e:
         print e
     except CommandError, e:
         e.dump()
@@ -2435,4 +2283,3 @@ if __name__ == "__main__":
 
     if first_cleanup_error:
         sys.exit(first_cleanup_error)
-        
index a143647..382e729 100644 (file)
@@ -64,15 +64,17 @@ command_t cmdlist[] = {
         /* Network configuration commands */
         {"==== network config ====", jt_noop, 0, "network config"},
         {"network", jt_ptl_network, 0, "commands that follow apply to net\n"
-         "usage: network <tcp/elan/myrinet>"},
+         "usage: network <tcp/elan/myrinet/scimac>"},
         {"connect", jt_ptl_connect, 0, "connect to a remote nid\n"
          "usage: connect [[<hostname> <port>] | <elan id>]"},
         {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid\n"
          "usage: disconnect <nid>"},
         {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local nid. "
          "The nid defaults to hostname for tcp networks and is automatically "
-         "setup for elan/myrinet networks.\n"
+         "setup for elan/myrinet/scimac networks.\n"
          "usage: mynid [nid]"},
+        {"shownid", jt_ptl_shownid, 0, "print the local NID\n"
+         "usage: shownid"},
         {"add_uuid", jt_obd_add_uuid, 0, "associate a UUID with a nid\n"
          "usage: add_uuid <uuid> <nid> <net_type>"},
         {"close_uuid", jt_obd_close_uuid, 0, "disconnect a UUID\n"
@@ -93,24 +95,21 @@ command_t cmdlist[] = {
         {"send_mem", jt_ptl_txmem, 0, "set socket send buffer size, "
          "if size is omited the current size is reported.\n"
          "usage: send_mem [size]"},
-        {"nagle", jt_ptl_nagle, 0, "enable/disable nagle, omiting the "
+        {"nagle", jt_ptl_nagle, 0, "enable/disable nagle, omitting the "
          "argument will cause the current nagle setting to be reported.\n"
          "usage: nagle [on/off]"},
-
+        {"fail", jt_ptl_fail_nid, 0, "fail/restore communications.\n"
+         "Omitting the count means indefinitely, 0 means restore, "
+         "otherwise fail 'count' messages.\n"
+         "usage: fail nid|_all_ [count]"},
+                
         /* Device selection commands */
         {"=== device selection ===", jt_noop, 0, "device selection"},
         {"newdev", jt_obd_newdev, 0, "create a new device\n"
          "usage: newdev"},
-#if 0
-        {"uuid2dev", jt_obd_uuid2dev, 0,
-         "find device attached with <uuid> and make it the current device\n"
-         "usage: uuid2dev <uuid>"},
-#endif
-        {"name2dev", jt_obd_name2dev, 0,
-         "find device attached with <name> and make it the current device\n"
-         "usage: name2dev <name>"},
-        {"device", jt_obd_device, 0, "set current device to <devno>\n"
-         "usage: device <devno>"},
+        {"device", jt_obd_device, 0,
+         "set current device to <%uuid|$name|devno>\n"
+         "usage: device <%uuid|$name|devno>"},
         {"device_list", jt_obd_list, 0, "show all devices\n"
          "usage: device_list"},
         {"lustre_build_version", jt_get_version, 0,
@@ -126,7 +125,7 @@ command_t cmdlist[] = {
          "type specific device configuration information\n"
          "usage: setup <args...>"},
         {"cleanup", jt_obd_cleanup, 0, "cleanup previously setup device\n"
-         "usage: cleanup [force]"},
+         "usage: cleanup [force | failover]"},
         {"detach", jt_obd_detach, 0,
          "remove driver (and name and uuid) from current device\n"
          "usage: detach"},
@@ -156,7 +155,7 @@ command_t cmdlist[] = {
          "usage: setattr <objid> <mode>"},
          {"create", jt_obd_create, 0,
          "create <num> OST objects (with <mode>)\n"
-         "usage: create [num [mode [verbose]]]"},
+         "usage: create [num [mode [verbose [lsm data]]]]"},
         {"destroy", jt_obd_destroy, 0,
          "destroy OST object <objid> [num [verbose]]\n"
          "usage: destroy <num> objects, starting at objid <objid>"},
@@ -185,21 +184,24 @@ command_t cmdlist[] = {
          "stop lock manager stress test (no args)\n"},
         {"dump_ldlm", jt_obd_dump_ldlm, 0,
          "dump all lock manager state (no args)"},
-        {"lov_set_osc_active", jt_obd_lov_set_osc_active, 0,
-         "(de)activate an OSC in a LOV\n"
-         "usage: lov_set_osc_active <OSC UUID> <1|0 (active|inactive)>"},
-        {"newconn", jt_obd_newconn, 0, "newconn <olduuid> [newuuid]"},
-        {"failconn", jt_obd_failconn, 0, "failconn <uuid>"},
+        {"activate", jt_obd_activate, 0, "activate an import\n"},
+        {"deactivate", jt_obd_deactivate, 0, "deactivate an import\n"},
+        {"recover", jt_obd_recover, 0, "usage: recover [<connection UUID>]"},
         {"lookup", jt_obd_mdc_lookup, 0, "usage: lookup <directory> <file>"},
         {"notransno", jt_obd_no_transno, 0,
-         "disable sending of committed-transno updates\n"
-         "usage: notransno"},
+         "disable sending of committed-transno updates\n"},
         {"readonly", jt_obd_set_readonly, 0,
-         "disable writes to the underlying device\n"
-         "usage: readonly"},
+         "disable writes to the underlying device\n"},
+        {"abort_recovery", jt_obd_abort_recovery, 0,
+         "abort recovery on MDS device\n"},
+        {"mount_option", jt_obd_mount_option, 0,
+         "dump mount options to file\n"},
 
         /* Debug commands */
         {"======== debug =========", jt_noop, 0, "debug"},
+        {"debug_daemon", jt_dbg_debug_daemon, 0,
+         "debug daemon control and dump to a file"
+         "usage: debug_daemon [start file <#MB>|stop|pause|continue]"},
         {"debug_kernel", jt_dbg_debug_kernel, 0,
          "get debug buffer and dump to a file"
          "usage: debug_kernel [file] [raw]"},
@@ -244,10 +246,11 @@ int main(int argc, char **argv)
         if (dbg_initialize(argc, argv) < 0)
                 exit(3);
 
+        Parser_init("lctl > ", cmdlist);
+
         if (argc > 1) {
                 rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
         } else {
-                Parser_init("lctl > ", cmdlist);
                 rc = Parser_commands();
         }
 
diff --git a/lustre/utils/llparser.pm b/lustre/utils/llparser.pm
deleted file mode 100644 (file)
index 5cee31f..0000000
+++ /dev/null
@@ -1,399 +0,0 @@
-#!/usr/bin/perl
-# Copyright (C) 2002 Cluster File Systems, Inc.
-# Author: Hariharan Thantry <thantry@users.sourceforge.net>
-
-#   This file is part of Lustre, http://www.lustre.org.
-#
-#   Lustre is free software; you can redistribute it and/or
-#   modify it under the terms of version 2 of the GNU General Public
-#   License as published by the Free Software Foundation.
-#
-#   Lustre is distributed in the hope that it will be useful,
-#   but WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#   GNU General Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License
-#   along with Lustre; if not, write to the Free Software
-#   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-#
-
-
-package llparser;
-require Exporter;
-@ISA = qw(Exporter);
-@EXPORT = qw(parse_file print_rpcrelations parse_foptions %ll_subsystems 
-       %subsysnum %trace_masks $e_subsys $e_mask $e_processor $e_time 
-       $e_file $e_line $e_function $e_pid $e_stack $e_fmtstr $e_backref 
-       $e_treeparent $e_numchildren $e_youngestchild $e_next $e_pidhead 
-       $e_rpcsndrcv $e_rpcpid $e_rpcxid $e_rpcnid $e_rpcopc $e_rpcnext 
-       $e_curlineref $SEND $RCV);
-
-($e_subsys, 
- $e_mask, 
- $e_processor, 
- $e_time, 
- $e_file, 
- $e_line, 
- $e_function, 
- $e_pid, 
- $e_stack, 
- $e_fmtstr, 
- $e_treeparent, 
- $e_numchildren,
- $e_youngestchild, 
- $e_pidhead,
- $e_next, 
- $e_backref) = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-
-($e_rpcpid,
- $e_rpcxid,
- $e_rpcnid,
- $e_rpcopc,
- $e_rpcnext, 
- $e_rpcsndrcv,
- $e_curlineref) = (0, 1, 2, 3, 4, 5, 6); 
-
-$SEND = 0;
-$RCV  = 1;
-
-$REGEX=qr/^\s*(\w+)\s*:\s*(\d+)\s*:\s*(\d+)\s*:\s*(\d+\.(?:\d+))\s*\(\s*([^:]+)\s*:\s*(\d+)\s*:\s*([^()]+)\s*\(\)\s*(?:(?:\d+)\s*\|\s*)?(\d+)\s*\+\s*(\d+)\s*(?:.*)\):(.*)$/;
-
-$RPCREGEX = qr/^\s*(?:Sending|Handling)\s*RPC\s*pid:xid:nid:opc\s*(\d+):(?:0x)?(\w+):(?:0x)?(\w+):(\d+)\s*$/;
-$FILEOPTIONREGEX = qr/(--server)|(-s)/;
-$SENDING = qr/Sending/;
-
-
-# Needs to match definition in portals/include/linux/kp30.h
-%ll_subsystems = ("00" => "UNDEFINED", "01" => "MDC", "02" => "MDS", 
-                 "03" => "OSC",  "04" => "OST",  "05" => "CLASS",
-                 "06" => "OBDFS","07" => "LLITE","08" => "RPC",
-                 "09" => "EXT2OBD","0a" => "PORTALS","0b" => "SOCKNAL",
-                 "0c" => "QSWNAL","0d" => "PINGER","0e" => "FILTER",
-                 "0f" => "TRACE","10" => "ECHO","11" => "LDLM",
-                 "12" => "LOV", "13" => "GMNAL","14" => "PTLROUTER" );
-
-%subsysnum;
-$subsysnum->{UNDEFINED} = 0;
-$subsysnum->{MDC} = 1;
-$subsysnum->{MDS} = 2;
-$subsysnum->{OSC} = 3;
-$subsysnum->{OST} = 4;
-$subsysnum->{CLASS} = 5;
-$subsysnum->{OBDFS} = 6;
-$subsysnum->{LLITE} = 7;
-$subsysnum->{RPC} = 8;
-$subsysnum->{EXT2OBD} = 9;
-$subsysnum->{PORTALS} = 10;
-$subsysnum->{SOCKNAL} = 11;
-$subsysnum->{QSWNAL} = 12;
-$subsysnum->{PINGER} = 13;
-$subsysnum->{FILTER} = 14;
-$subsysnum->{TRACE} = 15; # obdtrace, not to be confused with D_TRACE */
-$subsysnum->{ECHO} = 16;
-$subsysnum->{LDLM} = 17;
-$subsysnum->{LOV} = 18;
-$subsysnum->{GMNAL} = 19;
-$subsysnum->{PTLROUTER} = 20;
-
-%tracemasks;
-$tracemasks->{TRACE} = 1 << 0; # /* ENTRY/EXIT markers */
-$tracemasks->{INODE} = 1 << 1; #
-$tracemasks->{SUPER} = 1 << 2; #
-$tracemasks->{EXT2} = 1 << 3; # /* anything from ext2_debug */
-$tracemasks->{MALLOC} = 1 << 4; # /* print malloc, free information */
-$tracemasks->{CACHE} = 1 << 5; # /* cache-related items */
-$tracemasks->{INFO} = 1 << 6; # /* general information */
-$tracemasks->{IOCTL} = 1 << 7; # /* ioctl related information */
-$tracemasks->{BLOCKS} = 1 << 8; # /* ext2 block allocation */
-$tracemasks->{NET} = 1 << 9; # /* network communications */
-$tracemasks->{WARNING} = 1 << 10; #
-$tracemasks->{BUFFS} = 1 << 11; #
-$tracemasks->{OTHER} = 1 << 12; #
-$tracemasks->{DENTRY} = 1 << 13; #
-$tracemasks->{PORTALS} = 1 << 14; # /* ENTRY/EXIT markers */
-$tracemasks->{PAGE} = 1 << 15; # /* bulk page handling */
-$tracemasks->{DLMTRACE} = 1 << 16; #
-$tracemasks->{ERROR} = 1 << 17; # /* CERROR} = ...) == CDEBUG} = D_ERROR, ...) */
-$tracemasks->{EMERG} = 1 << 18; # /* CEMERG} = ...) == CDEBUG} = D_EMERG, ...) */
-$tracemasks->{HA} = 1 << 19; # /* recovery and failover */
-$tracemasks->{RPCTRACE} = 1 << 19; # /* recovery and failover */
-
-# Contains all the file names, the first filename is the 
-# client. After that are all servers.
-my @filearray = ();
-
-
-# Create backlinks between array entries based on the calling sequence
-# For each new PID encountered, the first entry will be present in the 
-# PID hash.
-
-sub create_links {
-    my $arrayref = shift @_;
-    my $pidhashref = shift @_;
-    my $stitchref = shift @_;
-    my %local_hash;
-    my $hash_lineref;
-    my $tmpfmtref;
-    my $tmpref;
-    my $firstlineaftermarker = 0;
-
-    foreach $lineref (@$arrayref) {
-       next if ($lineref->[$e_time] == 0); # Skip the client marker line
-       my $pidprevious = $pidhashref->{$lineref->[$e_pid]};
-       if ($pidprevious->[$e_next] == 0) {
-           $pidprevious->[$e_next] = $lineref;
-           if (exists $local_hash{$lineref->[$e_pid]} 
-               && $firstlineaftermarker) {
-               $hash_lineref=$local_hash{$lineref->[$e_pid]};
-               $hash_lineref->[$e_next] =$lineref;
-               $firstlineaftermarker = 0;
-           } 
-       } elsif ($local_hash{$lineref->[$e_pid]} == 0) {
-               # True only for the first line, the marker line.
-               $local_hash{$lineref->[$e_pid]}=$lineref;
-               #print "LINE ADDED TO HASH: @$lineref\n";
-               $firstlineaftermarker = 1; 
-       }
-       # Stack grows upward (assumes x86 kernel)
-       if ($lineref->[$e_stack] < $pidprevious->[$e_stack]) {
-           # lineref is not a child of pidprevious, find its parent
-         LINE: while(($lineref->[$e_stack] < $pidprevious->[$e_stack]) &&
-                     ($lineref->[$e_function] == $pidprevious->[$e_function])
-                     ) {
-                         #This second part of the comparision is a HACK  
-                         last LINE if ($pidprevious->[$e_backref] == 0); 
-                         $pidprevious = $pidprevious->[$e_backref];
-         }
-       }
-       if ($lineref->[$e_stack] > $pidprevious->[$e_stack]) {
-           # lineref is child of pidprevious, with the caveat that they must
-            # belong to different functions. This is a HACK 
-           # until CDEBUG is modified
-           while($lineref->[$e_function] eq $pidprevious->[$e_function]) {
-             last if ($pidprevious->[$e_backref] == 0);
-              $pidprevious = $pidprevious->[$e_backref];
-           }   
-
-           $lineref->[$e_backref] = $pidprevious;
-           $pidprevious->[$e_numchildren]++;
-       } else {
-           # lineref is sibling of pidprevious
-           $lineref->[$e_numchildren] = 0;
-           $lineref->[$e_backref] = $pidprevious->[$e_backref];
-           ($lineref->[$e_backref])->[$e_numchildren]++;
-       }
-
-       $pidhashref->{$lineref->[$e_pid]} = $lineref;
-       $lineref->[$e_youngestchild] = $lineref;
-       while ($pidprevious->[$e_backref] != 0) {
-           $pidprevious->[$e_youngestchild] = $lineref;
-           $pidprevious = $pidprevious->[$e_backref];
-       }
-       $pidprevious->[$e_youngestchild] = $lineref;
-       $lineref->[$e_pidhead]=$pidprevious;
-       
-        # Stitch together rpc's
-       if($lineref->[$e_fmtstr] =~ $RPCREGEX) {
-           #print "RPC LINE: @$lineref\n";
-           $tmpfmtref = [$1, $2, $3, $4, 0, 0, 0];
-           if ($lineref->[$e_fmtstr] =~ $SENDING) {
-               $tmpfmtref->[$e_rpcsndrcv] = $SEND;
-           } else { $tmpfmtref->[$e_rpcsndrcv] = $RCV; }
-           $tmpfmtref->[$e_curlineref] = $lineref;
-           $stitchref->{$lineref->[$e_time]} = $tmpfmtref;
-           
-       }
-           
-    }
-match_rpcs($stitchref);
-return $arrayref;      
-}
-
-
-
-
-# Main loop, parses the debug log
-
-sub parse_file {
-    my %hasharray;
-    my $input_files = shift;
-    
-    my $stitch_ref = shift;
-    my $pid = shift;
-    my $rpctrace = shift;
-    my $trace = shift;
-    my $nodlm = shift;
-    my $noclass = shift;
-    my $nonet = shift;
-
-    print "$pid, $rpctrace, $nodlm, $noclass, $nonet\n";
-    $backref = 0;
-    $treeparent = 0;
-    $numchildren = 0;
-    $youngestchild = 0;
-    $next = 0;
-    $pidhead = 0;
-    $iter = 0;
-                       
-    foreach $file (@$input_files) {
-       
-       open(FILEHANDLE, $file) or die "Can't open file: $file\n";
-       while(<FILEHANDLE>) {
-           if (/$REGEX/) {
-               @parsed_line=($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 
-                             $treeparent, $numchildren, $youngestchild, 
-                             $pidhead, $next, $backref);
-               next if (($parsed_line[$e_pid] != $pid) && 
-                        ($pid) && ($iter == 0));
-               next if (($parsed_line[$e_mask] != $tracemasks->{RPCTRACE}) 
-                        && ($rpctrace));
-               next if ($trace && $parsed_line[$e_mask] != 
-                        $tracemasks->{TRACE});
-               next if ($nodlm && hex($parsed_line[$e_subsys]) == 
-                        $subsysnum->{LDLM});
-               next if ($noclass && hex($parsed_line[$e_subsys]) == 
-                        $subsysnum->{CLASS});
-               next if ($nonet && (hex($parsed_line[$e_subsys]) == 
-                                   $subsysnum->{RPC} ||
-                                   hex($parsed_line[$e_subsys]) == 
-                                   $subsysnum->{NET} ||        
-                                   hex($parsed_line[$e_subsys]) == 
-                                   $subsysnum->{PORTALS} ||
-                                   hex($parsed_line[$e_subsys]) == 
-                                   $subsysnum->{SOCKNAL} ||
-                                   hex($parsed_line[$e_subsys]) == 
-                                   $subsysnum->{QSWNAL} ||
-                                   hex($parsed_line[$e_subsys]) == 
-                                   $subsysnum->{GMNAL}));      
-               
-               
-               if (!exists($hasharray{$parsed_line[$e_pid]})) {
-                   # Push a marker for the beginning of this PID
-                   my @marker_line;
-                   $marker_line[$e_subsys] = 0;
-                   $marker_line[$e_mask] = 0;
-                   $marker_line[$e_processor] = 0;
-                   $marker_line[$e_time] = $parsed_line[$e_time];
-                   $marker_line[$e_file] = 0;
-                   $marker_line[$e_line] = 0;
-                   $marker_line[$e_function] = 0;
-                   $marker_line[$e_pid] = $parsed_line[$e_pid];
-                   # marker lines are everyone's parent, so stack value zero
-                   $marker_line[$e_stack] = 0; 
-                   $marker_line[$e_fmtstr] = "";
-                   $marker_line[$e_treeparent] = 0;
-                   $marker_line[$e_numchildren] = 0;
-                   $marker_line[$e_youngestchild] = 0;
-                   $marker_line[$e_pidhead] = 0;
-                   $marker_line[$e_next]= \@parsed_line;
-                   $marker_line[$e_backref] = 0;
-                   $hasharray{$parsed_line[$e_pid]} = \@marker_line;
-                   push @$array_parsed, [ @marker_line ];
-                   
-               }
-               push @$array_parsed, [ @parsed_line ];
-           }
-           
-       }
-       close(FILEHANDLE);
-       if ($iter == 0) {
-           # Insert end of client line marker, an all zero pattern;
-           @marker_line = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-           push @$array_parsed, [ @marker_line ]; 
-           
-       }
-       $iter ++;
-    }
-    
-    $array_parsed=create_links($array_parsed, \%hasharray, $stitch_ref);
-    #print_array($array_parsed);
-    return $array_parsed;
-}
-
-sub print_array {
-
-    my $arrayref = shift;
-    foreach $lineref(@$arrayref){
-       if ($lineref->[$e_backref]==0){
-               print "MARKER LINE(addr): $lineref contents: [@$lineref]\n";
-       } else {
-
-               print "REGULAR LINE (addr) :$lineref contents:[@$lineref]\n";
-       }
-    }
-    
-}
-
-sub print_rpcrelations {
-
-    my $rpchashref = shift;
-    foreach $rpckeys (sort keys %$rpchashref) {
-       $tmpref = $rpchashref->{$rpckeys};
-       #print "Key: $rpckeys, Contents: @$tmpref\n";
-
-    }
-
-}
-sub match_rpcs {
-    my $rpchashref = shift;
-    foreach $rpckeys (sort keys %$rpchashref) {
-       $tmpref = $rpchashref->{$rpckeys};
-       #print "MATCHING: $@tmpref...\n";
-       foreach $cmpkeys (sort keys %$rpchashref) {
-           next if($cmpkeys == $rpckeys);
-           $cmpref = $rpchashref->{$cmpkeys};
-        #   print "Line compared: @$cmpref\n";
-           next if ($tmpref->[$e_rpcsndrcv] == $cmpref->[$e_rpcsndrcv]);
-           next if ($tmpref->[$e_rpcpid] != $cmpref->[$e_rpcpid]);
-           next if ($tmpref->[$e_rpcxid] != $cmpref->[$e_rpcxid]);
-           if ($tmpref->[$e_rpcsndrcv] == $SEND) {
-               $tmpref->[$e_rpcnext] = $cmpkeys;
-               #print "MACTHED: KEY 1: $rpckeys CONTENTS: @$tmpref", 
-               #"KEY2: $cmpkeys CONTENTS: @$cmpref\n"
-               
-           }
-                   
-       }
-
-    }
-
-}
-
-sub getnextchild {
-    my $rootline = shift;
-    my $lineref = shift;
-    my $tempref = $lineref->[$e_next];
-    if ($tempref == 0)  {
-       return 0;
-    }
-
-    if (($tempref->[$e_stack] > $rootline->[$e_stack]) ||
-       (($tempref->[$e_stack] <= $rootline->[$e_stack]) &&
-        ($tempref->[$e_function] == $rootline->[$e_function])
-        )){
-       # Child
-       return $tempref;
-       
-    }
-       return 0;
-       
-       
-}
-
-
-sub parse_foptions {
-    
-    my $inarg = shift;
-    my $idx = 0;
-    foreach $elem(@$inarg) {
-       next if ($elem =~ /$FILEOPTIONREGEX/);
-       $filearray[$idx] = $elem;
-       $idx++;    
-    }
-    return \@filearray;
-}
-
-1;
-#$array_parsed=parse_file();
-#print_array($array_parsed);
diff --git a/lustre/utils/llstat.pl b/lustre/utils/llstat.pl
new file mode 100755 (executable)
index 0000000..28eb778
--- /dev/null
@@ -0,0 +1,122 @@
+#!/usr/bin/perl
+
+my $pname = $0;
+
+sub usage()
+{
+    print STDERR "Usage: $pname <stats_file> [<interval>]\n";
+    exit 1;
+}
+
+
+my $statspath;
+my $interval = 0;
+
+if (($#ARGV < 0) || ($#ARGV > 1)) {
+    usage();
+} else {
+    $statspath = $ARGV[0];
+    if ($#ARGV == 1) {
+       $interval = $ARGV[1];
+    } 
+}
+
+
+
+my %namehash;
+my $anysum = 0;
+my $anysumsquare = 0;
+my $mhz = 0;
+
+sub get_cpumhz()
+{
+    my $cpu_freq;
+    my $itc_freq; # On Itanium systems use this
+    if (open(CPUINFO, "/proc/cpuinfo")==0) {
+       return;
+    }
+    while (<CPUINFO>) {
+       if (/^cpu MHz\s+:\s*([\d\.]+)/) { $cpu_freq=$1; }
+       elsif (/^itc MHz\s+:\s*([\d\.]+)/) { $itc_freq=$1; }
+    }
+    if (defined($itc_freq)) { $mhz = $itc_freq; }
+    elsif (defined($cpu_freq)) { $mhz = $cpu_freq; }
+    else { $mhz = 1; }
+}
+
+get_cpumhz();
+print "Processor counters run at $mhz MHz\n";
+
+sub readstat()
+{
+    open(STATS, $statspath) || die "Cannot open $statspath: $!\n";
+    while (<STATS>) {
+       chop;
+       ($name, $cumulcount, $samples, $unit, $min, $max, $sum, $sumsquare) 
+           = split(/\s+/, $_);
+
+       $prevcount = %namehash->{$name};
+       if (defined($prevcount)) {
+           $diff = $cumulcount - $prevcount;
+           if ($name eq "snapshot_time") {
+               $tdiff = $diff;
+               # printf "%-25s prev=$prevcount, cumul=$cumulcount diff=$diff, tdiff=$tdiff\n", $name;
+               printf "$statspath @ $cumulcount\n";
+               printf "%-25s %-10s %-10s %-10s", "Name", "Cur.Count", "Cur.Rate", "#Events";
+               if ($anysum) {
+                   printf "%-8s %10s %12s %10s", "Unit", "min", "avg", "max";
+               }
+               if ($anysumsquare) {
+                   printf "%10s", "stddev";
+               }
+                printf "\n";
+           }
+           elsif ($cumulcount!=0) {
+               printf "%-25s %-10Lu %-10Lu %-10Lu",
+                      $name, $diff, ($diff/$tdiff), $cumulcount;
+               
+               if (defined($sum)) {
+                   my $sum_orig = $sum;
+                   if (($unit eq "[cycles]") && ($mhz != 1)) {
+                       $unit = "[usecs]";
+                       $min = $min/$mhz;
+                       $sum = $sum/$mhz;
+                       $max = $max/$mhz;
+                   }
+                   printf "%-8s %10Lu %12.2f %10Lu", $unit, $min, ($sum/$cumulcount), $max;
+                   if (defined($sumsquare)) {
+                       my $s = $sumsquare - (($sum_orig*$sum_orig)/$cumulcount);
+                       if ($s >= 0) {
+                           my $cnt = ($cumulcount >= 2) ? $cumulcount : 2 ;
+                           my $stddev = sqrt($s/($cnt - 1));
+                           if (($unit eq "[usecs]") && ($mhz != 1)) {
+                               $stddev = $stddev/$mhz;
+                           }
+                           printf " %10.2f", $stddev;
+                       }
+                   }
+               }
+               printf "\n";
+           }
+       }
+       else {
+           if ($cumulcount!=0) {
+               printf "%-25s $cumulcount\n", $name     
+           }
+           if (defined($sum)) {
+               $anysum = 1;
+           }
+           if (defined($sumsquare)) {
+               $anysumsquare = 1;
+           }
+       }
+       %namehash->{$name} = $cumulcount;
+    }
+}
+
+do {
+    readstat();
+    if ($interval) { 
+       sleep($interval);
+    }
+} while ($interval);
index 76757a7..8ab7278 100755 (executable)
@@ -29,9 +29,22 @@ import sys, os, getopt, string, exceptions
 import xml.dom.minidom
 from xml.dom.ext import PrettyPrint
 
+PYMOD_DIR = "/usr/lib/lustre/python"
+
+def development_mode():
+    base = os.path.dirname(sys.argv[0])
+    if os.access(base+"/Makefile.am", os.R_OK):
+        return 1
+    return 0
+
+if not development_mode():
+    sys.path.append(PYMOD_DIR)
+
+import Lustre
+
 DEFAULT_PORT = 988 
 
-def usage():
+def reference():
     print """usage: lmc --add object [object parameters]
 
 Object creation command summary:
@@ -39,12 +52,15 @@ Object creation command summary:
 --add node
   --node node_name
   --timeout num
-  --recovery_upcall path
+  --upcall path
+  --lustre_upcall path
+  --portals_upcall path
 
 --add net
   --node node_name
   --nid nid
-  --nettype tcp|elan|toe|gm
+  --cluster_id 
+  --nettype tcp|elan|toe|gm|scimac
   --hostaddr addr
   --port port
   --tcpbuf size
@@ -81,7 +97,74 @@ Object creation command summary:
   --mds mds_name
   --ost ost_name OR --lov lov_name
 """
-    sys.exit(1)
+
+PARAM = Lustre.Options.PARAM
+lmc_options = [
+    # lmc input/output options
+    ('reference', "Print short reference for commands"), 
+    ('verbose,v', "Print system commands as they are run"),
+    ('merge,m', "", PARAM),
+    ('output,o', "", PARAM),
+    ('input,i', "", PARAM),
+    ('batch', "", PARAM),
+
+    # commands
+    ('add', "", PARAM),
+    
+    # node options
+    ('node', "", PARAM),
+    ('timeout', "", PARAM),
+    ('upcall', "Set both lustre and portals upcall scripts.", PARAM),
+    ('lustre_upcall', "Set location of lustre upcall script.", PARAM),
+    ('portals_upcall', "Set location of portals upcall script.", PARAM),
+
+    # network 
+    ('nettype', "", PARAM),
+    ('nid', "", PARAM),
+    ('tcpbuf', "", PARAM, 0),
+    ('port', "", PARAM, DEFAULT_PORT),
+    ('nid_exchange', "", PARAM, 0),
+    ('irq_affinity', "", PARAM, 0),
+    ('hostaddr', "", PARAM, ""),
+    ('cluster_id', "", PARAM, "0"),
+
+    # routes
+    ('route', "", PARAM),
+    ('router', ""),
+    ('gw', "", PARAM),
+    ('gw_cluster_id', "", PARAM, "0"),
+    ('target_cluster_id', "", PARAM, "0"),
+    ('lo', "", PARAM),
+    ('hi', "", PARAM, ""),
+
+    # servers: mds and ost
+    ('mds', "", PARAM),
+    ('ost', "", PARAM, ""),
+    ('osdtype', "", PARAM, "obdfilter"),
+    ('failover', ""),
+    ('group', "", PARAM),
+    ('dev', "", PARAM, ""),
+    ('size', "", PARAM, 0),
+    ('journal_size', "", PARAM, 0),
+    ('fstype', "", PARAM, "ext3"),
+    ('ostuuid', "", PARAM, ""),
+    ('format', ""),
+
+    # clients: mountpoint and echo
+    ('echo_client', "", PARAM),
+    ('path', "", PARAM),
+    ('filesystem', "Lustre filesystem name", PARAM, ''),
+
+    # lov
+    ('lov', "", PARAM, ''),
+    ('stripe_sz', "", PARAM),
+    ('stripe_cnt', "", PARAM, 0),
+    ('stripe_pattern', "", PARAM, 0),
+
+    # cobd
+    ('real_obd', "", PARAM),
+    ('cache_obd', "", PARAM),
+    ]
 
 def error(*args):
     msg = string.join(map(str,args))
@@ -118,17 +201,12 @@ def new_uuid(name):
 ldlm_name = 'ldlm'
 ldlm_uuid = 'ldlm_UUID'
 
-ptlrpc_name = 'RPCDEV'
-ptlrpc_uuid = 'RPCDEV_UUID'
-
 def new_lustre(dom):
     """Create a new empty lustre document"""
     # adding ldlm here is a bit of a hack, but one is enough.
-    str = """<lustre>
+    str = """<lustre version="%s">
     <ldlm name="%s" uuid="%s"/>
-    <ptlrpc name="%s" uuid="%s"/>
-    </lustre>""" % (ldlm_name, ldlm_uuid,
-                    ptlrpc_name, ptlrpc_uuid)
+    </lustre>""" % (Lustre.CONFIG_VERSION, ldlm_name, ldlm_uuid)
     return dom.parseString(str)
 
 names = {}
@@ -146,9 +224,8 @@ def init_names(doc):
             init_names(n)
 
 def get_format_flag(options):
-    if options.has_key('format'):
-        if options['format']:
-            return 'yes'
+    if options.format:
+        return 'yes'
     return 'no'
 
 ############################################################
@@ -187,11 +264,13 @@ class GenConfig:
         node.appendChild(new)
         return new
 
-    def network(self, name, uuid, nid, net, hostaddr="", port=0, tcpbuf=0, irq_aff=0, nid_xchg=0):
+    def network(self, name, uuid, nid, cluster_id, net, hostaddr="",
+                port=0, tcpbuf=0, irq_aff=0, nid_xchg=0):
         """create <network> node"""
         network = self.newService("network", name, uuid)
         network.setAttribute("nettype", net);
         self.addElement(network, "nid", nid)
+        self.addElement(network, "clusterid", cluster_id)
         if hostaddr:
             self.addElement(network, "hostaddr", hostaddr)
         if port:
@@ -211,11 +290,13 @@ class GenConfig:
         rtbl = self.newService("routetbl", name, uuid)
         return rtbl
         
-    def route(self, net_type, gw, lo, hi):
+    def route(self, gw_net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi):
         """ create one entry for the route table """
         ref = self.doc.createElement('route')
-        ref.setAttribute("type", net_type)
+        ref.setAttribute("type", gw_net_type)
         ref.setAttribute("gw", gw)
+        ref.setAttribute("gwclusterid", gw_cluster_id)
+        ref.setAttribute("tgtclusterid", tgt_cluster_id)
         ref.setAttribute("lo", lo)
         if hi:
             ref.setAttribute("hi", hi)
@@ -237,7 +318,8 @@ class GenConfig:
         ldlm = self.newService("ldlm", name, uuid)
         return ldlm
 
-    def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid, node_uuid, dev_size=0):
+    def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid,
+            node_uuid, dev_size=0, journal_size=0):
         osd = self.newService("osd", name, uuid)
         osd.setAttribute('osdtype', osdtype)
         osd.appendChild(self.ref("target", ost_uuid))
@@ -249,6 +331,8 @@ class GenConfig:
             self.addElement(osd, "autoformat", format)
             if dev_size:
                 self.addElement(osd, "devsize", "%s" % (dev_size))
+            if journal_size:
+                self.addElement(osd, "journalsize", "%s" % (journal_size))
         return osd
 
     def cobd(self, name, uuid, real_uuid, cache_uuid):
@@ -257,9 +341,11 @@ class GenConfig:
         cobd.appendChild(self.ref("cacheobd",cache_uuid))
         return cobd
 
-    def ost(self, name, uuid, osd_uuid):
+    def ost(self, name, uuid, osd_uuid, group=""):
         ost = self.newService("ost", name, uuid)
         ost.appendChild(self.ref("active", osd_uuid))
+        if group:
+            self.addElement(ost, "group", group)
         return ost
 
     def oss(self, name, uuid):
@@ -279,30 +365,39 @@ class GenConfig:
         lovconfig.appendChild(self.ref("lov", lov_uuid))
         return lovconfig
 
-    def mds(self, name, uuid, mdd_uuid):
+    def mds(self, name, uuid, mdd_uuid, group=""):
         mds = self.newService("mds", name, uuid)
         mds.appendChild(self.ref("active",mdd_uuid))
+        if group:
+            self.addElement(mds, "group", group)
         return mds
 
     def mdsdev(self, name, uuid, fs, devname, format, node_uuid,
-            mds_uuid, dev_size=0 ):
+            mds_uuid, dev_size=0, journal_size=0):
         mdd = self.newService("mdsdev", name, uuid)
         self.addElement(mdd, "fstype", fs)
         dev = self.addElement(mdd, "devpath", devname)
         self.addElement(mdd, "autoformat", format)
         if dev_size:
                 self.addElement(mdd, "devsize", "%s" % (dev_size))
+        if journal_size:
+            self.addElement(mdd, "journalsize", "%s" % (journal_size))
         mdd.appendChild(self.ref("node", node_uuid))
         mdd.appendChild(self.ref("target", mds_uuid))
         return mdd
 
-    def mountpoint(self, name, uuid, mds_uuid, osc_uuid, path):
+    def mountpoint(self, name, uuid, fs_uuid, path):
         mtpt = self.newService("mountpoint", name, uuid)
-        mtpt.appendChild(self.ref("mds", mds_uuid))
-        mtpt.appendChild(self.ref("obd", osc_uuid))
+        mtpt.appendChild(self.ref("filesystem", fs_uuid))
         self.addElement(mtpt, "path", path)
         return mtpt
 
+    def filesystem(self, name, uuid, mds_uuid, obd_uuid):
+        fs = self.newService("filesystem", name, uuid)
+        fs.appendChild(self.ref("mds", mds_uuid))
+        fs.appendChild(self.ref("obd", obd_uuid))
+        return fs
+        
     def echo_client(self, name, uuid, osc_uuid):
         ec = self.newService("echoclient", name, uuid)
         ec.appendChild(self.ref("obd", osc_uuid))
@@ -352,6 +447,12 @@ def name2uuid(lustre, name, tag="",  fatal=1):
             return ""
     return getUUID(ret)
     
+def lookup_filesystem(lustre, mds_uuid, ost_uuid):
+    for n in lustre.childNodes:
+        if n.nodeType == n.ELEMENT_NODE and n.nodeName == 'filesystem':
+            if ref_exists(n, mds_uuid) and ref_exists(n, ost_uuid):
+                return getUUID(n)
+    return None
 
 # XXX: assumes only one network element per node. will fix this
 # as soon as support for routers is added
@@ -403,6 +504,27 @@ def get_attr(dom_node, attr, default=""):
 ############################################################
 # Top level commands
 #
+def set_node_options(gen, node, options):
+    if options.router:
+        node.setAttribute('router', '1')
+    if options.timeout:
+        gen.addElement(node, "timeout", get_option(options, 'timeout'))
+    if options.upcall:
+        default_upcall =  get_option(options, 'upcall')
+    else:
+        default_upcall = ''
+    if default_upcall or options.lustre_upcall:
+        if options.lustre_upcall:
+            gen.addElement(node, 'lustreUpcall', options.lustre_upcall)
+        else: 
+            gen.addElement(node, 'lustreUpcall', default_upcall)
+    if default_upcall or options.portals_upcall:
+        if options.portals_upcall:
+            gen.addElement(node, 'portalsUpcall', options.portals_upcall)
+        else:
+            gen.addElement(node, 'portalsUpcall', default_upcall)
+    return node
+
 def do_add_node(gen, lustre,  options, node_name):
     uuid = new_uuid(node_name)
     prof_name = new_name("PROFILE_" + node_name)
@@ -413,13 +535,7 @@ def do_add_node(gen, lustre,  options, node_name):
     lustre.appendChild(profile)
 
     node_add_profile(gen, node, 'ldlm', ldlm_uuid)
-    node_add_profile(gen, node, 'ptlrpc', ptlrpc_uuid)
-    if has_option(options, 'router'):
-        node.setAttribute('router', '1')
-    if has_option(options, 'timeout'):
-        node.setAttribute('timeout', get_option(options, 'timeout'))
-    if has_option(options, 'recovery_upcall'):
-        node.setAttribute('recovery_upcall', get_option(options, 'recovery_upcall'))
+    set_node_options(gen, node, options)
     return node
 
     
@@ -439,15 +555,16 @@ def add_net(gen, lustre, options):
 
     node_name = get_option(options, 'node')
     nid = get_option(options, 'nid')
-    hostaddr = get_option(options, 'hostaddr', '')
+    cluster_id = get_option(options, 'cluster_id')
+    hostaddr = get_option(options, 'hostaddr')
     net_type = get_option(options, 'nettype')
 
     if net_type in ('tcp', 'toe'):
-        port = get_option_int(options, 'port', DEFAULT_PORT)
-        tcpbuf = get_option_int(options, 'tcpbuf', 0)
-        irq_aff = get_option_int(options, 'irq_affinity', 0)
-        nid_xchg = get_option_int(options, 'nid_exchange', 0)
-    elif net_type in ('elan', 'gm'):
+        port = get_option_int(options, 'port')
+        tcpbuf = get_option_int(options, 'tcpbuf')
+        irq_aff = get_option_int(options, 'irq_affinity')
+        nid_xchg = get_option_int(options, 'nid_exchange')
+    elif net_type in ('elan', 'gm', 'scimac'):
         port = 0
         tcpbuf = 0
         irq_aff = 0
@@ -461,9 +578,12 @@ def add_net(gen, lustre, options):
         node = do_add_node(gen, lustre, options, node_name)
     else:
         node = ret
+        set_node_options(gen, node, options)
+
     net_name = new_name('NET_'+ node_name +'_'+ net_type)
     net_uuid = new_uuid(net_name)
-    node.appendChild(gen.network(net_name, net_uuid, nid, net_type, hostaddr, port, tcpbuf, irq_aff, nid_xchg))
+    node.appendChild(gen.network(net_name, net_uuid, nid, cluster_id, net_type,
+                                 hostaddr, port, tcpbuf, irq_aff, nid_xchg))
     node_add_profile(gen, node, "network", net_uuid)
 
 
@@ -471,10 +591,14 @@ def add_route(gen, lustre, options):
     """ create a node with a network config """
 
     node_name = get_option(options, 'node')
-    net_type = get_option(options, 'nettype')
+    gw_net_type = get_option(options, 'nettype')
     gw = get_option(options, 'gw')
+    gw_cluster_id = get_option(options, 'gw_cluster_id')
+    tgt_cluster_id = get_option(options, 'target_cluster_id')
     lo = get_option(options, 'lo')
-    hi = get_option(options, 'hi', '')
+    hi = get_option(options, 'hi')
+    if not hi:
+        hi = lo
 
     node = findByName(lustre, node_name, "node")
     if not node:
@@ -489,7 +613,8 @@ def add_route(gen, lustre, options):
         rtbl = gen.routetbl(rtbl_name, rtbl_uuid)
         node.appendChild(rtbl)
         node_add_profile(gen, node, "routetbl", rtbl_uuid)
-    rtbl.appendChild(gen.route(net_type, gw, lo, hi))
+    rtbl.appendChild(gen.route(gw_net_type, gw, gw_cluster_id, tgt_cluster_id,
+                               lo, hi))
 
 
 def add_mds(gen, lustre, options):
@@ -501,12 +626,17 @@ def add_mds(gen, lustre, options):
     mds_uuid = name2uuid(lustre, mds_name, fatal=0)
     if not mds_uuid:
         mds_uuid = new_uuid(mds_name)
-        mds = gen.mds(mds_name, mds_uuid, mdd_uuid)
+        mds = gen.mds(mds_name, mds_uuid, mdd_uuid, options.group)
         lustre.appendChild(mds)
-        
+    else:
+        mds = lookup(lustre, mds_uuid)
+    if options.failover:
+        mds.setAttribute('failover', "1")
+
     devname = get_option(options, 'dev')
-    size = get_option(options, 'size', 0)
-    fstype = get_option(options, 'fstype', 'extN')
+    size = get_option(options, 'size')
+    fstype = get_option(options, 'fstype')
+    journal_size = get_option(options, 'journal_size')
 
     node_uuid = name2uuid(lustre, node_name, 'node')
 
@@ -516,15 +646,16 @@ def add_mds(gen, lustre, options):
     if not net_uuid:
         error("NODE: ", node_name, "not found")
 
-    mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname, get_format_flag(options),
-                  node_uuid, mds_uuid, dev_size=size)
+    mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname,
+                     get_format_flag(options), node_uuid, mds_uuid,
+                     dev_size=size, journal_size=journal_size)
     lustre.appendChild(mdd)
                    
 
 def add_ost(gen, lustre, options):
     node_name = get_option(options, 'node')
-    lovname = get_option(options, 'lov', '')
-    osdtype = get_option(options, 'osdtype', 'obdfilter', deprecated_tag="obdtype")
+    lovname = get_option(options, 'lov')
+    osdtype = get_option(options, 'osdtype')
 
     node_uuid = name2uuid(lustre, node_name)
 
@@ -533,37 +664,46 @@ def add_ost(gen, lustre, options):
         devname = ''
         size = 0
         fstype = ''
+        journal_size = ''
     else:
-        devname = get_option(options, 'dev', '') # can be unset for bluearcs
-        size = get_option(options, 'size', 0)
-        fstype = get_option(options, 'fstype', 'extN')
+        devname = get_option(options, 'dev') # can be unset for bluearcs
+        size = get_option(options, 'size')
+        fstype = get_option(options, 'fstype')
+        journal_size = get_option(options, 'journal_size')
         
-    ostname = get_option(options, 'ost', '', deprecated_tag='obd')
+    ostname = get_option(options, 'ost')
     if not ostname:
         ostname = new_name('OST_'+ node_name)
 
-    osdname = new_name("OSD_" + ostname)
+    osdname = new_name("OSD_" + ostname + "_" + node_name)
     osd_uuid = new_uuid(osdname)
 
     ost_uuid = name2uuid(lustre, ostname, fatal=0)
     if not ost_uuid:
-        ost_uuid = get_option(options, 'ostuuid', '', deprecated_tag = 'obduuid')
+        ost_uuid = get_option(options, 'ostuuid')
         if ost_uuid:
             if lookup(lustre, ost_uuid):
                 error("Duplicate OST UUID:", ost_uuid)
         else:
             ost_uuid = new_uuid(ostname)
 
-        ost = gen.ost(ostname, ost_uuid, osd_uuid)
+        ost = gen.ost(ostname, ost_uuid, osd_uuid, options.group)
         lustre.appendChild(ost)
         if lovname:
             lov = findByName(lustre, lovname, "lov")
             if not lov:
                 error('add_ost:', '"'+lovname+'"', "lov element not found.")
             lov_add_obd(gen, lov, ost_uuid)
+    else:
+        ost = lookup(lustre, ost_uuid)
 
-    osd = gen.osd(osdname, osd_uuid, fstype, osdtype, devname, get_format_flag(options), ost_uuid,
-                  node_uuid, size)
+    if options.failover:
+        ost.setAttribute('failover', "1")
+    
+
+    osd = gen.osd(osdname, osd_uuid, fstype, osdtype, devname,
+                  get_format_flag(options), ost_uuid, node_uuid, size,
+                  journal_size)
 
     node = findByName(lustre, node_name, "node")
 
@@ -623,8 +763,8 @@ def add_lov(gen, lustre, options):
 
     mds_name = get_option(options, 'mds')
     stripe_sz = get_option_int(options, 'stripe_sz')
-    stripe_cnt = get_option_int(options, 'stripe_cnt', 0)
-    pattern = get_option_int(options, 'stripe_pattern', 0)
+    stripe_cnt = get_option_int(options, 'stripe_cnt')
+    pattern = get_option_int(options, 'stripe_pattern')
     uuid = new_uuid(name)
 
     ret = findByName(lustre, name, "lov")
@@ -643,50 +783,57 @@ def add_lov(gen, lustre, options):
     lovconfig = gen.lovconfig(lovconfig_name, lovconfig_uuid, uuid)
     lustre.appendChild(lovconfig)
 
+def new_filesystem(gen, lustre, mds_uuid, obd_uuid):
+    fs_name = new_name("FS_fsname")
+    fs_uuid = new_uuid(fs_name)
+    mds = lookup(lustre, mds_uuid)
+    mds.appendChild(gen.ref("filesystem", fs_uuid))
+    fs = gen.filesystem(fs_name, fs_uuid, mds_uuid, obd_uuid)
+    lustre.appendChild(fs)
+    return fs_uuid
 
+def get_fs_uuid(gen, lustre, mds_name, obd_name):
+    mds_uuid = name2uuid(lustre, mds_name, tag='mds')
+    obd_uuid = name2uuid(lustre, obd_name, tag='lov', fatal=0)
+    if not obd_uuid:
+        obd_uuid = name2uuid(lustre, obd_name, tag='ost', fatal=1)
+    fs_uuid = lookup_filesystem(lustre, mds_uuid, obd_uuid)
+    if not fs_uuid:
+        fs_uuid = new_filesystem(gen, lustre, mds_uuid, obd_uuid)
+    return fs_uuid
+    
 def add_mtpt(gen, lustre, options):
     """ create mtpt on a node """
     node_name = get_option(options, 'node')
 
     path = get_option(options, 'path')
-    mds_name = get_option(options, 'mds')
-    lov_name = get_option(options, 'lov', '')
-    if lov_name == '':
-        lov_name = get_option(options, 'ost', '', deprecated_tag='obd')
+    fs_name = get_option(options, 'filesystem')
+    if fs_name == '':
+        mds_name = get_option(options, 'mds')
+        lov_name = get_option(options, 'lov')
         if lov_name == '':
-            error("--add mtpt requires either --lov lov_name or --ost ost_name")
+            lov_name = get_option(options, 'ost')
+            if lov_name == '':
+                error("--add mtpt requires either --filesystem or --mds with an  --lov lov_name or --ost ost_name")
+        fs_uuid = get_fs_uuid(gen, lustre, mds_name, lov_name)
+    else:
+        fs_uuid = name2uuid(lustre, fs_name, tag='filesystem')
 
     name = new_name('MNT_'+ node_name)
 
     ret = findByName(lustre, name, "mountpoint")
     if ret:
+        # this can't happen, because new_name creates unique names
         error("MOUNTPOINT: ", name, " already exists.")
 
-    mds_uuid = name2uuid(lustre, mds_name, tag='mds')
-    lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0)
-    if not lov_uuid:
-        lov_uuid = name2uuid(lustre, lov_name, tag='ost', fatal=1)
-
     uuid = new_uuid(name)
-    mtpt = gen.mountpoint(name, uuid, mds_uuid, lov_uuid, path)
+    mtpt = gen.mountpoint(name, uuid, fs_uuid, path)
     node = findByName(lustre, node_name, "node")
     if not node:
         error('node:',  node_name, "not found.")
     node_add_profile(gen, node, "mountpoint", uuid)
     lustre.appendChild(mtpt)
 
-# obsolete, leaving behind for reference 
-def add_oscref(gen, lustre, options):
-    """ create mtpt on a node """
-    node_name = get_option(options, 'node')
-    osc_name = get_option(options, 'osc')
-
-    osc_uuid = name2uuid(lustre, osc_name, tag='osc')
-    node = findByName(lustre, node_name, "node")
-    if not node:
-        error('node:', node_name, "not found")
-    node_add_profile(gen, node, "osc",osc_uuid)
-
 ############################################################
 # Command line processing
 #
@@ -694,161 +841,23 @@ class OptionError (exceptions.Exception):
     def __init__(self, args):
         self.args = args
 
-def has_option(options, tag):
-    """Look for tag in options hash and return the true if set"""
-    if options.has_key(tag):
-        return 1
-    return 0
-
-def get_option(options, tag, default = None, deprecated_tag=None):
+def get_option(options, tag):
     """Look for tag in options hash and return the value if set. If not
     set, then if return default it is set, otherwise exception."""
-    if options.has_key(tag):
-        return options[tag]
-    elif deprecated_tag and options.has_key(deprecated_tag):
-            warning('--'+deprecated_tag, " is deprecated, please use:", '--'+tag)
-            return options[deprecated_tag]
-    elif default != None:
-        return default
+    if options.__getattr__(tag) != None:
+        return options.__getattr__(tag)
     else:
-        raise OptionError("--add %s requires --%s <value>" % (options['add'], tag))
-        # this exception should print an error like '--add blah requires --<tag> value'
+        raise OptionError("--add %s requires --%s <value>" % (options.add, tag))
 
-def get_option_int(options, tag, default = None):
+def get_option_int(options, tag):
     """Return an integer option.  Raise exception if the value is not an int"""
-    val = get_option(options, tag, default)
+    val = get_option(options, tag)
     try:
         n = int(val)
     except ValueError:
         raise OptionError("--%s <num> (value must be integer)" % (tag))        
     return n
 
-def parse_cmdline(argv):
-    short_opts = "ho:i:m:"
-    long_opts = ["add=", "node=", "nettype=", "nid=", "tcpbuf=", "port=",
-                 "echo_client=", "stripe_sz=", "stripe_cnt=", "stripe_pattern=",
-                 "mds=", "route", "router", "merge=", "format", "reformat", "output=",
-                 "dev=", "size=", "obd=", "ost=", "obdtype=", "osdtype=", "obduuid=", "in=",
-                 "ostuuid=", "path=", "help", "batch=", "lov=", "gw=", "lo=", "hi=",
-                 "osc=", "real_obd=", "cache_obd=", "fstype=",
-                 "timeout=", "recovery_upcall=", "nid_exchange=", "irq_affinity=",
-                 "hostaddr=",]
-    opts = []
-    args = []
-    options = {}
-    try:
-        opts, args = getopt.getopt(argv, short_opts, long_opts)
-    except getopt.error, e:
-        panic(string.join(sys.argv), e)
-
-    for o, a in opts:
-        # Commands to create new devices
-        if o == "--add":
-            options['add'] = a
-
-        if o == "--node":
-            options['node'] = a
-
-        # devices names
-        if o == "--lov":
-            options['lov'] = a
-        if o == "--mds":
-            options['mds'] = a
-        if o == "--obd":
-            options['obd'] = a
-        if o == "--ost":
-            options['ost'] = a
-
-        # node options
-        if o == "--timeout":
-            options['timeout'] = a
-        if o == "--recovery_upcall":
-            options['recovery_upcall'] = a
-        if o == "--router":
-            options['router'] = 1
-        
-        # network options
-        if o == "--nid":
-            options['nid'] = a
-        if o == "--hostaddr":
-            options['hostaddr'] = a
-        if o == "--nettype":
-            options['nettype'] = a
-        if o == "--net":
-            options[''] = a
-        if o == "--tcpbuf":
-            options['tcpbuf'] = a
-        if o == "--port":
-            options['port'] = a
-        if o == "--mtpt":
-            options['mtpt'] = 1
-        if o == "--route":
-            options['route'] = 1
-        if o == "--nid_exchange":
-            options['nid_exchange'] = a
-        if o == "--irq_affinity":
-            options['irq_affinity'] = a
-
-        # ost options
-        if o == "--dev":
-            options['dev'] = a
-        if o == "--size":
-            options['size'] = a
-        if o == "--path":
-            options['path'] = a
-        if o == "--osc":
-            options['osc'] = a
-        if o == "--obdtype":
-            options['obdtype'] = a
-        if o == "--osdtype":
-            options['osdtype'] = a
-        if o == "--fstype":
-            options['fstype'] = a
-        if o == "--obduuid":
-            options['obduuid'] = a
-        if o == "--ostuuid":
-            options['ostuuid'] = a
-
-        # lov options
-        if o == "--stripe_sz":
-            options['stripe_sz'] = a
-        if o == "--stripe_cnt":
-            options['stripe_cnt'] = a
-        if o == "--stripe_pattern":
-            options['stripe_pattern'] = a
-        if o == "--gw":
-            options['gw'] = a
-        if o == "--lo":
-            options['lo'] = a
-        if o == "--hi":
-            options['hi'] = a
-
-        # cobd
-        if o == "--cache_obd":
-            options['cache_obd'] = a
-        if o == "--real_obd":
-            options['real_obd'] = a
-
-        # lmc options
-        if o in ("-h", "--help"):
-            usage()
-        if o in ("-o", "--output"):
-            options['output'] = a
-        if o in ("-m", "--merge"):
-            options['merge'] = a
-        if o == "--format":
-            options['format'] = 1
-        if o  == "--reformat":
-            warning("the lmc --reformat option is not supported. Use lconf --reformat")
-            options['reformat'] = 1
-        if o  == "--batch":
-            options['batch'] = a
-        if o  in ("--in" , "-i"):
-            options['in'] = a
-            
-    return options, args
-
-
 # simple class for profiling
 import time
 class chrono:
@@ -868,8 +877,6 @@ class chrono:
         str = '%s: %g secs' % (msg, d)
         print str
 
-
-
 ############################################################
 # Main
 #
@@ -877,8 +884,6 @@ class chrono:
 def add(devtype, gen, lustre, options):
     if devtype == 'net':
         add_net(gen, lustre, options)
-    elif devtype =='osc':
-        add_osc(gen, lustre, options)
     elif devtype == 'mtpt':
         add_mtpt(gen, lustre, options)
     elif devtype == 'mds':
@@ -899,28 +904,40 @@ def add(devtype, gen, lustre, options):
         error("unknown device type:", devtype)
     
 def do_command(gen, lustre, options, args):
-    if options.has_key('add'):
-        add(options['add'], gen, lustre, options)
+    if options.add:
+        add(options.add, gen, lustre, options)
     else:
         error("Missing command")
 
 def main():
-    options, args = parse_cmdline(sys.argv[1:])
+    cl = Lustre.Options("lmc", "", lmc_options)
+    try:
+        options, args = cl.parse(sys.argv[1:])
+    except Lustre.OptionError, e:
+        panic("lmc", e)
+
+    if len(args) > 0:
+        panic(string.join(sys.argv), "Unexpected extra arguments on command line: " + string.join(args))
+
+    if options.reference:
+        reference()
+        sys.exit(0)
+
     outFile = '-'
 
-    if options.has_key('merge'):
-        outFile = options['merge']
+    if options.merge:
+        outFile = options.merge
         if os.access(outFile, os.R_OK):
             doc = xml.dom.minidom.parse(outFile)
         else:
             doc = new_lustre(xml.dom.minidom)
-    elif options.has_key('in'):
-        doc = xml.dom.minidom.parse(options['in'])
+    elif options.input:
+        doc = xml.dom.minidom.parse(options.input)
     else:
         doc = new_lustre(xml.dom.minidom)
 
-    if options.has_key('output'):
-        outFile = options['output']
+    if options.output:
+        outFile = options.output
 
     lustre = doc.documentElement
     init_names(lustre)
@@ -930,21 +947,25 @@ def main():
 
     gen = GenConfig(doc)
 
-    if options.has_key('batch'):
-        fp = open(options['batch'])
+    if options.batch:
+        fp = open(options.batch)
         batchCommands = fp.readlines()
         fp.close()
         for cmd in batchCommands:
-            options, args = parse_cmdline(string.split(cmd))
             try:
+                options, args = cl.parse(string.split(cmd))
                 do_command(gen, lustre, options, args)
             except OptionError, e:
                 panic(cmd, e)
+            except Lustre.OptionError, e:
+                panic(cmd, e)
     else:
         try:
             do_command(gen, lustre, options, args)
         except OptionError, e:
             panic(string.join(sys.argv),e)
+        except Lustre.OptionError, e:
+            panic("lmc", e)
 
     if outFile == '-':
         PrettyPrint(doc)
diff --git a/lustre/utils/load_ldap.sh b/lustre/utils/load_ldap.sh
new file mode 100755 (executable)
index 0000000..531d385
--- /dev/null
@@ -0,0 +1,41 @@
+#!/bin/bash
+#
+# Load a lustre config xml into an openldap database.
+# See https://projects.clusterfs.com/lustre/LustreLDAP
+# for more details.
+#
+# Usage: load_ldap.sh <xml_file>
+set -e
+
+LDAP_BASE=${LDAP_BASE:-fs=lustre}
+LDAP_ROOTDN=${LDAP_ROOTDN:-cn=Manager,fs=lustre}
+LDAP_PW=${LDAP_PW:-secret}
+LDAP_AUTH="-x -D $LDAP_ROOTDN -w $LDAP_PW"
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+
+[ ! -z $LDAPURL ] && LDAP_AUTH="$LDAP_AUTH -H $LDAPURL"
+
+XML=${XML:-$1}
+
+if [ -z "$XML" ] || [  ! -r $XML ]; then
+     echo "usage: $0 xmlfile"
+     exit 1
+fi
+
+NAME=`basename $XML .xml`
+LDIF=/tmp/$NAME.ldif
+
+# add the top level record, if needed
+ldapsearch $LDAP_AUTH -b $LDAP_BASE > /dev/null 2>&1 ||
+    ldapadd $LDAP_AUTH -f $LUSTRE/conf/top.ldif
+
+# If this config already exists, then delete it
+ldapsearch $LDAP_AUTH -b config=$NAME,$LDAP_BASE > /dev/null 2>&1 && 
+    ldapdelete $LDAP_AUTH -r config=$NAME,$LDAP_BASE
+
+4xslt -D config=$NAME $XML $LUSTRE/conf/lustre2ldif.xsl  > $LDIF
+
+echo "Loading config to 'config=$NAME,$LDAP_BASE' ..."
+ldapadd $LDAP_AUTH -f $LDIF
+
+rm -f $LDIF
index 39e2bdf..2cdf5d2 100644 (file)
@@ -46,8 +46,13 @@ int create_file(char *name, long stripe_size, int stripe_offset,
                        name, strerror(errno));
                result = -errno;
        } else if (ioctl(fd, LL_IOC_LOV_SETSTRIPE, &a_striping)) {
+               char *errmsg = "stripe already set";
+
+               if (errno != EEXIST && errno != EALREADY)
+                       errmsg = strerror(errno);
+
                fprintf(stderr, "\nError on ioctl for '%s' (%d): %s\n",
-                       name, fd, strerror(errno));
+                       name, fd, errmsg);
                result = -errno;
        } else if (close(fd) < 0) {
                fprintf(stderr, "\nError on close for '%s' (%d): %s\n",
index 95e5445..a89e15d 100644 (file)
 #include <linux/lustre_idl.h>
 #include <linux/lustre_dlm.h>
 #include <linux/obd.h>          /* for struct lov_stripe_md */
-#include <linux/obd_lov.h>      /* for IOC_LOV_SET_OSC_ACTIVE */
 #include <linux/lustre_build_version.h>
 
 #include <unistd.h>
 #include <sys/un.h>
 #include <time.h>
 #include <sys/time.h>
-#include <netinet/in.h>
 #include <errno.h>
 #include <string.h>
 
@@ -76,8 +74,7 @@ static long long counter_snapshot[2][MAX_SHMEM_COUNT];
 struct timeval prev_time;
 #endif
 
-uint64_t conn_addr = -1;
-uint64_t conn_cookie;
+uint64_t conn_cookie = -1;
 char rawbuf[8192];
 char *buf = rawbuf;
 int max = sizeof(rawbuf);
@@ -94,7 +91,6 @@ static char *cmdname(char *func);
 #define IOC_INIT(data)                                                  \
 do {                                                                    \
         memset(&data, 0, sizeof(data));                                 \
-        data.ioc_addr = conn_addr;                                      \
         data.ioc_cookie = conn_cookie;                                  \
 } while (0)
 
@@ -149,11 +145,27 @@ static int do_name2dev(char *func, char *name)
 
         IOC_PACK(func, data);
         rc = l_ioctl(OBD_DEV_ID, OBD_IOC_NAME2DEV, buf);
-        if (rc < 0) {
-                fprintf(stderr, "error: %s: %s - %s\n", cmdname(func),
-                        name, strerror(rc = errno));
-                return rc;
-        }
+        if (rc < 0)
+                return errno;
+        IOC_UNPACK(func, data);
+
+        return data.ioc_dev + N2D_OFF;
+}
+
+static int do_uuid2dev(char *func, char *uuid)
+{
+        struct obd_ioctl_data data;
+        int rc;
+
+        IOC_INIT(data);
+
+        data.ioc_inllen1 = strlen(uuid) + 1;
+        data.ioc_inlbuf1 = uuid;
+
+        IOC_PACK(func, data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_UUID2DEV, buf);
+        if (rc < 0)
+                return errno;
         IOC_UNPACK(func, data);
 
         return data.ioc_dev + N2D_OFF;
@@ -161,8 +173,7 @@ static int do_name2dev(char *func, char *name)
 
 /*
  * resolve a device name to a device number.
- * supports a number or name.
- * FIXME: support UUID
+ * supports a number, $name or %uuid.
  */
 static int parse_devname(char *func, char *name)
 {
@@ -172,16 +183,31 @@ static int parse_devname(char *func, char *name)
         if (!name)
                 return ret;
         if (name[0] == '$') {
-                rc = do_name2dev(func, name + 1);
+                name++;
+                rc = do_name2dev(func, name);
                 if (rc >= N2D_OFF) {
                         ret = rc - N2D_OFF;
-                        printf("%s is device %d\n", name, ret);
+                        printf("Name %s is device %d\n", name, ret);
                 } else {
-                        fprintf(stderr, "error: %s: %s: %s\n", cmdname(func),
-                                name, "device not found");
+                        printf("No device found for name %s: %s\n",
+                               name, strerror(rc));
                 }
-        } else
+        } else if (name[0] == '%') {
+                name++;
+                rc = do_uuid2dev(func, name);
+                if (rc >= N2D_OFF) {
+                        ret = rc - N2D_OFF;
+                        printf("UUID %s is device %d\n", name, ret);
+                } else {
+                        printf("No device found for UUID %s: %s\n",
+                               name, strerror(rc));
+                }
+        } else {
+                /* Assume it's a number.  This means that bogus strings become
+                 * 0.  I might care about that some day. */
                 ret = strtoul(name, NULL, 0);
+                printf("Selected device %d\n", ret);
+        }
 
         return ret;
 }
@@ -380,7 +406,7 @@ int do_disconnect(char *func, int verbose)
         int rc;
         struct obd_ioctl_data data;
 
-        if (conn_addr == -1)
+        if (conn_cookie == -1)
                 return 0;
 
         IOC_INIT(data);
@@ -393,8 +419,8 @@ int do_disconnect(char *func, int verbose)
         } else {
                 if (verbose)
                         printf("%s: disconnected conn "LPX64"\n", cmdname(func),
-                               conn_addr);
-                conn_addr = -1;
+                               conn_cookie);
+                conn_cookie = -1;
         }
 
         return rc;
@@ -548,10 +574,8 @@ int jt_obd_connect(int argc, char **argv)
         if (rc < 0)
                 fprintf(stderr, "error: %s: OBD_IOC_CONNECT %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
-        else {
-                conn_addr = data.ioc_addr;
+        else
                 conn_cookie = data.ioc_cookie;
-        }
         return rc;
 }
 
@@ -560,7 +584,7 @@ int jt_obd_disconnect(int argc, char **argv)
         if (argc != 1)
                 return CMD_HELP;
 
-        if (conn_addr == -1)
+        if (conn_cookie == -1)
                 return 0;
 
         return do_disconnect(argv[0], 0);
@@ -705,19 +729,29 @@ int jt_obd_cleanup(int argc, char **argv)
 {
         struct obd_ioctl_data data;
         char force = 'F';
+        char failover = 'A';
+        char flags[3];
+        int flag_cnt = 0, n;
         int rc;
 
         IOC_INIT(data);
 
-        if (argc != 1 && argc != 2)
+        if (argc < 1 || argc > 3)
                 return CMD_HELP;
 
-        if (argc == 2) {
-                if (strcmp(argv[1], "force"))
+        for (n = 1; n < argc; n++) 
+                if (strcmp(argv[n], "force") == 0) {
+                        flags[flag_cnt++] = force;
+                } else if (strcmp(argv[n], "failover") == 0) {
+                        flags[flag_cnt++] = failover;
+                } else {
+                        fprintf(stderr, "unknown option: %s", argv[n]);
                         return CMD_HELP;
-                data.ioc_inllen1 = 1;
-                data.ioc_inlbuf1 = &force;
-        }
+                }
+
+        data.ioc_inllen1 = flag_cnt;
+        if (flag_cnt)
+                data.ioc_inlbuf1 = flags;
 
         IOC_PACK(argv[0], data);
         rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CLEANUP, buf);
@@ -766,6 +800,25 @@ int jt_obd_set_readonly(int argc, char **argv)
         return rc;
 }
 
+int jt_obd_abort_recovery(int argc, char **argv)
+{
+        struct obd_ioctl_data data;
+        int rc;
+
+        IOC_INIT(data);
+
+        if (argc != 1)
+                return CMD_HELP;
+
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_ABORT_RECOVERY, buf);
+        if (rc < 0)
+                fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
+                        strerror(rc = errno));
+
+        return rc;
+}
+
 int jt_obd_newdev(int argc, char **argv)
 {
         int rc;
@@ -789,6 +842,29 @@ int jt_obd_newdev(int argc, char **argv)
         return rc;
 }
 
+int jt_obd_mount_option(int argc, char **argv)
+{
+        int rc;
+        struct obd_ioctl_data data;
+
+        IOC_INIT(data);
+
+        if (argc != 2)
+                return CMD_HELP;
+
+        data.ioc_inllen1 = strlen(argv[1]) + 1;
+        data.ioc_inlbuf1 = argv[1];
+
+        IOC_PACK(argv[0], data);
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_MOUNTOPT, buf);
+        if (rc < 0) {
+                fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
+                        strerror(rc = errno));
+        }
+
+        return rc;
+}
+
 int jt_get_version(int argc, char **argv)
 {
         int rc;
@@ -800,8 +876,7 @@ int jt_get_version(int argc, char **argv)
 
         memset(buf, 0, sizeof(buf));
         data->ioc_version = OBD_IOCTL_VERSION;
-        data->ioc_addr = conn_addr;
-        data->ioc_cookie = conn_addr;
+        data->ioc_cookie = conn_cookie;
         data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data));
         data->ioc_len = obd_ioctl_packlen(data);
 
@@ -828,8 +903,7 @@ int jt_obd_list(int argc, char **argv)
 
         memset(buf, 0, sizeof(buf));
         data->ioc_version = OBD_IOCTL_VERSION;
-        data->ioc_addr = conn_addr;
-        data->ioc_cookie = conn_addr;
+        data->ioc_cookie = conn_cookie;
         data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data));
         data->ioc_len = obd_ioctl_packlen(data);
 
@@ -887,23 +961,6 @@ int jt_obd_attach(int argc, char **argv)
         return rc;
 }
 
-int jt_obd_name2dev(int argc, char **argv)
-{
-        int rc;
-
-        if (argc != 2)
-                return CMD_HELP;
-
-        rc = do_name2dev(argv[0], argv[1]);
-        if (rc >= N2D_OFF) {
-                int dev = rc - N2D_OFF;
-                rc = do_device(argv[0], dev);
-                if (rc == 0)
-                        printf("%d\n", dev);
-        }
-        return rc;
-}
-
 int jt_obd_setup(int argc, char **argv)
 {
         struct obd_ioctl_data data;
@@ -1047,15 +1104,15 @@ int jt_obd_unset_stripe (int argc, char **argv)
         if (argc != 2)
                 return CMD_HELP;
 
-        id = strtoll (argv[1], &end, 0);
-        if (*end == 0) {
+        id = strtoull (argv[1], &end, 0);
+        if (*end != 0) {
                 fprintf (stderr, "error: %s: invalid object id '%s'\n",
                          cmdname (argv[0]), argv[1]);
                 return CMD_HELP;
         }
 
         IOC_INIT (data);
-        data.ioc_obdo1.o_id = lsm_buffer.lsm.lsm_object_id;
+        data.ioc_obdo1.o_id = id;
         data.ioc_obdo1.o_mode = S_IFREG | 0644;
         data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE;
 
@@ -1070,8 +1127,8 @@ int jt_obd_unset_stripe (int argc, char **argv)
         return (0);
 }
 
-/* Create one or more objects, arg[1] may describe stripe meta-data.  If
- * not, defaults assumed.  This echo-client instances stashes the stripe
+/* Create one or more objects, arg[4] may describe stripe meta-data.  If
+ * not, defaults assumed.  This echo-client instance stashes the stripe
  * object ids.  Use get_stripe on this node to print full lsm and
  * set_stripe on another node to cut/paste between nodes.
  */
@@ -1787,23 +1844,20 @@ int jt_obd_ldlm_regress_stop(int argc, char **argv)
         return rc;
 }
 
-int jt_obd_lov_set_osc_active(int argc, char **argv)
+static int do_activate(int argc, char **argv, int flag)
 {
         struct obd_ioctl_data data;
         int rc;
 
         IOC_INIT(data);
-        if (argc != 3)
+        if (argc != 1)
                 return CMD_HELP;
 
-        data.ioc_inlbuf1 = argv[1];
-        data.ioc_inllen1 = strlen(argv[1]) + 1;
-
         /* reuse offset for 'active' */
-        data.ioc_offset = atoi(argv[2]);
+        data.ioc_offset = flag;
 
         IOC_PACK(argv[0], data);
-        rc = l_ioctl(OBD_DEV_ID, IOC_LOV_SET_OSC_ACTIVE, buf);
+        rc = l_ioctl(OBD_DEV_ID, IOC_OSC_SET_ACTIVE, buf);
         if (rc)
                 fprintf(stderr, "error: %s: failed: %s\n",
                         cmdname(argv[0]), strerror(rc = errno));
@@ -1811,49 +1865,36 @@ int jt_obd_lov_set_osc_active(int argc, char **argv)
         return rc;
 }
 
-int jt_obd_newconn(int argc, char **argv)
+int jt_obd_deactivate(int argc, char **argv)
 {
-        int rc;
-        struct obd_ioctl_data data;
-
-        IOC_INIT(data);
-        if (argc < 2 || argc > 3)
-                return CMD_HELP;
-
-        data.ioc_inllen1 = strlen(argv[1]) + 1;
-        data.ioc_inlbuf1 = argv[1];
-
-        if (argc == 3) {
-                data.ioc_inllen2 = strlen(argv[2]) + 1;
-                data.ioc_inlbuf2 = argv[2];
-        }
-
-        IOC_PACK(argv[0], data);
-        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_RECOVD_NEWCONN, buf);
-        if (rc < 0)
-                fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
-                        strerror(rc = errno));
+        return do_activate(argc, argv, 0);
+}
 
-        return rc;
+int jt_obd_activate(int argc, char **argv)
+{
+        return do_activate(argc, argv, 1);
 }
 
-int jt_obd_failconn(int argc, char **argv)
+int jt_obd_recover(int argc, char **argv)
 {
         int rc;
         struct obd_ioctl_data data;
 
         IOC_INIT(data);
-        if (argc < 2)
+        if (argc > 2)
                 return CMD_HELP;
 
-        data.ioc_inllen1 = strlen(argv[1]) + 1;
-        data.ioc_inlbuf1 = argv[1];
+        if (argc == 2) {
+                data.ioc_inllen1 = strlen(argv[1]) + 1;
+                data.ioc_inlbuf1 = argv[1];
+        }
 
         IOC_PACK(argv[0], data);
-        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_RECOVD_FAILCONN, buf);
-        if (rc < 0)
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CLIENT_RECOVER, buf);
+        if (rc < 0) {
                 fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]),
                         strerror(rc = errno));
+        }
 
         return rc;
 }
index 860b908..8fd4f7c 100644 (file)
@@ -91,10 +91,11 @@ int main(int argc, char **argv)
         if (obd_initialize(argc, argv) < 0)
                 exit(1);
 
+        Parser_init("obdctl > ", cmdlist);
+
         if (argc > 1) {
                 rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
         } else {
-                Parser_init("obdctl > ", cmdlist);
                 rc = Parser_commands();
         }
 
index f0e1a97..0203579 100644 (file)
@@ -40,10 +40,11 @@ int jt_obd_detach(int argc, char **argv);
 int jt_obd_cleanup(int argc, char **argv);
 int jt_obd_no_transno(int argc, char **argv);
 int jt_obd_set_readonly(int argc, char **argv);
+int jt_obd_abort_recovery(int argc, char **argv);
 int jt_obd_newdev(int argc, char **argv);
+int jt_obd_mount_option(int argc, char **argv);
 int jt_obd_list(int argc, char **argv);
 int jt_obd_attach(int argc, char **argv);
-int jt_obd_name2dev(int argc, char **argv);
 int jt_obd_setup(int argc, char **argv);
 int jt_obd_create(int argc, char **argv);
 int jt_obd_setattr(int argc, char **argv);
@@ -60,9 +61,9 @@ int jt_obd_test_ldlm(int argc, char **argv);
 int jt_obd_ldlm_regress_start(int argc, char **argv);
 int jt_obd_ldlm_regress_stop(int argc, char **argv);
 int jt_obd_dump_ldlm(int argc, char **argv);
-int jt_obd_lov_set_osc_active(int argc, char **argv);
-int jt_obd_newconn(int argc, char **argv);
-int jt_obd_failconn(int argc, char **argv);
+int jt_obd_activate(int argc, char **argv);
+int jt_obd_deactivate(int argc, char **argv);
+int jt_obd_recover(int argc, char **argv);
 int jt_obd_mdc_lookup(int argc, char **argv);
 int jt_get_version(int argc, char **argv);
 int jt_obd_add_uuid(int argc, char **argv);
index 8c79c67..c871818 100644 (file)
@@ -38,7 +38,6 @@ obdio_iocinit (struct obdio_conn *conn)
 {
         memset (&conn->oc_data, 0, sizeof (conn->oc_data));
         conn->oc_data.ioc_version = OBD_IOCTL_VERSION;
-        conn->oc_data.ioc_addr = conn->oc_conn_addr;
         conn->oc_data.ioc_cookie = conn->oc_conn_cookie;
         conn->oc_data.ioc_len = sizeof (conn->oc_data);
 }
@@ -103,12 +102,11 @@ obdio_connect (int device)
         obdio_iocinit (conn);
         rc = obdio_ioctl (conn, OBD_IOC_CONNECT);
         if (rc != 0) {
-                fprintf (stderr, "obdio_connect: Can't connect to device %d: %s\n",
-                         device, strerror (errno));
+                fprintf(stderr, "obdio_connect: Can't connect to device "
+                        "%d: %s\n", device, strerror (errno));
                 goto failed;
         }
 
-        conn->oc_conn_addr = conn->oc_data.ioc_addr;
         conn->oc_conn_cookie = conn->oc_data.ioc_cookie;
         return (conn);
 
index 9b06941..3811b41 100644 (file)
@@ -2,7 +2,7 @@
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
  *  Copyright (C) 2003 Cluster File Systems, Inc.
- *   Author: Eric Barton <eeb@clusterfs.com> 
+ *   Author: Eric Barton <eeb@clusterfs.com>
  *
  *   This file is part of Lustre, http://www.lustre.org.
  *
@@ -33,8 +33,7 @@
 #include <linux/obd_class.h>
 
 struct obdio_conn {
-        int                   oc_fd;
-        uint64_t               oc_conn_addr;
+        int                    oc_fd;
         uint64_t               oc_conn_cookie;
         struct obd_ioctl_data  oc_data;
         char                   oc_buffer[8192];
@@ -42,25 +41,25 @@ struct obdio_conn {
 
 struct obdio_barrier {
         uint64_t               ob_id;
-       uint64_t               ob_oid;
+        uint64_t               ob_oid;
         uint64_t               ob_npeers;
         uint64_t               ob_ordinal;
         uint64_t               ob_count;
 };
-       
+
 extern struct obdio_conn * obdio_connect (int device);
 extern void obdio_disconnect (struct obdio_conn *conn);
-extern int obdio_open (struct obdio_conn *conn, uint64_t oid, 
-                      struct lustre_handle *fh);
-extern int obdio_close (struct obdio_conn *conn, uint64_t oid, 
-                       struct lustre_handle *fh);
-extern int obdio_pread (struct obdio_conn *conn, uint64_t oid, 
-                       char *buffer, uint32_t count, uint64_t offset);
-extern int obdio_pwrite (struct obdio_conn *conn, uint64_t oid, 
-                        char *buffer, uint32_t count, uint64_t offset);
+extern int obdio_open (struct obdio_conn *conn, uint64_t oid,
+                       struct lustre_handle *fh);
+extern int obdio_close (struct obdio_conn *conn, uint64_t oid,
+                        struct lustre_handle *fh);
+extern int obdio_pread (struct obdio_conn *conn, uint64_t oid,
+                        char *buffer, uint32_t count, uint64_t offset);
+extern int obdio_pwrite (struct obdio_conn *conn, uint64_t oid,
+                         char *buffer, uint32_t count, uint64_t offset);
 extern int obdio_enqueue (struct obdio_conn *conn, uint64_t oid,
-                         int mode, uint64_t offset, uint32_t count,
-                         struct lustre_handle *lh);
+                          int mode, uint64_t offset, uint32_t count,
+                          struct lustre_handle *lh);
 extern int obdio_cancel (struct obdio_conn *conn, struct lustre_handle *lh);
 extern void *obdio_alloc_aligned_buffer (void **spacep, int size);
 extern struct obdio_barrier *obdio_new_barrier (uint64_t oid, uint64_t id, int npeers) ;
index 01085b9..8139fb5 100644 (file)
@@ -27,13 +27,13 @@ struct one_stat *close_reqs;
 struct one_stat *punch_reqs;
 
 struct one_stat *
-init_one_stat (char *basename, char *name) 
+init_one_stat (char *basename, char *name)
 {
        char             fname[1024];
        struct one_stat *stat = (struct one_stat *)malloc (sizeof (*stat));
-       
+
        if (stat == NULL) {
-               fprintf (stderr, "Can't allocate stat %s: %s\n", 
+               fprintf (stderr, "Can't allocate stat %s: %s\n",
                         name, strerror (errno));
                abort ();
        }
@@ -45,7 +45,7 @@ init_one_stat (char *basename, char *name)
 
        stat->fd = open (fname, O_RDONLY);
        if (stat->fd < 0 ) {
-               fprintf (stderr, "Can't open stat %s: %s\n", 
+               fprintf (stderr, "Can't open stat %s: %s\n",
                         fname, strerror (errno));
                abort ();
        }
@@ -54,7 +54,7 @@ init_one_stat (char *basename, char *name)
 }
 
 void
-update_one_stat (struct one_stat *stat) 
+update_one_stat (struct one_stat *stat)
 {
         static char buffer[1024];
        long long prev = stat->current;
@@ -67,7 +67,7 @@ update_one_stat (struct one_stat *stat)
                         stat->name, strerror (errno));
                abort ();
        }
-       
+
        buffer[nob] = 0;
        if (sscanf (buffer, "%Ld", &stat->current) != 1) {
                fprintf (stderr, "Can't parse stat %s: %s\n",
@@ -82,7 +82,7 @@ double
 timenow ()
 {
        struct timeval tv;
-   
+
        gettimeofday (&tv, NULL);
        return (tv.tv_sec + tv.tv_usec / 1000000.0);
 }
@@ -93,7 +93,7 @@ do_stat (void)
        static double last = 0.0;
        double now;
        double t;
-   
+
        now = timenow();
 
        update_one_stat (read_bytes);
@@ -108,7 +108,7 @@ do_stat (void)
        update_one_stat (destroy_reqs);
        update_one_stat (statfs_reqs);
        update_one_stat (punch_reqs);
-       
+
        if (last == 0.0) {
                printf ("R %Ld/%Ld W %Ld/%Ld attr %Ld/%Ld open %Ld/%Ld create %Ld/%Ld stat %Ld punch %Ld\n",
                        read_bytes->current, read_reqs->current,
@@ -125,32 +125,32 @@ do_stat (void)
                        read_bytes->delta / ((1<<20) * t),
                        write_reqs->delta, (int)(write_reqs->delta / t),
                        write_bytes->delta / ((1<<20) * t));
-               
+
                if (getattr_reqs->delta != 0)
                        printf (" ga:%Ld,%d/s", getattr_reqs->delta,
                                (int)(getattr_reqs->delta / t));
-               
+
                if (setattr_reqs->delta != 0)
                        printf (" sa:%Ld", setattr_reqs->delta);
 
                if (open_reqs->delta != 0)
                        printf (" op:%Ld", open_reqs->delta);
-               
+
                if (close_reqs->delta != 0)
                        printf (" cl:%Ld", close_reqs->delta);
 
                if (create_reqs->delta != 0)
                        printf (" cx:%Ld", create_reqs->delta);
-               
+
                if (destroy_reqs->delta != 0)
                        printf (" dx:%Ld", destroy_reqs->delta);
 
                if (statfs_reqs->delta != 0)
                        printf (" st:%Ld", statfs_reqs->delta);
-               
+
                if (punch_reqs->delta != 0)
                        printf (" pu:%Ld", punch_reqs->delta);
-               
+
                printf ("\n");
        }
 
@@ -167,9 +167,9 @@ int main (int argc, char **argv)
           fprintf (stderr, "obd type not specified\n");
           return (1);
        }
-       
+
        snprintf (basedir, sizeof (basedir), "/proc/sys/%s", argv[1]);
-   
+
        if (argc > 2)
                interval = atoi (argv[2]);
 
@@ -190,7 +190,7 @@ int main (int argc, char **argv)
 
        if (interval == 0)
                return (0);
-   
+
        for (;;) {
                sleep (interval);
                do_stat ();
index 0e5a9f0..fef987b 100644 (file)
 #define READLINE_LIBRARY
 #include <readline/readline.h>
 
-//extern char **completion_matches __P((char *, rl_compentry_func_t *));
+/* completion_matches() is #if 0-ed out in modern glibc */
+#ifndef completion_matches
+#define completion_matches rl_completion_matches
+#endif
 extern void using_history(void);
 extern void stifle_history(int);
 extern void add_history(char *);
diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c
new file mode 100644 (file)
index 0000000..5b6a589
--- /dev/null
@@ -0,0 +1,588 @@
+#include <stdio.h>
+#include <liblustre.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_idl.h>
+
+#define BLANK_LINE()                           \
+do {                                           \
+       printf ("\n");                          \
+} while (0)
+
+#define COMMENT(c)                             \
+do {                                           \
+       printf ("        /* "c" */\n");         \
+} while (0)
+
+#define STRINGIFY(a) #a
+
+#define CHECK_DEFINE(a)                                                \
+do {                                                           \
+       printf("        LASSERT ("#a" == "STRINGIFY(a)");\n");  \
+} while (0)
+
+#define CHECK_VALUE(a)                                 \
+do {                                                   \
+       printf("        LASSERT ("#a" == %d);\n", a);   \
+} while (0)
+
+#define CHECK_MEMBER_OFFSET(s,m)               \
+do {                                           \
+       CHECK_VALUE(offsetof (struct s, m));    \
+} while (0)
+
+#define CHECK_MEMBER_SIZEOF(s,m)                       \
+do {                                                   \
+       CHECK_VALUE((int)sizeof(((struct s *)0)->m));   \
+} while (0)
+
+#define CHECK_MEMBER(s,m)                      \
+do {                                           \
+       CHECK_MEMBER_OFFSET(s, m);              \
+       CHECK_MEMBER_SIZEOF(s, m);              \
+} while (0)
+
+#define CHECK_STRUCT(s)                                \
+do {                                           \
+        COMMENT("Checks for struct "#s);       \
+       CHECK_VALUE((int)sizeof(struct s));     \
+} while (0)
+
+
+
+void check1 (void)
+{
+#define VALUE 1234567
+
+       CHECK_VALUE (VALUE);
+       CHECK_DEFINE (VALUE);
+}
+
+void
+check_lustre_handle (void) 
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (lustre_handle);
+       CHECK_MEMBER (lustre_handle, cookie);
+}
+
+void
+check_lustre_msg (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (lustre_msg);
+       CHECK_MEMBER (lustre_msg, handle);
+       CHECK_MEMBER (lustre_msg, magic);
+       CHECK_MEMBER (lustre_msg, type);
+       CHECK_MEMBER (lustre_msg, version);
+       CHECK_MEMBER (lustre_msg, opc);
+       CHECK_MEMBER (lustre_msg, last_xid);
+       CHECK_MEMBER (lustre_msg, last_committed);
+       CHECK_MEMBER (lustre_msg, transno);
+       CHECK_MEMBER (lustre_msg, status);
+       CHECK_MEMBER (lustre_msg, flags);
+       CHECK_MEMBER (lustre_msg, bufcount);
+       CHECK_MEMBER (lustre_msg, buflens[7]);
+}
+
+void
+check_obdo (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (obdo);
+       CHECK_MEMBER (obdo, o_id);
+       CHECK_MEMBER (obdo, o_gr);
+       CHECK_MEMBER (obdo, o_atime);
+       CHECK_MEMBER (obdo, o_mtime);
+       CHECK_MEMBER (obdo, o_ctime);
+       CHECK_MEMBER (obdo, o_size);
+       CHECK_MEMBER (obdo, o_blocks);
+       CHECK_MEMBER (obdo, o_rdev);
+       CHECK_MEMBER (obdo, o_blksize);
+       CHECK_MEMBER (obdo, o_mode);
+       CHECK_MEMBER (obdo, o_uid);
+       CHECK_MEMBER (obdo, o_gid);
+       CHECK_MEMBER (obdo, o_flags);
+       CHECK_MEMBER (obdo, o_nlink);
+       CHECK_MEMBER (obdo, o_generation);
+       CHECK_MEMBER (obdo, o_valid);
+       CHECK_MEMBER (obdo, o_obdflags);
+       CHECK_MEMBER (obdo, o_easize);
+       CHECK_MEMBER (obdo, o_inline);
+}
+
+void
+check_obd_statfs (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (obd_statfs);
+       CHECK_MEMBER (obd_statfs, os_type);
+       CHECK_MEMBER (obd_statfs, os_blocks);
+       CHECK_MEMBER (obd_statfs, os_bfree);
+       CHECK_MEMBER (obd_statfs, os_bavail);
+       CHECK_MEMBER (obd_statfs, os_ffree);
+       CHECK_MEMBER (obd_statfs, os_fsid);
+       CHECK_MEMBER (obd_statfs, os_bsize);
+       CHECK_MEMBER (obd_statfs, os_namelen);
+}
+
+void
+check_obd_ioobj (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (obd_ioobj);
+       CHECK_MEMBER (obd_ioobj, ioo_id);
+       CHECK_MEMBER (obd_ioobj, ioo_gr);
+       CHECK_MEMBER (obd_ioobj, ioo_type);
+       CHECK_MEMBER (obd_ioobj, ioo_bufcnt);
+}
+
+void
+check_niobuf_remote (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (niobuf_remote);
+       CHECK_MEMBER (niobuf_remote, offset);
+       CHECK_MEMBER (niobuf_remote, len);
+       CHECK_MEMBER (niobuf_remote, flags);
+}
+
+void
+check_ost_body (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ost_body);
+       CHECK_MEMBER (ost_body, oa);
+}
+
+void
+check_ll_fid (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ll_fid);
+       CHECK_MEMBER (ll_fid, id);
+       CHECK_MEMBER (ll_fid, generation);
+       CHECK_MEMBER (ll_fid, f_type);
+}
+
+void
+check_mds_status_req (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (mds_status_req);
+       CHECK_MEMBER (mds_status_req, flags);
+       CHECK_MEMBER (mds_status_req, repbuf);
+}
+
+void
+check_mds_fileh_body (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (mds_fileh_body);
+       CHECK_MEMBER (mds_fileh_body, f_fid);
+}
+
+void
+check_mds_body (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (mds_body);
+       CHECK_MEMBER (mds_body, fid1);
+       CHECK_MEMBER (mds_body, fid2);
+       CHECK_MEMBER (mds_body, handle);
+       CHECK_MEMBER (mds_body, size);
+       CHECK_MEMBER (mds_body, blocks);
+       CHECK_MEMBER (mds_body, ino);
+       CHECK_MEMBER (mds_body, valid);
+       CHECK_MEMBER (mds_body, fsuid);
+       CHECK_MEMBER (mds_body, fsgid);
+       CHECK_MEMBER (mds_body, capability);
+       CHECK_MEMBER (mds_body, mode);
+       CHECK_MEMBER (mds_body, uid);
+       CHECK_MEMBER (mds_body, gid);
+       CHECK_MEMBER (mds_body, mtime);
+       CHECK_MEMBER (mds_body, ctime);
+       CHECK_MEMBER (mds_body, atime);
+       CHECK_MEMBER (mds_body, flags);
+       CHECK_MEMBER (mds_body, rdev);
+       CHECK_MEMBER (mds_body, nlink);
+       CHECK_MEMBER (mds_body, generation);
+       CHECK_MEMBER (mds_body, suppgid);
+}
+
+void
+check_mds_rec_setattr (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (mds_rec_setattr);
+       CHECK_MEMBER (mds_rec_setattr, sa_opcode);
+       CHECK_MEMBER (mds_rec_setattr, sa_fsuid);
+       CHECK_MEMBER (mds_rec_setattr, sa_fsgid);
+       CHECK_MEMBER (mds_rec_setattr, sa_cap);
+       CHECK_MEMBER (mds_rec_setattr, sa_reserved);
+       CHECK_MEMBER (mds_rec_setattr, sa_valid);
+       CHECK_MEMBER (mds_rec_setattr, sa_fid);
+       CHECK_MEMBER (mds_rec_setattr, sa_mode);
+       CHECK_MEMBER (mds_rec_setattr, sa_uid);
+       CHECK_MEMBER (mds_rec_setattr, sa_gid);
+       CHECK_MEMBER (mds_rec_setattr, sa_attr_flags);
+       CHECK_MEMBER (mds_rec_setattr, sa_size);
+       CHECK_MEMBER (mds_rec_setattr, sa_atime);
+       CHECK_MEMBER (mds_rec_setattr, sa_mtime);
+       CHECK_MEMBER (mds_rec_setattr, sa_ctime);
+       CHECK_MEMBER (mds_rec_setattr, sa_suppgid);
+}
+
+void
+check_mds_rec_create (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (mds_rec_create);
+       CHECK_MEMBER (mds_rec_create, cr_opcode);
+       CHECK_MEMBER (mds_rec_create, cr_fsuid);
+       CHECK_MEMBER (mds_rec_create, cr_fsgid);
+       CHECK_MEMBER (mds_rec_create, cr_cap);
+       CHECK_MEMBER (mds_rec_create, cr_flags);
+       CHECK_MEMBER (mds_rec_create, cr_mode);
+       CHECK_MEMBER (mds_rec_create, cr_fid);
+       CHECK_MEMBER (mds_rec_create, cr_replayfid);
+       CHECK_MEMBER (mds_rec_create, cr_uid);
+       CHECK_MEMBER (mds_rec_create, cr_gid);
+       CHECK_MEMBER (mds_rec_create, cr_time);
+       CHECK_MEMBER (mds_rec_create, cr_rdev);
+       CHECK_MEMBER (mds_rec_create, cr_suppgid);
+}
+
+void
+check_mds_rec_link (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (mds_rec_link);
+       CHECK_MEMBER (mds_rec_link, lk_opcode);
+       CHECK_MEMBER (mds_rec_link, lk_fsuid);
+       CHECK_MEMBER (mds_rec_link, lk_fsgid);
+       CHECK_MEMBER (mds_rec_link, lk_cap);
+       CHECK_MEMBER (mds_rec_link, lk_suppgid1);
+       CHECK_MEMBER (mds_rec_link, lk_suppgid2);
+       CHECK_MEMBER (mds_rec_link, lk_fid1);
+       CHECK_MEMBER (mds_rec_link, lk_fid2);
+}
+
+void
+check_mds_rec_unlink (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (mds_rec_unlink);
+       CHECK_MEMBER (mds_rec_unlink, ul_opcode);
+       CHECK_MEMBER (mds_rec_unlink, ul_fsuid);
+       CHECK_MEMBER (mds_rec_unlink, ul_fsgid);
+       CHECK_MEMBER (mds_rec_unlink, ul_cap);
+       CHECK_MEMBER (mds_rec_unlink, ul_reserved);
+       CHECK_MEMBER (mds_rec_unlink, ul_mode);
+       CHECK_MEMBER (mds_rec_unlink, ul_suppgid);
+       CHECK_MEMBER (mds_rec_unlink, ul_fid1);
+       CHECK_MEMBER (mds_rec_unlink, ul_fid2);
+}
+
+void
+check_mds_rec_rename (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (mds_rec_rename);
+       CHECK_MEMBER (mds_rec_rename, rn_opcode);
+       CHECK_MEMBER (mds_rec_rename, rn_fsuid);
+       CHECK_MEMBER (mds_rec_rename, rn_fsgid);
+       CHECK_MEMBER (mds_rec_rename, rn_cap);
+       CHECK_MEMBER (mds_rec_rename, rn_suppgid1);
+       CHECK_MEMBER (mds_rec_rename, rn_suppgid2);
+       CHECK_MEMBER (mds_rec_rename, rn_fid1);
+       CHECK_MEMBER (mds_rec_rename, rn_fid2);
+}
+
+void
+check_lov_desc (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (lov_desc);
+       CHECK_MEMBER (lov_desc, ld_tgt_count);
+       CHECK_MEMBER (lov_desc, ld_active_tgt_count);
+       CHECK_MEMBER (lov_desc, ld_default_stripe_count);
+       CHECK_MEMBER (lov_desc, ld_default_stripe_size);
+       CHECK_MEMBER (lov_desc, ld_default_stripe_offset);
+       CHECK_MEMBER (lov_desc, ld_pattern);
+       CHECK_MEMBER (lov_desc, ld_uuid);
+}
+
+void
+check_ldlm_res_id (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ldlm_res_id);
+       CHECK_MEMBER (ldlm_res_id, name[RES_NAME_SIZE]);
+}
+
+void
+check_ldlm_extent (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ldlm_extent);
+       CHECK_MEMBER (ldlm_extent, start);
+       CHECK_MEMBER (ldlm_extent, end);
+}
+
+void
+check_ldlm_intent (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ldlm_intent);
+       CHECK_MEMBER (ldlm_intent, opc);
+}
+
+void
+check_ldlm_resource_desc (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ldlm_resource_desc);
+       CHECK_MEMBER (ldlm_resource_desc, lr_type);
+       CHECK_MEMBER (ldlm_resource_desc, lr_name);
+       CHECK_MEMBER (ldlm_resource_desc, lr_version[RES_VERSION_SIZE]);
+}
+
+void
+check_ldlm_lock_desc (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ldlm_lock_desc);
+       CHECK_MEMBER (ldlm_lock_desc, l_resource);
+       CHECK_MEMBER (ldlm_lock_desc, l_req_mode);
+       CHECK_MEMBER (ldlm_lock_desc, l_granted_mode);
+       CHECK_MEMBER (ldlm_lock_desc, l_extent);
+       CHECK_MEMBER (ldlm_lock_desc, l_version[RES_VERSION_SIZE]);
+}
+
+void
+check_ldlm_request (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ldlm_request);
+       CHECK_MEMBER (ldlm_request, lock_flags);
+       CHECK_MEMBER (ldlm_request, lock_desc);
+       CHECK_MEMBER (ldlm_request, lock_handle1);
+       CHECK_MEMBER (ldlm_request, lock_handle2);
+}
+
+void
+check_ldlm_reply (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ldlm_reply);
+       CHECK_MEMBER (ldlm_reply, lock_flags);
+       CHECK_MEMBER (ldlm_reply, lock_mode);
+       CHECK_MEMBER (ldlm_reply, lock_resource_name);
+       CHECK_MEMBER (ldlm_reply, lock_handle);
+       CHECK_MEMBER (ldlm_reply, lock_extent);
+       CHECK_MEMBER (ldlm_reply, lock_policy_res1);
+       CHECK_MEMBER (ldlm_reply, lock_policy_res2);
+}
+
+void
+check_ptlbd_op (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ptlbd_op);
+       CHECK_MEMBER (ptlbd_op, op_cmd);
+       CHECK_MEMBER (ptlbd_op, op_lun);
+       CHECK_MEMBER (ptlbd_op, op_niob_cnt);
+       CHECK_MEMBER (ptlbd_op, op__padding);
+       CHECK_MEMBER (ptlbd_op, op_block_cnt);
+}
+
+void
+check_ptlbd_niob (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ptlbd_niob);
+       CHECK_MEMBER (ptlbd_niob, n_xid);
+       CHECK_MEMBER (ptlbd_niob, n_block_nr);
+       CHECK_MEMBER (ptlbd_niob, n_offset);
+       CHECK_MEMBER (ptlbd_niob, n_length);
+}
+
+void
+check_ptlbd_rsp (void)
+{
+       BLANK_LINE ();
+       CHECK_STRUCT (ptlbd_rsp);
+       CHECK_MEMBER (ptlbd_rsp, r_status);
+       CHECK_MEMBER (ptlbd_rsp, r_error_cnt);
+}
+
+int
+main (int argc, char **argv)
+{
+       printf ("void lustre_assert_wire_constants (void)\n"
+               "{\n");
+
+       COMMENT ("Wire protocol assertions generated by 'wirecheck'");
+       BLANK_LINE ();
+       
+       COMMENT ("Constants...");
+       CHECK_DEFINE (PTLRPC_MSG_MAGIC);
+       CHECK_DEFINE (PTLRPC_MSG_VERSION);
+
+       CHECK_VALUE (PTL_RPC_MSG_REQUEST);
+       CHECK_VALUE (PTL_RPC_MSG_ERR);
+       CHECK_VALUE (PTL_RPC_MSG_REPLY);
+
+       CHECK_VALUE (MSG_LAST_REPLAY);
+       CHECK_VALUE (MSG_RESENT);
+       
+       CHECK_VALUE (MSG_CONNECT_RECOVERING);
+       CHECK_VALUE (MSG_CONNECT_RECONNECT);
+       CHECK_VALUE (MSG_CONNECT_REPLAYABLE);
+       
+       CHECK_VALUE (OST_REPLY);
+       CHECK_VALUE (OST_GETATTR);
+       CHECK_VALUE (OST_SETATTR);
+       CHECK_VALUE (OST_READ);
+       CHECK_VALUE (OST_WRITE);
+       CHECK_VALUE (OST_CREATE);
+       CHECK_VALUE (OST_DESTROY);
+       CHECK_VALUE (OST_GET_INFO);
+       CHECK_VALUE (OST_CONNECT);
+       CHECK_VALUE (OST_DISCONNECT);
+       CHECK_VALUE (OST_PUNCH);
+       CHECK_VALUE (OST_OPEN);
+       CHECK_VALUE (OST_CLOSE);
+       CHECK_VALUE (OST_STATFS);
+       CHECK_VALUE (OST_SAN_READ);
+       CHECK_VALUE (OST_SAN_WRITE);
+       CHECK_VALUE (OST_SYNCFS);
+       CHECK_VALUE (OST_LAST_OPC);
+       CHECK_VALUE (OST_FIRST_OPC);
+
+       CHECK_VALUE (OBD_FL_INLINEDATA);
+       CHECK_VALUE (OBD_FL_OBDMDEXISTS);
+
+       CHECK_VALUE (LOV_MAGIC);
+
+       CHECK_VALUE (OBD_MD_FLALL);
+       CHECK_VALUE (OBD_MD_FLID);
+       CHECK_VALUE (OBD_MD_FLATIME);
+       CHECK_VALUE (OBD_MD_FLMTIME);
+       CHECK_VALUE (OBD_MD_FLCTIME);
+       CHECK_VALUE (OBD_MD_FLSIZE);
+       CHECK_VALUE (OBD_MD_FLBLOCKS);
+       CHECK_VALUE (OBD_MD_FLBLKSZ);
+       CHECK_VALUE (OBD_MD_FLMODE);
+       CHECK_VALUE (OBD_MD_FLTYPE);
+       CHECK_VALUE (OBD_MD_FLUID);
+       CHECK_VALUE (OBD_MD_FLGID);
+       CHECK_VALUE (OBD_MD_FLFLAGS);
+       CHECK_VALUE (OBD_MD_FLOBDFLG);
+       CHECK_VALUE (OBD_MD_FLNLINK);
+       CHECK_VALUE (OBD_MD_FLGENER);
+       CHECK_VALUE (OBD_MD_FLINLINE);
+       CHECK_VALUE (OBD_MD_FLRDEV);
+       CHECK_VALUE (OBD_MD_FLEASIZE);
+       CHECK_VALUE (OBD_MD_LINKNAME);
+       CHECK_VALUE (OBD_MD_FLHANDLE);
+       CHECK_VALUE (OBD_MD_FLCKSUM);
+
+       CHECK_VALUE (OBD_BRW_READ);
+       CHECK_VALUE (OBD_BRW_WRITE);
+       CHECK_VALUE (OBD_BRW_CREATE);
+       CHECK_VALUE (OBD_BRW_SYNC);
+
+       CHECK_DEFINE (OBD_OBJECT_EOF);
+
+       CHECK_VALUE (OST_REQ_HAS_OA1);
+
+       CHECK_VALUE (MDS_GETATTR);
+       CHECK_VALUE (MDS_GETATTR_NAME);
+       CHECK_VALUE (MDS_CLOSE);
+       CHECK_VALUE (MDS_REINT);
+       CHECK_VALUE (MDS_READPAGE);
+       CHECK_VALUE (MDS_CONNECT);
+       CHECK_VALUE (MDS_DISCONNECT);
+       CHECK_VALUE (MDS_GETSTATUS);
+       CHECK_VALUE (MDS_STATFS);
+       CHECK_VALUE (MDS_GETLOVINFO);
+       CHECK_VALUE (MDS_LAST_OPC);
+       CHECK_VALUE (MDS_FIRST_OPC);
+
+       CHECK_VALUE (REINT_SETATTR);
+       CHECK_VALUE (REINT_CREATE);
+       CHECK_VALUE (REINT_LINK);
+       CHECK_VALUE (REINT_UNLINK);
+       CHECK_VALUE (REINT_RENAME);
+       CHECK_VALUE (REINT_OPEN);
+       CHECK_VALUE (REINT_MAX);
+
+       CHECK_VALUE (IT_INTENT_EXEC);
+       CHECK_VALUE (IT_OPEN_LOOKUP);
+       CHECK_VALUE (IT_OPEN_NEG);
+       CHECK_VALUE (IT_OPEN_POS);
+       CHECK_VALUE (IT_OPEN_CREATE);
+       CHECK_VALUE (IT_OPEN_OPEN);
+
+       CHECK_VALUE (MDS_STATUS_CONN);
+       CHECK_VALUE (MDS_STATUS_LOV);
+
+       CHECK_VALUE (MDS_OPEN_HAS_EA);
+
+       CHECK_VALUE (LOV_RAID0);
+       CHECK_VALUE (LOV_RAIDRR);
+
+       CHECK_VALUE (LDLM_ENQUEUE);
+       CHECK_VALUE (LDLM_CONVERT);
+       CHECK_VALUE (LDLM_CANCEL);
+       CHECK_VALUE (LDLM_BL_CALLBACK);
+       CHECK_VALUE (LDLM_CP_CALLBACK);
+       CHECK_VALUE (LDLM_LAST_OPC);
+       CHECK_VALUE (LDLM_FIRST_OPC);
+
+        CHECK_VALUE (PTLBD_QUERY);
+        CHECK_VALUE (PTLBD_READ);
+        CHECK_VALUE (PTLBD_WRITE);
+        CHECK_VALUE (PTLBD_FLUSH);
+        CHECK_VALUE (PTLBD_CONNECT);
+        CHECK_VALUE (PTLBD_DISCONNECT);
+       CHECK_VALUE (PTLBD_LAST_OPC);
+       CHECK_VALUE (PTLBD_FIRST_OPC);
+
+       CHECK_VALUE (OBD_PING);
+
+       COMMENT ("Sizes and Offsets");
+       BLANK_LINE ();
+       check_lustre_handle ();
+       check_lustre_msg ();
+       check_obdo ();
+       check_obd_statfs ();
+       check_obd_ioobj ();
+       check_niobuf_remote ();
+       check_ost_body ();
+       check_ll_fid ();
+       check_mds_status_req ();
+       check_mds_fileh_body ();
+       check_mds_body ();
+       check_mds_rec_setattr ();
+       check_mds_rec_create ();
+       check_mds_rec_link ();
+       check_mds_rec_unlink ();
+       check_mds_rec_rename ();
+       check_lov_desc ();
+       check_ldlm_res_id ();
+       check_ldlm_extent ();
+       check_ldlm_intent ();
+       check_ldlm_resource_desc ();
+       check_ldlm_lock_desc ();
+       check_ldlm_request ();
+       check_ldlm_reply ();
+       check_ptlbd_op ();
+       check_ptlbd_niob ();
+       check_ptlbd_rsp ();
+
+       printf ("}\n\n");
+       
+       return (0);
+}